rebaseusers/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation

Created using spr 1.3.4
author: Vitaly Buka <vitalybuka@google.com> 2024-03-27 10:37:41 -0700
committer: Vitaly Buka <vitalybuka@google.com> 2024-03-27 10:37:41 -0700
commit: 3c80a2aa608d9068623cde891cf71cd4f7c19f16 (patch)
tree: 45b8bc261e29f078314b6c33064318e441f7f64a
parent: 45cb6b7a7f942f8e83ea5039ca0fcee1788346b7 (diff)
parent: 3403aee7a38ac979dbb5e57c779063535c605f6d (diff)
download: llvm-users/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation.zip
llvm-users/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation.tar.gz
llvm-users/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation.tar.bz2
355 files changed, 8165 insertions, 4319 deletions
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index b8e8ab2..ff61cf8 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@e38b1902ae4f44df626f11ba0734b14fb91f8f86 # v2.1.2
+        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
         with:
           results_file: results.sarif
           results_format: sarif
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0dd026a..0fdd9e3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -454,6 +454,7 @@ Bug Fixes to C++ Support
 - Fix a crash when instantiating a lambda that captures ``this`` outside of its context. Fixes (#GH85343).
 - Fix an issue where a namespace alias could be defined using a qualified name (all name components
   following the first `::` were ignored).
+- Fix an out-of-bounds crash when checking the validity of template partial specializations. (part of #GH86757).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 66da1c7..8af99a0 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -849,10 +849,89 @@ Check for performance anti-patterns when using Grand Central Dispatch.
 
 .. _optin-performance-Padding:
 
-optin.performance.Padding
-"""""""""""""""""""""""""
+optin.performance.Padding (C, C++, ObjC)
+""""""""""""""""""""""""""""""""""""""""
 Check for excessively padded structs.
 
+This checker detects structs with excessive padding, which can lead to wasted
+memory thus decreased performance by reducing the effectiveness of the
+processor cache. Padding bytes are added by compilers to align data accesses
+as some processors require data to be aligned to certain boundaries. On others,
+unaligned data access are possible, but impose significantly larger latencies.
+
+To avoid padding bytes, the fields of a struct should be ordered by decreasing
+by alignment. Usually, its easier to think of the ``sizeof`` of the fields, and
+ordering the fields by ``sizeof`` would usually also lead to the same optimal
+layout.
+
+In rare cases, one can use the ``#pragma pack(1)`` directive to enforce a packed
+layout too, but it can significantly increase the access times, so reordering the
+fields is usually a better solution.
+
+
+.. code-block:: cpp
+
+ // warn: Excessive padding in 'struct NonOptimal' (35 padding bytes, where 3 is optimal)
+ struct NonOptimal {
+   char c1;
+   // 7 bytes of padding
+   std::int64_t big1; // 8 bytes
+   char c2;
+   // 7 bytes of padding
+   std::int64_t big2; // 8 bytes
+   char c3;
+   // 7 bytes of padding
+   std::int64_t big3; // 8 bytes
+   char c4;
+   // 7 bytes of padding
+   std::int64_t big4; // 8 bytes
+   char c5;
+   // 7 bytes of padding
+ };
+ static_assert(sizeof(NonOptimal) == 4*8+5+5*7);
+
+ // no-warning: The fields are nicely aligned to have the minimal amount of padding bytes.
+ struct Optimal {
+   std::int64_t big1; // 8 bytes
+   std::int64_t big2; // 8 bytes
+   std::int64_t big3; // 8 bytes
+   std::int64_t big4; // 8 bytes
+   char c1;
+   char c2;
+   char c3;
+   char c4;
+   char c5;
+   // 3 bytes of padding
+ };
+ static_assert(sizeof(Optimal) == 4*8+5+3);
+
+ // no-warning: Bit packing representation is also accepted by this checker, but
+ // it can significantly increase access times, so prefer reordering the fields.
+ #pragma pack(1)
+ struct BitPacked {
+   char c1;
+   std::int64_t big1; // 8 bytes
+   char c2;
+   std::int64_t big2; // 8 bytes
+   char c3;
+   std::int64_t big3; // 8 bytes
+   char c4;
+   std::int64_t big4; // 8 bytes
+   char c5;
+ };
+ static_assert(sizeof(BitPacked) == 4*8+5);
+
+The ``AllowedPad`` option can be used to specify a threshold for the number
+padding bytes raising the warning. If the number of padding bytes of the struct
+and the optimal number of padding bytes differ by more than the threshold value,
+a warning will be raised.
+
+By default, the ``AllowedPad`` threshold is 24 bytes.
+
+To override this threshold to e.g. 4 bytes, use the
+``-analyzer-config optin.performance.Padding:AllowedPad=4`` option.
+
+
 .. _optin-portability-UnixAPI:
 
 optin.portability.UnixAPI
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 60db3cf..7a8bd98 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2991,6 +2991,7 @@ enum CXCallingConv {
   CXCallingConv_AArch64SVEPCS = 18,
   CXCallingConv_M68kRTD = 19,
   CXCallingConv_PreserveNone = 20,
+  CXCallingConv_RISCVVectorCall = 21,
 
   CXCallingConv_Invalid = 100,
   CXCallingConv_Unexposed = 200
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 318d4e5..80e6075 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3011,6 +3011,13 @@ def PreserveNone : DeclOrTypeAttr, TargetSpecificAttr<TargetAnyX86> {
   let Documentation = [PreserveNoneDocs];
 }
 
+def RISCVVectorCC: DeclOrTypeAttr, TargetSpecificAttr<TargetRISCV> {
+ let Spellings = [CXX11<"riscv", "vector_cc">,
+                  C23<"riscv", "vector_cc">,
+                  Clang<"riscv_vector_cc">];
+ let Documentation = [RISCVVectorCCDocs];
+}
+
 def Target : InheritableAttr {
   let Spellings = [GCC<"target">];
   let Args = [StringArgument<"featuresStr">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 384aebb..3ea4d67 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5494,6 +5494,17 @@ for clang builtin functions.
   }];
 }
 
+def RISCVVectorCCDocs : Documentation {
+ let Category = DocCatCallingConvs;
+ let Heading = "riscv::vector_cc, riscv_vector_cc, clang::riscv_vector_cc";
+ let Content = [{
+The ``riscv_vector_cc`` attribute can be applied to a function. It preserves 15
+registers namely, v1-v7 and v24-v31 as callee-saved. Callers thus don't need
+to save these registers before function calls, and callees only need to save
+them if they use them.
+ }];
+}
+
 def PreferredNameDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index 27df731..e3263fe 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -18,6 +18,7 @@ def err_no_output_file: Error<"no output file specified">;
 def err_no_such_header_file : Error<"no such %select{public|private|project}1 header file: '%0'">;
 def warn_no_such_excluded_header_file : Warning<"no such excluded %select{public|private}0 header file: '%1'">, InGroup<InstallAPIViolation>;
 def warn_glob_did_not_match: Warning<"glob '%0' did not match any header file">, InGroup<InstallAPIViolation>;
+def err_no_such_umbrella_header_file : Error<"%select{public|private|project}1 umbrella header file not found in input: '%0'">;
 } // end of command line category.
 
 let CategoryName = "Verification" in {
diff --git a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
index 8586405..fb11e82 100644
--- a/clang/include/clang/Basic/Specifiers.h
+++ b/clang/include/clang/Basic/Specifiers.h
@@ -273,29 +273,30 @@ namespace clang {
 
   /// CallingConv - Specifies the calling convention that a function uses.
   enum CallingConv {
-    CC_C,           // __attribute__((cdecl))
-    CC_X86StdCall,  // __attribute__((stdcall))
-    CC_X86FastCall, // __attribute__((fastcall))
-    CC_X86ThisCall, // __attribute__((thiscall))
-    CC_X86VectorCall, // __attribute__((vectorcall))
-    CC_X86Pascal,   // __attribute__((pascal))
-    CC_Win64,       // __attribute__((ms_abi))
-    CC_X86_64SysV,  // __attribute__((sysv_abi))
-    CC_X86RegCall, // __attribute__((regcall))
-    CC_AAPCS,       // __attribute__((pcs("aapcs")))
-    CC_AAPCS_VFP,   // __attribute__((pcs("aapcs-vfp")))
-    CC_IntelOclBicc, // __attribute__((intel_ocl_bicc))
-    CC_SpirFunction, // default for OpenCL functions on SPIR target
-    CC_OpenCLKernel, // inferred for OpenCL kernels
-    CC_Swift,        // __attribute__((swiftcall))
+    CC_C,                 // __attribute__((cdecl))
+    CC_X86StdCall,        // __attribute__((stdcall))
+    CC_X86FastCall,       // __attribute__((fastcall))
+    CC_X86ThisCall,       // __attribute__((thiscall))
+    CC_X86VectorCall,     // __attribute__((vectorcall))
+    CC_X86Pascal,         // __attribute__((pascal))
+    CC_Win64,             // __attribute__((ms_abi))
+    CC_X86_64SysV,        // __attribute__((sysv_abi))
+    CC_X86RegCall,        // __attribute__((regcall))
+    CC_AAPCS,             // __attribute__((pcs("aapcs")))
+    CC_AAPCS_VFP,         // __attribute__((pcs("aapcs-vfp")))
+    CC_IntelOclBicc,      // __attribute__((intel_ocl_bicc))
+    CC_SpirFunction,      // default for OpenCL functions on SPIR target
+    CC_OpenCLKernel,      // inferred for OpenCL kernels
+    CC_Swift,             // __attribute__((swiftcall))
     CC_SwiftAsync,        // __attribute__((swiftasynccall))
-    CC_PreserveMost, // __attribute__((preserve_most))
-    CC_PreserveAll,  // __attribute__((preserve_all))
+    CC_PreserveMost,      // __attribute__((preserve_most))
+    CC_PreserveAll,       // __attribute__((preserve_all))
     CC_AArch64VectorCall, // __attribute__((aarch64_vector_pcs))
-    CC_AArch64SVEPCS, // __attribute__((aarch64_sve_pcs))
-    CC_AMDGPUKernelCall, // __attribute__((amdgpu_kernel))
-    CC_M68kRTD,       // __attribute__((m68k_rtd))
-    CC_PreserveNone,  // __attribute__((preserve_none))
+    CC_AArch64SVEPCS,     // __attribute__((aarch64_sve_pcs))
+    CC_AMDGPUKernelCall,  // __attribute__((amdgpu_kernel))
+    CC_M68kRTD,           // __attribute__((m68k_rtd))
+    CC_PreserveNone,      // __attribute__((preserve_none))
+    CC_RISCVVectorCall,   // __attribute__((riscv_vector_cc))
   };
 
   /// Checks whether the given calling convention supports variadic
diff --git a/clang/include/clang/InstallAPI/HeaderFile.h b/clang/include/clang/InstallAPI/HeaderFile.h
index 235b4da..c67503d 100644
--- a/clang/include/clang/InstallAPI/HeaderFile.h
+++ b/clang/include/clang/InstallAPI/HeaderFile.h
@@ -24,8 +24,6 @@
 
 namespace clang::installapi {
 enum class HeaderType {
-  /// Unset or unknown type.
-  Unknown,
   /// Represents declarations accessible to all clients.
   Public,
   /// Represents declarations accessible to a disclosed set of clients.
@@ -33,6 +31,8 @@ enum class HeaderType {
   /// Represents declarations only accessible as implementation details to the
   /// input library.
   Project,
+  /// Unset or unknown type.
+  Unknown,
 };
 
 inline StringRef getName(const HeaderType T) {
@@ -62,6 +62,8 @@ class HeaderFile {
   bool Excluded{false};
   /// Add header file to processing.
   bool Extra{false};
+  /// Specify that header file is the umbrella header for library.
+  bool Umbrella{false};
 
 public:
   HeaderFile() = delete;
@@ -79,17 +81,21 @@ public:
 
   void setExtra(bool V = true) { Extra = V; }
   void setExcluded(bool V = true) { Excluded = V; }
+  void setUmbrellaHeader(bool V = true) { Umbrella = V; }
   bool isExtra() const { return Extra; }
   bool isExcluded() const { return Excluded; }
+  bool isUmbrellaHeader() const { return Umbrella; }
 
   bool useIncludeName() const {
     return Type != HeaderType::Project && !IncludeName.empty();
   }
 
   bool operator==(const HeaderFile &Other) const {
-    return std::tie(Type, FullPath, IncludeName, Language, Excluded, Extra) ==
-           std::tie(Other.Type, Other.FullPath, Other.IncludeName,
-                    Other.Language, Other.Excluded, Other.Extra);
+    return std::tie(Type, FullPath, IncludeName, Language, Excluded, Extra,
+                    Umbrella) == std::tie(Other.Type, Other.FullPath,
+                                          Other.IncludeName, Other.Language,
+                                          Other.Excluded, Other.Extra,
+                                          Other.Umbrella);
   }
 };
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5ecd2f9..3a1abd4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2234,7 +2234,8 @@ private:
   bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum);
   bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                      CallExpr *TheCall);
-  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D);
+  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                           const llvm::StringMap<bool> &FeatureMap);
   bool CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID, CallExpr *TheCall);
   bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index bf46766..5fe5c92 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -908,7 +908,7 @@ def PaddingChecker : Checker<"Padding">,
                   "24",
                   Released>
   ]>,
-  Documentation<NotDocumented>;
+  Documentation<HasDocumentation>;
 
 } // end: "padding"
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index f619d65..425f84e 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3445,6 +3445,7 @@ StringRef CXXNameMangler::getCallingConvQualifierName(CallingConv CC) {
   case CC_PreserveAll:
   case CC_M68kRTD:
   case CC_PreserveNone:
+  case CC_RISCVVectorCall:
     // FIXME: we should be mangling all of the above.
     return "";
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index d2ffb23..8f3e26d 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3484,6 +3484,9 @@ StringRef FunctionType::getNameForCallConv(CallingConv CC) {
   case CC_PreserveAll: return "preserve_all";
   case CC_M68kRTD: return "m68k_rtd";
   case CC_PreserveNone: return "preserve_none";
+    // clang-format off
+  case CC_RISCVVectorCall: return "riscv_vector_cc";
+    // clang-format on
   }
 
   llvm_unreachable("Invalid calling convention.");
@@ -4074,6 +4077,7 @@ bool AttributedType::isCallingConv() const {
   case attr::PreserveAll:
   case attr::M68kRTD:
   case attr::PreserveNone:
+  case attr::RISCVVectorCC:
     return true;
   }
   llvm_unreachable("invalid attr kind");
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index f176d04..0aa1d93 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1071,6 +1071,9 @@ void TypePrinter::printFunctionAfter(const FunctionType::ExtInfo &Info,
     case CC_PreserveNone:
       OS << " __attribute__((preserve_none))";
       break;
+    case CC_RISCVVectorCall:
+      OS << "__attribute__((riscv_vector_cc))";
+      break;
     }
   }
 
@@ -1960,6 +1963,9 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::PreserveNone:
     OS << "preserve_none";
     break;
+  case attr::RISCVVectorCC:
+    OS << "riscv_vector_cc";
+    break;
   case attr::NoDeref:
     OS << "noderef";
     break;
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index a6d4af2..f3d705e 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -467,3 +467,14 @@ ParsedTargetAttr RISCVTargetInfo::parseTargetAttr(StringRef Features) const {
   }
   return Ret;
 }
+
+TargetInfo::CallingConvCheckResult
+RISCVTargetInfo::checkCallingConvention(CallingConv CC) const {
+  switch (CC) {
+  default:
+    return CCCR_Warning;
+  case CC_C:
+  case CC_RISCVVectorCall:
+    return CCCR_OK;
+  }
+}
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index bfbdafb..78580b5 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -110,6 +110,8 @@ public:
 
   bool hasBFloat16Type() const override { return true; }
 
+  CallingConvCheckResult checkCallingConvention(CallingConv CC) const override;
+
   bool useFP16ConversionIntrinsics() const override {
     return false;
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3cfdb26..fdb517e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -792,7 +792,8 @@ EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
 
 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
-  return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
+  return Builder.CreateCall(CGM.getIntrinsic(inst, {ArgValue->getType()}),
+                            ArgValue);
 }
 
 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
@@ -3018,7 +3019,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_va_copy: {
     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
-    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy), {DstPtr, SrcPtr});
+    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy, {DstPtr->getType()}),
+                       {DstPtr, SrcPtr});
     return RValue::get(nullptr);
   }
   case Builtin::BIabs:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 475d96b..b8adf5c 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -74,6 +74,9 @@ unsigned CodeGenTypes::ClangCallConvToLLVMCallConv(CallingConv CC) {
   case CC_SwiftAsync: return llvm::CallingConv::SwiftTail;
   case CC_M68kRTD: return llvm::CallingConv::M68k_RTD;
   case CC_PreserveNone: return llvm::CallingConv::PreserveNone;
+    // clang-format off
+  case CC_RISCVVectorCall: return llvm::CallingConv::RISCV_VectorCall;
+    // clang-format on
   }
 }
 
@@ -260,6 +263,9 @@ static CallingConv getCallingConventionForDecl(const ObjCMethodDecl *D,
   if (D->hasAttr<PreserveNoneAttr>())
     return CC_PreserveNone;
 
+  if (D->hasAttr<RISCVVectorCCAttr>())
+    return CC_RISCVVectorCall;
+
   return CC_C;
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 0e20de2..2a385d8 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1452,6 +1452,8 @@ static unsigned getDwarfCC(CallingConv CC) {
     return llvm::dwarf::DW_CC_LLVM_M68kRTD;
   case CC_PreserveNone:
     return llvm::dwarf::DW_CC_LLVM_PreserveNone;
+  case CC_RISCVVectorCall:
+    return llvm::dwarf::DW_CC_LLVM_RISCVVectorCall;
   }
   return 0;
 }
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index b55f433..72393be 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2065,8 +2065,11 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
         targetDiag(D->getLocation(), diag::note_defined_here, FD) << D;
     }
 
-    if (TI.hasRISCVVTypes() && Ty->isRVVSizelessBuiltinType())
-      checkRVVTypeSupport(Ty, Loc, D);
+    if (TI.hasRISCVVTypes() && Ty->isRVVSizelessBuiltinType() && FD) {
+      llvm::StringMap<bool> CallerFeatureMap;
+      Context.getFunctionFeatureMap(CallerFeatureMap, FD);
+      checkRVVTypeSupport(Ty, Loc, D, CallerFeatureMap);
+    }
 
     // Don't allow SVE types in functions without a SVE target.
     if (Ty->isSVESizelessBuiltinType() && FD && FD->hasBody()) {
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 836c633..a312830 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -52,49 +52,54 @@ static void applyNullability(Sema &S, Decl *D, NullabilityKind Nullability,
   if (!Metadata.IsActive)
     return;
 
-  auto IsModified = [&](Decl *D, QualType QT,
-                        NullabilityKind Nullability) -> bool {
+  auto GetModified =
+      [&](Decl *D, QualType QT,
+          NullabilityKind Nullability) -> std::optional<QualType> {
     QualType Original = QT;
     S.CheckImplicitNullabilityTypeSpecifier(QT, Nullability, D->getLocation(),
                                             isa<ParmVarDecl>(D),
                                             /*OverrideExisting=*/true);
-    return QT.getTypePtr() != Original.getTypePtr();
+    return (QT.getTypePtr() != Original.getTypePtr()) ? std::optional(QT)
+                                                      : std::nullopt;
   };
 
   if (auto Function = dyn_cast<FunctionDecl>(D)) {
-    if (IsModified(D, Function->getReturnType(), Nullability)) {
-      QualType FnType = Function->getType();
-      Function->setType(FnType);
+    if (auto Modified =
+            GetModified(D, Function->getReturnType(), Nullability)) {
+      const FunctionType *FnType = Function->getType()->castAs<FunctionType>();
+      if (const FunctionProtoType *proto = dyn_cast<FunctionProtoType>(FnType))
+        Function->setType(S.Context.getFunctionType(
+            *Modified, proto->getParamTypes(), proto->getExtProtoInfo()));
+      else
+        Function->setType(
+            S.Context.getFunctionNoProtoType(*Modified, FnType->getExtInfo()));
     }
   } else if (auto Method = dyn_cast<ObjCMethodDecl>(D)) {
-    QualType Type = Method->getReturnType();
-    if (IsModified(D, Type, Nullability)) {
-      Method->setReturnType(Type);
+    if (auto Modified = GetModified(D, Method->getReturnType(), Nullability)) {
+      Method->setReturnType(*Modified);
 
       // Make it a context-sensitive keyword if we can.
-      if (!isIndirectPointerType(Type))
+      if (!isIndirectPointerType(*Modified))
         Method->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
             Method->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
     }
   } else if (auto Value = dyn_cast<ValueDecl>(D)) {
-    QualType Type = Value->getType();
-    if (IsModified(D, Type, Nullability)) {
-      Value->setType(Type);
+    if (auto Modified = GetModified(D, Value->getType(), Nullability)) {
+      Value->setType(*Modified);
 
       // Make it a context-sensitive keyword if we can.
       if (auto Parm = dyn_cast<ParmVarDecl>(D)) {
-        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(Type))
+        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(*Modified))
           Parm->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
               Parm->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
       }
     }
   } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
-    QualType Type = Property->getType();
-    if (IsModified(D, Type, Nullability)) {
-      Property->setType(Type, Property->getTypeSourceInfo());
+    if (auto Modified = GetModified(D, Property->getType(), Nullability)) {
+      Property->setType(*Modified, Property->getTypeSourceInfo());
 
       // Make it a property attribute if we can.
-      if (!isIndirectPointerType(Type))
+      if (!isIndirectPointerType(*Modified))
         Property->setPropertyAttributes(
             ObjCPropertyAttribute::kind_null_resettable);
     }
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 0844958..447e736 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5760,57 +5760,6 @@ static bool CheckInvalidVLENandLMUL(const TargetInfo &TI, CallExpr *TheCall,
 bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID,
                                          CallExpr *TheCall) {
-  // CodeGenFunction can also detect this, but this gives a better error
-  // message.
-  bool FeatureMissing = false;
-  SmallVector<StringRef> ReqFeatures;
-  StringRef Features = Context.BuiltinInfo.getRequiredFeatures(BuiltinID);
-  Features.split(ReqFeatures, ',', -1, false);
-
-  // Check if each required feature is included
-  for (StringRef F : ReqFeatures) {
-    SmallVector<StringRef> ReqOpFeatures;
-    F.split(ReqOpFeatures, '|');
-
-    if (llvm::none_of(ReqOpFeatures,
-                      [&TI](StringRef OF) { return TI.hasFeature(OF); })) {
-      std::string FeatureStrs;
-      bool IsExtension = true;
-      for (StringRef OF : ReqOpFeatures) {
-        // If the feature is 64bit, alter the string so it will print better in
-        // the diagnostic.
-        if (OF == "64bit") {
-          assert(ReqOpFeatures.size() == 1 && "Expected '64bit' to be alone");
-          OF = "RV64";
-          IsExtension = false;
-        }
-        if (OF == "32bit") {
-          assert(ReqOpFeatures.size() == 1 && "Expected '32bit' to be alone");
-          OF = "RV32";
-          IsExtension = false;
-        }
-
-        // Convert features like "zbr" and "experimental-zbr" to "Zbr".
-        OF.consume_front("experimental-");
-        std::string FeatureStr = OF.str();
-        FeatureStr[0] = std::toupper(FeatureStr[0]);
-        // Combine strings.
-        FeatureStrs += FeatureStrs.empty() ? "" : ", ";
-        FeatureStrs += "'";
-        FeatureStrs += FeatureStr;
-        FeatureStrs += "'";
-      }
-      // Error message
-      FeatureMissing = true;
-      Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_requires_extension)
-          << IsExtension
-          << TheCall->getSourceRange() << StringRef(FeatureStrs);
-    }
-  }
-
-  if (FeatureMissing)
-    return true;
-
   // vmulh.vv, vmulh.vx, vmulhu.vv, vmulhu.vx, vmulhsu.vv, vmulhsu.vx,
   // vsmul.vv, vsmul.vx are not included for EEW=64 in Zve64*.
   switch (BuiltinID) {
@@ -6714,36 +6663,35 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
   return false;
 }
 
-void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D) {
-  const TargetInfo &TI = Context.getTargetInfo();
-
+void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                               const llvm::StringMap<bool> &FeatureMap) {
   ASTContext::BuiltinVectorTypeInfo Info =
       Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
   unsigned EltSize = Context.getTypeSize(Info.ElementType);
   unsigned MinElts = Info.EC.getKnownMinValue();
 
   if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
-      !TI.hasFeature("zve64d"))
+      !FeatureMap.lookup("zve64d"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
   // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
   // least zve64x
   else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
             MinElts == 1) &&
-           !TI.hasFeature("zve64x"))
+           !FeatureMap.lookup("zve64x"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
-  else if (Info.ElementType->isFloat16Type() && !TI.hasFeature("zvfh") &&
-           !TI.hasFeature("zvfhmin"))
+  else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
+           !FeatureMap.lookup("zvfhmin"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D)
         << Ty << "zvfh or zvfhmin";
   else if (Info.ElementType->isBFloat16Type() &&
-           !TI.hasFeature("experimental-zvfbfmin"))
+           !FeatureMap.lookup("experimental-zvfbfmin"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
   else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
-           !TI.hasFeature("zve32f"))
+           !FeatureMap.lookup("zve32f"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
   // Given that caller already checked isRVVType() before calling this function,
   // if we don't have at least zve32x supported, then we need to emit error.
-  else if (!TI.hasFeature("zve32x"))
+  else if (!FeatureMap.lookup("zve32x"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
 }
 
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 1c546e9..b6c4d3d 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1269,10 +1269,18 @@ substituteParameterMappings(Sema &S, NormalizedConstraint &N,
                     : SourceLocation()));
     Atomic.ParameterMapping.emplace(TempArgs,  OccurringIndices.count());
   }
+  SourceLocation InstLocBegin =
+      ArgsAsWritten->arguments().empty()
+          ? ArgsAsWritten->getLAngleLoc()
+          : ArgsAsWritten->arguments().front().getSourceRange().getBegin();
+  SourceLocation InstLocEnd =
+      ArgsAsWritten->arguments().empty()
+          ? ArgsAsWritten->getRAngleLoc()
+          : ArgsAsWritten->arguments().front().getSourceRange().getEnd();
   Sema::InstantiatingTemplate Inst(
-      S, ArgsAsWritten->arguments().front().getSourceRange().getBegin(),
+      S, InstLocBegin,
       Sema::InstantiatingTemplate::ParameterMappingSubstitution{}, Concept,
-      ArgsAsWritten->arguments().front().getSourceRange());
+      {InstLocBegin, InstLocEnd});
   if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs))
     return true;
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 66aad25..8b44d24 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8962,8 +8962,13 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) {
     }
   }
 
-  if (T->isRVVSizelessBuiltinType())
-    checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext));
+  if (T->isRVVSizelessBuiltinType() && isa<FunctionDecl>(CurContext)) {
+    const FunctionDecl *FD = cast<FunctionDecl>(CurContext);
+    llvm::StringMap<bool> CallerFeatureMap;
+    Context.getFunctionFeatureMap(CallerFeatureMap, FD);
+    checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext),
+                        CallerFeatureMap);
+  }
 }
 
 /// Perform semantic checking on a newly-created variable
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 0a62c65..f25f3af 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5271,6 +5271,9 @@ static void handleCallConvAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   case ParsedAttr::AT_PreserveNone:
     D->addAttr(::new (S.Context) PreserveNoneAttr(S.Context, AL));
     return;
+  case ParsedAttr::AT_RISCVVectorCC:
+    D->addAttr(::new (S.Context) RISCVVectorCCAttr(S.Context, AL));
+    return;
   default:
     llvm_unreachable("unexpected attribute kind");
   }
@@ -5475,6 +5478,9 @@ bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC,
   case ParsedAttr::AT_PreserveNone:
     CC = CC_PreserveNone;
     break;
+  case ParsedAttr::AT_RISCVVectorCC:
+    CC = CC_RISCVVectorCall;
+    break;
   default: llvm_unreachable("unexpected attribute kind");
   }
 
@@ -9637,6 +9643,7 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUKernelCall:
   case ParsedAttr::AT_M68kRTD:
   case ParsedAttr::AT_PreserveNone:
+  case ParsedAttr::AT_RISCVVectorCC:
     handleCallConvAttr(S, D, AL);
     break;
   case ParsedAttr::AT_Suppress:
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 4636d89..f9e1ad0 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -638,8 +638,6 @@ ObjCPropertyDecl *Sema::CreatePropertyDecl(Scope *S,
     PDecl->setInvalidDecl();
   }
 
-  ProcessDeclAttributes(S, PDecl, FD.D);
-
   // Regardless of setter/getter attribute, we save the default getter/setter
   // selector names in anticipation of declaration of setter/getter methods.
   PDecl->setGetterName(GetterSel, GetterNameLoc);
@@ -647,6 +645,8 @@ ObjCPropertyDecl *Sema::CreatePropertyDecl(Scope *S,
   PDecl->setPropertyAttributesAsWritten(
                           makePropertyAttributesAsWritten(AttributesAsWritten));
 
+  ProcessDeclAttributes(S, PDecl, FD.D);
+
   if (Attributes & ObjCPropertyAttribute::kind_readonly)
     PDecl->setPropertyAttributes(ObjCPropertyAttribute::kind_readonly);
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index d7521a5..fd94caa 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -138,7 +138,8 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_PreserveMost:                                            \
   case ParsedAttr::AT_PreserveAll:                                             \
   case ParsedAttr::AT_M68kRTD:                                                 \
-  case ParsedAttr::AT_PreserveNone
+  case ParsedAttr::AT_PreserveNone:                                            \
+  case ParsedAttr::AT_RISCVVectorCC
 
 // Function type attributes.
 #define FUNCTION_TYPE_ATTRS_CASELIST                                           \
@@ -7939,6 +7940,8 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
     return createSimpleAttr<M68kRTDAttr>(Ctx, Attr);
   case ParsedAttr::AT_PreserveNone:
     return createSimpleAttr<PreserveNoneAttr>(Ctx, Attr);
+  case ParsedAttr::AT_RISCVVectorCC:
+    return createSimpleAttr<RISCVVectorCCAttr>(Ctx, Attr);
   }
   llvm_unreachable("unexpected attribute kind!");
 }
diff --git a/clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes b/clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes
new file mode 100644
index 0000000..ccdc4e1
--- /dev/null
+++ b/clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes
@@ -0,0 +1,8 @@
+Name: SomeOtherKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "methodB"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes b/clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes
new file mode 100644
index 0000000..cd5475b
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes
@@ -0,0 +1,5 @@
+Name: SomeBrokenLib
+Functions:
+  - Name: do_something_with_pointers
+    Nu llabilityOfRet: O
+    # the space is intentional, to make sure we don't crash on malformed API Notes
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h b/clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h
new file mode 100644
index 0000000..b09c6f6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h
@@ -0,0 +1,6 @@
+#ifndef SOME_BROKEN_LIB_H
+#define SOME_BROKEN_LIB_H
+
+void do_something_with_pointers(int *ptr1, int *ptr2);
+
+#endif // SOME_BROKEN_LIB_H
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes b/clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes
new file mode 100644
index 0000000..33eeaaa
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes
@@ -0,0 +1,7 @@
+Name: SomeBrokenLib
+Functions:
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+    
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h b/clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h
new file mode 100644
index 0000000..b09c6f6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h
@@ -0,0 +1,6 @@
+#ifndef SOME_BROKEN_LIB_H
+#define SOME_BROKEN_LIB_H
+
+void do_something_with_pointers(int *ptr1, int *ptr2);
+
+#endif // SOME_BROKEN_LIB_H
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h
new file mode 100644
index 0000000..523de4f
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h
@@ -0,0 +1 @@
+extern int FrameworkWithActualPrivateModule;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap
new file mode 100644
index 0000000..859d723
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithActualPrivateModule {
+  umbrella header "FrameworkWithActualPrivateModule.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap
new file mode 100644
index 0000000..e7fafe3
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithActualPrivateModule_Private {
+  umbrella header "FrameworkWithActualPrivateModule_Private.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes
new file mode 100644
index 0000000..831cf1e
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes
@@ -0,0 +1 @@
+Name: FrameworkWithActualPrivateModule_Private
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h
new file mode 100644
index 0000000..c07a3e9
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h
@@ -0,0 +1,2 @@
+#include <FrameworkWithActualPrivateModule/FrameworkWithActualPrivateModule.h>
+extern int FrameworkWithActualPrivateModule_Private;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h
new file mode 100644
index 0000000..4f3b631
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h
@@ -0,0 +1 @@
+extern int FrameworkWithWrongCase;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap
new file mode 100644
index 0000000..e97d361
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithWrongCase {
+  umbrella header "FrameworkWithWrongCase.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes
new file mode 100644
index 0000000..ae5447c
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes
@@ -0,0 +1 @@
+Name: FrameworkWithWrongCase
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h
new file mode 100644
index 0000000..d3d6148
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h
@@ -0,0 +1 @@
+extern int FrameworkWithWrongCasePrivate;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap
new file mode 100644
index 0000000..04b96ad
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithWrongCasePrivate {
+  umbrella header "FrameworkWithWrongCasePrivate.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap
new file mode 100644
index 0000000..d6ad53c
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap
@@ -0,0 +1 @@
+module FrameworkWithWrongCasePrivate.Inner {}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes
new file mode 100644
index 0000000..d7af293
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes
@@ -0,0 +1 @@
+Name: FrameworkWithWrongCasePrivate
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h
new file mode 100644
index 0000000..a95d19e
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h
@@ -0,0 +1,11 @@
+@import LayeredKitImpl;
+
+// @interface declarations already don't inherit attributes from forward 
+// declarations, so in order to test this properly we have to /not/ define
+// UpwardClass anywhere.
+
+// @interface UpwardClass
+// @end
+
+@protocol UpwardProto
+@end
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap
new file mode 100644
index 0000000..04bbe72
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module LayeredKit {
+  umbrella header "LayeredKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes
new file mode 100644
index 0000000..bece28c
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes
@@ -0,0 +1,9 @@
+Name: LayeredKitImpl
+Classes:
+- Name: PerfectlyNormalClass
+  Availability: none
+- Name: UpwardClass
+  Availability: none
+Protocols:
+- Name: UpwardProto
+  Availability: none
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h
new file mode 100644
index 0000000..99591d3
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h
@@ -0,0 +1,7 @@
+@protocol UpwardProto;
+@class UpwardClass;
+
+@interface PerfectlyNormalClass
+@end
+
+void doImplementationThings(UpwardClass *first, id <UpwardProto> second) __attribute((unavailable));
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap
new file mode 100644
index 0000000..58a6e55
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module LayeredKitImpl {
+  umbrella header "LayeredKitImpl.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap
new file mode 100644
index 0000000..2d07e76
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module SimpleKit {
+  umbrella header "SimpleKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes
new file mode 100644
index 0000000..817af12
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes
@@ -0,0 +1,74 @@
+Name: SomeKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "transform:"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
+      - Selector: "transform:integer:"
+        MethodKind:      Instance
+        NullabilityOfRet: N
+        Nullability:      [ N, S ]
+    Properties:
+      - Name: intValue
+        PropertyKind:    Instance
+        Availability: none
+        AvailabilityMsg: "wouldn't work anyway"
+      - Name: nonnullAInstance
+        PropertyKind:    Instance
+        Nullability:     N
+      - Name: nonnullAClass
+        PropertyKind:    Class
+        Nullability:     N
+      - Name: nonnullABoth
+        Nullability:     N
+  - Name: B
+    Availability: none
+    AvailabilityMsg: "just don't"
+  - Name: C
+    Methods:
+      - Selector: "initWithA:"
+        MethodKind: Instance
+        DesignatedInit: true
+  - Name: OverriddenTypes
+    Methods:
+      - Selector: "methodToMangle:second:"
+        MethodKind: Instance
+        ResultType: 'char *'
+        Parameters:
+          - Position: 0
+            Type: 'SOMEKIT_DOUBLE *'
+          - Position: 1
+            Type: 'float *'
+    Properties:
+      - Name: intPropertyToMangle
+        PropertyKind: Instance
+        Type: 'double *'
+Functions:
+  - Name: global_int_fun
+    ResultType: 'char *'
+    Parameters:
+      - Position: 0
+        Type: 'double *'
+      - Position: 1
+        Type: 'float *'
+Globals:
+  - Name: global_int_ptr
+    Type: 'double *'
+SwiftVersions:
+  - Version: 3.0
+    Classes:
+      - Name: A
+        Methods:
+          - Selector: "transform:integer:"
+            MethodKind:      Instance
+            NullabilityOfRet: O
+            Nullability:      [ O, S ]
+        Properties:
+          - Name: explicitNonnullInstance
+            PropertyKind:    Instance
+            Nullability:     O
+          - Name: explicitNullableInstance
+            PropertyKind:    Instance
+            Nullability:     N
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes
new file mode 100644
index 0000000..28ede9d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes
@@ -0,0 +1,15 @@
+Name: SomeKit
+Classes:
+  - Name: A
+    Methods:         
+      - Selector: "privateTransform:input:"
+        MethodKind:      Instance
+        NullabilityOfRet: N
+        Nullability:      [ N, S ]
+    Properties:
+      - Name: internalProperty
+        Nullability: N
+Protocols:
+  - Name: InternalProtocol
+    Availability: none
+    AvailabilityMsg: "not for you"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h
new file mode 100644
index 0000000..bc0c5da
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h
@@ -0,0 +1,55 @@
+#ifndef SOMEKIT_H
+#define SOMEKIT_H
+
+#define ROOT_CLASS __attribute__((objc_root_class))
+
+ROOT_CLASS
+@interface A
+-(A*)transform:(A*)input;
+-(A*)transform:(A*)input integer:(int)integer;
+
+@property (nonatomic, readonly, retain) A* someA;
+@property (nonatomic, retain) A* someOtherA;
+
+@property (nonatomic) int intValue;
+@end
+
+@interface B : A
+@end
+
+@interface C : A
+- (instancetype)init;
+- (instancetype)initWithA:(A*)a;
+@end
+
+
+@interface MyClass : A
+- Inst;
++ Clas;
+@end
+
+struct CGRect {
+  float origin;
+  float size;
+};
+typedef struct CGRect NSRect;
+
+@interface I
+- (void) Meth : (NSRect[4])exposedRects;
+- (void) Meth1 : (const  I*)exposedRects;
+- (void) Meth2 : (const I*)exposedRects;
+- (void) Meth3 : (I*)exposedRects;
+- (const I*) Meth4;
+- (const I*) Meth5 : (int) Arg1 : (const I*)Arg2 : (double)Arg3 :   (const I*) Arg4 :(const  volatile id) Arg5;
+- (volatile const I*) Meth6 : (const char *)Arg1 : (const char *)Arg2 : (double)Arg3 :   (const I*) Arg4 :(const  volatile id) Arg5;
+@end
+
+@class NSURL, NSArray, NSError;
+@interface INTF_BLOCKS
+  + (void)getNonLocalVersionsOfItemAtURL:(NSURL *)url completionHandler:(void (^)(NSArray *nonLocalFileVersions, NSError *error))completionHandler;
+  + (void *)getNonLocalVersionsOfItemAtURL2:(NSURL *)url completionHandler:(void (^)(NSArray *nonLocalFileVersions, NSError *error))completionHandler;
+  + (NSError **)getNonLocalVersionsOfItemAtURL3:(int)url completionHandler:(void (^)(NSArray *nonLocalFileVersions, NSError *error))completionHandler;
+  + (id)getNonLocalVersionsOfItemAtURL4:(NSURL *)url completionHandler:(void (^)(int nonLocalFileVersions, NSError *error, NSURL*))completionHandler;
+@end
+
+#endif
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap
new file mode 100644
index 0000000..3abee2d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module SomeKit {
+  umbrella header "SomeKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap
new file mode 100644
index 0000000..bbda9d0
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap
@@ -0,0 +1,8 @@
+module SomeKit.Private {
+  header "SomeKit_Private.h"
+  export *
+
+  explicit module NullAnnotation {
+    header "SomeKit_PrivateForNullAnnotation.h"
+  }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap
new file mode 100644
index 0000000..e310343
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap
@@ -0,0 +1,8 @@
+explicit framework module SomeKit.Private {
+  header "SomeKit_Private.h"
+  explicit NullAnnotation { header "SomeKit_PrivateForNullAnnotation.h" }
+  export *
+  module * { export * }
+syntax error
+
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h
new file mode 100644
index 0000000..c761112
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h
@@ -0,0 +1,16 @@
+#ifndef SOMEKIT_PRIVATE_H
+#define SOMEKIT_PRIVATE_H
+
+#import <SomeKit/SomeKit.h>
+
+@interface A(Private)
+-(A*)privateTransform:(A*)input;
+
+@property (nonatomic) A* internalProperty;
+@end
+
+@protocol InternalProtocol
+@end
+
+#endif
+
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h
new file mode 100644
index 0000000..bae4456
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h
@@ -0,0 +1,17 @@
+#ifndef SOMEKIT_PRIVATE_H
+#define SOMEKIT_PRIVATE_H
+
+#import <SomeKit/SomeKitForNullAnnotation.h>
+
+@interface A(Private)
+-(A*)privateTransform:(A*)input;
+
+@property (nonatomic) A* internalProperty;
+@end
+
+@protocol InternalProtocol
+- (id) MomeMethod;
+@end
+
+#endif
+
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes
new file mode 100644
index 0000000..28ede9d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes
@@ -0,0 +1,15 @@
+Name: SomeKit
+Classes:
+  - Name: A
+    Methods:         
+      - Selector: "privateTransform:input:"
+        MethodKind:      Instance
+        NullabilityOfRet: N
+        Nullability:      [ N, S ]
+    Properties:
+      - Name: internalProperty
+        Nullability: N
+Protocols:
+  - Name: InternalProtocol
+    Availability: none
+    AvailabilityMsg: "not for you"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes
new file mode 100644
index 0000000..2ad546b
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes
@@ -0,0 +1,8 @@
+Name: SomeOtherKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "methodA"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes
new file mode 100644
index 0000000..2ad546b
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes
@@ -0,0 +1,8 @@
+Name: SomeOtherKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "methodA"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
new file mode 100644
index 0000000..3911d76
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
@@ -0,0 +1,9 @@
+#ifndef SOME_OTHER_KIT_H
+
+__attribute__((objc_root_class))
+@interface A
+-(void)methodA;
+-(void)methodB;
+@end
+
+#endif
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap
new file mode 100644
index 0000000..0aaad92
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module SomeOtherKit {
+  umbrella header "SomeOtherKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h
new file mode 100644
index 0000000..d3376f1
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h
@@ -0,0 +1 @@
+extern int TopLevelPrivateKit_Public;
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes
new file mode 100644
index 0000000..ece1dd2
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
+\ No newline at end of file
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap
new file mode 100644
index 0000000..70faa54
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module TopLevelPrivateKit {
+  umbrella header "TopLevelPrivateKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap
new file mode 100644
index 0000000..0958a14
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap
@@ -0,0 +1,5 @@
+framework module TopLevelPrivateKit_Private {
+  umbrella header "TopLevelPrivateKit_Private.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes
new file mode 100644
index 0000000..908dae0
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes
new file mode 100644
index 0000000..4332362
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes
@@ -0,0 +1,4 @@
+Name: TopLevelPrivateKit_Private
+Globals:
+- Name: TopLevelPrivateKit_Private
+  Type: float
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h
new file mode 100644
index 0000000..39cbfe6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h
@@ -0,0 +1 @@
+extern int TopLevelPrivateKit_Private;
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes
new file mode 100644
index 0000000..ece1dd2
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
+\ No newline at end of file
diff --git a/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes
new file mode 100644
index 0000000..572c714
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes
@@ -0,0 +1,156 @@
+Name: VersionedKit
+Classes:
+  - Name: TestProperties
+    SwiftObjCMembers: true
+    Properties:
+      - Name: accessorsOnly
+        PropertyKind:    Instance
+        SwiftImportAsAccessors: true
+      - Name: accessorsOnlyForClass
+        PropertyKind:    Class
+        SwiftImportAsAccessors: true
+      - Name: accessorsOnlyExceptInVersion3
+        PropertyKind:    Instance
+        SwiftImportAsAccessors: true
+      - Name: accessorsOnlyForClassExceptInVersion3
+        PropertyKind:    Class
+        SwiftImportAsAccessors: true
+Functions:
+  - Name: unversionedRenameDUMP
+    SwiftName: 'unversionedRename_NOTES()'
+Tags:
+  - Name: APINotedFlagEnum
+    FlagEnum: true
+  - Name: APINotedOpenEnum
+    EnumExtensibility: open
+  - Name: APINotedClosedEnum
+    EnumExtensibility: closed
+  - Name: SoonToBeCFEnum
+    EnumKind: CFEnum
+  - Name: SoonToBeNSEnum
+    EnumKind: NSEnum
+  - Name: SoonToBeCFOptions
+    EnumKind: CFOptions
+  - Name: SoonToBeNSOptions
+    EnumKind: NSOptions
+  - Name: SoonToBeCFClosedEnum
+    EnumKind: CFClosedEnum
+  - Name: SoonToBeNSClosedEnum
+    EnumKind: NSClosedEnum
+  - Name: UndoAllThatHasBeenDoneToMe
+    EnumKind: none
+Typedefs:
+  - Name: MultiVersionedTypedef34Notes
+    SwiftName: MultiVersionedTypedef34Notes_NEW
+  - Name: MultiVersionedTypedef345Notes
+    SwiftName: MultiVersionedTypedef345Notes_NEW
+  - Name: MultiVersionedTypedef4Notes
+    SwiftName: MultiVersionedTypedef4Notes_NEW
+  - Name: MultiVersionedTypedef45Notes
+    SwiftName: MultiVersionedTypedef45Notes_NEW
+SwiftVersions:
+  - Version: 3.0
+    Classes:
+      - Name: MyReferenceType
+        SwiftBridge: ''
+      - Name: TestGenericDUMP
+        SwiftImportAsNonGeneric: true
+      - Name: TestProperties
+        SwiftObjCMembers: false
+        Properties:
+          - Name: accessorsOnlyInVersion3
+            PropertyKind:    Instance
+            SwiftImportAsAccessors: true
+          - Name: accessorsOnlyForClassInVersion3
+            PropertyKind:    Class
+            SwiftImportAsAccessors: true
+          - Name: accessorsOnlyExceptInVersion3
+            PropertyKind:    Instance
+            SwiftImportAsAccessors: false
+          - Name: accessorsOnlyForClassExceptInVersion3
+            PropertyKind:    Class
+            SwiftImportAsAccessors: false
+      - Name: Swift3RenamedOnlyDUMP
+        SwiftName: SpecialSwift3Name
+      - Name: Swift3RenamedAlsoDUMP
+        SwiftName: SpecialSwift3Also
+    Functions:
+      - Name: moveToPointDUMP
+        SwiftName: 'moveTo(a:b:)'
+      - Name: acceptClosure
+        Parameters:      
+          - Position:        0
+            NoEscape:        false
+      - Name: privateFunc
+        SwiftPrivate: false
+    Tags:
+      - Name: MyErrorCode
+        NSErrorDomain: ''
+      - Name: NewlyFlagEnum
+        FlagEnum: false
+      - Name: OpenToClosedEnum
+        EnumExtensibility: open
+      - Name: ClosedToOpenEnum
+        EnumExtensibility: closed
+      - Name: NewlyClosedEnum
+        EnumExtensibility: none
+      - Name: NewlyOpenEnum
+        EnumExtensibility: none
+    Typedefs:
+      - Name: MyDoubleWrapper
+        SwiftWrapper: none
+      - Name: MultiVersionedTypedef34
+        SwiftName: MultiVersionedTypedef34_3
+      - Name: MultiVersionedTypedef34Header
+        SwiftName: MultiVersionedTypedef34Header_3
+      - Name: MultiVersionedTypedef34Notes
+        SwiftName: MultiVersionedTypedef34Notes_3
+      - Name: MultiVersionedTypedef345
+        SwiftName: MultiVersionedTypedef345_3
+      - Name: MultiVersionedTypedef345Header
+        SwiftName: MultiVersionedTypedef345Header_3
+      - Name: MultiVersionedTypedef345Notes
+        SwiftName: MultiVersionedTypedef345Notes_3
+  - Version: 5
+    Typedefs:
+      - Name: MultiVersionedTypedef345
+        SwiftName: MultiVersionedTypedef345_5
+      - Name: MultiVersionedTypedef345Header
+        SwiftName: MultiVersionedTypedef345Header_5
+      - Name: MultiVersionedTypedef345Notes
+        SwiftName: MultiVersionedTypedef345Notes_5
+      - Name: MultiVersionedTypedef45
+        SwiftName: MultiVersionedTypedef45_5
+      - Name: MultiVersionedTypedef45Header
+        SwiftName: MultiVersionedTypedef45Header_5
+      - Name: MultiVersionedTypedef45Notes
+        SwiftName: MultiVersionedTypedef45Notes_5
+  - Version: 4 # Versions are deliberately ordered as "3, 5, 4" to catch bugs.
+    Classes:
+      - Name: Swift4RenamedDUMP
+        SwiftName: SpecialSwift4Name
+    Typedefs:
+      - Name: MultiVersionedTypedef34
+        SwiftName: MultiVersionedTypedef34_4
+      - Name: MultiVersionedTypedef34Header
+        SwiftName: MultiVersionedTypedef34Header_4
+      - Name: MultiVersionedTypedef34Notes
+        SwiftName: MultiVersionedTypedef34Notes_4
+      - Name: MultiVersionedTypedef345
+        SwiftName: MultiVersionedTypedef345_4
+      - Name: MultiVersionedTypedef345Header
+        SwiftName: MultiVersionedTypedef345Header_4
+      - Name: MultiVersionedTypedef345Notes
+        SwiftName: MultiVersionedTypedef345Notes_4
+      - Name: MultiVersionedTypedef4
+        SwiftName: MultiVersionedTypedef4_4
+      - Name: MultiVersionedTypedef4Header
+        SwiftName: MultiVersionedTypedef4Header_4
+      - Name: MultiVersionedTypedef4Notes
+        SwiftName: MultiVersionedTypedef4Notes_4
+      - Name: MultiVersionedTypedef45
+        SwiftName: MultiVersionedTypedef45_4
+      - Name: MultiVersionedTypedef45Header
+        SwiftName: MultiVersionedTypedef45Header_4
+      - Name: MultiVersionedTypedef45Notes
+        SwiftName: MultiVersionedTypedef45Notes_4
diff --git a/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h
new file mode 100644
index 0000000..9ce9563
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h
@@ -0,0 +1,137 @@
+void moveToPointDUMP(double x, double y) __attribute__((swift_name("moveTo(x:y:)")));
+
+void unversionedRenameDUMP(void) __attribute__((swift_name("unversionedRename_HEADER()")));
+
+void acceptClosure(void (^ __attribute__((noescape)) block)(void));
+
+void privateFunc(void) __attribute__((swift_private));
+
+typedef double MyDoubleWrapper __attribute__((swift_wrapper(struct)));
+
+#if __OBJC__
+@class NSString;
+
+extern NSString *MyErrorDomain;
+
+enum __attribute__((ns_error_domain(MyErrorDomain))) MyErrorCode {
+  MyErrorCodeFailed = 1
+};
+
+__attribute__((swift_bridge("MyValueType")))
+@interface MyReferenceType
+@end
+
+@interface TestProperties
+@property (nonatomic, readwrite, retain) id accessorsOnly;
+@property (nonatomic, readwrite, retain, class) id accessorsOnlyForClass;
+
+@property (nonatomic, readwrite, retain) id accessorsOnlyInVersion3;
+@property (nonatomic, readwrite, retain, class) id accessorsOnlyForClassInVersion3;
+
+@property (nonatomic, readwrite, retain) id accessorsOnlyExceptInVersion3;
+@property (nonatomic, readwrite, retain, class) id accessorsOnlyForClassExceptInVersion3;
+@end
+
+@interface Base
+@end
+
+@interface TestGenericDUMP<Element> : Base
+- (Element)element;
+@end
+
+@interface Swift3RenamedOnlyDUMP
+@end
+
+__attribute__((swift_name("Swift4Name")))
+@interface Swift3RenamedAlsoDUMP
+@end
+
+@interface Swift4RenamedDUMP
+@end
+
+#endif
+
+
+enum __attribute__((flag_enum)) FlagEnum {
+  FlagEnumA = 1,
+  FlagEnumB = 2
+};
+
+enum __attribute__((flag_enum)) NewlyFlagEnum {
+  NewlyFlagEnumA = 1,
+  NewlyFlagEnumB = 2
+};
+
+enum APINotedFlagEnum {
+  APINotedFlagEnumA = 1,
+  APINotedFlagEnumB = 2
+};
+
+
+enum __attribute__((enum_extensibility(open))) OpenEnum {
+  OpenEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(open))) NewlyOpenEnum {
+  NewlyOpenEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(closed))) NewlyClosedEnum {
+  NewlyClosedEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(open))) ClosedToOpenEnum {
+  ClosedToOpenEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(closed))) OpenToClosedEnum {
+  OpenToClosedEnumA = 1,
+};
+
+enum APINotedOpenEnum {
+  APINotedOpenEnumA = 1,
+};
+
+enum APINotedClosedEnum {
+  APINotedClosedEnumA = 1,
+};
+
+
+enum SoonToBeCFEnum {
+  SoonToBeCFEnumA = 1
+};
+enum SoonToBeNSEnum {
+  SoonToBeNSEnumA = 1
+};
+enum SoonToBeCFOptions {
+  SoonToBeCFOptionsA = 1
+};
+enum SoonToBeNSOptions {
+  SoonToBeNSOptionsA = 1
+};
+enum SoonToBeCFClosedEnum {
+  SoonToBeCFClosedEnumA = 1
+};
+enum SoonToBeNSClosedEnum {
+  SoonToBeNSClosedEnumA = 1
+};
+enum UndoAllThatHasBeenDoneToMe {
+  UndoAllThatHasBeenDoneToMeA = 1
+} __attribute__((flag_enum)) __attribute__((enum_extensibility(closed)));
+
+
+typedef int MultiVersionedTypedef4;
+typedef int MultiVersionedTypedef4Notes;
+typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_NEW")));
+
+typedef int MultiVersionedTypedef34;
+typedef int MultiVersionedTypedef34Notes;
+typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_NEW")));
+
+typedef int MultiVersionedTypedef45;
+typedef int MultiVersionedTypedef45Notes;
+typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_NEW")));
+
+typedef int MultiVersionedTypedef345;
+typedef int MultiVersionedTypedef345Notes;
+typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_NEW")));
diff --git a/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap
new file mode 100644
index 0000000..6d957fd
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module VersionedKit {
+  umbrella header "VersionedKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Headers/APINotes.apinotes b/clang/test/APINotes/Inputs/Headers/APINotes.apinotes
new file mode 100644
index 0000000..08210fc
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/APINotes.apinotes
@@ -0,0 +1,18 @@
+Name: HeaderLib
+SwiftInferImportAsMember: true
+Functions:
+  - Name: custom_realloc
+    NullabilityOfRet: N
+    Nullability: [ N, S ]
+  - Name: unavailable_function
+    Availability: none
+    AvailabilityMsg: "I beg you not to use this"
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+    Nullability: [ N, O ]
+    
+Globals:
+  - Name: global_int
+    Nullability: N
+  - Name: unavailable_global_int
+    Availability: none
diff --git a/clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes b/clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes
new file mode 100644
index 0000000..00f7b50
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes
@@ -0,0 +1,10 @@
+Name: BrokenTypes
+Functions:
+  - Name: break_me_function
+    ResultType: 'int * with extra junk'
+    Parameters:
+      - Position: 0
+        Type: 'not_a_type'
+Globals:
+  - Name: break_me_variable
+    Type: 'double'
diff --git a/clang/test/APINotes/Inputs/Headers/BrokenTypes.h b/clang/test/APINotes/Inputs/Headers/BrokenTypes.h
new file mode 100644
index 0000000..fee054b
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/BrokenTypes.h
@@ -0,0 +1,8 @@
+#ifndef BROKEN_TYPES_H
+#define BROKEN_TYPES_H
+
+char break_me_function(void *ptr);
+
+extern char break_me_variable;
+
+#endif // BROKEN_TYPES_H
diff --git a/clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes b/clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes
new file mode 100644
index 0000000..0f47ac6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes
@@ -0,0 +1,15 @@
+Name: ExternCtx
+Globals:
+  - Name: globalInExternC
+    Availability: none
+    AvailabilityMsg: "oh no"
+  - Name: globalInExternCXX
+    Availability: none
+    AvailabilityMsg: "oh no #2"
+Functions:
+  - Name: globalFuncInExternC
+    Availability: none
+    AvailabilityMsg: "oh no #3"
+  - Name: globalFuncInExternCXX
+    Availability: none
+    AvailabilityMsg: "oh no #4"
diff --git a/clang/test/APINotes/Inputs/Headers/ExternCtx.h b/clang/test/APINotes/Inputs/Headers/ExternCtx.h
new file mode 100644
index 0000000..669d443
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ExternCtx.h
@@ -0,0 +1,11 @@
+extern "C" {
+  static int globalInExternC = 1;
+
+  static void globalFuncInExternC() {}
+}
+
+extern "C++" {
+  static int globalInExternCXX = 2;
+
+  static void globalFuncInExternCXX() {}
+}
diff --git a/clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes b/clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes
new file mode 100644
index 0000000..7dcb224
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes
@@ -0,0 +1,37 @@
+Name: HeaderLib
+SwiftInferImportAsMember: true
+Functions:
+  - Name: custom_realloc
+    NullabilityOfRet: N
+    Nullability: [ N, S ]
+  - Name: unavailable_function
+    Availability: none
+    AvailabilityMsg: "I beg you not to use this"
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+    Nullability: [ N, O ]
+  - Name: do_something_with_arrays
+    Parameters:
+      - Position: 0
+        Nullability: N
+      - Position: 1
+        Nullability: N
+  - Name: take_pointer_and_int
+    Parameters:
+      - Position: 0
+        Nullability: N
+        NoEscape: true
+      - Position: 1
+        NoEscape: true
+Globals:
+  - Name: global_int
+    Nullability: N
+  - Name: unavailable_global_int
+    Availability: none
+Tags:
+  - Name: unavailable_struct
+    Availability: none
+
+Typedefs:
+  - Name: unavailable_typedef
+    Availability: none
diff --git a/clang/test/APINotes/Inputs/Headers/HeaderLib.h b/clang/test/APINotes/Inputs/Headers/HeaderLib.h
new file mode 100644
index 0000000..8065249
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/HeaderLib.h
@@ -0,0 +1,19 @@
+#ifndef HEADER_LIB_H
+#define HEADER_LIB_H
+
+void *custom_realloc(void *member, unsigned size);
+
+int *global_int;
+
+int unavailable_function(void);
+int unavailable_global_int;
+
+void do_something_with_pointers(int *ptr1, int *ptr2);
+void do_something_with_arrays(int simple[], int nested[][2]);
+
+typedef int unavailable_typedef;
+struct unavailable_struct { int x, y, z; };
+
+void take_pointer_and_int(int *ptr1, int value);
+
+#endif
diff --git a/clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes
new file mode 100644
index 0000000..813eb50
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes
@@ -0,0 +1,10 @@
+Name: InstancetypeModule
+Classes:
+- Name: SomeBaseClass
+  Methods:
+  - Selector: instancetypeFactoryMethod
+    MethodKind: Class
+    ResultType: SomeBaseClass * _Nonnull
+  - Selector: staticFactoryMethod
+    MethodKind: Class
+    ResultType: SomeBaseClass * _Nonnull
diff --git a/clang/test/APINotes/Inputs/Headers/InstancetypeModule.h b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.h
new file mode 100644
index 0000000..767f201
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.h
@@ -0,0 +1,10 @@
+@interface Object
+@end
+
+@interface SomeBaseClass : Object
++ (nullable instancetype)instancetypeFactoryMethod;
++ (nullable SomeBaseClass *)staticFactoryMethod;
+@end
+
+@interface SomeSubclass : SomeBaseClass
+@end
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h
new file mode 100644
index 0000000..867a15c
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h
@@ -0,0 +1 @@
+extern int ModuleWithWrongCase;
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h
new file mode 100644
index 0000000..aa01429
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h
@@ -0,0 +1 @@
+extern int ModuleWithWrongCasePrivate;
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes
new file mode 100644
index 0000000..dc6dc50
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes
@@ -0,0 +1 @@
+Name: ModuleWithWrongCasePrivate
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes
new file mode 100644
index 0000000..dc6dc50
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes
@@ -0,0 +1 @@
+Name: ModuleWithWrongCasePrivate
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
new file mode 100644
index 0000000..e9da367
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
@@ -0,0 +1,53 @@
+---
+Name: Namespaces
+Globals:
+  - Name: varInInlineNamespace
+    SwiftName: swiftVarInInlineNamespace
+Functions:
+  - Name: funcInNamespace
+    SwiftName: inWrongContext()
+  - Name: funcInInlineNamespace
+    SwiftName: swiftFuncInInlineNamespace()
+Tags:
+  - Name: char_box
+    SwiftName: InWrongContext
+Namespaces:
+  - Name: Namespace1
+    Typedefs:
+      - Name: my_typedef
+        SwiftName: SwiftTypedef
+      - Name: my_using_decl
+        SwiftName: SwiftUsingDecl
+    Globals:
+      - Name: varInNamespace
+        SwiftName: swiftVarInNamespace
+    Functions:
+      - Name: funcInNamespace
+        SwiftName: swiftFuncInNamespace()
+    Tags:
+      - Name: char_box
+        SwiftName: CharBox
+    Namespaces:
+      - Name: Nested1
+        Globals:
+          - Name: varInNestedNamespace
+            SwiftName: swiftVarInNestedNamespace
+        Functions:
+          - Name: funcInNestedNamespace
+            SwiftName: swiftFuncInNestedNamespace(_:)
+        Tags:
+          - Name: char_box
+            SwiftName: NestedCharBox
+        Namespaces:
+          - Name: Namespace1
+            Tags:
+              - Name: char_box
+                SwiftName: DeepNestedCharBox
+      - Name: Nested2
+        Globals:
+          - Name: varInNestedNamespace
+            SwiftName: swiftAnotherVarInNestedNamespace
+  - Name: InlineNamespace1
+    Functions:
+      - Name: funcInInlineNamespace
+        SwiftName: shouldNotSpellOutInlineNamespaces()
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.h b/clang/test/APINotes/Inputs/Headers/Namespaces.h
new file mode 100644
index 0000000..6a79e99
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.h
@@ -0,0 +1,39 @@
+namespace Namespace1 { namespace Nested1 {} }
+
+namespace Namespace1 {
+static int varInNamespace = 1;
+struct char_box { char c; };
+void funcInNamespace();
+
+namespace Nested1 {
+void funcInNestedNamespace(int i);
+struct char_box {
+  char c;
+};
+}
+
+namespace Nested1 {
+static int varInNestedNamespace = 1;
+void funcInNestedNamespace(int i);
+
+namespace Namespace1 {
+struct char_box { char c; };
+} // namespace Namespace1
+} // namespace Nested1
+
+namespace Nested2 {
+static int varInNestedNamespace = 2;
+} // namespace Nested2
+
+namespace Nested1 { namespace Namespace1 {} }
+} // namespace Namespace1
+
+namespace Namespace1 {
+typedef int my_typedef;
+using my_using_decl = int;
+}
+
+inline namespace InlineNamespace1 {
+static int varInInlineNamespace = 3;
+void funcInInlineNamespace();
+}
diff --git a/clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes b/clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes
new file mode 100644
index 0000000..5f62284
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes
@@ -0,0 +1,4 @@
+Name: HeaderLib
+Globals:
+- Name: PrivateLib
+  Type: float
diff --git a/clang/test/APINotes/Inputs/Headers/PrivateLib.h b/clang/test/APINotes/Inputs/Headers/PrivateLib.h
new file mode 100644
index 0000000..59aeef0
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/PrivateLib.h
@@ -0,0 +1 @@
+extern int PrivateLib;
diff --git a/clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes b/clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes
new file mode 100644
index 0000000..908dae0
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
new file mode 100644
index 0000000..5dbb83c
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -0,0 +1,9 @@
+---
+Name: SwiftImportAs
+Tags:
+- Name: ImmortalRefType
+  SwiftImportAs: reference
+- Name: RefCountedType
+  SwiftImportAs: reference
+  SwiftReleaseOp: RCRelease
+  SwiftRetainOp: RCRetain
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
new file mode 100644
index 0000000..82b8a67
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
@@ -0,0 +1,6 @@
+struct ImmortalRefType {};
+
+struct RefCountedType { int value; };
+
+inline void RCRetain(RefCountedType *x) { x->value++; }
+inline void RCRelease(RefCountedType *x) { x->value--; }
diff --git a/clang/test/APINotes/Inputs/Headers/module.modulemap b/clang/test/APINotes/Inputs/Headers/module.modulemap
new file mode 100644
index 0000000..98b4ee3
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/module.modulemap
@@ -0,0 +1,31 @@
+module ExternCtx {
+  header "ExternCtx.h"
+}
+
+module HeaderLib {
+  header "HeaderLib.h"
+}
+
+module InstancetypeModule {
+  header "InstancetypeModule.h"
+}
+
+module BrokenTypes {
+  header "BrokenTypes.h"
+}
+
+module ModuleWithWrongCase {
+  header "ModuleWithWrongCase.h"
+}
+
+module ModuleWithWrongCasePrivate {
+  header "ModuleWithWrongCasePrivate.h"
+}
+
+module Namespaces {
+  header "Namespaces.h"
+}
+
+module SwiftImportAs {
+  header "SwiftImportAs.h"
+}
diff --git a/clang/test/APINotes/Inputs/Headers/module.private.modulemap b/clang/test/APINotes/Inputs/Headers/module.private.modulemap
new file mode 100644
index 0000000..2ecf322
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/module.private.modulemap
@@ -0,0 +1,5 @@
+module PrivateLib {
+  header "PrivateLib.h"
+}
+
+module ModuleWithWrongCasePrivate.Inner {}
diff --git a/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes
new file mode 100644
index 0000000..77db844
--- /dev/null
+++ b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes
@@ -0,0 +1,65 @@
+---
+Name:            UIKit
+Classes:
+  - Name:            UIFont
+    Methods:
+      - Selector:        'fontWithName:size:'
+        MethodKind:      Instance
+        Nullability:     [ N ]
+        NullabilityOfRet: O
+        DesignatedInit:  true
+# CHECK: duplicate definition of method '-[UIFont fontWithName:size:]'
+      - Selector:        'fontWithName:size:'
+        MethodKind:      Instance
+        Nullability:     [ N ]
+        NullabilityOfRet: O
+        DesignatedInit:  true
+    Properties:
+      - Name:            familyName
+        Nullability:     N
+      - Name:            fontName
+        Nullability:     N
+# CHECK: duplicate definition of instance property 'UIFont.familyName'
+      - Name:            familyName
+        Nullability:     N
+# CHECK: multiple definitions of class 'UIFont'
+  - Name:            UIFont
+Protocols:
+  - Name:            MyProto
+    AuditedForNullability: true
+# CHECK: multiple definitions of protocol 'MyProto'
+  - Name:            MyProto
+    AuditedForNullability: true
+Functions:
+  - Name:        'globalFoo'
+    Nullability:     [ N, N, O, S ]
+    NullabilityOfRet: O
+  - Name:        'globalFoo2'
+    Nullability:     [ N, N, O, S ]
+    NullabilityOfRet: O
+Globals:
+  - Name:            globalVar
+    Nullability:     O
+  - Name:            globalVar2
+    Nullability:     O
+Tags:
+# CHECK: cannot mix EnumKind and FlagEnum (for FlagAndEnumKind)
+  - Name: FlagAndEnumKind
+    FlagEnum: true
+    EnumKind: CFOptions
+# CHECK: cannot mix EnumKind and FlagEnum (for FlagAndEnumKind2)
+  - Name: FlagAndEnumKind2
+    EnumKind: CFOptions
+    FlagEnum: false
+# CHECK: cannot mix EnumKind and EnumExtensibility (for ExtensibilityAndEnumKind)
+  - Name: ExtensibilityAndEnumKind
+    EnumExtensibility: open
+    EnumKind: CFOptions
+# CHECK: cannot mix EnumKind and EnumExtensibility (for ExtensibilityAndEnumKind2)
+  - Name: ExtensibilityAndEnumKind2
+    EnumKind: CFOptions
+    EnumExtensibility: closed
+# CHECK: cannot mix EnumKind and EnumExtensibility (for ExtensibilityAndEnumKind3)
+  - Name: ExtensibilityAndEnumKind3
+    EnumKind: none
+    EnumExtensibility: none
diff --git a/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h
new file mode 100644
index 0000000..55313ae
--- /dev/null
+++ b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h
@@ -0,0 +1 @@
+extern int yesOfCourseThisIsWhatUIKitLooksLike;
diff --git a/clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap b/clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap
new file mode 100644
index 0000000..3d683d7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap
@@ -0,0 +1,3 @@
+module UIKit {
+  header "UIKit.h"
+}
diff --git a/clang/test/APINotes/availability.m b/clang/test/APINotes/availability.m
new file mode 100644
index 0000000..2ddc2a7
--- /dev/null
+++ b/clang/test/APINotes/availability.m
@@ -0,0 +1,48 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -Wno-private-module -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include "HeaderLib.h"
+#import <SomeKit/SomeKit.h>
+#import <SomeKit/SomeKit_Private.h>
+
+int main() {
+  int i;
+  i = unavailable_function(); // expected-error{{'unavailable_function' is unavailable: I beg you not to use this}}
+  // expected-note@HeaderLib.h:8{{'unavailable_function' has been explicitly marked unavailable here}}
+  i = unavailable_global_int; // expected-error{{'unavailable_global_int' is unavailable}}
+  // expected-note@HeaderLib.h:9{{'unavailable_global_int' has been explicitly marked unavailable here}}
+
+  unavailable_typedef t; // expected-error{{'unavailable_typedef' is unavailable}}
+  // expected-note@HeaderLib.h:14{{'unavailable_typedef' has been explicitly marked unavailable here}}
+
+  struct unavailable_struct s; // expected-error{{'unavailable_struct' is unavailable}}
+  // expected-note@HeaderLib.h:15{{'unavailable_struct' has been explicitly marked unavailable here}}
+
+  B *b = 0; // expected-error{{'B' is unavailable: just don't}}
+  // expected-note@SomeKit/SomeKit.h:15{{'B' has been explicitly marked unavailable here}}
+
+  id<InternalProtocol> proto = 0; // expected-error{{'InternalProtocol' is unavailable: not for you}}
+  // expected-note@SomeKit/SomeKit_Private.h:12{{'InternalProtocol' has been explicitly marked unavailable here}}
+
+  A *a = 0;
+  i = a.intValue; // expected-error{{intValue' is unavailable: wouldn't work anyway}}
+  // expected-note@SomeKit/SomeKit.h:12{{'intValue' has been explicitly marked unavailable here}}
+
+  [a transform:a]; // expected-error{{'transform:' is unavailable: anything but this}}
+  // expected-note@SomeKit/SomeKit.h:6{{'transform:' has been explicitly marked unavailable here}}
+
+  [a implicitGetOnlyInstance]; // expected-error{{'implicitGetOnlyInstance' is unavailable: getter gone}}
+  // expected-note@SomeKit/SomeKit.h:53{{'implicitGetOnlyInstance' has been explicitly marked unavailable here}}
+  [A implicitGetOnlyClass]; // expected-error{{'implicitGetOnlyClass' is unavailable: getter gone}}
+  // expected-note@SomeKit/SomeKit.h:54{{'implicitGetOnlyClass' has been explicitly marked unavailable here}}
+  [a implicitGetSetInstance]; // expected-error{{'implicitGetSetInstance' is unavailable: getter gone}}
+  // expected-note@SomeKit/SomeKit.h:56{{'implicitGetSetInstance' has been explicitly marked unavailable here}}
+  [a setImplicitGetSetInstance: a];  // expected-error{{'setImplicitGetSetInstance:' is unavailable: setter gone}}
+  // expected-note@SomeKit/SomeKit.h:56{{'setImplicitGetSetInstance:' has been explicitly marked unavailable here}}
+  [A implicitGetSetClass]; // expected-error{{'implicitGetSetClass' is unavailable: getter gone}}
+  // expected-note@SomeKit/SomeKit.h:57{{'implicitGetSetClass' has been explicitly marked unavailable here}}
+  [A setImplicitGetSetClass: a];  // expected-error{{'setImplicitGetSetClass:' is unavailable: setter gone}}
+  // expected-note@SomeKit/SomeKit.h:57{{'setImplicitGetSetClass:' has been explicitly marked unavailable here}}
+  return 0;
+}
+
diff --git a/clang/test/APINotes/broken_types.m b/clang/test/APINotes/broken_types.m
new file mode 100644
index 0000000..ee33ff7
--- /dev/null
+++ b/clang/test/APINotes/broken_types.m
@@ -0,0 +1,19 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s 2> %t.err
+// RUN: FileCheck %s < %t.err
+
+#include "BrokenTypes.h"
+
+// CHECK: <API Notes>:1:1: error: unknown type name 'not_a_type'
+// CHECK-NEXT: not_a_type
+// CHECK-NEXT: ^
+
+// CHECK: <API Notes>:1:7: error: unparsed tokens following type
+// CHECK-NEXT: int * with extra junk
+// CHECK-NEXT:       ^
+
+// CHECK: BrokenTypes.h:4:6: error: API notes replacement type 'int *' has a different size from original type 'char'
+
+// CHECK: BrokenTypes.h:6:13: error: API notes replacement type 'double' has a different size from original type 'char'
+
+// CHECK: 5 errors generated.
diff --git a/clang/test/APINotes/case-for-private-apinotes-file.c b/clang/test/APINotes/case-for-private-apinotes-file.c
new file mode 100644
index 0000000..6aff3db
--- /dev/null
+++ b/clang/test/APINotes/case-for-private-apinotes-file.c
@@ -0,0 +1,22 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fapinotes-modules -fimplicit-module-maps -fmodules-cache-path=%t -F %S/Inputs/Frameworks -I %S/Inputs/Headers %s 2>&1 | FileCheck %s
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fapinotes-modules -fimplicit-module-maps -fmodules-cache-path=%t -iframework %S/Inputs/Frameworks -isystem %S/Inputs/Headers %s -Werror
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fapinotes-modules -fimplicit-module-maps -fmodules-cache-path=%t -iframework %S/Inputs/Frameworks -isystem %S/Inputs/Headers %s -Wnonportable-private-system-apinotes-path 2>&1 | FileCheck %s
+
+#include <ModuleWithWrongCase.h>
+#include <ModuleWithWrongCasePrivate.h>
+#include <FrameworkWithWrongCase/FrameworkWithWrongCase.h>
+#include <FrameworkWithWrongCasePrivate/FrameworkWithWrongCasePrivate.h>
+#include <FrameworkWithActualPrivateModule/FrameworkWithActualPrivateModule_Private.h>
+
+// CHECK-NOT: warning:
+// CHECK: warning: private API notes file for module 'ModuleWithWrongCasePrivate' should be named 'ModuleWithWrongCasePrivate_private.apinotes', not 'ModuleWithWrongCasePrivate_Private.apinotes'
+// CHECK-NOT: warning:
+// CHECK: warning: private API notes file for module 'FrameworkWithWrongCasePrivate' should be named 'FrameworkWithWrongCasePrivate_private.apinotes', not 'FrameworkWithWrongCasePrivate_Private.apinotes'
+// CHECK-NOT: warning:
diff --git a/clang/test/APINotes/extern-context.cpp b/clang/test/APINotes/extern-context.cpp
new file mode 100644
index 0000000..331dee0
--- /dev/null
+++ b/clang/test/APINotes/extern-context.cpp
@@ -0,0 +1,23 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternC -x c++ | FileCheck -check-prefix=CHECK-EXTERN-C %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternCXX -x c++ | FileCheck -check-prefix=CHECK-EXTERN-CXX %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternC -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-C %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternCXX -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-CXX %s
+
+#include "ExternCtx.h"
+
+// CHECK-EXTERN-C: Dumping globalInExternC:
+// CHECK-EXTERN-C: VarDecl {{.+}} imported in ExternCtx globalInExternC 'int'
+// CHECK-EXTERN-C: UnavailableAttr {{.+}} <<invalid sloc>> "oh no"
+
+// CHECK-EXTERN-CXX: Dumping globalInExternCXX:
+// CHECK-EXTERN-CXX: VarDecl {{.+}} imported in ExternCtx globalInExternCXX 'int'
+// CHECK-EXTERN-CXX: UnavailableAttr {{.+}} <<invalid sloc>> "oh no #2"
+
+// CHECK-FUNC-EXTERN-C: Dumping globalFuncInExternC:
+// CHECK-FUNC-EXTERN-C: FunctionDecl {{.+}} imported in ExternCtx globalFuncInExternC 'void ()'
+// CHECK-FUNC-EXTERN-C: UnavailableAttr {{.+}} <<invalid sloc>> "oh no #3"
+
+// CHECK-FUNC-EXTERN-CXX: Dumping globalFuncInExternCXX:
+// CHECK-FUNC-EXTERN-CXX: FunctionDecl {{.+}} imported in ExternCtx globalFuncInExternCXX 'void ()'
+// CHECK-FUNC-EXTERN-CXX: UnavailableAttr {{.+}} <<invalid sloc>> "oh no #4"
diff --git a/clang/test/APINotes/instancetype.m b/clang/test/APINotes/instancetype.m
new file mode 100644
index 0000000..30339e5
--- /dev/null
+++ b/clang/test/APINotes/instancetype.m
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -verify %s
+
+@import InstancetypeModule;
+
+void test() {
+  // The nullability is here to verify that the API notes were applied.
+  int good = [SomeSubclass instancetypeFactoryMethod]; // expected-error {{initializing 'int' with an expression of type 'SomeSubclass * _Nonnull'}}
+  int bad = [SomeSubclass staticFactoryMethod]; // expected-error {{initializing 'int' with an expression of type 'SomeBaseClass * _Nonnull'}}
+}
diff --git a/clang/test/APINotes/module-cache.m b/clang/test/APINotes/module-cache.m
new file mode 100644
index 0000000..5dcaf11
--- /dev/null
+++ b/clang/test/APINotes/module-cache.m
@@ -0,0 +1,65 @@
+// RUN: rm -rf %t
+
+// Set up directories
+// RUN: mkdir -p %t/APINotes
+// RUN: cp %S/Inputs/APINotes/SomeOtherKit.apinotes %t/APINotes/SomeOtherKit.apinotes
+// RUN: mkdir -p %t/Frameworks
+// RUN: cp -r %S/Inputs/Frameworks/SomeOtherKit.framework %t/Frameworks
+
+// First build: check that 'methodB' is unavailable but 'methodA' is available.
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/before.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-REBUILD %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+
+// Do it again; now we're using caches.
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/before.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-WITHOUT-REBUILD %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+
+// Add a blank line to the header to force the module to rebuild, without
+// (yet) changing API notes.
+// RUN: echo >> %t/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/before.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-REBUILD %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+
+// Change the API notes file, after the module has rebuilt once.
+// RUN: echo '      - Selector: "methodA"' >> %t/APINotes/SomeOtherKit.apinotes
+// RUN: echo '        MethodKind: Instance' >> %t/APINotes/SomeOtherKit.apinotes
+// RUN: echo '        Availability: none' >> %t/APINotes/SomeOtherKit.apinotes
+// RUN: echo '        AvailabilityMsg: "not here either"' >> %t/APINotes/SomeOtherKit.apinotes
+
+// Build again: check that both methods are now unavailable and that the module rebuilt.
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/after.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODA %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-REBUILD %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-TWO-ERRORS %s < %t/after.log
+
+// Run the build again: check that both methods are now unavailable
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/after.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODA %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-WITHOUT-REBUILD %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-TWO-ERRORS %s < %t/after.log
+
+@import SomeOtherKit;
+
+void test(A *a) {
+  // CHECK-METHODA: error: 'methodA' is unavailable: not here either
+  [a methodA];
+
+  // CHECK-METHODB: error: 'methodB' is unavailable: anything but this
+  [a methodB];
+}
+
+// CHECK-REBUILD: remark: building module{{.*}}SomeOtherKit
+
+// CHECK-WITHOUT-REBUILD-NOT: remark: building module{{.*}}SomeOtherKit
+
+// CHECK-ONE-ERROR: 1 error generated.
+// CHECK-TWO-ERRORS: 2 errors generated.
+
diff --git a/clang/test/APINotes/namespaces.cpp b/clang/test/APINotes/namespaces.cpp
new file mode 100644
index 0000000..2f9d93c
--- /dev/null
+++ b/clang/test/APINotes/namespaces.cpp
@@ -0,0 +1,69 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x objective-c++
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_typedef -x objective-c++ | FileCheck -check-prefix=CHECK-TYPEDEF-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_using_decl -x objective-c++ | FileCheck -check-prefix=CHECK-USING-DECL-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::varInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::funcInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested2::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::funcInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter varInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-INLINE-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter funcInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-INLINE-NAMESPACE %s
+
+#import <Namespaces.h>
+
+// CHECK-TYPEDEF-IN-NAMESPACE: Dumping Namespace1::my_typedef:
+// CHECK-TYPEDEF-IN-NAMESPACE-NEXT: TypedefDecl {{.+}} imported in Namespaces my_typedef 'int'
+// CHECK-TYPEDEF-IN-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "SwiftTypedef"
+
+// CHECK-USING-DECL-IN-NAMESPACE: Dumping Namespace1::my_using_decl:
+// CHECK-USING-DECL-IN-NAMESPACE-NEXT: TypeAliasDecl {{.+}} imported in Namespaces my_using_decl 'int'
+// CHECK-USING-DECL-IN-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "SwiftUsingDecl"
+
+// CHECK-GLOBAL-IN-NAMESPACE: Dumping Namespace1::varInNamespace:
+// CHECK-GLOBAL-IN-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInNamespace 'int' static cinit
+// CHECK-GLOBAL-IN-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 1
+// CHECK-GLOBAL-IN-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftVarInNamespace"
+
+// CHECK-FUNC-IN-NAMESPACE: Dumping Namespace1::funcInNamespace:
+// CHECK-FUNC-IN-NAMESPACE-NEXT: FunctionDecl {{.+}} imported in Namespaces funcInNamespace 'void ()'
+// CHECK-FUNC-IN-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftFuncInNamespace()"
+
+// CHECK-STRUCT-IN-NAMESPACE: Dumping Namespace1::char_box:
+// CHECK-STRUCT-IN-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
+// CHECK-STRUCT-IN-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "CharBox"
+
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::varInNestedNamespace:
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInNestedNamespace 'int' static cinit
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 1
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftVarInNestedNamespace"
+
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested2::varInNestedNamespace:
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInNestedNamespace 'int' static cinit
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 2
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftAnotherVarInNestedNamespace"
+
+// CHECK-FUNC-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::funcInNestedNamespace:
+// CHECK-FUNC-IN-NESTED-NAMESPACE-NEXT: FunctionDecl {{.+}} imported in Namespaces funcInNestedNamespace 'void (int)'
+// CHECK-FUNC-IN-NESTED-NAMESPACE-NEXT: ParmVarDecl {{.+}} i 'int'
+// CHECK-FUNC-IN-NESTED-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftFuncInNestedNamespace(_:)"
+
+// CHECK-STRUCT-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::char_box:
+// CHECK-STRUCT-IN-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
+// CHECK-STRUCT-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "NestedCharBox"
+
+// CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE: Dumping Namespace1::Nested1::Namespace1::char_box:
+// CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
+// CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "DeepNestedCharBox"
+
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE: Dumping varInInlineNamespace:
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInInlineNamespace 'int' static cinit
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 3
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftVarInInlineNamespace"
+
+// CHECK-FUNC-IN-INLINE-NAMESPACE: Dumping funcInInlineNamespace:
+// CHECK-FUNC-IN-INLINE-NAMESPACE-NEXT: FunctionDecl {{.+}} imported in Namespaces funcInInlineNamespace 'void ()'
+// CHECK-FUNC-IN-INLINE-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftFuncInInlineNamespace()"
diff --git a/clang/test/APINotes/nullability.c b/clang/test/APINotes/nullability.c
new file mode 100644
index 0000000..e07fc2e
--- /dev/null
+++ b/clang/test/APINotes/nullability.c
@@ -0,0 +1,21 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include "HeaderLib.h"
+
+int main() {
+  custom_realloc(0, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+  int i = 0;
+  do_something_with_pointers(&i, 0);
+  do_something_with_pointers(0, &i); // expected-warning{{null passed to a callee that requires a non-null argument}}
+  
+  extern void *p;
+  do_something_with_arrays(0, p); // expected-warning{{null passed to a callee that requires a non-null argument}}
+  do_something_with_arrays(p, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  take_pointer_and_int(0, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  float *fp = global_int; // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'int * _Nonnull'}}
+  return 0;
+}
+
diff --git a/clang/test/APINotes/nullability.m b/clang/test/APINotes/nullability.m
new file mode 100644
index 0000000..21ec668
--- /dev/null
+++ b/clang/test/APINotes/nullability.m
@@ -0,0 +1,46 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+// Test with Swift version 3.0. This should only affect the few APIs that have an entry in the 3.0 tables.
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fapinotes-swift-version=3.0 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify -DSWIFT_VERSION_3_0 -fmodules-ignore-macro=SWIFT_VERSION_3_0
+
+#import <SomeKit/SomeKit.h>
+
+int main() {
+  A *a;
+
+#if SWIFT_VERSION_3_0
+  float *fp =  // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'A * _Nullable'}}
+    [a transform: 0 integer: 0];
+#else
+  float *fp =  // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'A *'}}
+    [a transform: 0 integer: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+#endif
+
+  [a setNonnullAInstance: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  [A setNonnullAInstance: 0]; // no warning
+  a.nonnullAInstance = 0; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  A* _Nonnull aPtr = a.nonnullAInstance; // no warning
+
+  [a setNonnullAClass: 0]; // no warning
+  [A setNonnullAClass: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  [a setNonnullABoth: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  [A setNonnullABoth: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  [a setInternalProperty: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+#if SWIFT_VERSION_3_0
+  // Version 3 information overrides header information.
+  [a setExplicitNonnullInstance: 0]; //  okay
+  [a setExplicitNullableInstance: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+#else
+  // Header information overrides unversioned information.
+  [a setExplicitNonnullInstance: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  [a setExplicitNullableInstance: 0]; // okay
+#endif
+
+  return 0;
+}
+
diff --git a/clang/test/APINotes/objc-forward-declarations.m b/clang/test/APINotes/objc-forward-declarations.m
new file mode 100644
index 0000000..e82bed2
--- /dev/null
+++ b/clang/test/APINotes/objc-forward-declarations.m
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -F %S/Inputs/Frameworks %s -verify
+
+@import LayeredKit;
+
+void test(
+  UpwardClass *okayClass,
+  id <UpwardProto> okayProto,
+  PerfectlyNormalClass *badClass // expected-error {{'PerfectlyNormalClass' is unavailable}}
+) {
+  // expected-note@LayeredKitImpl/LayeredKitImpl.h:4 {{'PerfectlyNormalClass' has been explicitly marked unavailable here}}
+}
diff --git a/clang/test/APINotes/objc_designated_inits.m b/clang/test/APINotes/objc_designated_inits.m
new file mode 100644
index 0000000..1f2b8ed
--- /dev/null
+++ b/clang/test/APINotes/objc_designated_inits.m
@@ -0,0 +1,17 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include "HeaderLib.h"
+#import <SomeKit/SomeKit.h>
+
+@interface CSub : C
+-(instancetype)initWithA:(A*)a;
+@end
+
+@implementation CSub
+-(instancetype)initWithA:(A*)a { // expected-warning{{designated initializer missing a 'super' call to a designated initializer of the super class}}
+  // expected-note@SomeKit/SomeKit.h:20 2{{method marked as designated initializer of the class here}}
+  self = [super init]; // expected-warning{{designated initializer invoked a non-designated initializer}}
+  return self;
+}
+@end
diff --git a/clang/test/APINotes/properties.m b/clang/test/APINotes/properties.m
new file mode 100644
index 0000000..f218092
--- /dev/null
+++ b/clang/test/APINotes/properties.m
@@ -0,0 +1,42 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' | FileCheck -check-prefix=CHECK -check-prefix=CHECK-4 %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' -fapinotes-swift-version=3 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-3 %s
+
+@import VersionedKit;
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnly 'id'
+// CHECK-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyForClass 'id'
+// CHECK-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyInVersion3 'id'
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedAdditionAttr {{.+}} 3.0{{$}}
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyForClassInVersion3 'id'
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedAdditionAttr {{.+}} 3.0{{$}}
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyExceptInVersion3 'id'
+// CHECK-3-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 3.0 {{[0-9]+}}
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyForClassExceptInVersion3 'id'
+// CHECK-3-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 3.0 {{[0-9]+}}
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: Decl
diff --git a/clang/test/APINotes/retain-count-convention.m b/clang/test/APINotes/retain-count-convention.m
new file mode 100644
index 0000000..4bf9610a
--- /dev/null
+++ b/clang/test/APINotes/retain-count-convention.m
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fdisable-module-hash -fsyntax-only -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/SimpleKit.pcm | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -ast-dump-filter 'DUMP' %t/ModulesCache/SimpleKit.pcm | FileCheck -check-prefix CHECK-DUMP %s
+
+#import <SimpleKit/SimpleKit.h>
+
+// CHECK: void *getCFOwnedToUnowned(void) __attribute__((cf_returns_not_retained));
+// CHECK: void *getCFUnownedToOwned(void) __attribute__((cf_returns_retained));
+// CHECK: void *getCFOwnedToNone(void) __attribute__((cf_unknown_transfer));
+// CHECK: id getObjCOwnedToUnowned(void) __attribute__((ns_returns_not_retained));
+// CHECK: id getObjCUnownedToOwned(void) __attribute__((ns_returns_retained));
+// CHECK: int indirectGetCFOwnedToUnowned(void * _Nullable *out __attribute__((cf_returns_not_retained)));
+// CHECK: int indirectGetCFUnownedToOwned(void * _Nullable *out __attribute__((cf_returns_retained)));
+// CHECK: int indirectGetCFOwnedToNone(void * _Nullable *out);
+// CHECK: int indirectGetCFNoneToOwned(void **out __attribute__((cf_returns_not_retained)));
+
+// CHECK-LABEL: @interface MethodTest
+// CHECK: - (id)getOwnedToUnowned __attribute__((ns_returns_not_retained));
+// CHECK: - (id)getUnownedToOwned __attribute__((ns_returns_retained));
+// CHECK: @end
+
+// CHECK-DUMP-LABEL: Dumping getCFAuditedToUnowned_DUMP:
+// CHECK-DUMP-NEXT: FunctionDecl
+// CHECK-DUMP-NEXT: CFReturnsNotRetainedAttr
+// CHECK-DUMP-NEXT: CFAuditedTransferAttr
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping getCFAuditedToOwned_DUMP:
+// CHECK-DUMP-NEXT: FunctionDecl
+// CHECK-DUMP-NEXT: CFReturnsRetainedAttr
+// CHECK-DUMP-NEXT: CFAuditedTransferAttr
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping getCFAuditedToNone_DUMP:
+// CHECK-DUMP-NEXT: FunctionDecl
+// CHECK-DUMP-NEXT: CFUnknownTransferAttr
+// CHECK-DUMP-NOT: Attr
diff --git a/clang/test/APINotes/search-order.m b/clang/test/APINotes/search-order.m
new file mode 100644
index 0000000..17e81d5
--- /dev/null
+++ b/clang/test/APINotes/search-order.m
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -DFROM_FRAMEWORK=1 -verify
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %S/Inputs/APINotes  -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -DFROM_SEARCH_PATH=1 -verify
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -iapinotes-modules %S/Inputs/APINotes  -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -DFROM_FRAMEWORK=1 -verify
+
+@import SomeOtherKit;
+
+void test(A *a) {
+#if FROM_FRAMEWORK
+  [a methodA]; // expected-error{{unavailable}}
+  [a methodB];
+
+  // expected-note@SomeOtherKit/SomeOtherKit.h:5{{'methodA' has been explicitly marked unavailable here}}
+#elif FROM_SEARCH_PATH
+  [a methodA];
+  [a methodB]; // expected-error{{unavailable}}
+
+  // expected-note@SomeOtherKit/SomeOtherKit.h:6{{'methodB' has been explicitly marked unavailable here}}
+#else
+#  error Not something we need to test
+#endif
+}
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
new file mode 100644
index 0000000..904857e
--- /dev/null
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -0,0 +1,16 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
+
+#include <SwiftImportAs.h>
+
+// CHECK-IMMORTAL: Dumping ImmortalRefType:
+// CHECK-IMMORTAL-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct ImmortalRefType
+// CHECK-IMMORTAL: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
+
+// CHECK-REF-COUNTED: Dumping RefCountedType:
+// CHECK-REF-COUNTED-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct RefCountedType
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "retain:RCRetain"
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:RCRelease"
diff --git a/clang/test/APINotes/top-level-private-modules.c b/clang/test/APINotes/top-level-private-modules.c
new file mode 100644
index 0000000..0da72b2
--- /dev/null
+++ b/clang/test/APINotes/top-level-private-modules.c
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include <PrivateLib.h>
+#include <TopLevelPrivateKit/TopLevelPrivateKit_Private.h>
+
+void *testPlain = PrivateLib; // expected-error {{initializing 'void *' with an expression of incompatible type 'float'}}
+void *testFramework = TopLevelPrivateKit_Private; // expected-error {{initializing 'void *' with an expression of incompatible type 'float'}}
diff --git a/clang/test/APINotes/types.m b/clang/test/APINotes/types.m
new file mode 100644
index 0000000..133d504
--- /dev/null
+++ b/clang/test/APINotes/types.m
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fdisable-module-hash -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/SimpleKit.pcm | FileCheck %s
+
+#import <SomeKit/SomeKit.h>
+#import <SimpleKit/SimpleKit.h>
+
+// CHECK: struct __attribute__((swift_name("SuccessfullyRenamedA"))) RenamedAgainInAPINotesA {
+// CHECK: struct __attribute__((swift_name("SuccessfullyRenamedB"))) RenamedAgainInAPINotesB {
+
+void test(OverriddenTypes *overridden) {
+  int *ip1 = global_int_ptr; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'double (*)(int, int)'}}
+
+  int *ip2 = global_int_fun( // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
+               ip2, // expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'double *'}}
+               ip2); // expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'float *'}}
+
+  int *ip3 = [overridden // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
+                methodToMangle: ip3 // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'double *'}}
+                        second: ip3]; // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'float *'}}
+
+  int *ip4 = overridden.intPropertyToMangle; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'double *'}}
+}
+
+// expected-note@SomeKit/SomeKit.h:42{{passing argument to parameter 'ptr' here}}
+// expected-note@SomeKit/SomeKit.h:42{{passing argument to parameter 'ptr2' here}}
+// expected-note@SomeKit/SomeKit.h:48{{passing argument to parameter 'ptr1' here}}
+// expected-note@SomeKit/SomeKit.h:48{{passing argument to parameter 'ptr2' here}}
diff --git a/clang/test/APINotes/versioned-multi.c b/clang/test/APINotes/versioned-multi.c
new file mode 100644
index 0000000..48c51fd
--- /dev/null
+++ b/clang/test/APINotes/versioned-multi.c
@@ -0,0 +1,69 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// Build and check the unversioned module file.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Unversioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-UNVERSIONED %s
+
+// Build and check the various versions.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned3 -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned3/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED-3 %s
+
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned4 -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=4 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned4/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED-4 %s
+
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned5 -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=5 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned5/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED-5 %s
+
+#import <VersionedKit/VersionedKit.h>
+
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef4;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef34;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef45;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef345;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_NEW")));
+
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef4 __attribute__((swift_name("MultiVersionedTypedef4_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef34 __attribute__((swift_name("MultiVersionedTypedef34_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef45 __attribute__((swift_name("MultiVersionedTypedef45_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef345 __attribute__((swift_name("MultiVersionedTypedef345_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_3")));
+
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef4 __attribute__((swift_name("MultiVersionedTypedef4_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef34 __attribute__((swift_name("MultiVersionedTypedef34_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef45 __attribute__((swift_name("MultiVersionedTypedef45_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef345 __attribute__((swift_name("MultiVersionedTypedef345_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_4")));
+
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef4;
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef34;
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef45 __attribute__((swift_name("MultiVersionedTypedef45_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef345 __attribute__((swift_name("MultiVersionedTypedef345_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_5")));
diff --git a/clang/test/APINotes/versioned.m b/clang/test/APINotes/versioned.m
new file mode 100644
index 0000000..61cc8c3
--- /dev/null
+++ b/clang/test/APINotes/versioned.m
@@ -0,0 +1,187 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// Build and check the unversioned module file.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Unversioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-UNVERSIONED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-UNVERSIONED-DUMP %s
+
+// Build and check the versioned module file.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-VERSIONED-DUMP %s
+
+#import <VersionedKit/VersionedKit.h>
+
+// CHECK-UNVERSIONED: void moveToPointDUMP(double x, double y) __attribute__((swift_name("moveTo(x:y:)")));
+// CHECK-VERSIONED: void moveToPointDUMP(double x, double y) __attribute__((swift_name("moveTo(a:b:)")));
+
+// CHECK-DUMP-LABEL: Dumping moveToPointDUMP
+// CHECK-VERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "moveTo(x:y:)"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "moveTo(a:b:)"
+// CHECK-UNVERSIONED-DUMP: SwiftNameAttr {{.+}} "moveTo(x:y:)"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "moveTo(a:b:)"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping unversionedRenameDUMP
+// CHECK-DUMP: in VersionedKit unversionedRenameDUMP
+// CHECK-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 0 IsReplacedByActive{{$}}
+// CHECK-DUMP-NEXT: SwiftNameAttr {{.+}} "unversionedRename_HEADER()"
+// CHECK-DUMP-NEXT: SwiftNameAttr {{.+}} "unversionedRename_NOTES()"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping TestGenericDUMP
+// CHECK-VERSIONED-DUMP: SwiftImportAsNonGenericAttr {{.+}} <<invalid sloc>>
+// CHECK-UNVERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftImportAsNonGenericAttr {{.+}} <<invalid sloc>>
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping Swift3RenamedOnlyDUMP
+// CHECK-DUMP: in VersionedKit Swift3RenamedOnlyDUMP
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 3.0 {{[0-9]+}} IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift3Name"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "SpecialSwift3Name"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping Swift3RenamedAlsoDUMP
+// CHECK-DUMP: in VersionedKit Swift3RenamedAlsoDUMP
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <line:{{.+}}, col:{{.+}}> "Swift4Name"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift3Also"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <line:{{.+}}, col:{{.+}}> "Swift4Name"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "SpecialSwift3Also"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping Swift4RenamedDUMP
+// CHECK-DUMP: in VersionedKit Swift4RenamedDUMP
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 4 {{[0-9]+}} IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift4Name"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 4{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "SpecialSwift4Name"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-NOT: Dumping
+
+// CHECK-UNVERSIONED: void acceptClosure(void (^block)(void) __attribute__((noescape)));
+// CHECK-VERSIONED: void acceptClosure(void (^block)(void));
+
+// CHECK-UNVERSIONED: void privateFunc(void) __attribute__((swift_private));
+
+// CHECK-UNVERSIONED: typedef double MyDoubleWrapper __attribute__((swift_wrapper("struct")));
+
+// CHECK-UNVERSIONED:      enum __attribute__((ns_error_domain(MyErrorDomain))) MyErrorCode {
+// CHECK-UNVERSIONED-NEXT:     MyErrorCodeFailed = 1
+// CHECK-UNVERSIONED-NEXT: };
+
+// CHECK-UNVERSIONED: __attribute__((swift_bridge("MyValueType")))
+// CHECK-UNVERSIONED: @interface MyReferenceType
+
+// CHECK-VERSIONED: void privateFunc(void);
+
+// CHECK-VERSIONED: typedef double MyDoubleWrapper;
+
+// CHECK-VERSIONED:      enum MyErrorCode {
+// CHECK-VERSIONED-NEXT:     MyErrorCodeFailed = 1
+// CHECK-VERSIONED-NEXT: };
+
+// CHECK-VERSIONED-NOT: __attribute__((swift_bridge("MyValueType")))
+// CHECK-VERSIONED: @interface MyReferenceType
+
+// CHECK-UNVERSIONED: __attribute__((swift_objc_members)
+// CHECK-UNVERSIONED-NEXT: @interface TestProperties
+// CHECK-VERSIONED-NOT: __attribute__((swift_objc_members)
+// CHECK-VERSIONED: @interface TestProperties
+
+// CHECK-UNVERSIONED-LABEL: enum __attribute__((flag_enum)) FlagEnum {
+// CHECK-UNVERSIONED-NEXT:     FlagEnumA = 1,
+// CHECK-UNVERSIONED-NEXT:     FlagEnumB = 2
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum __attribute__((flag_enum)) NewlyFlagEnum {
+// CHECK-UNVERSIONED-NEXT:     NewlyFlagEnumA = 1,
+// CHECK-UNVERSIONED-NEXT:     NewlyFlagEnumB = 2
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum __attribute__((flag_enum)) APINotedFlagEnum {
+// CHECK-UNVERSIONED-NEXT:     APINotedFlagEnumA = 1,
+// CHECK-UNVERSIONED-NEXT:     APINotedFlagEnumB = 2
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) OpenEnum {
+// CHECK-UNVERSIONED-NEXT:     OpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) NewlyOpenEnum {
+// CHECK-UNVERSIONED-NEXT:     NewlyOpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) NewlyClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     NewlyClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) ClosedToOpenEnum {
+// CHECK-UNVERSIONED-NEXT:     ClosedToOpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) OpenToClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     OpenToClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) APINotedOpenEnum {
+// CHECK-UNVERSIONED-NEXT:     APINotedOpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) APINotedClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     APINotedClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+
+// CHECK-VERSIONED-LABEL: enum __attribute__((flag_enum)) FlagEnum {
+// CHECK-VERSIONED-NEXT:     FlagEnumA = 1,
+// CHECK-VERSIONED-NEXT:     FlagEnumB = 2
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum NewlyFlagEnum {
+// CHECK-VERSIONED-NEXT:     NewlyFlagEnumA = 1,
+// CHECK-VERSIONED-NEXT:     NewlyFlagEnumB = 2
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((flag_enum)) APINotedFlagEnum {
+// CHECK-VERSIONED-NEXT:     APINotedFlagEnumA = 1,
+// CHECK-VERSIONED-NEXT:     APINotedFlagEnumB = 2
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("open"))) OpenEnum {
+// CHECK-VERSIONED-NEXT:     OpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum NewlyOpenEnum {
+// CHECK-VERSIONED-NEXT:     NewlyOpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum NewlyClosedEnum {
+// CHECK-VERSIONED-NEXT:     NewlyClosedEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("closed"))) ClosedToOpenEnum {
+// CHECK-VERSIONED-NEXT:     ClosedToOpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("open"))) OpenToClosedEnum {
+// CHECK-VERSIONED-NEXT:     OpenToClosedEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("open"))) APINotedOpenEnum {
+// CHECK-VERSIONED-NEXT:     APINotedOpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("closed"))) APINotedClosedEnum {
+// CHECK-VERSIONED-NEXT:     APINotedClosedEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+
+// These don't actually have versioned information, so we just check them once.
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) SoonToBeCFEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeCFEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) SoonToBeNSEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeNSEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) __attribute__((flag_enum)) SoonToBeCFOptions {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeCFOptionsA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) __attribute__((flag_enum)) SoonToBeNSOptions {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeNSOptionsA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) SoonToBeCFClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeCFClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) SoonToBeNSClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeNSClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum UndoAllThatHasBeenDoneToMe {
+// CHECK-UNVERSIONED-NEXT:     UndoAllThatHasBeenDoneToMeA = 1
+// CHECK-UNVERSIONED-NEXT: };
diff --git a/clang/test/APINotes/yaml-convert-diags.c b/clang/test/APINotes/yaml-convert-diags.c
new file mode 100644
index 0000000..1d352dc
--- /dev/null
+++ b/clang/test/APINotes/yaml-convert-diags.c
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t
+// RUN: not %clang_cc1 -fsyntax-only -fapinotes %s -I %S/Inputs/BrokenHeaders2 2>&1 | FileCheck %s
+
+#include "SomeBrokenLib.h"
+
+// CHECK: error: multiple definitions of global function 'do_something_with_pointers'
diff --git a/clang/test/APINotes/yaml-parse-diags.c b/clang/test/APINotes/yaml-parse-diags.c
new file mode 100644
index 0000000..3ae39cc
--- /dev/null
+++ b/clang/test/APINotes/yaml-parse-diags.c
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fapinotes %s -I %S/Inputs/BrokenHeaders -verify
+
+#include "SomeBrokenLib.h"
+
+// expected-error@APINotes.apinotes:4{{unknown key 'Nu llabilityOfRet'}}
diff --git a/clang/test/APINotes/yaml-reader-errors.m b/clang/test/APINotes/yaml-reader-errors.m
new file mode 100644
index 0000000..9e5ee34
--- /dev/null
+++ b/clang/test/APINotes/yaml-reader-errors.m
@@ -0,0 +1,5 @@
+// RUN: rm -rf %t
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fapinotes -fapinotes-modules -fmodules-cache-path=%t -I %S/Inputs/yaml-reader-errors/ -fsyntax-only %s > %t.err 2>&1
+// RUN: FileCheck %S/Inputs/yaml-reader-errors/UIKit.apinotes < %t.err
+
+@import UIKit;
diff --git a/clang/test/CodeGen/CSKY/csky-abi.c b/clang/test/CodeGen/CSKY/csky-abi.c
index 2e54937..29ed661 100644
--- a/clang/test/CodeGen/CSKY/csky-abi.c
+++ b/clang/test/CodeGen/CSKY/csky-abi.c
@@ -185,13 +185,13 @@ void f_va_caller(void) {
 // CHECK:   [[VA:%.*]] = alloca ptr, align 4
 // CHECK:   [[V:%.*]] = alloca i32, align 4
 // CHECK:   store ptr %fmt, ptr [[FMT_ADDR]], align 4
-// CHECK:   call void @llvm.va_start(ptr [[VA]])
+// CHECK:   call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK:   [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK:   store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK:   [[TMP1:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 // CHECK:   store i32 [[TMP1]], ptr [[V]], align 4
-// CHECK:   call void @llvm.va_end(ptr [[VA]])
+// CHECK:   call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK:   [[TMP2:%.*]] = load i32, ptr [[V]], align 4
 // CHECK:   ret i32 [[TMP2]]
 // CHECK: }
@@ -210,13 +210,13 @@ int f_va_1(char *fmt, ...) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca double, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    store double [[TMP4]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[V]], align 4
 // CHECK-NEXT:    ret double [[TMP5]]
 double f_va_2(char *fmt, ...) {
@@ -236,7 +236,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[X:%.*]] = alloca double, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -252,7 +252,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT5]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP11:%.*]] = load double, ptr [[ARGP_CUR4]], align 4
 // CHECK-NEXT:    store double [[TMP11]], ptr [[X]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[V]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[X]], align 4
 // CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[TMP13]]
@@ -279,7 +279,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -302,7 +302,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    [[ARGP_NEXT9:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR8]], i32 16
 // CHECK-NEXT:    store ptr [[ARGP_NEXT9]], ptr [[VA]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[ARGP_CUR8]], i32 16, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 int f_va_4(char *fmt, ...) {
   __builtin_va_list va;
 
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d.c b/clang/test/CodeGen/LoongArch/abi-lp64d.c
index 66b480a..fc7f1ea 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d.c
@@ -449,13 +449,13 @@ void f_va_caller(void) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 int f_va_int(char *fmt, ...) {
diff --git a/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c b/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c
index 0318242..b3f1e93 100644
--- a/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c
+++ b/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c
@@ -17,7 +17,7 @@ vector double vector_varargs(int count, ...) {
 }
 
 // CHECK:         %arg_list = alloca ptr
-// CHECK:         call void @llvm.va_start(ptr %arg_list)
+// CHECK:         call void @llvm.va_start.p0(ptr %arg_list)
 
 // AIX32:       for.body:
 // AIX32-NEXT:    %argp.cur = load ptr, ptr %arg_list, align 4
@@ -41,4 +41,4 @@ vector double vector_varargs(int count, ...) {
 
 
 // CHECK:      for.end:
-// CHECK:        call void @llvm.va_end(ptr %arg_list)
+// CHECK:        call void @llvm.va_end.p0(ptr %arg_list)
diff --git a/clang/test/CodeGen/PowerPC/aix-vaargs.c b/clang/test/CodeGen/PowerPC/aix-vaargs.c
index 8b8417d..724ba656 100644
--- a/clang/test/CodeGen/PowerPC/aix-vaargs.c
+++ b/clang/test/CodeGen/PowerPC/aix-vaargs.c
@@ -35,7 +35,7 @@ void testva (int n, ...) {
 
 // CHECK-NEXT:  %v = alloca i32, align 4
 // CHECK-NEXT:  store i32 %n, ptr %n.addr, align 4
-// CHECK-NEXT:  call void @llvm.va_start(ptr %ap)
+// CHECK-NEXT:  call void @llvm.va_start.p0(ptr %ap)
 
 // AIX32-NEXT:  %argp.cur = load ptr, ptr %ap, align 4
 // AIX32-NEXT:  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 16
@@ -48,7 +48,7 @@ void testva (int n, ...) {
 // AIX32-NEXT:  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %t, ptr align 4 %argp.cur, i32 16, i1 false)
 // AIX64-NEXT:  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %t, ptr align 8 %argp.cur, i64 16, i1 false)
 
-// CHECK-NEXT:  call void @llvm.va_copy(ptr %ap2, ptr %ap)
+// CHECK-NEXT:  call void @llvm.va_copy.p0(ptr %ap2, ptr %ap)
 
 // AIX32-NEXT:  %argp.cur1 = load ptr, ptr %ap2, align 4
 // AIX32-NEXT:  %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i32 4
@@ -62,14 +62,14 @@ void testva (int n, ...) {
 // AIX64-NEXT:  %1 = load i32, ptr %0, align 4
 // AIX64-NEXT:  store i32 %1, ptr %v, align 4
 
-// CHECK-NEXT:  call void @llvm.va_end(ptr %ap2)
-// CHECK-NEXT:  call void @llvm.va_end(ptr %ap)
+// CHECK-NEXT:  call void @llvm.va_end.p0(ptr %ap2)
+// CHECK-NEXT:  call void @llvm.va_end.p0(ptr %ap)
 // CHECK-NEXT:  ret void
 
-// CHECK: declare void @llvm.va_start(ptr)
+// CHECK: declare void @llvm.va_start.p0(ptr)
 
 // AIX32: declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
 // AIX64: declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-// CHECK: declare void @llvm.va_copy(ptr, ptr)
-// CHECK: declare void @llvm.va_end(ptr)
+// CHECK: declare void @llvm.va_copy.p0(ptr, ptr)
+// CHECK: declare void @llvm.va_end.p0(ptr)
diff --git a/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c b/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c
index 396614f..2f5459d 100644
--- a/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c
+++ b/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c
@@ -31,7 +31,7 @@ void foo_ls(ldbl128_s);
 // OMP-TARGET: call void @foo_ld(ppc_fp128 noundef %[[V3]])
 
 // OMP-HOST-LABEL: define{{.*}} void @omp(
-// OMP-HOST: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// OMP-HOST: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // OMP-HOST: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]], align 8
 // OMP-HOST: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // OMP-HOST: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
@@ -49,13 +49,13 @@ void omp(int n, ...) {
 }
 
 // IEEE-LABEL: define{{.*}} void @f128
-// IEEE: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// IEEE: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // IEEE: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IEEE: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // IEEE: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
 // IEEE: %[[V4:[0-9a-zA-Z_.]+]] = load fp128, ptr %[[ALIGN]], align 16
 // IEEE: call void @foo_fq(fp128 noundef %[[V4]])
-// IEEE: call void @llvm.va_end(ptr %[[AP]])
+// IEEE: call void @llvm.va_end.p0(ptr %[[AP]])
 void f128(int n, ...) {
   va_list ap;
   va_start(ap, n);
@@ -64,20 +64,20 @@ void f128(int n, ...) {
 }
 
 // IEEE-LABEL: define{{.*}} void @long_double
-// IEEE: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// IEEE: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // IEEE: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IEEE: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // IEEE: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
 // IEEE: %[[V4:[0-9a-zA-Z_.]+]] = load fp128, ptr %[[ALIGN]], align 16
 // IEEE: call void @foo_ld(fp128 noundef %[[V4]])
-// IEEE: call void @llvm.va_end(ptr %[[AP]])
+// IEEE: call void @llvm.va_end.p0(ptr %[[AP]])
 
 // IBM-LABEL: define{{.*}} void @long_double
-// IBM: call void @llvm.va_start(ptr  %[[AP:[0-9a-zA-Z_.]+]])
+// IBM: call void @llvm.va_start.p0(ptr  %[[AP:[0-9a-zA-Z_.]+]])
 // IBM: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IBM: %[[V4:[0-9a-zA-Z_.]+]] = load ppc_fp128, ptr %[[CUR]], align 8
 // IBM: call void @foo_ld(ppc_fp128 noundef %[[V4]])
-// IBM: call void @llvm.va_end(ptr %[[AP]])
+// IBM: call void @llvm.va_end.p0(ptr %[[AP]])
 void long_double(int n, ...) {
   va_list ap;
   va_start(ap, n);
@@ -86,7 +86,7 @@ void long_double(int n, ...) {
 }
 
 // IEEE-LABEL: define{{.*}} void @long_double_struct
-// IEEE: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// IEEE: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // IEEE: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IEEE: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // IEEE: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
@@ -96,7 +96,7 @@ void long_double(int n, ...) {
 // IEEE: %[[COERCE:[0-9a-zA-Z_.]+]] = getelementptr inbounds %struct.ldbl128_s, ptr %[[TMP]], i32 0, i32 0
 // IEEE: %[[V4:[0-9a-zA-Z_.]+]] = load fp128, ptr %[[COERCE]], align 16
 // IEEE: call void @foo_ls(fp128 inreg %[[V4]])
-// IEEE: call void @llvm.va_end(ptr %[[AP]])
+// IEEE: call void @llvm.va_end.p0(ptr %[[AP]])
 void long_double_struct(int n, ...) {
   va_list ap;
   va_start(ap, n);
diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c
index 35d6973..b303d71 100644
--- a/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c
+++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c
@@ -2,6 +2,28 @@
 // RUN: not %clang_cc1 -triple riscv64 -target-feature +zifencei -target-feature +m -target-feature +a \
 // RUN:  -emit-llvm %s 2>&1 | FileCheck %s
 
+#include <riscv_vector.h>
+
+void test_builtin() {
+// CHECK: error: '__builtin_rvv_vsetvli' needs target feature zve32x
+  __riscv_vsetvl_e8m8(1);
+}
+
+void test_rvv_i32_type() {
+// CHECK: error: RISC-V type 'vint32m1_t' (aka '__rvv_int32m1_t') requires the 'zve32x' extension
+  vint32m1_t v;
+}
+
+void test_rvv_f32_type() {
+// CHECK: error: RISC-V type 'vfloat32m1_t' (aka '__rvv_float32m1_t') requires the 'zve32f' extension
+  vfloat32m1_t v;
+}
+
+void test_rvv_f64_type() {
+// CHECK: error: RISC-V type 'vfloat64m1_t' (aka '__rvv_float64m1_t') requires the 'zve64d' extension
+  vfloat64m1_t v;
+}
+
 // CHECK: error: duplicate 'arch=' in the 'target' attribute string;
 __attribute__((target("arch=rv64gc;arch=rv64gc_zbb"))) void testMultiArchSelectLast() {}
 // CHECK: error: duplicate 'cpu=' in the 'target' attribute string;
diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
index f216eaf..1f86821 100644
--- a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
+++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
@@ -4,6 +4,8 @@
 // RUN:  -target-feature -relax -target-feature -zfa \
 // RUN:  -emit-llvm %s -o - | FileCheck %s
 
+#include <riscv_vector.h>
+
 // CHECK-LABEL: define dso_local void @testDefault
 // CHECK-SAME: () #0 {
 void testDefault() {}
@@ -35,6 +37,34 @@ testAttrFullArchAndAttrCpu() {}
 // CHECK-SAME: () #8 {
 __attribute__((target("cpu=sifive-u54"))) void testAttrCpuOnly() {}
 
+__attribute__((target("arch=+zve32x")))
+void test_builtin_w_zve32x() {
+// CHECK-LABEL: test_builtin_w_zve32x
+// CHECK-SAME: #9
+  __riscv_vsetvl_e8m8(1);
+}
+
+__attribute__((target("arch=+zve32x")))
+void test_rvv_i32_type_w_zve32x() {
+// CHECK-LABEL: test_rvv_i32_type_w_zve32x
+// CHECK-SAME: #9
+  vint32m1_t v;
+}
+
+__attribute__((target("arch=+zve32f")))
+void test_rvv_f32_type_w_zve32f() {
+// CHECK-LABEL: test_rvv_f32_type_w_zve32f
+// CHECK-SAME: #11
+  vfloat32m1_t v;
+}
+
+__attribute__((target("arch=+zve64d")))
+void test_rvv_f64_type_w_zve64d() {
+// CHECK-LABEL: test_rvv_f64_type_w_zve64d
+// CHECK-SAME: #12
+  vfloat64m1_t v;
+}
+
 //.
 // CHECK: attributes #0 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zifencei,-relax,-zbb,-zfa" }
 // CHECK: attributes #1 = { {{.*}}"target-cpu"="rocket-rv64" "target-features"="+64bit,+a,+d,+f,+m,+save-restore,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zbb,-zfa" "tune-cpu"="generic-rv64" }
@@ -46,3 +76,6 @@ __attribute__((target("cpu=sifive-u54"))) void testAttrCpuOnly() {}
 // CHECK: attributes #6 = { {{.*}}"target-cpu"="sifive-u54" "target-features"="+64bit,+a,+m,+save-restore,+zbb,+zifencei,-relax,-zfa" }
 // CHECK: attributes #7 = { {{.*}}"target-cpu"="sifive-u54" "target-features"="+64bit,+m,+save-restore,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" }
 // CHECK: attributes #8 = { {{.*}}"target-cpu"="sifive-u54" "target-features"="+64bit,+a,+c,+d,+f,+m,+save-restore,+zicsr,+zifencei,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" }
+// CHECK: attributes #9 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zicsr,+zifencei,+zve32x,+zvl32b,-relax,-zbb,-zfa" }
+// CHECK: attributes #11 = { {{.*}}"target-features"="+64bit,+a,+f,+m,+save-restore,+zicsr,+zifencei,+zve32f,+zve32x,+zvl32b,-relax,-zbb,-zfa" }
+// CHECK: attributes #12 = { {{.*}}"target-features"="+64bit,+a,+d,+f,+m,+save-restore,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl32b,+zvl64b,-relax,-zbb,-zfa" }
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
new file mode 100644
index 0000000..072d8a8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
@@ -0,0 +1,34 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-LLVM %s
+// RUN: %clang_cc1 -std=c23 -triple riscv64 -target-feature +v \
+// RUN:   -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-LLVM %s
+
+#include <riscv_vector.h>
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @bar
+vint32m1_t __attribute__((riscv_vector_cc)) bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @bar
+[[riscv::vector_cc]] vint32m1_t bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr2(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call <vscale x 2 x i32> @baz
+vint32m1_t baz(vint32m1_t input);
+vint32m1_t test_no_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = baz(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
new file mode 100644
index 0000000..c01aeb2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
@@ -0,0 +1,32 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -std=c++11 -triple riscv64 -target-feature +v \
+// RUN:   -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-LLVM %s
+
+#include <riscv_vector.h>
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @_Z3baru15__rvv_int32m1_t
+vint32m1_t __attribute__((riscv_vector_cc)) bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @_Z3baru15__rvv_int32m1_t
+[[riscv::vector_cc]] vint32m1_t bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr2(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call <vscale x 2 x i32> @_Z3bazu15__rvv_int32m1_t
+vint32m1_t baz(vint32m1_t input);
+vint32m1_t test_no_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = baz(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.c
new file mode 100644
index 0000000..5c35901
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -std=c23 -triple riscv64 -target-feature +v -verify
+
+__attribute__((riscv_vector_cc)) int var; // expected-warning {{'riscv_vector_cc' only applies to function types; type here is 'int'}}
+
+__attribute__((riscv_vector_cc)) void func();
+__attribute__((riscv_vector_cc(1))) void func_invalid(); // expected-error {{'riscv_vector_cc' attribute takes no arguments}}
+
+void test_no_attribute(int); // expected-note {{previous declaration is here}}
+void __attribute__((riscv_vector_cc)) test_no_attribute(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
+
+[[riscv::vector_cc]] int var2; // expected-warning {{'vector_cc' only applies to function types; type here is 'int'}}
+
+[[riscv::vector_cc]] void func2();
+[[riscv::vector_cc(1)]] void func_invalid2(); // expected-error {{'vector_cc' attribute takes no arguments}}
+
+void test_no_attribute2(int); // expected-note {{previous declaration is here}}
+[[riscv::vector_cc]] void test_no_attribute2(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp
new file mode 100644
index 0000000..264bb7d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -triple riscv64 -target-feature +v -verify
+
+__attribute__((riscv_vector_cc)) int var; // expected-warning {{'riscv_vector_cc' only applies to function types; type here is 'int'}}
+
+__attribute__((riscv_vector_cc)) void func();
+__attribute__((riscv_vector_cc(1))) void func_invalid(); // expected-error {{'riscv_vector_cc' attribute takes no arguments}}
+
+void test_no_attribute(int); // expected-note {{previous declaration is here}}
+void __attribute__((riscv_vector_cc)) test_no_attribute(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
+
+class test_cc {
+  __attribute__((riscv_vector_cc)) void member_func();
+};
+
+void test_lambda() {
+  __attribute__((riscv_vector_cc)) auto lambda = []() { // expected-warning {{'riscv_vector_cc' only applies to function types; type here is 'auto'}}
+  };
+}
+
+[[riscv::vector_cc]] int var2; // expected-warning {{'vector_cc' only applies to function types; type here is 'int'}}
+
+[[riscv::vector_cc]] void func2();
+[[riscv::vector_cc(1)]] void func_invalid2(); // expected-error {{'vector_cc' attribute takes no arguments}}
+
+void test_no_attribute2(int); // expected-note {{previous declaration is here}}
+[[riscv::vector_cc]] void test_no_attribute2(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
+
+class test_cc2 {
+  [[riscv::vector_cc]] void member_func();
+};
+
+void test_lambda2() {
+  [[riscv::vector_cc]] auto lambda = []() { // expected-warning {{'vector_cc' only applies to function types; type here is 'auto'}}
+  };
+}
diff --git a/clang/test/CodeGen/RISCV/riscv32-vararg.c b/clang/test/CodeGen/RISCV/riscv32-vararg.c
index 1c4e41f2..00e04eb 100644
--- a/clang/test/CodeGen/RISCV/riscv32-vararg.c
+++ b/clang/test/CodeGen/RISCV/riscv32-vararg.c
@@ -80,13 +80,13 @@ void f_va_caller(void) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -111,7 +111,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-ILP32F-NEXT:    [[V:%.*]] = alloca double, align 8
 // CHECK-ILP32F-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32F-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -119,7 +119,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARGP_CUR_ALIGNED]], align 8
 // CHECK-ILP32F-NEXT:    store double [[TMP1]], ptr [[V]], align 8
-// CHECK-ILP32F-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[TMP2:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32F-NEXT:    ret double [[TMP2]]
 //
@@ -130,7 +130,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-ILP32D-NEXT:    [[V:%.*]] = alloca double, align 8
 // CHECK-ILP32D-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32D-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -138,7 +138,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARGP_CUR_ALIGNED]], align 8
 // CHECK-ILP32D-NEXT:    store double [[TMP1]], ptr [[V]], align 8
-// CHECK-ILP32D-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[TMP2:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32D-NEXT:    ret double [[TMP2]]
 //
@@ -149,13 +149,13 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-ILP32E-NEXT:    [[V:%.*]] = alloca double, align 8
 // CHECK-ILP32E-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32E-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARGP_CUR]], align 4
 // CHECK-ILP32E-NEXT:    store double [[TMP0]], ptr [[V]], align 8
-// CHECK-ILP32E-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[TMP1:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32E-NEXT:    ret double [[TMP1]]
 //
@@ -180,7 +180,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-ILP32F-NEXT:    [[X:%.*]] = alloca double, align 8
 // CHECK-ILP32F-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32F-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -200,7 +200,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARGP_CUR3_ALIGNED]], align 8
 // CHECK-ILP32F-NEXT:    store double [[TMP4]], ptr [[X]], align 8
-// CHECK-ILP32F-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[TMP5:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32F-NEXT:    [[TMP6:%.*]] = load double, ptr [[X]], align 8
 // CHECK-ILP32F-NEXT:    [[ADD:%.*]] = fadd double [[TMP5]], [[TMP6]]
@@ -215,7 +215,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-ILP32D-NEXT:    [[X:%.*]] = alloca double, align 8
 // CHECK-ILP32D-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32D-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -235,7 +235,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARGP_CUR3_ALIGNED]], align 8
 // CHECK-ILP32D-NEXT:    store double [[TMP4]], ptr [[X]], align 8
-// CHECK-ILP32D-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[TMP5:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32D-NEXT:    [[TMP6:%.*]] = load double, ptr [[X]], align 8
 // CHECK-ILP32D-NEXT:    [[ADD:%.*]] = fadd double [[TMP5]], [[TMP6]]
@@ -250,7 +250,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-ILP32E-NEXT:    [[X:%.*]] = alloca double, align 8
 // CHECK-ILP32E-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32E-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -266,7 +266,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARGP_CUR3]], align 4
 // CHECK-ILP32E-NEXT:    store double [[TMP2]], ptr [[X]], align 8
-// CHECK-ILP32E-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[TMP3:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32E-NEXT:    [[TMP4:%.*]] = load double, ptr [[X]], align 8
 // CHECK-ILP32E-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], [[TMP4]]
@@ -296,7 +296,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-ILP32F-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-ILP32F-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32F-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -321,7 +321,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT8]], ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGP_CUR7]], align 4
 // CHECK-ILP32F-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[TMP3]], i32 16, i1 false)
-// CHECK-ILP32F-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-ILP32F-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to fp128
 // CHECK-ILP32F-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[LD]], align 16
@@ -384,7 +384,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-ILP32D-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-ILP32D-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32D-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -409,7 +409,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT8]], ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGP_CUR7]], align 4
 // CHECK-ILP32D-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[TMP3]], i32 16, i1 false)
-// CHECK-ILP32D-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-ILP32D-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to fp128
 // CHECK-ILP32D-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[LD]], align 16
@@ -472,7 +472,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-ILP32E-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-ILP32E-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32E-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -497,7 +497,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT8]], ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGP_CUR7]], align 4
 // CHECK-ILP32E-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[TMP3]], i32 16, i1 false)
-// CHECK-ILP32E-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-ILP32E-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to fp128
 // CHECK-ILP32E-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[LD]], align 16
diff --git a/clang/test/CodeGen/RISCV/riscv64-vararg.c b/clang/test/CodeGen/RISCV/riscv64-vararg.c
index 634cde6..efdffa2 100644
--- a/clang/test/CodeGen/RISCV/riscv64-vararg.c
+++ b/clang/test/CodeGen/RISCV/riscv64-vararg.c
@@ -135,13 +135,13 @@ void f_va_caller(void) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -166,7 +166,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[V:%.*]] = alloca fp128, align 16
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15
 // CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -16)
@@ -174,7 +174,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load fp128, ptr [[ARGP_CUR_ALIGNED]], align 16
 // CHECK-NEXT:    store fp128 [[TMP1]], ptr [[V]], align 16
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load fp128, ptr [[V]], align 16
 // CHECK-NEXT:    ret fp128 [[TMP2]]
 //
@@ -199,7 +199,7 @@ long double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[X:%.*]] = alloca fp128, align 16
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15
 // CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -16)
@@ -219,7 +219,7 @@ long double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load fp128, ptr [[ARGP_CUR3_ALIGNED]], align 16
 // CHECK-NEXT:    store fp128 [[TMP4]], ptr [[X]], align 16
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[V]], align 16
 // CHECK-NEXT:    [[TMP6:%.*]] = load fp128, ptr [[X]], align 16
 // CHECK-NEXT:    [[ADD:%.*]] = fadd fp128 [[TMP5]], [[TMP6]]
@@ -248,7 +248,7 @@ long double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 8
 // CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
@@ -267,7 +267,7 @@ long double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT6]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGP_CUR5]], align 8
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[LS]], ptr align 8 [[TMP1]], i64 32, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TINY]], ptr [[TS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i64
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c
index ecf090a..bad6850 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv32 -target-feature +zbb -verify %s -o -
+// RUN: %clang_cc1 -triple riscv32 -target-feature +zbb -S -verify %s -o -
 
 unsigned int orc_b_64(unsigned int a) {
-  return __builtin_riscv_orc_b_64(a); // expected-error {{builtin requires: 'RV64'}}
+  return __builtin_riscv_orc_b_64(a); // expected-error {{'__builtin_riscv_orc_b_64' needs target feature zbb,64bit}}
 }
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c
index d2e3e76..a256bf7 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c
@@ -1,14 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64 -target-feature +zbkb -verify %s -o -
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zbkb -S -verify %s -o -
 
 #include <stdint.h>
 
-uint32_t zip(uint32_t rs1)
+uint32_t zip_unzip(uint32_t rs1)
 {
-  return __builtin_riscv_zip_32(rs1); // expected-error {{builtin requires: 'RV32'}}
-}
-
-uint32_t unzip(uint32_t rs1)
-{
-  return __builtin_riscv_unzip_32(rs1); // expected-error {{builtin requires: 'RV32'}}
+  (void)__builtin_riscv_zip_32(rs1); // expected-error {{'__builtin_riscv_zip_32' needs target feature zbkb,32bit}}
+  return __builtin_riscv_unzip_32(rs1); // expected-error {{'__builtin_riscv_unzip_32' needs target feature zbkb,32bit}}
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c
index 6ec9b05..ecb6c5f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c
@@ -11,7 +11,7 @@
 // CHECK-RV64V-NEXT:    ret i32 [[CONV]]
 //
 
-// CHECK-RV64-ERR: error: builtin requires at least one of the following extensions: 'Zve32x'
+// CHECK-RV64-ERR: error: '__builtin_rvv_vsetvli' needs target feature zve32x
 
 int test() {
   return __builtin_rvv_vsetvli(1, 0, 0);
diff --git a/clang/test/CodeGen/WebAssembly/wasm-varargs.c b/clang/test/CodeGen/WebAssembly/wasm-varargs.c
index c475de1..e794857 100644
--- a/clang/test/CodeGen/WebAssembly/wasm-varargs.c
+++ b/clang/test/CodeGen/WebAssembly/wasm-varargs.c
@@ -10,13 +10,13 @@
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -38,7 +38,7 @@ int test_i32(char *fmt, ...) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -46,7 +46,7 @@ int test_i32(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARGP_CUR_ALIGNED]], align 8
 // CHECK-NEXT:    store i64 [[TMP1]], ptr [[V]], align 8
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[V]], align 8
 // CHECK-NEXT:    ret i64 [[TMP2]]
 //
@@ -73,13 +73,13 @@ struct S {
 // CHECK-NEXT:    [[FMT_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT]], ptr align 4 [[TMP0]], i32 12, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    ret void
 //
 struct S test_struct(char *fmt, ...) {
@@ -102,7 +102,7 @@ struct Z {};
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[U:%.*]] = alloca [[STRUCT_Z:%.*]], align 1
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 0
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -112,7 +112,7 @@ struct Z {};
 // CHECK-NEXT:    store ptr [[ARGP_NEXT2]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR1]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT]], ptr align 4 [[TMP0]], i32 12, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    ret void
 //
 struct S test_empty_struct(char *fmt, ...) {
diff --git a/clang/test/CodeGen/X86/va-arg-sse.c b/clang/test/CodeGen/X86/va-arg-sse.c
index e040b0e..b7d00da 100644
--- a/clang/test/CodeGen/X86/va-arg-sse.c
+++ b/clang/test/CodeGen/X86/va-arg-sse.c
@@ -21,7 +21,7 @@ struct S a[5];
 // CHECK-NEXT:    store i32 0, ptr [[J]], align 4
 // CHECK-NEXT:    store i32 0, ptr [[K]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[AP]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    store ptr getelementptr inbounds ([5 x %struct.S], ptr @a, i64 0, i64 2), ptr [[P]], align 8
 // CHECK-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[AP]], i64 0, i64 0
 // CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY2]], i32 0, i32 1
@@ -52,7 +52,7 @@ struct S a[5];
 // CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARG]], ptr align 4 [[VAARG_ADDR]], i64 12, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY3:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[AP]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[ARRAYDECAY3]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[ARRAYDECAY3]])
 // CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[P]], align 8
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne ptr [[TMP15]], null
 // CHECK-NEXT:    br i1 [[TOBOOL]], label [[LAND_LHS_TRUE:%.*]], label [[IF_END:%.*]]
diff --git a/clang/test/CodeGen/X86/x86_64-vaarg.c b/clang/test/CodeGen/X86/x86_64-vaarg.c
index a18ba83..07c6df1 100644
--- a/clang/test/CodeGen/X86/x86_64-vaarg.c
+++ b/clang/test/CodeGen/X86/x86_64-vaarg.c
@@ -13,7 +13,7 @@ typedef struct { struct {} a; } empty;
 // CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_EMPTY]], align 1
 // CHECK-NEXT:    store i32 [[Z]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL]], ptr align 1 [[TMP]], i64 0, i1 false)
 // CHECK-NEXT:    ret void
@@ -37,7 +37,7 @@ typedef struct {
 // CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
 // CHECK-NEXT:    store i32 [[Z]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 1
 // CHECK-NEXT:    [[FP_OFFSET:%.*]] = load i32, ptr [[FP_OFFSET_P]], align 4
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed.c b/clang/test/CodeGen/aarch64-ABI-align-packed.c
index 2b029f6..13c68fe 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed.c
@@ -73,7 +73,7 @@ __attribute__((noinline)) void named_arg_non_packed_struct(double d0, double d1,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6:[0-9]+]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_non_packed_struct(double d0, double d1, double d2, double d3,
@@ -128,7 +128,7 @@ __attribute__((noinline)) void named_arg_packed_struct(double d0, double d1, dou
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_packed_struct(double d0, double d1, double d2, double d3,
@@ -183,7 +183,7 @@ __attribute__((noinline)) void named_arg_packed_member(double d0, double d1, dou
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_packed_member(double d0, double d1, double d2, double d3,
@@ -238,7 +238,7 @@ __attribute__((noinline)) void named_arg_aligned_struct_8(double d0, double d1,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_aligned_struct_8(double d0, double d1, double d2, double d3,
@@ -293,7 +293,7 @@ __attribute__((noinline)) void named_arg_aligned_member_8(double d0, double d1,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_aligned_member_8(double d0, double d1, double d2, double d3,
@@ -348,7 +348,7 @@ __attribute__((noinline)) void named_arg_pragma_packed_struct_8(double d0, doubl
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_pragma_packed_struct_8(double d0, double d1, double d2, double d3,
@@ -403,7 +403,7 @@ __attribute__((noinline)) void named_arg_pragma_packed_struct_4(double d0, doubl
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_pragma_packed_struct_4(double d0, double d1, double d2, double d3,
diff --git a/clang/test/CodeGen/aarch64-varargs.c b/clang/test/CodeGen/aarch64-varargs.c
index 44b8702..ee4e88e 100644
--- a/clang/test/CodeGen/aarch64-varargs.c
+++ b/clang/test/CodeGen/aarch64-varargs.c
@@ -837,7 +837,7 @@ void check_start(int n, ...) {
   va_list the_list;
   va_start(the_list, n);
 // CHECK: [[THE_LIST:%[a-z_0-9]+]] = alloca %struct.__va_list
-// CHECK: call void @llvm.va_start(ptr [[THE_LIST]])
+// CHECK: call void @llvm.va_start.p0(ptr [[THE_LIST]])
 }
 
 typedef struct {} empty;
diff --git a/clang/test/CodeGen/arm-varargs.c b/clang/test/CodeGen/arm-varargs.c
index f754c7f..ab4ac46 100644
--- a/clang/test/CodeGen/arm-varargs.c
+++ b/clang/test/CodeGen/arm-varargs.c
@@ -264,5 +264,5 @@ void check_start(int n, ...) {
   va_list the_list;
   va_start(the_list, n);
 // CHECK: [[THE_LIST:%[a-z0-9._]+]] = alloca %struct.__va_list
-// CHECK: call void @llvm.va_start(ptr [[THE_LIST]])
+// CHECK: call void @llvm.va_start.p0(ptr [[THE_LIST]])
 }
diff --git a/clang/test/CodeGen/hexagon-linux-vararg.c b/clang/test/CodeGen/hexagon-linux-vararg.c
index 033e72a..84945e8 100644
--- a/clang/test/CodeGen/hexagon-linux-vararg.c
+++ b/clang/test/CodeGen/hexagon-linux-vararg.c
@@ -9,7 +9,7 @@ struct AAA {
   int d;
 };
 
-// CHECK:   call void @llvm.va_start(ptr %arraydecay)
+// CHECK:   call void @llvm.va_start.p0(ptr %arraydecay)
 // CHECK:   %arraydecay1 = getelementptr inbounds [1 x %struct.__va_list_tag],
 // ptr %ap, i32 0, i32 0
 // CHECK:   br label %vaarg.maybe_reg
diff --git a/clang/test/CodeGen/mips-varargs.c b/clang/test/CodeGen/mips-varargs.c
index 052aedd..029f000 100644
--- a/clang/test/CodeGen/mips-varargs.c
+++ b/clang/test/CodeGen/mips-varargs.c
@@ -29,7 +29,7 @@ int test_i32(char *fmt, ...) {
 // ALL:   [[V:%.*]] = alloca i32, align 4
 // NEW:   [[PROMOTION_TEMP:%.*]] = alloca i32, align 4
 //
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 // O32:   [[AP_NEXT:%.+]] = getelementptr inbounds i8, ptr [[AP_CUR]], [[$INTPTR_T:i32]] [[$CHUNKSIZE:4]]
 // NEW:   [[AP_NEXT:%.+]] = getelementptr inbounds i8, ptr [[AP_CUR]], [[$INTPTR_T:i32|i64]] [[$CHUNKSIZE:8]]
@@ -45,7 +45,7 @@ int test_i32(char *fmt, ...) {
 // NEW:   [[ARG:%.+]] = load i32, ptr [[PROMOTION_TEMP]], align 4
 // ALL:   store i32 [[ARG]], ptr [[V]], align 4
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL: }
 
 long long test_i64(char *fmt, ...) {
@@ -61,7 +61,7 @@ long long test_i64(char *fmt, ...) {
 // ALL-LABEL: define{{.*}} i64 @test_i64(ptr{{.*}} %fmt, ...)
 //
 // ALL:   %va = alloca ptr, align [[$PTRALIGN]]
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 //
 // i64 is 8-byte aligned, while this is within O32's stack alignment there's no
@@ -74,7 +74,7 @@ long long test_i64(char *fmt, ...) {
 //
 // ALL:   [[ARG:%.+]] = load i64, ptr [[AP_CUR]], align 8
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL: }
 
 char *test_ptr(char *fmt, ...) {
@@ -92,7 +92,7 @@ char *test_ptr(char *fmt, ...) {
 // ALL:   %va = alloca ptr, align [[$PTRALIGN]]
 // ALL:   [[V:%.*]] = alloca ptr, align [[$PTRALIGN]]
 // N32:   [[AP_CAST:%.+]] = alloca ptr, align 4
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 // ALL:   [[AP_NEXT:%.+]] = getelementptr inbounds i8, ptr [[AP_CUR]], [[$INTPTR_T]] [[$CHUNKSIZE]]
 // ALL:   store ptr [[AP_NEXT]], ptr %va, align [[$PTRALIGN]]
@@ -109,7 +109,7 @@ char *test_ptr(char *fmt, ...) {
 // N64:   [[ARG:%.+]] = load ptr, ptr [[AP_CUR]], align [[$PTRALIGN]]
 // ALL:   store ptr [[ARG]], ptr [[V]], align [[$PTRALIGN]]
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL: }
 
 int test_v4i32(char *fmt, ...) {
@@ -128,7 +128,7 @@ int test_v4i32(char *fmt, ...) {
 //
 // ALL:   %va = alloca ptr, align [[$PTRALIGN]]
 // ALL:   [[V:%.+]] = alloca <4 x i32>, align 16
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 //
 // Vectors are 16-byte aligned, however the O32 ABI has a maximum alignment of
@@ -152,7 +152,7 @@ int test_v4i32(char *fmt, ...) {
 // N32:   [[ARG:%.+]] = load <4 x i32>, ptr [[AP_CUR]], align 16
 // ALL:   store <4 x i32> [[ARG]], ptr [[V]], align 16
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL:   [[VECEXT:%.+]] = extractelement <4 x i32> {{.*}}, i32 0
 // ALL:   ret i32 [[VECEXT]]
 // ALL: }
diff --git a/clang/test/CodeGen/pr53127.cpp b/clang/test/CodeGen/pr53127.cpp
index 97fe129..5a52b48 100644
--- a/clang/test/CodeGen/pr53127.cpp
+++ b/clang/test/CodeGen/pr53127.cpp
@@ -34,7 +34,7 @@ void operator delete(void*);
 // CHECK-NEXT:    br i1 [[CALL6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
 // CHECK:       cond.true7:
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[L]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    br label [[COND_END9:%.*]]
 // CHECK:       cond.false8:
 // CHECK-NEXT:    br label [[COND_END9]]
@@ -44,7 +44,7 @@ void operator delete(void*);
 // CHECK:       cond.true11:
 // CHECK-NEXT:    [[ARRAYDECAY12:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[L]], i64 0, i64 0
 // CHECK-NEXT:    [[ARRAYDECAY13:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[L2]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_copy(ptr [[ARRAYDECAY12]], ptr [[ARRAYDECAY13]])
+// CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[ARRAYDECAY12]], ptr [[ARRAYDECAY13]])
 // CHECK-NEXT:    br label [[COND_END15:%.*]]
 // CHECK:       cond.false14:
 // CHECK-NEXT:    br label [[COND_END15]]
diff --git a/clang/test/CodeGen/varargs-with-nonzero-default-address-space.c b/clang/test/CodeGen/varargs-with-nonzero-default-address-space.c
new file mode 100644
index 0000000..b087da34
--- /dev/null
+++ b/clang/test/CodeGen/varargs-with-nonzero-default-address-space.c
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple spirv64-unknown-unknown -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+
+struct x {
+  double b;
+  long a;
+};
+
+// CHECK-LABEL: define spir_func void @testva(
+// CHECK-SAME: i32 noundef [[N:%.*]], ...) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AP:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_X:%.*]], align 8
+// CHECK-NEXT:    [[AP2:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[VARET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr [[N_ADDR]] to ptr addrspace(4)
+// CHECK-NEXT:    [[AP_ASCAST:%.*]] = addrspacecast ptr [[AP]] to ptr addrspace(4)
+// CHECK-NEXT:    [[T_ASCAST:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
+// CHECK-NEXT:    [[AP2_ASCAST:%.*]] = addrspacecast ptr [[AP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[V_ASCAST:%.*]] = addrspacecast ptr [[V]] to ptr addrspace(4)
+// CHECK-NEXT:    [[VARET_ASCAST:%.*]] = addrspacecast ptr [[VARET]] to ptr addrspace(4)
+// CHECK-NEXT:    store i32 [[N]], ptr addrspace(4) [[N_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    call void @llvm.va_start.p4(ptr addrspace(4) [[AP_ASCAST]])
+// CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr addrspace(4) [[AP_ASCAST]], ptr
+// CHECK-NEXT:    call void @llvm.memcpy.p4.p0.i64(ptr addrspace(4) align 8 [[T_ASCAST]], ptr align 8 [[TMP0]], i64 16, i1 false)
+// CHECK-NEXT:    call void @llvm.va_copy.p4(ptr addrspace(4) [[AP2_ASCAST]], ptr addrspace(4) [[AP_ASCAST]])
+// CHECK-NEXT:    [[TMP1:%.*]] = va_arg ptr addrspace(4) [[AP2_ASCAST]], i32
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[VARET_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VARET_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[V_ASCAST]], align 4
+// CHECK-NEXT:    call void @llvm.va_end.p4(ptr addrspace(4) [[AP2_ASCAST]])
+// CHECK-NEXT:    call void @llvm.va_end.p4(ptr addrspace(4) [[AP_ASCAST]])
+// CHECK-NEXT:    ret void
+
+void testva(int n, ...) {
+  __builtin_va_list ap;
+  __builtin_va_start(ap, n);
+  struct x t = __builtin_va_arg(ap, struct x);
+  __builtin_va_list ap2;
+  __builtin_va_copy(ap2, ap);
+  int v = __builtin_va_arg(ap2, int);
+  __builtin_va_end(ap2);
+  __builtin_va_end(ap);
+}
diff --git a/clang/test/CodeGen/xcore-abi.c b/clang/test/CodeGen/xcore-abi.c
index 4dd0f22..bb8d2fe 100644
--- a/clang/test/CodeGen/xcore-abi.c
+++ b/clang/test/CodeGen/xcore-abi.c
@@ -28,7 +28,7 @@ void testva (int n, ...) {
   // CHECK: [[AP:%[a-z0-9]+]] = alloca ptr, align 4
   // CHECK: [[V5:%[a-z0-9]+]] = alloca %struct.x, align 4
   // CHECK: [[TMP:%[a-z0-9]+]] = alloca [4 x i32], align 4
-  // CHECK: call void @llvm.va_start(ptr [[AP]])
+  // CHECK: call void @llvm.va_start.p0(ptr [[AP]])
 
   char* v1 = va_arg (ap, char*);
   f(v1);
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
index 5a4270a..a1d17c8 100644
--- a/clang/test/CodeGenCXX/ext-int.cpp
+++ b/clang/test/CodeGenCXX/ext-int.cpp
@@ -159,9 +159,9 @@ void TakesVarargs(int i, ...) {
   // WIN: %[[ARGS:.+]] = alloca ptr
   __builtin_va_start(args, i);
   // LIN64: %[[STARTAD:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]]
-  // LIN64: call void @llvm.va_start(ptr %[[STARTAD]])
-  // LIN32: call void @llvm.va_start(ptr %[[ARGS]])
-  // WIN: call void @llvm.va_start(ptr %[[ARGS]])
+  // LIN64: call void @llvm.va_start.p0(ptr %[[STARTAD]])
+  // LIN32: call void @llvm.va_start.p0(ptr %[[ARGS]])
+  // WIN: call void @llvm.va_start.p0(ptr %[[ARGS]])
 
   _BitInt(92) A = __builtin_va_arg(args, _BitInt(92));
   // LIN64: %[[AD1:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]]
@@ -302,9 +302,9 @@ void TakesVarargs(int i, ...) {
 
   __builtin_va_end(args);
   // LIN64: %[[ENDAD:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]]
-  // LIN64: call void @llvm.va_end(ptr %[[ENDAD]])
-  // LIN32: call void @llvm.va_end(ptr %[[ARGS]])
-  // WIN: call void @llvm.va_end(ptr %[[ARGS]])
+  // LIN64: call void @llvm.va_end.p0(ptr %[[ENDAD]])
+  // LIN32: call void @llvm.va_end.p0(ptr %[[ARGS]])
+  // WIN: call void @llvm.va_end.p0(ptr %[[ARGS]])
 }
 void typeid_tests() {
   // LIN: define{{.*}} void @_Z12typeid_testsv()
diff --git a/clang/test/CodeGenCXX/ibm128-declarations.cpp b/clang/test/CodeGenCXX/ibm128-declarations.cpp
index 5ee4f35..e0187e2 100644
--- a/clang/test/CodeGenCXX/ibm128-declarations.cpp
+++ b/clang/test/CodeGenCXX/ibm128-declarations.cpp
@@ -107,13 +107,13 @@ int main(void) {
 // CHECK: define dso_local noundef ppc_fp128 @_Z10func_vaargiz(i32 noundef signext %n, ...)
 // CHECK: entry:
 // CHECK:   store i32 %n, ptr %n.addr, align 4
-// CHECK:   call void @llvm.va_start(ptr %ap)
+// CHECK:   call void @llvm.va_start.p0(ptr %ap)
 // CHECK:   %argp.cur = load ptr, ptr %ap, align 8
 // CHECK:   %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 16
 // CHECK:   store ptr %argp.next, ptr %ap, align 8
 // CHECK:   %0 = load ppc_fp128, ptr %argp.cur, align 8
 // CHECK:   store ppc_fp128 %0, ptr %r, align 16
-// CHECK:   call void @llvm.va_end(ptr %ap)
+// CHECK:   call void @llvm.va_end.p0(ptr %ap)
 // CHECK:   %1 = load ppc_fp128, ptr %r, align 16
 // CHECK:   ret ppc_fp128 %1
 // CHECK: }
diff --git a/clang/test/CodeGenCXX/x86_64-vaarg.cpp b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
index d221c18..985a0cc 100644
--- a/clang/test/CodeGenCXX/x86_64-vaarg.cpp
+++ b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
@@ -11,7 +11,7 @@ typedef struct { struct {} a; } empty;
 // CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_EMPTY]], align 1
 // CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL]], ptr align 1 [[TMP]], i64 1, i1 false)
 // CHECK-NEXT:    ret void
@@ -34,7 +34,7 @@ typedef struct {
 // CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
 // CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 1
 // CHECK-NEXT:    [[FP_OFFSET:%.*]] = load i32, ptr [[FP_OFFSET_P]], align 4
diff --git a/clang/test/Driver/aarch64-sve.c b/clang/test/Driver/aarch64-sve.c
index f34b270..4a33c2e 100644
--- a/clang/test/Driver/aarch64-sve.c
+++ b/clang/test/Driver/aarch64-sve.c
@@ -6,12 +6,11 @@
 // RUN: %clang --target=aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-NOSVE %s
 // GENERICV8A-NOSVE-NOT: "-target-feature" "+sve"
 
-// The 32-bit floating point matrix multiply extension is enabled by default
-// for armv8.6-a targets (or later) with SVE, and can optionally be enabled for
-// any target from armv8.2a onwards (we don't enforce not using it with earlier
-// targets).
+// The 32-bit floating point matrix multiply extension is an optional feature
+// that can be used for any target from armv8.2a and onwards. This can be
+// enabled using the `+f32mm` option.`.
 // RUN: %clang --target=aarch64 -march=armv8.6a       -### -c %s 2>&1 | FileCheck -check-prefix=NO-F32MM %s
-// RUN: %clang --target=aarch64 -march=armv8.6a+sve   -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
+// RUN: %clang --target=aarch64 -march=armv8.6a+sve+f32mm   -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
 // RUN: %clang --target=aarch64 -march=armv8.5a+f32mm -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
 // NO-F32MM-NOT: "-target-feature" "+f32mm"
 // F32MM: "-target-feature" "+f32mm"
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h
new file mode 100644
index 0000000..993d5d4
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h
@@ -0,0 +1,3 @@
+#ifndef PUBLIC_UMBRELLA_HEADER_FIRST
+#error "Public umbrella header was not included first!"
+#endif
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h
new file mode 100644
index 0000000..2599ff1
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h
@@ -0,0 +1 @@
+#define PUBLIC_UMBRELLA_HEADER_FIRST
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h
new file mode 100644
index 0000000..557209b
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h
@@ -0,0 +1,3 @@
+#ifndef PRIVATE_UMBRELLA_HEADER_FIRST
+#error "Private umbrella header was not included first!"
+#endif
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h
new file mode 100644
index 0000000..fd5b49b
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h
@@ -0,0 +1 @@
+#define PRIVATE_UMBRELLA_HEADER_FIRST
diff --git a/clang/test/InstallAPI/umbrella-headers-unix.test b/clang/test/InstallAPI/umbrella-headers-unix.test
new file mode 100644
index 0000000..4611877
--- /dev/null
+++ b/clang/test/InstallAPI/umbrella-headers-unix.test
@@ -0,0 +1,40 @@
+// UNSUPPORTED: system-windows
+
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: mkdir %t/Frameworks/
+; RUN: cp -r %S/Inputs/Umbrella/Umbrella.framework %t/Frameworks/
+
+// Only validate path based input that rely on regex matching on unix based file systems.
+; RUN: clang-installapi --target=arm64-apple-macosx13 \
+; RUN:  -install_name /System/Library/Frameworks/Umbrella2.framework/Versions/A/Umbrella \
+; RUN: -ObjC -F%t/Frameworks/ %t/inputs.json \
+; RUN: --public-umbrella-header=%t/Frameworks/Umbrella.framework/Headers/SpecialUmbrella.h \
+; RUN: -private-umbrella-header \
+; RUN: %t/Frameworks/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck -allow-empty %s
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/AAA.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/SpecialUmbrella.h",
+    "type" : "public"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/AAA_Private.h",
+    "type" : "private"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h",
+    "type" : "private"
+  }],
+  "version": "3"
+}
diff --git a/clang/test/InstallAPI/umbrella-headers.test b/clang/test/InstallAPI/umbrella-headers.test
new file mode 100644
index 0000000..ce9c506
--- /dev/null
+++ b/clang/test/InstallAPI/umbrella-headers.test
@@ -0,0 +1,48 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: cp -r %S/Inputs/Umbrella/Umbrella.framework %t/Frameworks/
+
+// Check base filename matches.
+; RUN: clang-installapi --target=arm64-apple-macosx13 \
+; RUN: -install_name /System/Library/Frameworks/Umbrella.framework/Versions/A/Umbrella \
+; RUN: -ObjC -F%t/Frameworks/ %t/inputs.json \
+; RUN: --public-umbrella-header=SpecialUmbrella.h \
+; RUN: --private-umbrella-header=SpecialPrivateUmbrella.h \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck -allow-empty %s
+
+// Try missing umbrella header argument.
+; RUN: not clang-installapi --target=arm64-apple-macosx13 \
+; RUN: -install_name /System/Library/Frameworks/Umbrella.framework/Versions/A/Umbrella \
+; RUN: -ObjC -F%t/Frameworks/ %t/inputs.json \
+; RUN: --public-umbrella-header=Ignore.h \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck %s -check-prefix=ERR
+
+; ERR: error: public umbrella header file not found in input: 'Ignore.h'
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+;--- Frameworks/Umbrella.framework/Headers/Ignore.h
+#error "This header should be ignored"
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/AAA.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/SpecialUmbrella.h",
+    "type" : "public"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/AAA_Private.h",
+    "type" : "private"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h",
+    "type" : "private"
+  }],
+  "version": "3"
+}
diff --git a/clang/test/Modules/codegen.test b/clang/test/Modules/codegen.test
index 7760205..0af630a 100644
--- a/clang/test/Modules/codegen.test
+++ b/clang/test/Modules/codegen.test
@@ -26,7 +26,7 @@ USE: $_Z4instIiEvv = comdat any
 USE: $_Z10always_inlv = comdat any
 FOO: $_ZN13implicit_dtorD2Ev = comdat any
 FOO: define weak_odr void @_Z2f1PKcz(ptr noundef %fmt, ...) #{{[0-9]+}} comdat
-FOO:   call void @llvm.va_start(ptr %{{[a-zA-Z0-9]*}})
+FOO:   call void @llvm.va_start.p0(ptr %{{[a-zA-Z0-9]*}})
 
 Test that implicit special members are emitted into the FOO module if they're
 ODR used there, otherwise emit them linkonce_odr as usual in the use.
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 9f8a8bd..85762b7 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -196,7 +196,7 @@
 // CHECK-8_6-NOT: __ARM_FEATURE_SHA3 1
 // CHECK-8_6-NOT: __ARM_FEATURE_SM4 1
 
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv8.6-a+sve -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE-8_6 %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8.6-a+sve+f32mm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE-8_6 %s
 // CHECK-SVE-8_6: __ARM_FEATURE_SVE 1
 // CHECK-SVE-8_6: __ARM_FEATURE_SVE_BF16 1
 // CHECK-SVE-8_6: __ARM_FEATURE_SVE_MATMUL_FP32 1
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index b7ea0d0..787cc80 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++20 -verify %s
+// RUN: %clang_cc1 -std=c++20 -ferror-limit 0 -verify %s
 
 namespace PR47043 {
   template<typename T> concept True = true;
@@ -1114,3 +1114,11 @@ void foo() {
 }
 
 } // namespace GH64808
+
+namespace GH86757_1 {
+template <typename...> concept b = false;
+template <typename> concept c = b<>;
+template <typename d> concept f = c< d >;
+template <f> struct e; // expected-note {{}}
+template <f d> struct e<d>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+}
diff --git a/clang/tools/clang-installapi/InstallAPIOpts.td b/clang/tools/clang-installapi/InstallAPIOpts.td
index ab9e1fe..71532c9 100644
--- a/clang/tools/clang-installapi/InstallAPIOpts.td
+++ b/clang/tools/clang-installapi/InstallAPIOpts.td
@@ -61,3 +61,15 @@ def exclude_private_header : Separate<["-"], "exclude-private-header">,
   HelpText<"Exclude private header from parsing">;
 def exclude_private_header_EQ : Joined<["--"], "exclude-private-header=">,
   Alias<exclude_private_header>;
+def public_umbrella_header : Separate<["-"], "public-umbrella-header">,
+  MetaVarName<"<path>">, HelpText<"Specify the public umbrella header location">;
+def public_umbrella_header_EQ : Joined<["--"], "public-umbrella-header=">,
+  Alias<public_umbrella_header>;
+def private_umbrella_header : Separate<["-"], "private-umbrella-header">,
+  MetaVarName<"<path>">, HelpText<"Specify the private umbrella header location">;
+def private_umbrella_header_EQ : Joined<["--"], "private-umbrella-header=">,
+  Alias<private_umbrella_header>;
+def project_umbrella_header : Separate<["-"], "project-umbrella-header">,
+  MetaVarName<"<path>">, HelpText<"Specify the project umbrella header location">;
+def project_umbrella_header_EQ : Joined<["--"], "project-umbrella-header=">,
+  Alias<project_umbrella_header>;
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 4f79c62..8e4a1b0 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -270,6 +270,16 @@ Options::processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args) {
                                  OPT_exclude_project_header))
     return {};
 
+  // Handle umbrella headers.
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_public_umbrella_header))
+    DriverOpts.PublicUmbrellaHeader = A->getValue();
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_private_umbrella_header))
+    DriverOpts.PrivateUmbrellaHeader = A->getValue();
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_project_umbrella_header))
+    DriverOpts.ProjectUmbrellaHeader = A->getValue();
+
   /// Any unclaimed arguments should be forwarded to the clang driver.
   std::vector<const char *> ClangDriverArgs(ParsedArgs.size());
   for (const Arg *A : ParsedArgs) {
@@ -323,6 +333,15 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
   }
 }
 
+static const Regex Rule("(.+)/(.+)\\.framework/");
+static StringRef getFrameworkNameFromInstallName(StringRef InstallName) {
+  SmallVector<StringRef, 3> Match;
+  Rule.match(InstallName, &Match);
+  if (Match.empty())
+    return "";
+  return Match.back();
+}
+
 InstallAPIContext Options::createContext() {
   InstallAPIContext Ctx;
   Ctx.FM = FM;
@@ -339,6 +358,11 @@ InstallAPIContext Options::createContext() {
   Ctx.OutputLoc = DriverOpts.OutputPath;
   Ctx.LangMode = FEOpts.LangMode;
 
+  // Attempt to find umbrella headers by capturing framework name.
+  StringRef FrameworkName;
+  if (!LinkerOpts.IsDylib)
+    FrameworkName = getFrameworkNameFromInstallName(LinkerOpts.InstallName);
+
   // Process inputs.
   for (const std::string &ListPath : DriverOpts.FileLists) {
     auto Buffer = FM->getBufferForFile(ListPath);
@@ -357,8 +381,7 @@ InstallAPIContext Options::createContext() {
     assert(Type != HeaderType::Unknown && "Missing header type.");
     for (const StringRef Path : Headers) {
       if (!FM->getOptionalFileRef(Path)) {
-        Diags->Report(diag::err_no_such_header_file)
-            << Path << (unsigned)Type - 1;
+        Diags->Report(diag::err_no_such_header_file) << Path << (unsigned)Type;
         return false;
       }
       SmallString<PATH_MAX> FullPath(Path);
@@ -382,6 +405,7 @@ InstallAPIContext Options::createContext() {
   std::vector<std::unique_ptr<HeaderGlob>> ExcludedHeaderGlobs;
   std::set<FileEntryRef> ExcludedHeaderFiles;
   auto ParseGlobs = [&](const PathSeq &Paths, HeaderType Type) {
+    assert(Type != HeaderType::Unknown && "Missing header type.");
     for (const StringRef Path : Paths) {
       auto Glob = HeaderGlob::create(Path, Type);
       if (Glob)
@@ -424,6 +448,57 @@ InstallAPIContext Options::createContext() {
     if (!Glob->didMatch())
       Diags->Report(diag::warn_glob_did_not_match) << Glob->str();
 
+  // Mark any explicit or inferred umbrella headers. If one exists, move
+  // that to the beginning of the input headers.
+  auto MarkandMoveUmbrellaInHeaders = [&](llvm::Regex &Regex,
+                                          HeaderType Type) -> bool {
+    auto It = find_if(Ctx.InputHeaders, [&Regex, Type](const HeaderFile &H) {
+      return (H.getType() == Type) && Regex.match(H.getPath());
+    });
+
+    if (It == Ctx.InputHeaders.end())
+      return false;
+    It->setUmbrellaHeader();
+
+    // Because there can be an umbrella header per header type,
+    // find the first non umbrella header to swap position with.
+    auto BeginPos = find_if(Ctx.InputHeaders, [](const HeaderFile &H) {
+      return !H.isUmbrellaHeader();
+    });
+    if (BeginPos != Ctx.InputHeaders.end() && BeginPos < It)
+      std::swap(*BeginPos, *It);
+    return true;
+  };
+
+  auto FindUmbrellaHeader = [&](StringRef HeaderPath, HeaderType Type) -> bool {
+    assert(Type != HeaderType::Unknown && "Missing header type.");
+    if (!HeaderPath.empty()) {
+      auto EscapedString = Regex::escape(HeaderPath);
+      Regex UmbrellaRegex(EscapedString);
+      if (!MarkandMoveUmbrellaInHeaders(UmbrellaRegex, Type)) {
+        Diags->Report(diag::err_no_such_umbrella_header_file)
+            << HeaderPath << (unsigned)Type;
+        return false;
+      }
+    } else if (!FrameworkName.empty() && (Type != HeaderType::Project)) {
+      auto UmbrellaName = "/" + Regex::escape(FrameworkName);
+      if (Type == HeaderType::Public)
+        UmbrellaName += "\\.h";
+      else
+        UmbrellaName += "[_]?Private\\.h";
+      Regex UmbrellaRegex(UmbrellaName);
+      MarkandMoveUmbrellaInHeaders(UmbrellaRegex, Type);
+    }
+    return true;
+  };
+  if (!FindUmbrellaHeader(DriverOpts.PublicUmbrellaHeader,
+                          HeaderType::Public) ||
+      !FindUmbrellaHeader(DriverOpts.PrivateUmbrellaHeader,
+                          HeaderType::Private) ||
+      !FindUmbrellaHeader(DriverOpts.ProjectUmbrellaHeader,
+                          HeaderType::Project))
+    return Ctx;
+
   // Parse binary dylib and initialize verifier.
   if (DriverOpts.DylibToVerify.empty()) {
     Ctx.Verifier = std::make_unique<DylibVerifier>();
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index c18309f..3671e4c 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -31,6 +31,15 @@ struct DriverOptions {
   /// \brief Path to input file lists (JSON).
   llvm::MachO::PathSeq FileLists;
 
+  /// \brief Path to public umbrella header.
+  std::string PublicUmbrellaHeader;
+
+  /// \brief Path to private umbrella header.
+  std::string PrivateUmbrellaHeader;
+
+  /// \brief Path to project umbrella header.
+  std::string ProjectUmbrellaHeader;
+
   /// \brief Paths of extra public headers.
   PathSeq ExtraPublicHeaders;
 
diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp
index 292d524..991767d 100644
--- a/clang/tools/libclang/CXType.cpp
+++ b/clang/tools/libclang/CXType.cpp
@@ -680,6 +680,7 @@ CXCallingConv clang_getFunctionTypeCallingConv(CXType X) {
       TCALLINGCONV(PreserveAll);
       TCALLINGCONV(M68kRTD);
       TCALLINGCONV(PreserveNone);
+      TCALLINGCONV(RISCVVectorCall);
     case CC_SpirFunction: return CXCallingConv_Unexposed;
     case CC_AMDGPUKernelCall: return CXCallingConv_Unexposed;
     case CC_OpenCLKernel: return CXCallingConv_Unexposed;
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 8513174..5e41ef9 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -334,10 +334,6 @@ void RVVEmitter::createHeader(raw_ostream &OS) {
   OS << "#include <stdint.h>\n";
   OS << "#include <stddef.h>\n\n";
 
-  OS << "#ifndef __riscv_vector\n";
-  OS << "#error \"Vector intrinsics require the vector extension.\"\n";
-  OS << "#endif\n\n";
-
   OS << "#ifdef __cplusplus\n";
   OS << "extern \"C\" {\n";
   OS << "#endif\n\n";
diff --git a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
index e068c48..3e41f67 100644
--- a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
@@ -141,7 +141,7 @@ TEST(ScudoStringsTest, CapacityIncreaseFails) {
 
   // Test requires that the default length is at least 6 characters.
   scudo::uptr MaxSize = Str.capacity();
-  EXPECT_LE(6, MaxSize);
+  EXPECT_LE(6u, MaxSize);
 
   for (size_t i = 0; i < MaxSize - 5; i++) {
     Str.append("B");
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index c0c603f..d31d6a5 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -51,7 +51,7 @@ public:
                   Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
       : converter(converter), semaCtx(semaCtx),
-        clauses(makeList(clauses, semaCtx)) {}
+        clauses(makeClauses(clauses, semaCtx)) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool processCollapse(
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index f48e84f..853dcd7 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -347,7 +347,7 @@ Aligned make(const parser::OmpClause::Aligned &inp,
 
   return Aligned{{
       /*Alignment=*/maybeApply(makeExprFn(semaCtx), t1),
-      /*List=*/makeList(t0, semaCtx),
+      /*List=*/makeObjects(t0, semaCtx),
   }};
 }
 
@@ -362,7 +362,7 @@ Allocate make(const parser::OmpClause::Allocate &inp,
     return Allocate{{/*AllocatorSimpleModifier=*/std::nullopt,
                      /*AllocatorComplexModifier=*/std::nullopt,
                      /*AlignModifier=*/std::nullopt,
-                     /*List=*/makeList(t1, semaCtx)}};
+                     /*List=*/makeObjects(t1, semaCtx)}};
   }
 
   using Tuple = decltype(Allocate::t);
@@ -374,7 +374,7 @@ Allocate make(const parser::OmpClause::Allocate &inp,
             return {/*AllocatorSimpleModifier=*/makeExpr(v.v, semaCtx),
                     /*AllocatorComplexModifier=*/std::nullopt,
                     /*AlignModifier=*/std::nullopt,
-                    /*List=*/makeList(t1, semaCtx)};
+                    /*List=*/makeObjects(t1, semaCtx)};
           },
           // complex-modifier + align-modifier
           [&](const wrapped::AllocateModifier::ComplexModifier &v) -> Tuple {
@@ -384,14 +384,14 @@ Allocate make(const parser::OmpClause::Allocate &inp,
                 /*AllocatorSimpleModifier=*/std::nullopt,
                 /*AllocatorComplexModifier=*/Allocator{makeExpr(s0.v, semaCtx)},
                 /*AlignModifier=*/Align{makeExpr(s1.v, semaCtx)},
-                /*List=*/makeList(t1, semaCtx)};
+                /*List=*/makeObjects(t1, semaCtx)};
           },
           // align-modifier
           [&](const wrapped::AllocateModifier::Align &v) -> Tuple {
             return {/*AllocatorSimpleModifier=*/std::nullopt,
                     /*AllocatorComplexModifier=*/std::nullopt,
                     /*AlignModifier=*/Align{makeExpr(v.v, semaCtx)},
-                    /*List=*/makeList(t1, semaCtx)};
+                    /*List=*/makeObjects(t1, semaCtx)};
           },
       },
       t0->u)};
@@ -450,13 +450,13 @@ Collapse make(const parser::OmpClause::Collapse &inp,
 Copyin make(const parser::OmpClause::Copyin &inp,
             semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Copyin{/*List=*/makeList(inp.v, semaCtx)};
+  return Copyin{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Copyprivate make(const parser::OmpClause::Copyprivate &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Copyprivate{/*List=*/makeList(inp.v, semaCtx)};
+  return Copyprivate{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Default make(const parser::OmpClause::Default &inp,
@@ -641,7 +641,7 @@ Doacross make(const parser::OmpClause::Doacross &inp,
 Enter make(const parser::OmpClause::Enter &inp,
            semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Enter{makeList(/*List=*/inp.v, semaCtx)};
+  return Enter{makeObjects(/*List=*/inp.v, semaCtx)};
 }
 
 Exclusive make(const parser::OmpClause::Exclusive &inp,
@@ -671,7 +671,7 @@ Final make(const parser::OmpClause::Final &inp,
 Firstprivate make(const parser::OmpClause::Firstprivate &inp,
                   semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Firstprivate{/*List=*/makeList(inp.v, semaCtx)};
+  return Firstprivate{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 // Flush: empty
@@ -681,7 +681,7 @@ From make(const parser::OmpClause::From &inp,
   // inp.v -> parser::OmpObjectList
   return From{{/*Expectation=*/std::nullopt, /*Mapper=*/std::nullopt,
                /*Iterator=*/std::nullopt,
-               /*LocatorList=*/makeList(inp.v, semaCtx)}};
+               /*LocatorList=*/makeObjects(inp.v, semaCtx)}};
 }
 
 // Full: empty
@@ -696,7 +696,7 @@ Grainsize make(const parser::OmpClause::Grainsize &inp,
 HasDeviceAddr make(const parser::OmpClause::HasDeviceAddr &inp,
                    semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return HasDeviceAddr{/*List=*/makeList(inp.v, semaCtx)};
+  return HasDeviceAddr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Hint make(const parser::OmpClause::Hint &inp,
@@ -762,20 +762,20 @@ InReduction make(const parser::OmpClause::InReduction &inp,
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
   return InReduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
-       /*List=*/makeList(t1, semaCtx)}};
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 IsDevicePtr make(const parser::OmpClause::IsDevicePtr &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return IsDevicePtr{/*List=*/makeList(inp.v, semaCtx)};
+  return IsDevicePtr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Lastprivate make(const parser::OmpClause::Lastprivate &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
   return Lastprivate{{/*LastprivateModifier=*/std::nullopt,
-                      /*List=*/makeList(inp.v, semaCtx)}};
+                      /*List=*/makeObjects(inp.v, semaCtx)}};
 }
 
 Linear make(const parser::OmpClause::Linear &inp,
@@ -817,7 +817,7 @@ Linear make(const parser::OmpClause::Linear &inp,
 Link make(const parser::OmpClause::Link &inp,
           semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Link{/*List=*/makeList(inp.v, semaCtx)};
+  return Link{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Map make(const parser::OmpClause::Map &inp,
@@ -844,7 +844,7 @@ Map make(const parser::OmpClause::Map &inp,
   if (!t0) {
     return Map{{/*MapType=*/std::nullopt, /*MapTypeModifiers=*/std::nullopt,
                 /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
-                /*LocatorList=*/makeList(t1, semaCtx)}};
+                /*LocatorList=*/makeObjects(t1, semaCtx)}};
   }
 
   auto &s0 = std::get<std::optional<parser::OmpMapType::Always>>(t0->t);
@@ -857,7 +857,7 @@ Map make(const parser::OmpClause::Map &inp,
   return Map{{/*MapType=*/convert1(s1),
               /*MapTypeModifiers=*/maybeList,
               /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
-              /*LocatorList=*/makeList(t1, semaCtx)}};
+              /*LocatorList=*/makeObjects(t1, semaCtx)}};
 }
 
 // Match: incomplete
@@ -980,7 +980,7 @@ Priority make(const parser::OmpClause::Priority &inp,
 Private make(const parser::OmpClause::Private &inp,
              semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Private{/*List=*/makeList(inp.v, semaCtx)};
+  return Private{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 ProcBind make(const parser::OmpClause::ProcBind &inp,
@@ -1010,7 +1010,7 @@ Reduction make(const parser::OmpClause::Reduction &inp,
   return Reduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
        /*ReductionModifier=*/std::nullopt,
-       /*List=*/makeList(t1, semaCtx)}};
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 // Relaxed: empty
@@ -1104,7 +1104,7 @@ Severity make(const parser::OmpClause::Severity &inp,
 Shared make(const parser::OmpClause::Shared &inp,
             semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Shared{/*List=*/makeList(inp.v, semaCtx)};
+  return Shared{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 // Simd: empty
@@ -1128,7 +1128,7 @@ TaskReduction make(const parser::OmpClause::TaskReduction &inp,
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
   return TaskReduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
-       /*List=*/makeList(t1, semaCtx)}};
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 ThreadLimit make(const parser::OmpClause::ThreadLimit &inp,
@@ -1145,7 +1145,7 @@ To make(const parser::OmpClause::To &inp,
   // inp.v -> parser::OmpObjectList
   return To{{/*Expectation=*/std::nullopt, /*Mapper=*/std::nullopt,
              /*Iterator=*/std::nullopt,
-             /*LocatorList=*/makeList(inp.v, semaCtx)}};
+             /*LocatorList=*/makeObjects(inp.v, semaCtx)}};
 }
 
 // UnifiedAddress: empty
@@ -1175,13 +1175,13 @@ Use make(const parser::OmpClause::Use &inp,
 UseDeviceAddr make(const parser::OmpClause::UseDeviceAddr &inp,
                    semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return UseDeviceAddr{/*List=*/makeList(inp.v, semaCtx)};
+  return UseDeviceAddr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 UseDevicePtr make(const parser::OmpClause::UseDevicePtr &inp,
                   semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return UseDevicePtr{/*List=*/makeList(inp.v, semaCtx)};
+  return UseDevicePtr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 UsesAllocators make(const parser::OmpClause::UsesAllocators &inp,
@@ -1205,8 +1205,8 @@ Clause makeClause(const Fortran::parser::OmpClause &cls,
       cls.u);
 }
 
-List<Clause> makeList(const parser::OmpClauseList &clauses,
-                      semantics::SemanticsContext &semaCtx) {
+List<Clause> makeClauses(const parser::OmpClauseList &clauses,
+                         semantics::SemanticsContext &semaCtx) {
   return makeList(clauses.v, [&](const parser::OmpClause &s) {
     return makeClause(s, semaCtx);
   });
diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h
index af13182..3e77642 100644
--- a/flang/lib/Lower/OpenMP/Clauses.h
+++ b/flang/lib/Lower/OpenMP/Clauses.h
@@ -88,8 +88,8 @@ List<ResultTy> makeList(ContainerTy &&container, FunctionTy &&func) {
   return v;
 }
 
-inline ObjectList makeList(const parser::OmpObjectList &objects,
-                           semantics::SemanticsContext &semaCtx) {
+inline ObjectList makeObjects(const parser::OmpObjectList &objects,
+                              semantics::SemanticsContext &semaCtx) {
   return makeList(objects.v, makeObjectFn(semaCtx));
 }
 
@@ -256,8 +256,8 @@ Clause makeClause(llvm::omp::Clause id, Specific &&specific,
 Clause makeClause(const Fortran::parser::OmpClause &cls,
                   semantics::SemanticsContext &semaCtx);
 
-List<Clause> makeList(const parser::OmpClauseList &clauses,
-                      semantics::SemanticsContext &semaCtx);
+List<Clause> makeClauses(const parser::OmpClauseList &clauses,
+                         semantics::SemanticsContext &semaCtx);
 } // namespace Fortran::lower::omp
 
 #endif // FORTRAN_LOWER_OPENMP_CLAUSES_H
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 226abe9..1cbc825 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -89,7 +89,7 @@ public:
                        Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()),
-        clauses(omp::makeList(opClauseList, semaCtx)), eval(eval),
+        clauses(omp::makeClauses(opClauseList, semaCtx)), eval(eval),
         useDelayedPrivatization(useDelayedPrivatization), symTable(symTable) {}
 
   // Privatisation is split into two steps.
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0cf2a8f9..5defffd 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1254,7 +1254,7 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
 
   if (const auto *objectList{
           Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
-    ObjectList objects{makeList(*objectList, semaCtx)};
+    ObjectList objects{makeObjects(*objectList, semaCtx)};
     // Case: declare target(func, var1, var2)
     gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
                          symbolAndClause);
@@ -2352,7 +2352,7 @@ void Fortran::lower::genOpenMPReduction(
     const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  List<Clause> clauses{makeList(clauseList, semaCtx)};
+  List<Clause> clauses{makeClauses(clauseList, semaCtx)};
 
   for (const Clause &clause : clauses) {
     if (const auto &reductionClause =
diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst
index e6fc6df..22a18b7 100644
--- a/libc/docs/dev/code_style.rst
+++ b/libc/docs/dev/code_style.rst
@@ -55,7 +55,7 @@ We define two kinds of macros:
    * ``src/__support/macros/config.h`` - Important compiler and platform
      features. Such macros can be used to produce portable code by
      parameterizing compilation based on the presence or lack of a given
-     feature. e.g., ``LIBC_HAS_BUILTIN``
+     feature. e.g., ``LIBC_HAS_FEATURE``
    * ``src/__support/macros/attributes.h`` - Attributes for functions, types,
      and variables. e.g., ``LIBC_UNUSED``
    * ``src/__support/macros/optimization.h`` - Portable macros for performance
diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
index f76285b..84d01fe 100644
--- a/libc/src/__support/CPP/CMakeLists.txt
+++ b/libc/src/__support/CPP/CMakeLists.txt
@@ -18,7 +18,6 @@ add_header_library(
     .limits
     .type_traits
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.config
     libc.src.__support.macros.sanitizer
 )
 
@@ -157,7 +156,6 @@ add_header_library(
   DEPENDS
     libc.include.llvm-libc-macros.stdfix_macros
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.config
     libc.src.__support.macros.properties.types
 )
 
diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h
index b74cb598..5e42894 100644
--- a/libc/src/__support/CPP/atomic.h
+++ b/libc/src/__support/CPP/atomic.h
@@ -71,10 +71,11 @@ public:
 
   T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
          [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_load_n))
-      return __scoped_atomic_load_n(&val, int(mem_ord), (int)(mem_scope));
-    else
-      return __atomic_load_n(&val, int(mem_ord));
+#if __has_builtin(__scoped_atomic_load_n)
+    return __scoped_atomic_load_n(&val, int(mem_ord), (int)(mem_scope));
+#else
+    return __atomic_load_n(&val, int(mem_ord));
+#endif
   }
 
   // Atomic store.
@@ -85,10 +86,11 @@ public:
 
   void store(T rhs, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_store_n))
-      __scoped_atomic_store_n(&val, rhs, int(mem_ord), (int)(mem_scope));
-    else
-      __atomic_store_n(&val, rhs, int(mem_ord));
+#if __has_builtin(__scoped_atomic_store_n)
+    __scoped_atomic_store_n(&val, rhs, int(mem_ord), (int)(mem_scope));
+#else
+    __atomic_store_n(&val, rhs, int(mem_ord));
+#endif
   }
 
   // Atomic compare exchange
@@ -101,47 +103,51 @@ public:
 
   T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_exchange_n))
-      return __scoped_atomic_exchange_n(&val, desired, int(mem_ord),
-                                        (int)(mem_scope));
-    else
-      return __atomic_exchange_n(&val, desired, int(mem_ord));
+#if __has_builtin(__scoped_atomic_exchange_n)
+    return __scoped_atomic_exchange_n(&val, desired, int(mem_ord),
+                                      (int)(mem_scope));
+#else
+    return __atomic_exchange_n(&val, desired, int(mem_ord));
+#endif
   }
 
   T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
               [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_add))
-      return __scoped_atomic_fetch_add(&val, increment, int(mem_ord),
-                                       (int)(mem_scope));
-    else
-      return __atomic_fetch_add(&val, increment, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_add)
+    return __scoped_atomic_fetch_add(&val, increment, int(mem_ord),
+                                     (int)(mem_scope));
+#else
+    return __atomic_fetch_add(&val, increment, int(mem_ord));
+#endif
   }
 
   T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_or))
-      return __scoped_atomic_fetch_or(&val, mask, int(mem_ord),
-                                      (int)(mem_scope));
-    else
-      return __atomic_fetch_or(&val, mask, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_or)
+    return __scoped_atomic_fetch_or(&val, mask, int(mem_ord), (int)(mem_scope));
+#else
+    return __atomic_fetch_or(&val, mask, int(mem_ord));
+#endif
   }
 
   T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
               [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_and))
-      return __scoped_atomic_fetch_and(&val, mask, int(mem_ord),
-                                       (int)(mem_scope));
-    else
-      return __atomic_fetch_and(&val, mask, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_and)
+    return __scoped_atomic_fetch_and(&val, mask, int(mem_ord),
+                                     (int)(mem_scope));
+#else
+    return __atomic_fetch_and(&val, mask, int(mem_ord));
+#endif
   }
 
   T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
               [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_sub))
-      return __scoped_atomic_fetch_sub(&val, decrement, int(mem_ord),
-                                       (int)(mem_scope));
-    else
-      return __atomic_fetch_sub(&val, decrement, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_sub)
+    return __scoped_atomic_fetch_sub(&val, decrement, int(mem_ord),
+                                     (int)(mem_scope));
+#else
+    return __atomic_fetch_sub(&val, decrement, int(mem_ord));
+#endif
   }
 
   // Set the value without using an atomic operation. This is useful
@@ -166,7 +172,7 @@ LIBC_INLINE void atomic_thread_fence([[maybe_unused]] MemoryOrder mem_ord) {
 // except no instructions for memory ordering are issued. Only reordering of
 // the instructions by the compiler is suppressed as order instructs.
 LIBC_INLINE void atomic_signal_fence([[maybe_unused]] MemoryOrder mem_ord) {
-#if LIBC_HAS_BUILTIN(__atomic_signal_fence)
+#if __has_builtin(__atomic_signal_fence)
   __atomic_signal_fence(static_cast<int>(mem_ord));
 #else
   // if the builtin is not ready, use asm as a full compiler barrier.
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 3f2fbec..80f50fd 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -14,14 +14,13 @@
 #include "src/__support/CPP/limits.h" // numeric_limits
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
 #include "src/__support/macros/sanitizer.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE::cpp {
 
-#if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 
@@ -36,20 +35,20 @@ LIBC_INLINE constexpr cpp::enable_if_t<
     To>
 bit_cast(const From &from) {
   MSAN_UNPOISON(&from, sizeof(From));
-#if LIBC_HAS_BUILTIN(__builtin_bit_cast)
+#if __has_builtin(__builtin_bit_cast)
   return __builtin_bit_cast(To, from);
 #else
   To to;
   char *dst = reinterpret_cast<char *>(&to);
   const char *src = reinterpret_cast<const char *>(&from);
-#if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#if __has_builtin(__builtin_memcpy_inline)
   __builtin_memcpy_inline(dst, src, sizeof(To));
 #else
   for (unsigned i = 0; i < sizeof(To); ++i)
     dst[i] = src[i];
-#endif // LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#endif // __has_builtin(__builtin_memcpy_inline)
   return to;
-#endif // LIBC_HAS_BUILTIN(__builtin_bit_cast)
+#endif // __has_builtin(__builtin_bit_cast)
 }
 
 template <typename T>
@@ -94,7 +93,7 @@ countr_zero(T value) {
   }
   return zero_bits;
 }
-#if LIBC_HAS_BUILTIN(__builtin_ctzs)
+#if __has_builtin(__builtin_ctzs)
 ADD_SPECIALIZATION(countr_zero, unsigned short, __builtin_ctzs)
 #endif
 ADD_SPECIALIZATION(countr_zero, unsigned int, __builtin_ctz)
@@ -124,7 +123,7 @@ countl_zero(T value) {
   }
   return zero_bits;
 }
-#if LIBC_HAS_BUILTIN(__builtin_clzs)
+#if __has_builtin(__builtin_clzs)
 ADD_SPECIALIZATION(countl_zero, unsigned short, __builtin_clzs)
 #endif
 ADD_SPECIALIZATION(countl_zero, unsigned int, __builtin_clz)
@@ -242,6 +241,14 @@ LIBC_INLINE constexpr To bit_or_static_cast(const From &from) {
 /// Count number of 1's aka population count or Hamming weight.
 ///
 /// Only unsigned integral types are allowed.
+// clang-19+, gcc-14+
+#if __has_builtin(__builtin_popcountg)
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
+popcount(T value) {
+  return __builtin_popcountg(value);
+}
+#else // !__has_builtin(__builtin_popcountg)
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
 popcount(T value) {
@@ -261,7 +268,7 @@ ADD_SPECIALIZATION(unsigned short, __builtin_popcount)
 ADD_SPECIALIZATION(unsigned, __builtin_popcount)
 ADD_SPECIALIZATION(unsigned long, __builtin_popcountl)
 ADD_SPECIALIZATION(unsigned long long, __builtin_popcountll)
-// TODO: 128b specializations?
+#endif // __builtin_popcountg
 #undef ADD_SPECIALIZATION
 
 } // namespace LIBC_NAMESPACE::cpp
diff --git a/libc/src/__support/CPP/type_traits/add_pointer.h b/libc/src/__support/CPP/type_traits/add_pointer.h
index 72a764b..1257033 100644
--- a/libc/src/__support/CPP/type_traits/add_pointer.h
+++ b/libc/src/__support/CPP/type_traits/add_pointer.h
@@ -10,7 +10,6 @@
 
 #include "src/__support/CPP/type_traits/remove_reference.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/CPP/type_traits/decay.h b/libc/src/__support/CPP/type_traits/decay.h
index a018286..f1a1200 100644
--- a/libc/src/__support/CPP/type_traits/decay.h
+++ b/libc/src/__support/CPP/type_traits/decay.h
@@ -9,7 +9,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_DECAY_H
 
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 #include "src/__support/CPP/type_traits/add_pointer.h"
 #include "src/__support/CPP/type_traits/conditional.h"
diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h
index d47de1c..f94fe30 100644
--- a/libc/src/__support/CPP/type_traits/is_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_destructible.h
@@ -16,12 +16,11 @@
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_destructible
-#if LIBC_HAS_BUILTIN(__is_destructible)
+#if __has_builtin(__is_destructible)
 template <typename T>
 struct is_destructible : bool_constant<__is_destructible(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_function.h b/libc/src/__support/CPP/type_traits/is_function.h
index 557b322..0eba586 100644
--- a/libc/src/__support/CPP/type_traits/is_function.h
+++ b/libc/src/__support/CPP/type_traits/is_function.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/is_const.h"
 #include "src/__support/CPP/type_traits/is_reference.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_function
-#if LIBC_HAS_BUILTIN(__is_function)
+#if __has_builtin(__is_function)
 template <typename T>
 struct is_function : integral_constant<bool, __is_function(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_lvalue_reference.h b/libc/src/__support/CPP/type_traits/is_lvalue_reference.h
index f52e303..1dff57f 100644
--- a/libc/src/__support/CPP/type_traits/is_lvalue_reference.h
+++ b/libc/src/__support/CPP/type_traits/is_lvalue_reference.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/false_type.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_lvalue_reference
-#if LIBC_HAS_BUILTIN(__is_lvalue_reference)
+#if __has_builtin(__is_lvalue_reference)
 template <typename T>
 struct is_lvalue_reference : bool_constant<__is_lvalue_reference(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_reference.h b/libc/src/__support/CPP/type_traits/is_reference.h
index c017028..bbfb2b7 100644
--- a/libc/src/__support/CPP/type_traits/is_reference.h
+++ b/libc/src/__support/CPP/type_traits/is_reference.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/false_type.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_reference
-#if LIBC_HAS_BUILTIN(__is_reference)
+#if __has_builtin(__is_reference)
 template <typename T> struct is_reference : bool_constant<__is_reference(T)> {};
 #else
 template <typename T> struct is_reference : public false_type {};
diff --git a/libc/src/__support/CPP/type_traits/is_rvalue_reference.h b/libc/src/__support/CPP/type_traits/is_rvalue_reference.h
index f0487e4..3efbbe6 100644
--- a/libc/src/__support/CPP/type_traits/is_rvalue_reference.h
+++ b/libc/src/__support/CPP/type_traits/is_rvalue_reference.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/false_type.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_rvalue_reference
-#if LIBC_HAS_BUILTIN(__is_rvalue_reference)
+#if __has_builtin(__is_rvalue_reference)
 template <typename T>
 struct is_rvalue_reference : bool_constant<__is_rvalue_reference(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_trivially_copyable.h b/libc/src/__support/CPP/type_traits/is_trivially_copyable.h
index 0c3fdcc..b4c825d 100644
--- a/libc/src/__support/CPP/type_traits/is_trivially_copyable.h
+++ b/libc/src/__support/CPP/type_traits/is_trivially_copyable.h
@@ -9,7 +9,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H
 
 #include "src/__support/CPP/type_traits/integral_constant.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/CPP/type_traits/is_trivially_destructible.h b/libc/src/__support/CPP/type_traits/is_trivially_destructible.h
index 3345149..37e0e86 100644
--- a/libc/src/__support/CPP/type_traits/is_trivially_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_trivially_destructible.h
@@ -11,12 +11,11 @@
 #include "src/__support/CPP/type_traits/bool_constant.h"
 #include "src/__support/CPP/type_traits/is_destructible.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_trivially_destructible
-#if LIBC_HAS_BUILTIN(__is_trivially_destructible)
+#if __has_builtin(__is_trivially_destructible)
 template <typename T>
 struct is_trivially_destructible
     : public bool_constant<__is_trivially_destructible(T)> {};
@@ -25,7 +24,7 @@ template <typename T>
 struct is_trivially_destructible
     : public bool_constant<cpp::is_destructible_v<T> &&__has_trivial_destructor(
           T)> {};
-#endif // LIBC_HAS_BUILTIN(__is_trivially_destructible)
+#endif // __has_builtin(__is_trivially_destructible)
 template <typename T>
 LIBC_INLINE_VAR constexpr bool is_trivially_destructible_v =
     is_trivially_destructible<T>::value;
diff --git a/libc/src/__support/CPP/type_traits/remove_all_extents.h b/libc/src/__support/CPP/type_traits/remove_all_extents.h
index bff6341..5941b82 100644
--- a/libc/src/__support/CPP/type_traits/remove_all_extents.h
+++ b/libc/src/__support/CPP/type_traits/remove_all_extents.h
@@ -9,14 +9,13 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_REMOVE_ALL_EXTENTS_H
 
 #include "src/__support/CPP/type_traits/type_identity.h"
-#include "src/__support/macros/config.h"
 
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE::cpp {
 
 // remove_all_extents
-#if LIBC_HAS_BUILTIN(__remove_all_extents)
+#if __has_builtin(__remove_all_extents)
 template <typename T> using remove_all_extents_t = __remove_all_extents(T);
 template <typename T>
 struct remove_all_extents : cpp::type_identity<remove_all_extents_t<T>> {};
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 0f43502..ff155a1 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -6,7 +6,6 @@ add_header_library(
     libc.include.fenv
     libc.include.math
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.config
     libc.src.errno.errno
 )
 
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index a6a533d..6086d5d 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -11,7 +11,6 @@
 
 #include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/architectures.h"
 #include "src/errno/libc_errno.h"
 #include <fenv.h>
diff --git a/libc/src/__support/FPUtil/gpu/FMA.h b/libc/src/__support/FPUtil/gpu/FMA.h
index 86bc860..ef1cd26 100644
--- a/libc/src/__support/FPUtil/gpu/FMA.h
+++ b/libc/src/__support/FPUtil/gpu/FMA.h
@@ -10,12 +10,12 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GPU_FMA_H
 
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/config.h"
 
-// These intrinsics map to the FMA instrunctions in the target ISA for the GPU.
+// These intrinsics map to the FMA instructions in the target ISA for the GPU.
 // The default rounding mode generated from these will be to the nearest even.
-static_assert(LIBC_HAS_BUILTIN(__builtin_fma), "FMA builtins must be defined");
-static_assert(LIBC_HAS_BUILTIN(__builtin_fmaf), "FMA builtins must be defined");
+#if !__has_builtin(__builtin_fma) || !__has_builtin(__builtin_fmaf)
+#error "FMA builtins must be defined");
+#endif
 
 namespace LIBC_NAMESPACE {
 namespace fputil {
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index df01e08..282efdb 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -1082,6 +1082,17 @@ bit_cast(const UInt<Bits> &from) {
   return cpp::bit_cast<To>(from.val);
 }
 
+// Specialization of cpp::popcount ('bit.h') for BigInt.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+popcount(T value) {
+  int bits = 0;
+  for (auto word : value.val)
+    if (word)
+      bits += popcount(word);
+  return bits;
+}
+
 // Specialization of cpp::has_single_bit ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, bool>
@@ -1218,6 +1229,49 @@ LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T> mask_leading_ones() {
   return out;
 }
 
+// Specialization of count_zeros ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+count_zeros(T value) {
+  return cpp::popcount(~value);
+}
+
+// Specialization of first_leading_zero ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_leading_zero(T value) {
+  return value == cpp::numeric_limits<T>::max() ? 0
+                                                : cpp::countl_one(value) + 1;
+}
+
+// Specialization of first_leading_one ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_leading_one(T value) {
+  return first_leading_zero(~value);
+}
+
+// Specialization of first_trailing_zero ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_trailing_zero(T value) {
+  return value == cpp::numeric_limits<T>::max() ? 0
+                                                : cpp::countr_zero(~value) + 1;
+}
+
+// Specialization of first_trailing_one ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_trailing_one(T value) {
+  return value == cpp::numeric_limits<T>::max() ? 0
+                                                : cpp::countr_zero(value) + 1;
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UINT_H
diff --git a/libc/src/__support/macros/config.h b/libc/src/__support/macros/config.h
index 6666c13..3f200f0 100644
--- a/libc/src/__support/macros/config.h
+++ b/libc/src/__support/macros/config.h
@@ -13,24 +13,6 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_CONFIG_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_CONFIG_H
 
-// LIBC_HAS_BUILTIN()
-//
-// Checks whether the compiler supports a Clang Feature Checking Macro, and if
-// so, checks whether it supports the provided builtin function "x" where x
-// is one of the functions noted in
-// https://clang.llvm.org/docs/LanguageExtensions.html
-//
-// Note: Use this macro to avoid an extra level of #ifdef __has_builtin check.
-// http://releases.llvm.org/3.3/tools/clang/docs/LanguageExtensions.html
-
-// Compiler builtin-detection.
-// clang.llvm.org/docs/LanguageExtensions.html#has-builtin
-#ifdef __has_builtin
-#define LIBC_HAS_BUILTIN(x) __has_builtin(x)
-#else
-#define LIBC_HAS_BUILTIN(x) 0
-#endif
-
 // Compiler feature-detection.
 // clang.llvm.org/docs/LanguageExtensions.html#has-feature-and-has-extension
 #ifdef __has_feature
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index ae97efc..59886ca 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -11,7 +11,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
 
 #include "src/__support/macros/attributes.h"          // LIBC_INLINE
-#include "src/__support/macros/config.h"              // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
 
 // We use a template to implement likely/unlikely to make sure that we don't
diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h
index fc66c20..bd9b62b 100644
--- a/libc/src/__support/macros/sanitizer.h
+++ b/libc/src/__support/macros/sanitizer.h
@@ -47,8 +47,7 @@
 // Functions to unpoison memory
 //-----------------------------------------------------------------------------
 
-#if defined(LIBC_HAVE_MEMORY_SANITIZER) &&                                     \
-    LIBC_HAS_BUILTIN(__builtin_constant_p)
+#if defined(LIBC_HAVE_MEMORY_SANITIZER) && __has_builtin(__builtin_constant_p)
 // Only perform MSAN unpoison in non-constexpr context.
 #include <sanitizer/msan_interface.h>
 #define MSAN_UNPOISON(addr, size)                                              \
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index 28ee1be..70a8800 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -14,7 +14,6 @@
 #include "src/__support/CPP/limits.h"        // CHAR_BIT, numeric_limits
 #include "src/__support/CPP/type_traits.h"   // is_unsigned_v
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 
 namespace LIBC_NAMESPACE {
 
@@ -61,7 +60,7 @@ add_with_carry(T a, T b, T carry_in) {
   return add_with_carry_const<T>(a, b, carry_in);
 }
 
-#if LIBC_HAS_BUILTIN(__builtin_addc)
+#if __has_builtin(__builtin_addc)
 // https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
 
 template <>
@@ -129,7 +128,7 @@ add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
   }
 }
 
-#endif // LIBC_HAS_BUILTIN(__builtin_addc)
+#endif // __has_builtin(__builtin_addc)
 
 // Subtract with borrow
 template <typename T> struct DiffBorrow {
@@ -157,7 +156,7 @@ sub_with_borrow(T a, T b, T borrow_in) {
   return sub_with_borrow_const<T>(a, b, borrow_in);
 }
 
-#if LIBC_HAS_BUILTIN(__builtin_subc)
+#if __has_builtin(__builtin_subc)
 // https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
 
 template <>
@@ -225,7 +224,7 @@ sub_with_borrow<unsigned long long>(unsigned long long a, unsigned long long b,
   }
 }
 
-#endif // LIBC_HAS_BUILTIN(__builtin_subc)
+#endif // __has_builtin(__builtin_subc)
 
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
diff --git a/libc/src/__support/memory_size.h b/libc/src/__support/memory_size.h
index 7bd16a1..491123b 100644
--- a/libc/src/__support/memory_size.h
+++ b/libc/src/__support/memory_size.h
@@ -19,7 +19,7 @@
 namespace LIBC_NAMESPACE {
 namespace internal {
 template <class T> LIBC_INLINE bool mul_overflow(T a, T b, T *res) {
-#if LIBC_HAS_BUILTIN(__builtin_mul_overflow)
+#if __has_builtin(__builtin_mul_overflow)
   return __builtin_mul_overflow(a, b, res);
 #else
   T max = cpp::numeric_limits<T>::max();
diff --git a/libc/src/string/memory_utils/generic/builtin.h b/libc/src/string/memory_utils/generic/builtin.h
index 5239329..ba4f4b8 100644
--- a/libc/src/string/memory_utils/generic/builtin.h
+++ b/libc/src/string/memory_utils/generic/builtin.h
@@ -10,16 +10,16 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_BUILTIN_H
 
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/string/memory_utils/utils.h"   // Ptr, CPtr
 
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
 
-static_assert(LIBC_HAS_BUILTIN(__builtin_memcpy), "Builtin not defined");
-static_assert(LIBC_HAS_BUILTIN(__builtin_memset), "Builtin not defined");
-static_assert(LIBC_HAS_BUILTIN(__builtin_memmove), "Builtin not defined");
+#if !__has_builtin(__builtin_memcpy) || !__has_builtin(__builtin_memset) ||    \
+    !__has_builtin(__builtin_memmove)
+#error "Builtin not defined");
+#endif
 
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_builtin(Ptr dst, CPtr src, size_t count, size_t offset = 0) {
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 79526d1..b3e1a26 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -14,7 +14,6 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/endian.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/architectures.h"
 
 #include <stddef.h> // size_t
@@ -71,11 +70,11 @@ LIBC_INLINE bool is_disjoint(const void *p1, const void *p2, size_t size) {
   return sdiff >= 0 ? size <= udiff : size <= neg_udiff;
 }
 
-#if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 
-#if LIBC_HAS_BUILTIN(__builtin_memset_inline)
+#if __has_builtin(__builtin_memset_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
 #endif
 
diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp
index cee5b90..875b47e 100644
--- a/libc/test/src/__support/CPP/bit_test.cpp
+++ b/libc/test/src/__support/CPP/bit_test.cpp
@@ -15,13 +15,6 @@
 
 namespace LIBC_NAMESPACE::cpp {
 
-using UnsignedTypesNoBigInt = testing::TypeList<
-#if defined(LIBC_TYPES_HAS_INT128)
-    __uint128_t,
-#endif // LIBC_TYPES_HAS_INT128
-    unsigned char, unsigned short, unsigned int, unsigned long,
-    unsigned long long>;
-
 using UnsignedTypes = testing::TypeList<
 #if defined(LIBC_TYPES_HAS_INT128)
     __uint128_t,
@@ -228,7 +221,7 @@ TEST(LlvmLibcBitTest, Rotr) {
             rotr<uint64_t>(0x12345678deadbeefULL, -19));
 }
 
-TYPED_TEST(LlvmLibcBitTest, CountOnes, UnsignedTypesNoBigInt) {
+TYPED_TEST(LlvmLibcBitTest, CountOnes, UnsignedTypes) {
   EXPECT_EQ(popcount(T(0)), 0);
   for (int i = 0; i != cpp::numeric_limits<T>::digits; ++i)
     EXPECT_EQ(popcount<T>(cpp::numeric_limits<T>::max() >> i),
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index e6422488..e88b3e1 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/UInt128.h" // UInt128
+#include "src/__support/UInt128.h" // UInt<128>
 #include "src/__support/integer_literals.h"
 #include "src/__support/math_extras.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@ using UnsignedTypesNoBigInt = testing::TypeList<
     __uint128_t,
 #endif // LIBC_TYPES_HAS_INT128
     unsigned char, unsigned short, unsigned int, unsigned long,
-    unsigned long long>;
+    unsigned long long, UInt<128>>;
 
 TEST(LlvmLibcBlockMathExtrasTest, mask_trailing_ones) {
   EXPECT_EQ(0_u8, (mask_leading_ones<uint8_t, 0>()));
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 90af156..46ad98f 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Workaround for missing __has_builtin in < GCC 10.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
 #include "llvmlibc_rpc_server.h"
 
 #include "src/__support/RPC/rpc.h"
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4c38154..0890b89 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -32,7 +32,7 @@ template <class, class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
 
 template <class _AlgPolicy>
-struct __copy_loop {
+struct __copy_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -94,9 +94,7 @@ struct __copy_loop {
       __local_first = _Traits::__begin(++__segment_iterator);
     }
   }
-};
 
-struct __copy_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -108,7 +106,7 @@ struct __copy_trivial {
 template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
 pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
 __copy(_InIter __first, _Sent __last, _OutIter __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __copy_loop<_AlgPolicy>, __copy_trivial>(
+  return std::__copy_move_unwrap_iters<__copy_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/include/__algorithm/copy_backward.h b/libcxx/include/__algorithm/copy_backward.h
index 591dd21..73dc846 100644
--- a/libcxx/include/__algorithm/copy_backward.h
+++ b/libcxx/include/__algorithm/copy_backward.h
@@ -33,7 +33,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
 __copy_backward(_InIter __first, _Sent __last, _OutIter __result);
 
 template <class _AlgPolicy>
-struct __copy_backward_loop {
+struct __copy_backward_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -104,9 +104,7 @@ struct __copy_backward_loop {
       __local_last = _Traits::__end(__segment_iterator);
     }
   }
-};
 
-struct __copy_backward_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -118,7 +116,7 @@ struct __copy_backward_trivial {
 template <class _AlgPolicy, class _BidirectionalIterator1, class _Sentinel, class _BidirectionalIterator2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1, _BidirectionalIterator2>
 __copy_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __copy_backward_loop<_AlgPolicy>, __copy_backward_trivial>(
+  return std::__copy_move_unwrap_iters<__copy_backward_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index 845967b..12a26c6 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -81,30 +81,17 @@ __copy_backward_trivial_impl(_In* __first, _In* __last, _Out* __result) {
 
 // Iterator unwrapping and dispatching to the correct overload.
 
-template <class _F1, class _F2>
-struct __overload : _F1, _F2 {
-  using _F1::operator();
-  using _F2::operator();
-};
-
-template <class _InIter, class _Sent, class _OutIter, class = void>
-struct __can_rewrap : false_type {};
-
-template <class _InIter, class _Sent, class _OutIter>
-struct __can_rewrap<_InIter,
-                    _Sent,
-                    _OutIter,
-                    // Note that sentinels are always copy-constructible.
-                    __enable_if_t< is_copy_constructible<_InIter>::value && is_copy_constructible<_OutIter>::value > >
-    : true_type {};
+template <class _InIter, class _OutIter>
+struct __can_rewrap
+    : integral_constant<bool, is_copy_constructible<_InIter>::value && is_copy_constructible<_OutIter>::value> {};
 
 template <class _Algorithm,
           class _InIter,
           class _Sent,
           class _OutIter,
-          __enable_if_t<__can_rewrap<_InIter, _Sent, _OutIter>::value, int> = 0>
+          __enable_if_t<__can_rewrap<_InIter, _OutIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__unwrap_and_dispatch(_InIter __first, _Sent __last, _OutIter __out_first) {
+__copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) {
   auto __range  = std::__unwrap_range(__first, std::move(__last));
   auto __result = _Algorithm()(std::move(__range.first), std::move(__range.second), std::__unwrap_iter(__out_first));
   return std::make_pair(std::__rewrap_range<_Sent>(std::move(__first), std::move(__result.first)),
@@ -115,24 +102,12 @@ template <class _Algorithm,
           class _InIter,
           class _Sent,
           class _OutIter,
-          __enable_if_t<!__can_rewrap<_InIter, _Sent, _OutIter>::value, int> = 0>
+          __enable_if_t<!__can_rewrap<_InIter, _OutIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__unwrap_and_dispatch(_InIter __first, _Sent __last, _OutIter __out_first) {
+__copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) {
   return _Algorithm()(std::move(__first), std::move(__last), std::move(__out_first));
 }
 
-template <class _AlgPolicy,
-          class _NaiveAlgorithm,
-          class _OptimizedAlgorithm,
-          class _InIter,
-          class _Sent,
-          class _OutIter>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__dispatch_copy_or_move(_InIter __first, _Sent __last, _OutIter __out_first) {
-  using _Algorithm = __overload<_NaiveAlgorithm, _OptimizedAlgorithm>;
-  return std::__unwrap_and_dispatch<_Algorithm>(std::move(__first), std::move(__last), std::move(__out_first));
-}
-
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index bf574b5..1716d43 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -34,7 +34,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIte
 __move(_InIter __first, _Sent __last, _OutIter __result);
 
 template <class _AlgPolicy>
-struct __move_loop {
+struct __move_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -95,9 +95,7 @@ struct __move_loop {
       __local_first = _Traits::__begin(++__segment_iterator);
     }
   }
-};
 
-struct __move_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_move_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -109,7 +107,7 @@ struct __move_trivial {
 template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
 __move(_InIter __first, _Sent __last, _OutIter __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __move_loop<_AlgPolicy>, __move_trivial>(
+  return std::__copy_move_unwrap_iters<__move_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h
index 6bb7c91..4beb7bdb 100644
--- a/libcxx/include/__algorithm/move_backward.h
+++ b/libcxx/include/__algorithm/move_backward.h
@@ -33,7 +33,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1
 __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result);
 
 template <class _AlgPolicy>
-struct __move_backward_loop {
+struct __move_backward_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -104,9 +104,7 @@ struct __move_backward_loop {
       __local_last = _Traits::__end(--__segment_iterator);
     }
   }
-};
 
-struct __move_backward_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_move_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -122,7 +120,7 @@ __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _Bidirectiona
                     std::is_copy_constructible<_BidirectionalIterator1>::value,
                 "Iterators must be copy constructible.");
 
-  return std::__dispatch_copy_or_move<_AlgPolicy, __move_backward_loop<_AlgPolicy>, __move_backward_trivial>(
+  return std::__copy_move_unwrap_iters<__move_backward_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
index c06a9ed..cbca37e 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -329,7 +329,7 @@ constexpr bool test() {
   { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(sBigMax); }
   assert(std::saturate_cast<unsigned long int>(  sBigMin) == 0UL);       // saturated
   assert(std::saturate_cast<unsigned long int>(    sZero) == 0UL);
-  assert(std::saturate_cast<unsigned long int>(  sBigMax) == ULONG_MAX); // saturated
+  assert(std::saturate_cast<unsigned long int>(  sBigMax) == (sizeof(UIntT) > sizeof(unsigned long int) ? ULONG_MAX : LONG_MAX)); // saturated depending on underlying types
 
   { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(uBigMax); }
   assert(std::saturate_cast<unsigned long int>(    uZero) == 0UL);
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 8f85929..917f88f 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -54,6 +54,7 @@ enum class EmitKind { Obj, LLVM, ASM };
 struct Export {
   StringRef name;       // N in /export:N or /export:E=N
   StringRef extName;    // E in /export:E=N
+  StringRef exportAs;   // E in /export:N,EXPORTAS,E
   StringRef aliasTarget; // GNU specific: N in "alias == N"
   Symbol *sym = nullptr;
   uint16_t ordinal = 0;
@@ -73,10 +74,9 @@ struct Export {
   StringRef exportName; // Name in DLL
 
   bool operator==(const Export &e) const {
-    return (name == e.name && extName == e.extName &&
-            aliasTarget == e.aliasTarget &&
-            ordinal == e.ordinal && noname == e.noname &&
-            data == e.data && isPrivate == e.isPrivate);
+    return (name == e.name && extName == e.extName && exportAs == e.exportAs &&
+            aliasTarget == e.aliasTarget && ordinal == e.ordinal &&
+            noname == e.noname && data == e.data && isPrivate == e.isPrivate);
   }
 };
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 1814929..2b1d4ab 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -945,6 +945,7 @@ void LinkerDriver::createImportLibrary(bool asLib) {
     e2.Name = std::string(e1.name);
     e2.SymbolName = std::string(e1.symbolName);
     e2.ExtName = std::string(e1.extName);
+    e2.ExportAs = std::string(e1.exportAs);
     e2.AliasTarget = std::string(e1.aliasTarget);
     e2.Ordinal = e1.ordinal;
     e2.Noname = e1.noname;
@@ -1044,6 +1045,7 @@ void LinkerDriver::parseModuleDefs(StringRef path) {
       e2.name = saver().save(e1.Name);
       e2.extName = saver().save(e1.ExtName);
     }
+    e2.exportAs = saver().save(e1.ExportAs);
     e2.aliasTarget = saver().save(e1.AliasTarget);
     e2.ordinal = e1.Ordinal;
     e2.noname = e1.Noname;
diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index 0fa4769..b4ff31a 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -585,7 +585,8 @@ Export LinkerDriver::parseExport(StringRef arg) {
     }
   }
 
-  // Optional parameters "[,@ordinal[,NONAME]][,DATA][,PRIVATE]"
+  // Optional parameters
+  // "[,@ordinal[,NONAME]][,DATA][,PRIVATE][,EXPORTAS,exportname]"
   while (!rest.empty()) {
     StringRef tok;
     std::tie(tok, rest) = rest.split(",");
@@ -607,6 +608,13 @@ Export LinkerDriver::parseExport(StringRef arg) {
       e.isPrivate = true;
       continue;
     }
+    if (tok.equals_insensitive("exportas")) {
+      if (!rest.empty() && !rest.contains(','))
+        e.exportAs = rest;
+      else
+        error("invalid EXPORTAS value: " + rest);
+      break;
+    }
     if (tok.starts_with("@")) {
       int32_t ord;
       if (tok.substr(1).getAsInteger(0, ord))
@@ -683,7 +691,9 @@ void LinkerDriver::fixupExports() {
   }
 
   for (Export &e : ctx.config.exports) {
-    if (!e.forwardTo.empty()) {
+    if (!e.exportAs.empty()) {
+      e.exportName = e.exportAs;
+    } else if (!e.forwardTo.empty()) {
       e.exportName = undecorate(ctx, e.name);
     } else {
       e.exportName = undecorate(ctx, e.extName.empty() ? e.name : e.extName);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 33c5013..92f2e20 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1659,10 +1659,17 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
   // original section/value pairs. For non-GOT non-PLT relocation case below, we
   // may alter section/value, so create a copy of the symbol to make
   // section/value fixed.
+  //
+  // Prior to Android V, there was a bug that caused RELR relocations to be
+  // applied after packed relocations. This meant that resolvers referenced by
+  // IRELATIVE relocations in the packed relocation section would read
+  // unrelocated globals with RELR relocations when
+  // --pack-relative-relocs=android+relr is enabled. Work around this by placing
+  // IRELATIVE in .rela.plt.
   auto *directSym = makeDefined(cast<Defined>(sym));
   directSym->allocateAux();
-  addPltEntry(*in.iplt, *in.igotPlt, *mainPart->relaDyn, target->iRelativeRel,
-              *directSym);
+  auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *mainPart->relaDyn;
+  addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym);
   sym.allocateAux();
   symAux.back().pltIdx = symAux[directSym->auxIdx].pltIdx;
 
diff --git a/lld/test/COFF/exportas.test b/lld/test/COFF/exportas.test
index c0295c3..d70547c 100644
--- a/lld/test/COFF/exportas.test
+++ b/lld/test/COFF/exportas.test
@@ -9,6 +9,77 @@ RUN: lld-link -out:out1.dll -dll -noentry test.obj test.lib
 RUN: llvm-readobj --coff-imports out1.dll | FileCheck --check-prefix=IMPORT %s
 IMPORT: Symbol: expfunc
 
+Pass -export argument with EXPORTAS.
+
+RUN: llvm-mc -filetype=obj -triple=x86_64-windows func.s -o func.obj
+RUN: lld-link -out:out2.dll -dll -noentry func.obj -export:func,EXPORTAS,expfunc
+RUN: llvm-readobj --coff-exports out2.dll | FileCheck --check-prefix=EXPORT %s
+EXPORT: Name: expfunc
+
+RUN: llvm-readobj out2.lib | FileCheck --check-prefix=IMPLIB %s
+IMPLIB:      Name type: export as
+IMPLIB-NEXT: Export name: expfunc
+IMPLIB-NEXT: Symbol: __imp_func
+IMPLIB-NEXT: Symbol: func
+
+Use .drectve section with EXPORTAS.
+
+RUN: llvm-mc -filetype=obj -triple=x86_64-windows drectve.s -o drectve.obj
+RUN: lld-link -out:out3.dll -dll -noentry func.obj drectve.obj
+RUN: llvm-readobj --coff-exports out3.dll | FileCheck --check-prefix=EXPORT %s
+RUN: llvm-readobj out3.lib | FileCheck --check-prefix=IMPLIB %s
+
+Use a .def file with EXPORTAS.
+
+RUN: lld-link -out:out4.dll -dll -noentry func.obj -def:test.def
+RUN: llvm-readobj --coff-exports out4.dll | FileCheck --check-prefix=EXPORT %s
+RUN: llvm-readobj out4.lib | FileCheck --check-prefix=IMPLIB %s
+
+Use a .def file with EXPORTAS in a forwarding export.
+
+RUN: lld-link -out:out5.dll -dll -noentry func.obj -def:test2.def
+RUN: llvm-readobj --coff-exports out5.dll | FileCheck --check-prefix=FORWARD-EXPORT %s
+FORWARD-EXPORT:      Export {
+FORWARD-EXPORT-NEXT:   Ordinal: 1
+FORWARD-EXPORT-NEXT:   Name: expfunc
+FORWARD-EXPORT-NEXT:   ForwardedTo: otherdll.otherfunc
+FORWARD-EXPORT-NEXT: }
+
+RUN: llvm-readobj out5.lib | FileCheck --check-prefix=FORWARD-IMPLIB %s
+FORWARD-IMPLIB:      Name type: export as
+FORWARD-IMPLIB-NEXT: Export name: expfunc
+FORWARD-IMPLIB-NEXT: Symbol: __imp_func
+FORWARD-IMPLIB-NEXT: Symbol: func
+
+Pass -export argument with EXPORTAS in a forwarding export.
+
+RUN: lld-link -out:out6.dll -dll -noentry func.obj -export:func=otherdll.otherfunc,EXPORTAS,expfunc
+RUN: llvm-readobj --coff-exports out6.dll | FileCheck --check-prefix=FORWARD-EXPORT %s
+RUN: llvm-readobj out6.lib | FileCheck --check-prefix=FORWARD-IMPLIB %s
+
+Pass -export argument with EXPORTAS in a data export.
+
+RUN: lld-link -out:out7.dll -dll -noentry func.obj -export:func,DATA,@5,EXPORTAS,expfunc
+RUN: llvm-readobj --coff-exports out7.dll | FileCheck --check-prefix=ORD %s
+ORD:      Ordinal: 5
+ORD-NEXT: Name: expfunc
+
+RUN: llvm-readobj out7.lib | FileCheck --check-prefix=ORD-IMPLIB %s
+ORD-IMPLIB:      Type: data
+ORD-IMPLIB-NEXT: Name type: export as
+ORD-IMPLIB-NEXT: Export name: expfunc
+ORD-IMPLIB-NEXT: Symbol: __imp_func
+
+Check invalid EXPORTAS syntax.
+
+RUN: not lld-link -out:err1.dll -dll -noentry func.obj -export:func,EXPORTAS, 2>&1 | \
+RUN:     FileCheck --check-prefix=ERR1 %s
+ERR1: error: invalid EXPORTAS value: {{$}}
+
+RUN: not lld-link -out:err2.dll -dll -noentry func.obj -export:func,EXPORTAS,expfunc,DATA 2>&1 | \
+RUN:     FileCheck --check-prefix=ERR2 %s
+ERR2: error: invalid EXPORTAS value: expfunc,DATA
+
 #--- test.s
     .section ".test", "rd"
     .rva __imp_func
@@ -17,3 +88,20 @@ IMPORT: Symbol: expfunc
 LIBRARY test.dll
 EXPORTS
     func EXPORTAS expfunc
+
+#--- test2.def
+LIBRARY test.dll
+EXPORTS
+    func=otherdll.otherfunc EXPORTAS expfunc
+
+#--- func.s
+    .text
+    .globl func
+    .p2align 2, 0x0
+func:
+    movl $1, %eax
+    retq
+
+#--- drectve.s
+    .section .drectve, "yn"
+    .ascii " -export:func,EXPORTAS,expfunc"
diff --git a/lld/test/ELF/pack-dyn-relocs-ifunc.s b/lld/test/ELF/pack-dyn-relocs-ifunc.s
new file mode 100644
index 0000000..6168d06
--- /dev/null
+++ b/lld/test/ELF/pack-dyn-relocs-ifunc.s
@@ -0,0 +1,49 @@
+# REQUIRES: aarch64
+## Prior to Android V, there was a bug that caused RELR relocations to be
+## applied after packed relocations. This meant that resolvers referenced by
+## IRELATIVE relocations in the packed relocation section would read unrelocated
+## globals when --pack-relative-relocs=android+relr is enabled. Work around this
+## by placing IRELATIVE in .rela.plt.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-android a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-android b.s -o b.o
+# RUN: ld.lld -shared b.o -o b.so
+# RUN: ld.lld -pie --pack-dyn-relocs=android+relr -z separate-loadable-segments a.o b.so -o a
+# RUN: llvm-readobj -r a | FileCheck %s
+# RUN: llvm-objdump -d a | FileCheck %s --check-prefix=ASM
+
+# CHECK:      .relr.dyn {
+# CHECK-NEXT:   0x30000 R_AARCH64_RELATIVE -
+# CHECK-NEXT: }
+# CHECK:      .rela.plt {
+# CHECK-NEXT:   0x30020 R_AARCH64_JUMP_SLOT bar 0x0
+# CHECK-NEXT:   0x30028 R_AARCH64_IRELATIVE - 0x10000
+# CHECK-NEXT: }
+
+# ASM:      <.iplt>:
+# ASM-NEXT:   adrp    x16, 0x30000
+# ASM-NEXT:   ldr     x17, [x16, #0x28]
+# ASM-NEXT:   add     x16, x16, #0x28
+# ASM-NEXT:   br      x17
+
+#--- a.s
+.text
+.type foo, %gnu_indirect_function
+.globl foo
+foo:
+  ret
+
+.globl _start
+_start:
+  bl foo
+  bl bar
+
+.data
+.balign 8
+.quad .data
+
+#--- b.s
+.globl bar
+bar:
+  ret
diff --git a/lldb/include/lldb/Symbol/UnwindTable.h b/lldb/include/lldb/Symbol/UnwindTable.h
index f0ce704..26826e5 100644
--- a/lldb/include/lldb/Symbol/UnwindTable.h
+++ b/lldb/include/lldb/Symbol/UnwindTable.h
@@ -57,6 +57,10 @@ public:
 
   ArchSpec GetArchitecture();
 
+  /// Called after a SymbolFile has been added to a Module to add any new
+  /// unwind sections that may now be available.
+  void Update();
+
 private:
   void Dump(Stream &s);
 
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index a520523..9c105b3 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -1009,6 +1009,8 @@ SymbolFile *Module::GetSymbolFile(bool can_create, Stream *feedback_strm) {
         m_symfile_up.reset(
             SymbolVendor::FindPlugin(shared_from_this(), feedback_strm));
         m_did_load_symfile = true;
+        if (m_unwind_table)
+          m_unwind_table->Update();
       }
     }
   }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 5f67658..1164bc6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -693,6 +693,7 @@ llvm::DWARFDebugAbbrev *SymbolFileDWARF::DebugAbbrev() {
   if (debug_abbrev_data.GetByteSize() == 0)
     return nullptr;
 
+  ElapsedTime elapsed(m_parse_time);
   auto abbr =
       std::make_unique<llvm::DWARFDebugAbbrev>(debug_abbrev_data.GetAsLLVM());
   llvm::Error error = abbr->parse();
diff --git a/lldb/source/Symbol/UnwindTable.cpp b/lldb/source/Symbol/UnwindTable.cpp
index 3c1a518..11bedf3 100644
--- a/lldb/source/Symbol/UnwindTable.cpp
+++ b/lldb/source/Symbol/UnwindTable.cpp
@@ -84,6 +84,51 @@ void UnwindTable::Initialize() {
   }
 }
 
+void UnwindTable::Update() {
+  if (!m_initialized)
+    return Initialize();
+
+  std::lock_guard<std::mutex> guard(m_mutex);
+
+  ObjectFile *object_file = m_module.GetObjectFile();
+  if (!object_file)
+    return;
+
+  if (!m_object_file_unwind_up)
+    m_object_file_unwind_up = object_file->CreateCallFrameInfo();
+
+  SectionList *sl = m_module.GetSectionList();
+  if (!sl)
+    return;
+
+  SectionSP sect = sl->FindSectionByType(eSectionTypeEHFrame, true);
+  if (!m_eh_frame_up && sect) {
+    m_eh_frame_up = std::make_unique<DWARFCallFrameInfo>(
+        *object_file, sect, DWARFCallFrameInfo::EH);
+  }
+
+  sect = sl->FindSectionByType(eSectionTypeDWARFDebugFrame, true);
+  if (!m_debug_frame_up && sect) {
+    m_debug_frame_up = std::make_unique<DWARFCallFrameInfo>(
+        *object_file, sect, DWARFCallFrameInfo::DWARF);
+  }
+
+  sect = sl->FindSectionByType(eSectionTypeCompactUnwind, true);
+  if (!m_compact_unwind_up && sect) {
+    m_compact_unwind_up =
+        std::make_unique<CompactUnwindInfo>(*object_file, sect);
+  }
+
+  sect = sl->FindSectionByType(eSectionTypeARMexidx, true);
+  if (!m_arm_unwind_up && sect) {
+    SectionSP sect_extab = sl->FindSectionByType(eSectionTypeARMextab, true);
+    if (sect_extab.get()) {
+      m_arm_unwind_up =
+          std::make_unique<ArmUnwindInfo>(*object_file, sect, sect_extab);
+    }
+  }
+}
+
 UnwindTable::~UnwindTable() = default;
 
 std::optional<AddressRange>
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 3af62f5..03a74f2 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1800,7 +1800,6 @@ void StackFrame::DumpUsingSettingsFormat(Stream *strm, bool show_unique,
     return;
 
   ExecutionContext exe_ctx(shared_from_this());
-  StreamString s;
 
   const FormatEntity::Entry *frame_format = nullptr;
   Target *target = exe_ctx.GetTargetPtr();
diff --git a/lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test b/lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test
new file mode 100644
index 0000000..5420213
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test
@@ -0,0 +1,27 @@
+# TODO: When it's possible to run "image show-unwind" without a running
+# process, we can remove the unsupported line below, and hard-code an ELF
+# triple in the test.
+# UNSUPPORTED: system-windows, system-darwin
+
+# RUN: cd %T
+# RUN: %clang_host %S/Inputs/target-symbols-add-unwind.c -g \
+# RUN:   -fno-unwind-tables -fno-asynchronous-unwind-tables \
+# RUN:   -o target-symbols-add-unwind.debug
+# RUN: llvm-objcopy --strip-debug target-symbols-add-unwind.debug \
+# RUN:   target-symbols-add-unwind.stripped
+# RUN: %lldb target-symbols-add-unwind.stripped -s %s -o quit | FileCheck %s
+
+process launch --stop-at-entry
+image show-unwind -n main
+# CHECK-LABEL: image show-unwind -n main
+# CHECK-NOT: debug_frame UnwindPlan:
+
+target symbols add -s target-symbols-add-unwind.stripped target-symbols-add-unwind.debug
+# CHECK-LABEL: target symbols add
+# CHECK: symbol file {{.*}} has been added to {{.*}}
+
+image show-unwind -n main
+# CHECK-LABEL: image show-unwind -n main
+# CHECK: debug_frame UnwindPlan:
+# CHECK-NEXT: This UnwindPlan originally sourced from DWARF CFI
+# CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 030b769..a4be315 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12815,10 +12815,11 @@ Variable argument support is defined in LLVM with the
 functions. These functions are related to the similarly named macros
 defined in the ``<stdarg.h>`` header file.
 
-All of these functions operate on arguments that use a target-specific
+All of these functions take as arguments pointers to a target-specific
 value type "``va_list``". The LLVM assembly language reference manual
 does not define what this type is, so all transformations should be
-prepared to handle these functions regardless of the type used.
+prepared to handle these functions regardless of the type used. The intrinsics
+are overloaded, and can be used for pointers to different address spaces.
 
 This example shows how the :ref:`va_arg <i_va_arg>` instruction and the
 variable argument handling intrinsic functions are used.
@@ -12835,24 +12836,24 @@ variable argument handling intrinsic functions are used.
     define i32 @test(i32 %X, ...) {
       ; Initialize variable argument processing
       %ap = alloca %struct.va_list
-      call void @llvm.va_start(ptr %ap)
+      call void @llvm.va_start.p0(ptr %ap)
 
       ; Read a single integer argument
       %tmp = va_arg ptr %ap, i32
 
       ; Demonstrate usage of llvm.va_copy and llvm.va_end
       %aq = alloca ptr
-      call void @llvm.va_copy(ptr %aq, ptr %ap)
-      call void @llvm.va_end(ptr %aq)
+      call void @llvm.va_copy.p0(ptr %aq, ptr %ap)
+      call void @llvm.va_end.p0(ptr %aq)
 
       ; Stop processing of arguments.
-      call void @llvm.va_end(ptr %ap)
+      call void @llvm.va_end.p0(ptr %ap)
       ret i32 %tmp
     }
 
-    declare void @llvm.va_start(ptr)
-    declare void @llvm.va_copy(ptr, ptr)
-    declare void @llvm.va_end(ptr)
+    declare void @llvm.va_start.p0(ptr)
+    declare void @llvm.va_copy.p0(ptr, ptr)
+    declare void @llvm.va_end.p0(ptr)
 
 .. _int_va_start:
 
@@ -12864,7 +12865,8 @@ Syntax:
 
 ::
 
-      declare void @llvm.va_start(ptr <arglist>)
+      declare void @llvm.va_start.p0(ptr <arglist>)
+      declare void @llvm.va_start.p5(ptr addrspace(5) <arglist>)
 
 Overview:
 """""""""
@@ -12896,7 +12898,8 @@ Syntax:
 
 ::
 
-      declare void @llvm.va_end(ptr <arglist>)
+      declare void @llvm.va_end.p0(ptr <arglist>)
+      declare void @llvm.va_end.p5(ptr addrspace(5) <arglist>)
 
 Overview:
 """""""""
@@ -12929,7 +12932,8 @@ Syntax:
 
 ::
 
-      declare void @llvm.va_copy(ptr <destarglist>, ptr <srcarglist>)
+      declare void @llvm.va_copy.p0(ptr <destarglist>, ptr <srcarglist>)
+      declare void @llvm.va_copy.p5(ptr addrspace(5) <destarglist>, ptr addrspace(5) <srcarglist>)
 
 Overview:
 """""""""
@@ -12942,6 +12946,7 @@ Arguments:
 
 The first argument is a pointer to a ``va_list`` element to initialize.
 The second argument is a pointer to a ``va_list`` element to copy from.
+The address spaces of the two arguments must match.
 
 Semantics:
 """"""""""
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c2b1a9d..7588048 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -76,6 +76,7 @@ Changes to the AMDGPU Backend
 
 Changes to the ARM Backend
 --------------------------
+* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
 
 Changes to the AVR Backend
 --------------------------
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 5863a8d..65ccb1b 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -181,6 +181,7 @@ enum Kind {
   kw_tailcc,
   kw_m68k_rtdcc,
   kw_graalcc,
+  kw_riscv_vector_cc,
 
   // Attributes:
   kw_attributes,
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index e70b58d..d8927c6 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -1040,6 +1040,7 @@ HANDLE_DW_CC(0xca, LLVM_PreserveAll)
 HANDLE_DW_CC(0xcb, LLVM_X86RegCall)
 HANDLE_DW_CC(0xcc, LLVM_M68kRTD)
 HANDLE_DW_CC(0xcd, LLVM_PreserveNone)
+HANDLE_DW_CC(0xce, LLVM_RISCVVectorCall)
 // From GCC source code (include/dwarf2.h): This DW_CC_ value is not currently
 // generated by any toolchain.  It is used internally to GDB to indicate OpenCL
 // C functions that have been compiled with the IBM XL C for OpenCL compiler and
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index ef8aaf5..a05d1a4 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -264,6 +264,9 @@ namespace CallingConv {
     /// except that the first parameter is mapped to x9.
     ARM64EC_Thunk_Native = 109,
 
+    /// Calling convention used for RISC-V V-extension.
+    RISCV_VectorCall = 110,
+
     /// The highest possible ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index d0ef9c2..7649024 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -700,10 +700,13 @@ class MSBuiltin<string name> {
 //===--------------- Variable Argument Handling Intrinsics ----------------===//
 //
 
-def int_vastart : DefaultAttrsIntrinsic<[], [llvm_ptr_ty], [], "llvm.va_start">;
-def int_vacopy  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [],
-                            "llvm.va_copy">;
-def int_vaend   : DefaultAttrsIntrinsic<[], [llvm_ptr_ty], [], "llvm.va_end">;
+def int_vastart : DefaultAttrsIntrinsic<[],
+                                        [llvm_anyptr_ty], [], "llvm.va_start">;
+def int_vacopy  : DefaultAttrsIntrinsic<[],
+                                        [llvm_anyptr_ty, LLVMMatchType<0>], [],
+                                        "llvm.va_copy">;
+def int_vaend   : DefaultAttrsIntrinsic<[],
+                                        [llvm_anyptr_ty], [], "llvm.va_end">;
 
 //===------------------- Garbage Collection Intrinsics --------------------===//
 //
diff --git a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h
index b25f8eb..b7db6e0 100644
--- a/llvm/include/llvm/IR/Verifier.h
+++ b/llvm/include/llvm/IR/Verifier.h
@@ -77,6 +77,7 @@ public:
   /// Visit an instruction and return true if it is valid, return false if an
   /// invalid TBAA is attached.
   bool visitTBAAMetadata(Instruction &I, const MDNode *MD);
+  bool visitTBAAStructMetadata(Instruction &I, const MDNode *MD);
 };
 
 /// Check a function for errors, useful for use when debugging a
diff --git a/llvm/include/llvm/Object/GOFF.h b/llvm/include/llvm/Object/GOFF.h
index 9176245..9fb8876 100644
--- a/llvm/include/llvm/Object/GOFF.h
+++ b/llvm/include/llvm/Object/GOFF.h
@@ -73,6 +73,26 @@ protected:
   }
 };
 
+class TXTRecord : public Record {
+public:
+  /// \brief Maximum length of data; any more must go in continuation.
+  static const uint8_t TXTMaxDataLength = 56;
+
+  static Error getData(const uint8_t *Record, SmallString<256> &CompleteData);
+
+  static void getElementEsdId(const uint8_t *Record, uint32_t &EsdId) {
+    get<uint32_t>(Record, 4, EsdId);
+  }
+
+  static void getOffset(const uint8_t *Record, uint32_t &Offset) {
+    get<uint32_t>(Record, 12, Offset);
+  }
+
+  static void getDataLength(const uint8_t *Record, uint16_t &Length) {
+    get<uint16_t>(Record, 22, Length);
+  }
+};
+
 class HDRRecord : public Record {
 public:
   static Error getData(const uint8_t *Record, SmallString<256> &CompleteData);
diff --git a/llvm/include/llvm/Object/GOFFObjectFile.h b/llvm/include/llvm/Object/GOFFObjectFile.h
index 7e1ceb9..6871641 100644
--- a/llvm/include/llvm/Object/GOFFObjectFile.h
+++ b/llvm/include/llvm/Object/GOFFObjectFile.h
@@ -29,7 +29,10 @@ namespace llvm {
 namespace object {
 
 class GOFFObjectFile : public ObjectFile {
+  friend class GOFFSymbolRef;
+
   IndexedMap<const uint8_t *> EsdPtrs; // Indexed by EsdId.
+  SmallVector<const uint8_t *, 256> TextPtrs;
 
   mutable DenseMap<uint32_t, std::pair<size_t, std::unique_ptr<char[]>>>
       EsdNamesCache;
@@ -38,7 +41,7 @@ class GOFFObjectFile : public ObjectFile {
   // (EDID, 0)               code, r/o data section
   // (EDID,PRID)             r/w data section
   SmallVector<SectionEntryImpl, 256> SectionList;
-  mutable DenseMap<uint32_t, std::string> SectionDataCache;
+  mutable DenseMap<uint32_t, SmallVector<uint8_t>> SectionDataCache;
 
 public:
   Expected<StringRef> getSymbolName(SymbolRef Symbol) const;
@@ -66,6 +69,10 @@ public:
     return true;
   }
 
+  bool isSectionNoLoad(DataRefImpl Sec) const;
+  bool isSectionReadOnlyData(DataRefImpl Sec) const;
+  bool isSectionZeroInit(DataRefImpl Sec) const;
+
 private:
   // SymbolRef.
   Expected<StringRef> getSymbolName(DataRefImpl Symb) const override;
@@ -75,27 +82,24 @@ private:
   Expected<uint32_t> getSymbolFlags(DataRefImpl Symb) const override;
   Expected<SymbolRef::Type> getSymbolType(DataRefImpl Symb) const override;
   Expected<section_iterator> getSymbolSection(DataRefImpl Symb) const override;
+  uint64_t getSymbolSize(DataRefImpl Symb) const;
 
   const uint8_t *getSymbolEsdRecord(DataRefImpl Symb) const;
   bool isSymbolUnresolved(DataRefImpl Symb) const;
   bool isSymbolIndirect(DataRefImpl Symb) const;
 
   // SectionRef.
-  void moveSectionNext(DataRefImpl &Sec) const override {}
-  virtual Expected<StringRef> getSectionName(DataRefImpl Sec) const override {
-    return StringRef();
-  }
-  uint64_t getSectionAddress(DataRefImpl Sec) const override { return 0; }
-  uint64_t getSectionSize(DataRefImpl Sec) const override { return 0; }
+  void moveSectionNext(DataRefImpl &Sec) const override;
+  virtual Expected<StringRef> getSectionName(DataRefImpl Sec) const override;
+  uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionSize(DataRefImpl Sec) const override;
   virtual Expected<ArrayRef<uint8_t>>
-  getSectionContents(DataRefImpl Sec) const override {
-    return ArrayRef<uint8_t>();
-  }
-  uint64_t getSectionIndex(DataRefImpl Sec) const override { return 0; }
-  uint64_t getSectionAlignment(DataRefImpl Sec) const override { return 0; }
+  getSectionContents(DataRefImpl Sec) const override;
+  uint64_t getSectionIndex(DataRefImpl Sec) const override { return Sec.d.a; }
+  uint64_t getSectionAlignment(DataRefImpl Sec) const override;
   bool isSectionCompressed(DataRefImpl Sec) const override { return false; }
-  bool isSectionText(DataRefImpl Sec) const override { return false; }
-  bool isSectionData(DataRefImpl Sec) const override { return false; }
+  bool isSectionText(DataRefImpl Sec) const override;
+  bool isSectionData(DataRefImpl Sec) const override;
   bool isSectionBSS(DataRefImpl Sec) const override { return false; }
   bool isSectionVirtual(DataRefImpl Sec) const override { return false; }
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override {
@@ -109,6 +113,7 @@ private:
   const uint8_t *getSectionPrEsdRecord(DataRefImpl &Sec) const;
   const uint8_t *getSectionEdEsdRecord(uint32_t SectionIndex) const;
   const uint8_t *getSectionPrEsdRecord(uint32_t SectionIndex) const;
+  uint32_t getSectionDefEsdId(DataRefImpl &Sec) const;
 
   // RelocationRef.
   void moveRelocationNext(DataRefImpl &Rel) const override {}
@@ -122,6 +127,29 @@ private:
                              SmallVectorImpl<char> &Result) const override {}
 };
 
+class GOFFSymbolRef : public SymbolRef {
+public:
+  GOFFSymbolRef(const SymbolRef &B) : SymbolRef(B) {
+    assert(isa<GOFFObjectFile>(SymbolRef::getObject()));
+  }
+
+  const GOFFObjectFile *getObject() const {
+    return cast<GOFFObjectFile>(BasicSymbolRef::getObject());
+  }
+
+  Expected<uint32_t> getSymbolGOFFFlags() const {
+    return getObject()->getSymbolFlags(getRawDataRefImpl());
+  }
+
+  Expected<SymbolRef::Type> getSymbolGOFFType() const {
+    return getObject()->getSymbolType(getRawDataRefImpl());
+  }
+
+  uint64_t getSize() const {
+    return getObject()->getSymbolSize(getRawDataRefImpl());
+  }
+};
+
 } // namespace object
 
 } // namespace llvm
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 02f64fc..2301a27 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -640,6 +640,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(tailcc);
   KEYWORD(m68k_rtdcc);
   KEYWORD(graalcc);
+  KEYWORD(riscv_vector_cc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index f0be021..41d48e5 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2143,6 +2143,7 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'tailcc'
 ///   ::= 'm68k_rtdcc'
 ///   ::= 'graalcc'
+///   ::= 'riscv_vector_cc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::parseOptionalCallingConv(unsigned &CC) {
@@ -2213,6 +2214,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_m68k_rtdcc:     CC = CallingConv::M68k_RTD; break;
   case lltok::kw_graalcc:        CC = CallingConv::GRAAL; break;
+  case lltok::kw_riscv_vector_cc:
+    CC = CallingConv::RISCV_VectorCall;
+    break;
   case lltok::kw_cc: {
       Lex.Lex();
       return parseUInt32(CC);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 39ee95d..36abe27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2555,7 +2555,8 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
 
 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
 /// a shift and add with a different constant.
-static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
+static SDValue foldAddSubOfSignBit(SDNode *N, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Expecting add or sub");
 
@@ -2583,7 +2584,6 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
   // Eliminate the 'not' by adjusting the shift and add/sub constant:
   // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
   // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
-  SDLoc DL(N);
   if (SDValue NewC = DAG.FoldConstantArithmetic(
           IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
           {ConstantOp, DAG.getConstant(1, DL, VT)})) {
@@ -2878,7 +2878,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubBoolOfMaskedVal(N, DL, DAG))
     return V;
 
-  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+  if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
     return V;
 
   // Try to match AVGFLOOR fixedwidth pattern
@@ -3877,14 +3877,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldAddSubBoolOfMaskedVal(N, DL, DAG))
     return V;
 
-  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+  if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
     return V;
 
   // Try to match AVGCEIL fixedwidth pattern
   if (SDValue V = foldSubToAvg(N, DL))
     return V;
 
-  if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
+  if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, DL))
     return V;
 
   if (SDValue V = foldSubToUSubSat(VT, N, DL))
@@ -3949,7 +3949,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
         if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
           if (C->getAPIntValue() == (BitWidth - 1))
-            return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
+            return DAG.getNode(ISD::ABS, DL, VT, S0);
     }
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b1f6fd6..e10b8bc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -118,14 +118,7 @@ private:
   void LegalizeLoadOps(SDNode *Node);
   void LegalizeStoreOps(SDNode *Node);
 
-  /// Some targets cannot handle a variable
-  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
-  /// is necessary to spill the vector being inserted into to memory, perform
-  /// the insert there, and then read the result back.
-  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
-                                         const SDLoc &dl);
-  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
-                                  const SDLoc &dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Op);
 
   /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
@@ -378,45 +371,12 @@ SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
   return Result;
 }
 
-/// Some target cannot handle a variable insertion index for the
-/// INSERT_VECTOR_ELT instruction.  In this case, it
-/// is necessary to spill the vector being inserted into to memory, perform
-/// the insert there, and then read the result back.
-SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
-                                                             SDValue Val,
-                                                             SDValue Idx,
-                                                             const SDLoc &dl) {
-  // If the target doesn't support this, we have to spill the input vector
-  // to a temporary stack slot, update the element, then reload it.  This is
-  // badness.  We could also load the value into a vector register (either
-  // with a "move to register" or "extload into register" instruction, then
-  // permute it into place, if the idx is a constant and if the idx is
-  // supported by the target.
-  EVT VT    = Vec.getValueType();
-  EVT EltVT = VT.getVectorElementType();
-  SDValue StackPtr = DAG.CreateStackTemporary(VT);
-
-  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-
-  // Store the vector.
-  SDValue Ch = DAG.getStore(
-      DAG.getEntryNode(), dl, Vec, StackPtr,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
-
-  SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Idx);
-
-  // Store the scalar value.
-  Ch = DAG.getTruncStore(
-      Ch, dl, Val, StackPtr2,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
-  // Load the updated vector.
-  return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
-                                               DAG.getMachineFunction(), SPFI));
-}
+SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  SDLoc dl(Op);
 
-SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
-                                                      SDValue Idx,
-                                                      const SDLoc &dl) {
   if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
     // SCALAR_TO_VECTOR requires that the type of the value being inserted
     // match the element type of the vector being created, except for
@@ -438,7 +398,7 @@ SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
       return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
     }
   }
-  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+  return ExpandInsertToVectorThroughStack(Op);
 }
 
 SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
@@ -1486,7 +1446,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   // Store the value to a temporary stack slot, then LOAD the returned part.
   EVT VecVT = Vec.getValueType();
-  EVT SubVecVT = Part.getValueType();
+  EVT PartVT = Part.getValueType();
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
@@ -1496,13 +1456,24 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Then store the inserted part.
-  SDValue SubStackPtr =
-      TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
+  if (PartVT.isVector()) {
+    SDValue SubStackPtr =
+        TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, PartVT, Idx);
+
+    // Store the subvector.
+    Ch = DAG.getStore(
+        Ch, dl, Part, SubStackPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+  } else {
+    SDValue SubStackPtr =
+        TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
-  // Store the subvector.
-  Ch = DAG.getStore(
-      Ch, dl, Part, SubStackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    // Store the scalar value.
+    Ch = DAG.getTruncStore(
+        Ch, dl, Part, SubStackPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+        VecVT.getVectorElementType());
+  }
 
   // Finally, load the updated vector.
   return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
@@ -3416,9 +3387,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
     break;
   case ISD::INSERT_VECTOR_ELT:
-    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
-                                              Node->getOperand(1),
-                                              Node->getOperand(2), dl));
+    Results.push_back(ExpandINSERT_VECTOR_ELT(SDValue(Node, 0)));
     break;
   case ISD::VECTOR_SHUFFLE: {
     SmallVector<int, 32> NewMask;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9990556..b16e78d 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2073,7 +2073,8 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
     // FreeBSD has "__stack_chk_guard" defined externally on libc.so
     if (M.getDirectAccessExternalData() &&
         !TM.getTargetTriple().isWindowsGNUEnvironment() &&
-        !TM.getTargetTriple().isOSFreeBSD() &&
+        !(TM.getTargetTriple().isPPC64() &&
+          TM.getTargetTriple().isOSFreeBSD()) &&
         (!TM.getTargetTriple().isOSDarwin() ||
          TM.getRelocationModel() == Reloc::Static))
       GV->setDSOLocal(true);
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 38c191a..84690f0 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -363,6 +363,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
   case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
   case CallingConv::M68k_RTD:      Out << "m68k_rtdcc"; break;
+  case CallingConv::RISCV_VectorCall:
+    Out << "riscv_vector_cc";
+    break;
   }
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 33f3584..e165725 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5096,6 +5096,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
 
+  if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa_struct))
+    TBAAVerifyHelper.visitTBAAStructMetadata(I, TBAA);
+
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias))
     visitAliasScopeListMetadata(MD);
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
@@ -7419,6 +7422,35 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
   return true;
 }
 
+bool TBAAVerifier::visitTBAAStructMetadata(Instruction &I, const MDNode *MD) {
+  CheckTBAA(MD->getNumOperands() % 3 == 0,
+            "tbaa.struct operands must occur in groups of three", &I, MD);
+
+  // Each group of three operands must consist of two integers and a
+  // tbaa node. Moreover, the regions described by the offset and size
+  // operands must be non-overlapping.
+  std::optional<APInt> NextFree;
+  for (unsigned int Idx = 0; Idx < MD->getNumOperands(); Idx += 3) {
+    auto *OffsetCI =
+        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx));
+    CheckTBAA(OffsetCI, "Offset must be a constant integer", &I, MD);
+
+    auto *SizeCI =
+        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx + 1));
+    CheckTBAA(SizeCI, "Size must be a constant integer", &I, MD);
+
+    MDNode *TBAA = dyn_cast_or_null<MDNode>(MD->getOperand(Idx + 2));
+    CheckTBAA(TBAA, "TBAA tag missing", &I, MD);
+    visitTBAAMetadata(I, TBAA);
+
+    bool NonOverlapping = !NextFree || NextFree->ule(OffsetCI->getValue());
+    CheckTBAA(NonOverlapping, "Overlapping tbaa.struct regions", &I, MD);
+
+    NextFree = OffsetCI->getValue() + SizeCI->getValue();
+  }
+  return true;
+}
+
 char VerifierLegacyPass::ID = 0;
 INITIALIZE_PASS(VerifierLegacyPass, "verify", "Module Verifier", false, false)
 
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 8224a14..477c5bf 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -690,12 +690,12 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       if (ImportType == IMPORT_CODE && isArm64EC(M)) {
         if (std::optional<std::string> MangledName =
                 getArm64ECMangledFunctionName(Name)) {
-          if (ExportName.empty()) {
+          if (!E.Noname && ExportName.empty()) {
             NameType = IMPORT_NAME_EXPORTAS;
             ExportName.swap(Name);
           }
           Name = std::move(*MangledName);
-        } else if (ExportName.empty()) {
+        } else if (!E.Noname && ExportName.empty()) {
           NameType = IMPORT_NAME_EXPORTAS;
           ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
         }
diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp
index 76a1355..2845d93 100644
--- a/llvm/lib/Object/GOFFObjectFile.cpp
+++ b/llvm/lib/Object/GOFFObjectFile.cpp
@@ -168,6 +168,11 @@ GOFFObjectFile::GOFFObjectFile(MemoryBufferRef Object, Error &Err)
       LLVM_DEBUG(dbgs() << "  --  ESD " << EsdId << "\n");
       break;
     }
+    case GOFF::RT_TXT:
+      // Save TXT records.
+      TextPtrs.emplace_back(I);
+      LLVM_DEBUG(dbgs() << "  --  TXT\n");
+      break;
     case GOFF::RT_END:
       LLVM_DEBUG(dbgs() << "  --  END (GOFF record type) unhandled\n");
       break;
@@ -364,6 +369,13 @@ GOFFObjectFile::getSymbolSection(DataRefImpl Symb) const {
                                std::to_string(SymEdId));
 }
 
+uint64_t GOFFObjectFile::getSymbolSize(DataRefImpl Symb) const {
+  const uint8_t *Record = getSymbolEsdRecord(Symb);
+  uint32_t Length;
+  ESDRecord::getLength(Record, Length);
+  return Length;
+}
+
 const uint8_t *GOFFObjectFile::getSectionEdEsdRecord(DataRefImpl &Sec) const {
   SectionEntryImpl EsdIds = SectionList[Sec.d.a];
   const uint8_t *EsdRecord = EsdPtrs[EsdIds.d.a];
@@ -394,6 +406,154 @@ GOFFObjectFile::getSectionPrEsdRecord(uint32_t SectionIndex) const {
   return EsdRecord;
 }
 
+uint32_t GOFFObjectFile::getSectionDefEsdId(DataRefImpl &Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  uint32_t Length;
+  ESDRecord::getLength(EsdRecord, Length);
+  if (Length == 0) {
+    const uint8_t *PrEsdRecord = getSectionPrEsdRecord(Sec);
+    if (PrEsdRecord)
+      EsdRecord = PrEsdRecord;
+  }
+
+  uint32_t DefEsdId;
+  ESDRecord::getEsdId(EsdRecord, DefEsdId);
+  LLVM_DEBUG(dbgs() << "Got def EsdId: " << DefEsdId << '\n');
+  return DefEsdId;
+}
+
+void GOFFObjectFile::moveSectionNext(DataRefImpl &Sec) const {
+  Sec.d.a++;
+  if ((Sec.d.a) >= SectionList.size())
+    Sec.d.a = 0;
+}
+
+Expected<StringRef> GOFFObjectFile::getSectionName(DataRefImpl Sec) const {
+  DataRefImpl EdSym;
+  SectionEntryImpl EsdIds = SectionList[Sec.d.a];
+  EdSym.d.a = EsdIds.d.a;
+  Expected<StringRef> Name = getSymbolName(EdSym);
+  if (Name) {
+    StringRef Res = *Name;
+    LLVM_DEBUG(dbgs() << "Got section: " << Res << '\n');
+    LLVM_DEBUG(dbgs() << "Final section name: " << Res << '\n');
+    Name = Res;
+  }
+  return Name;
+}
+
+uint64_t GOFFObjectFile::getSectionAddress(DataRefImpl Sec) const {
+  uint32_t Offset;
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  ESDRecord::getOffset(EsdRecord, Offset);
+  return Offset;
+}
+
+uint64_t GOFFObjectFile::getSectionSize(DataRefImpl Sec) const {
+  uint32_t Length;
+  uint32_t DefEsdId = getSectionDefEsdId(Sec);
+  const uint8_t *EsdRecord = EsdPtrs[DefEsdId];
+  ESDRecord::getLength(EsdRecord, Length);
+  LLVM_DEBUG(dbgs() << "Got section size: " << Length << '\n');
+  return static_cast<uint64_t>(Length);
+}
+
+// Unravel TXT records and expand fill characters to produce
+// a contiguous sequence of bytes.
+Expected<ArrayRef<uint8_t>>
+GOFFObjectFile::getSectionContents(DataRefImpl Sec) const {
+  if (SectionDataCache.count(Sec.d.a)) {
+    auto &Buf = SectionDataCache[Sec.d.a];
+    return ArrayRef<uint8_t>(Buf);
+  }
+  uint64_t SectionSize = getSectionSize(Sec);
+  uint32_t DefEsdId = getSectionDefEsdId(Sec);
+
+  const uint8_t *EdEsdRecord = getSectionEdEsdRecord(Sec);
+  bool FillBytePresent;
+  ESDRecord::getFillBytePresent(EdEsdRecord, FillBytePresent);
+  uint8_t FillByte = '\0';
+  if (FillBytePresent)
+    ESDRecord::getFillByteValue(EdEsdRecord, FillByte);
+
+  // Initialize section with fill byte.
+  SmallVector<uint8_t> Data(SectionSize, FillByte);
+
+  // Replace section with content from text records.
+  for (const uint8_t *TxtRecordInt : TextPtrs) {
+    const uint8_t *TxtRecordPtr = TxtRecordInt;
+    uint32_t TxtEsdId;
+    TXTRecord::getElementEsdId(TxtRecordPtr, TxtEsdId);
+    LLVM_DEBUG(dbgs() << "Got txt EsdId: " << TxtEsdId << '\n');
+
+    if (TxtEsdId != DefEsdId)
+      continue;
+
+    uint32_t TxtDataOffset;
+    TXTRecord::getOffset(TxtRecordPtr, TxtDataOffset);
+
+    uint16_t TxtDataSize;
+    TXTRecord::getDataLength(TxtRecordPtr, TxtDataSize);
+
+    LLVM_DEBUG(dbgs() << "Record offset " << TxtDataOffset << ", data size "
+                      << TxtDataSize << "\n");
+
+    SmallString<256> CompleteData;
+    CompleteData.reserve(TxtDataSize);
+    if (Error Err = TXTRecord::getData(TxtRecordPtr, CompleteData))
+      return std::move(Err);
+    assert(CompleteData.size() == TxtDataSize && "Wrong length of data");
+    std::copy(CompleteData.data(), CompleteData.data() + TxtDataSize,
+              Data.begin() + TxtDataOffset);
+  }
+  SectionDataCache[Sec.d.a] = Data;
+  return ArrayRef<uint8_t>(Data);
+}
+
+uint64_t GOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDAlignment Pow2Alignment;
+  ESDRecord::getAlignment(EsdRecord, Pow2Alignment);
+  return 1ULL << static_cast<uint64_t>(Pow2Alignment);
+}
+
+bool GOFFObjectFile::isSectionText(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDExecutable Executable;
+  ESDRecord::getExecutable(EsdRecord, Executable);
+  return Executable == GOFF::ESD_EXE_CODE;
+}
+
+bool GOFFObjectFile::isSectionData(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDExecutable Executable;
+  ESDRecord::getExecutable(EsdRecord, Executable);
+  return Executable == GOFF::ESD_EXE_DATA;
+}
+
+bool GOFFObjectFile::isSectionNoLoad(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDLoadingBehavior LoadingBehavior;
+  ESDRecord::getLoadingBehavior(EsdRecord, LoadingBehavior);
+  return LoadingBehavior == GOFF::ESD_LB_NoLoad;
+}
+
+bool GOFFObjectFile::isSectionReadOnlyData(DataRefImpl Sec) const {
+  if (!isSectionData(Sec))
+    return false;
+
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDLoadingBehavior LoadingBehavior;
+  ESDRecord::getLoadingBehavior(EsdRecord, LoadingBehavior);
+  return LoadingBehavior == GOFF::ESD_LB_Initial;
+}
+
+bool GOFFObjectFile::isSectionZeroInit(DataRefImpl Sec) const {
+  // GOFF uses fill characters and fill characters are applied
+  // on getSectionContents() - so we say false to zero init.
+  return false;
+}
+
 section_iterator GOFFObjectFile::section_begin() const {
   DataRefImpl Sec;
   moveSectionNext(Sec);
@@ -476,6 +636,13 @@ Error ESDRecord::getData(const uint8_t *Record,
   return getContinuousData(Record, DataSize, 72, CompleteData);
 }
 
+Error TXTRecord::getData(const uint8_t *Record,
+                         SmallString<256> &CompleteData) {
+  uint16_t Length;
+  getDataLength(Record, Length);
+  return getContinuousData(Record, Length, 24, CompleteData);
+}
+
 Error ENDRecord::getData(const uint8_t *Record,
                          SmallString<256> &CompleteData) {
   uint16_t Length = getNameLength(Record);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 72e8b59..052b231 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -22,6 +22,7 @@
 #include "AMDKernelCodeT.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "R600AsmPrinter.h"
 #include "SIMachineFunctionInfo.h"
@@ -428,38 +429,43 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
   return KernelCodeProperties;
 }
 
-amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
-    const MachineFunction &MF,
-    const SIProgramInfo &PI) const {
+MCKernelDescriptor
+AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
+                                            const SIProgramInfo &PI) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  MCContext &Ctx = MF.getContext();
 
-  amdhsa::kernel_descriptor_t KernelDescriptor;
-  memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+  MCKernelDescriptor KernelDescriptor;
 
   assert(isUInt<32>(PI.ScratchSize));
   assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
   assert(isUInt<32>(PI.getComputePGMRSrc2()));
 
-  KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
-  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+  KernelDescriptor.group_segment_fixed_size =
+      MCConstantExpr::create(PI.LDSSize, Ctx);
+  KernelDescriptor.private_segment_fixed_size =
+      MCConstantExpr::create(PI.ScratchSize, Ctx);
 
   Align MaxKernArgAlign;
-  KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
+  KernelDescriptor.kernarg_size = MCConstantExpr::create(
+      STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
 
-  KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM);
-  KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
-  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+  KernelDescriptor.compute_pgm_rsrc1 =
+      MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx);
+  KernelDescriptor.compute_pgm_rsrc2 =
+      MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx);
+  KernelDescriptor.kernel_code_properties =
+      MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
 
   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
-  if (STM.hasGFX90AInsts())
-    KernelDescriptor.compute_pgm_rsrc3 =
-      CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+  KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
+      STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
 
-  if (AMDGPU::hasKernargPreload(STM))
-    KernelDescriptor.kernarg_preload =
-        static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
+  KernelDescriptor.kernarg_preload = MCConstantExpr::create(
+      AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
+      Ctx);
 
   return KernelDescriptor;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 79326cd..b8b2718 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -28,15 +28,12 @@ class MCCodeEmitter;
 class MCOperand;
 
 namespace AMDGPU {
+struct MCKernelDescriptor;
 namespace HSAMD {
 class MetadataStreamer;
 }
 } // namespace AMDGPU
 
-namespace amdhsa {
-struct kernel_descriptor_t;
-}
-
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
   unsigned CodeObjectVersion;
@@ -75,9 +72,9 @@ private:
   uint16_t getAmdhsaKernelCodeProperties(
       const MachineFunction &MF) const;
 
-  amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
-      const MachineFunction &MF,
-      const SIProgramInfo &PI) const;
+  AMDGPU::MCKernelDescriptor
+  getAmdhsaKernelDescriptor(const MachineFunction &MF,
+                            const SIProgramInfo &PI) const;
 
   void initTargetStreamer(Module &M);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 84b4ccc..5aa35bec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -657,6 +657,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
     return true;
 
   IRBuilder<> B(CI);
+  if (CI->isStrictFP())
+    B.setIsFPConstrained(true);
 
   if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
     // Under unsafe-math, evaluate calls if possible.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4648df1..294fc68 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8,6 +8,7 @@
 
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
+#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
@@ -5417,7 +5418,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (getParser().parseIdentifier(KernelName))
     return true;
 
-  kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI());
+  AMDGPU::MCKernelDescriptor KD =
+      AMDGPU::MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(
+          &getSTI(), getContext());
 
   StringSet<> Seen;
 
@@ -5457,89 +5460,111 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       return TokError(".amdhsa_ directives cannot be repeated");
 
     SMLoc ValStart = getLoc();
-    int64_t IVal;
-    if (getParser().parseAbsoluteExpression(IVal))
+    const MCExpr *ExprVal;
+    if (getParser().parseExpression(ExprVal))
       return true;
     SMLoc ValEnd = getLoc();
     SMRange ValRange = SMRange(ValStart, ValEnd);
 
-    if (IVal < 0)
-      return OutOfRangeError(ValRange);
-
+    int64_t IVal = 0;
     uint64_t Val = IVal;
+    bool EvaluatableExpr;
+    if ((EvaluatableExpr = ExprVal->evaluateAsAbsolute(IVal))) {
+      if (IVal < 0)
+        return OutOfRangeError(ValRange);
+      Val = IVal;
+    }
 
 #define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE)                           \
-  if (!isUInt<ENTRY##_WIDTH>(VALUE))                                           \
+  if (!isUInt<ENTRY##_WIDTH>(Val))                                             \
     return OutOfRangeError(RANGE);                                             \
-  AMDHSA_BITS_SET(FIELD, ENTRY, VALUE);
+  AMDGPU::MCKernelDescriptor::bits_set(FIELD, VALUE, ENTRY##_SHIFT, ENTRY,     \
+                                       getContext());
+
+// Some fields use the parsed value immediately which requires the expression to
+// be solvable.
+#define EXPR_RESOLVE_OR_ERROR(RESOLVED)                                        \
+  if (!(RESOLVED))                                                             \
+    return Error(IDRange.Start, "directive should have resolvable expression", \
+                 IDRange);
 
     if (ID == ".amdhsa_group_segment_fixed_size") {
-      if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val))
+      if (!isUInt<sizeof(kernel_descriptor_t::group_segment_fixed_size) *
+                  CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
-      KD.group_segment_fixed_size = Val;
+      KD.group_segment_fixed_size = ExprVal;
     } else if (ID == ".amdhsa_private_segment_fixed_size") {
-      if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
+      if (!isUInt<sizeof(kernel_descriptor_t::private_segment_fixed_size) *
+                  CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
-      KD.private_segment_fixed_size = Val;
+      KD.private_segment_fixed_size = ExprVal;
     } else if (ID == ".amdhsa_kernarg_size") {
-      if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val))
+      if (!isUInt<sizeof(kernel_descriptor_t::kernarg_size) * CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
-      KD.kernarg_size = Val;
+      KD.kernarg_size = ExprVal;
     } else if (ID == ".amdhsa_user_sgpr_count") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       ExplicitUserSGPRCount = Val;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
-                       Val, ValRange);
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 4;
     } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (!hasKernargPreload())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
 
       if (Val > getMaxNumUserSGPRs())
         return OutOfRangeError(ValRange);
-      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val,
+      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, ExprVal,
                        ValRange);
       if (Val) {
         ImpliedUserSGPRCount += Val;
         PreloadLength = Val;
       }
     } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (!hasKernargPreload())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
 
       if (Val >= 1024)
         return OutOfRangeError(ValRange);
-      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val,
+      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, ExprVal,
                        ValRange);
       if (Val)
         PreloadOffset = Val;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, ExprVal,
                        ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, ExprVal,
                        ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
-                       Val, ValRange);
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, ExprVal,
                        ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
@@ -5548,34 +5573,39 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
-                       ValRange);
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT,
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
-                       Val, ValRange);
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 1;
     } else if (ID == ".amdhsa_wavefront_size32") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       EnableWavefrontSize32 = Val;
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
-                       Val, ValRange);
+                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_uses_dynamic_stack") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, Val, ValRange);
+                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_enable_private_segment") {
       if (!hasArchitectedFlatScratch())
         return Error(
@@ -5583,42 +5613,48 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
             "directive is not supported without architected flat scratch",
             IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_info") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_vgpr_workitem_id") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_next_free_vgpr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       VGPRRange = ValRange;
       NextFreeVGPR = Val;
     } else if (ID == ".amdhsa_next_free_sgpr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       SGPRRange = ValRange;
       NextFreeSGPR = Val;
     } else if (ID == ".amdhsa_accum_offset") {
       if (!isGFX90A())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       AccumOffset = Val;
     } else if (ID == ".amdhsa_reserve_vcc") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (!isUInt<1>(Val))
         return OutOfRangeError(ValRange);
       ReserveVCC = Val;
     } else if (ID == ".amdhsa_reserve_flat_scratch") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 7)
         return Error(IDRange.Start, "directive requires gfx7+", IDRange);
       if (hasArchitectedFlatScratch())
@@ -5638,97 +5674,105 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                                  IDRange);
     } else if (ID == ".amdhsa_float_round_mode_32") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_float_round_mode_16_64") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange);
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_float_denorm_mode_32") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange);
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_float_denorm_mode_16_64") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_dx10_clamp") {
       if (IVersion.Major >= 12)
         return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Val,
+                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_ieee_mode") {
       if (IVersion.Major >= 12)
         return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Val,
+                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_fp16_overflow") {
       if (IVersion.Major < 9)
         return Error(IDRange.Start, "directive requires gfx9+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_tg_split") {
       if (!isGFX90A())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val,
-                       ValRange);
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_memory_ordered") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_forward_progress") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_shared_vgpr_count") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 10 || IVersion.Major >= 12)
         return Error(IDRange.Start, "directive requires gfx10 or gfx11",
                      IDRange);
       SharedVGPRCount = Val;
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
-                       COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, Val,
+                       COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
       PARSE_BITS_ENTRY(
           KD.compute_pgm_rsrc2,
-          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val,
-          ValRange);
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION,
+          ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_denorm_src") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") {
       PARSE_BITS_ENTRY(
           KD.compute_pgm_rsrc2,
-          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val,
-          ValRange);
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO,
+          ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_overflow") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_underflow") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_inexact") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_int_div_zero") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_round_robin_scheduling") {
       if (IVersion.Major < 12)
         return Error(IDRange.Start, "directive requires gfx12+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, Val,
+                       COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, ExprVal,
                        ValRange);
     } else {
       return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange);
@@ -5755,15 +5799,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
           VGPRBlocks))
     return OutOfRangeError(VGPRRange);
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                  COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks);
+  AMDGPU::MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()),
+      COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT,
+      COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext());
 
   if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
           SGPRBlocks))
     return OutOfRangeError(SGPRRange);
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                  COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
-                  SGPRBlocks);
+  AMDGPU::MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()),
+      COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT,
+      COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext());
 
   if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount)
     return TokError("amdgpu_user_sgpr_count smaller than than implied by "
@@ -5774,11 +5821,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
 
   if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
     return TokError("too many user SGPRs enabled");
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
-                  UserSGPRCount);
-
-  if (PreloadLength && KD.kernarg_size &&
-      (PreloadLength * 4 + PreloadOffset * 4 > KD.kernarg_size))
+  AMDGPU::MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc2, MCConstantExpr::create(UserSGPRCount, getContext()),
+      COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT,
+      COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, getContext());
+
+  int64_t IVal = 0;
+  if (!KD.kernarg_size->evaluateAsAbsolute(IVal))
+    return TokError("Kernarg size should be resolvable");
+  uint64_t kernarg_size = IVal;
+  if (PreloadLength && kernarg_size &&
+      (PreloadLength * 4 + PreloadOffset * 4 > kernarg_size))
     return TokError("Kernarg preload length + offset is larger than the "
                     "kernarg segment size");
 
@@ -5790,8 +5843,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                       "increments of 4");
     if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
       return TokError("accum_offset exceeds total VGPR allocation");
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
-                    (AccumOffset / 4 - 1));
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc3,
+        MCConstantExpr::create(AccumOffset / 4 - 1, getContext()),
+        COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+        COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
   }
 
   if (IVersion.Major >= 10 && IVersion.Major < 12) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
new file mode 100644
index 0000000..77e7e30
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
@@ -0,0 +1,98 @@
+//===--- AMDHSAKernelDescriptor.h -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCKernelDescriptor.h"
+#include "AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/TargetParser/TargetParser.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+MCKernelDescriptor
+MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI,
+                                                     MCContext &Ctx) {
+  IsaVersion Version = getIsaVersion(STI->getCPU());
+
+  MCKernelDescriptor KD;
+  const MCExpr *ZeroMCExpr = MCConstantExpr::create(0, Ctx);
+  const MCExpr *OneMCExpr = MCConstantExpr::create(1, Ctx);
+
+  KD.group_segment_fixed_size = ZeroMCExpr;
+  KD.private_segment_fixed_size = ZeroMCExpr;
+  KD.compute_pgm_rsrc1 = ZeroMCExpr;
+  KD.compute_pgm_rsrc2 = ZeroMCExpr;
+  KD.compute_pgm_rsrc3 = ZeroMCExpr;
+  KD.kernarg_size = ZeroMCExpr;
+  KD.kernel_code_properties = ZeroMCExpr;
+  KD.kernarg_preload = ZeroMCExpr;
+
+  MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc1,
+      MCConstantExpr::create(amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE, Ctx),
+      amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Ctx);
+  if (Version.Major < 12) {
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc1, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Ctx);
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc1, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Ctx);
+  }
+  MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc2, OneMCExpr,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Ctx);
+  if (Version.Major >= 10) {
+    if (STI->getFeatureBits().test(FeatureWavefrontSize32))
+      MCKernelDescriptor::bits_set(
+          KD.kernel_code_properties, OneMCExpr,
+          amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
+          amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, Ctx);
+    if (!STI->getFeatureBits().test(FeatureCuMode))
+      MCKernelDescriptor::bits_set(
+          KD.compute_pgm_rsrc1, OneMCExpr,
+          amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
+          amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, Ctx);
+
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc1, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Ctx);
+  }
+  if (AMDGPU::isGFX90A(*STI) && STI->getFeatureBits().test(FeatureTgSplit))
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc3, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx);
+  return KD;
+}
+
+void MCKernelDescriptor::bits_set(const MCExpr *&Dst, const MCExpr *Value,
+                                  uint32_t Shift, uint32_t Mask,
+                                  MCContext &Ctx) {
+  auto Sft = MCConstantExpr::create(Shift, Ctx);
+  auto Msk = MCConstantExpr::create(Mask, Ctx);
+  Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+  Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Sft, Ctx),
+                               Ctx);
+}
+
+const MCExpr *MCKernelDescriptor::bits_get(const MCExpr *Src, uint32_t Shift,
+                                           uint32_t Mask, MCContext &Ctx) {
+  auto Sft = MCConstantExpr::create(Shift, Ctx);
+  auto Msk = MCConstantExpr::create(Mask, Ctx);
+  return MCBinaryExpr::createLShr(MCBinaryExpr::createAnd(Src, Msk, Ctx), Sft,
+                                  Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h
new file mode 100644
index 0000000..26958ac
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h
@@ -0,0 +1,54 @@
+//===--- AMDGPUMCKernelDescriptor.h ---------------------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDHSA kernel descriptor MCExpr struct for use in MC layer. Uses
+/// AMDHSAKernelDescriptor.h for sizes and constants.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELDESCRIPTOR_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELDESCRIPTOR_H
+
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
+
+namespace llvm {
+class MCExpr;
+class MCContext;
+class MCSubtargetInfo;
+namespace AMDGPU {
+
+struct MCKernelDescriptor {
+  const MCExpr *group_segment_fixed_size = nullptr;
+  const MCExpr *private_segment_fixed_size = nullptr;
+  const MCExpr *kernarg_size = nullptr;
+  const MCExpr *compute_pgm_rsrc3 = nullptr;
+  const MCExpr *compute_pgm_rsrc1 = nullptr;
+  const MCExpr *compute_pgm_rsrc2 = nullptr;
+  const MCExpr *kernel_code_properties = nullptr;
+  const MCExpr *kernarg_preload = nullptr;
+
+  static MCKernelDescriptor
+  getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI, MCContext &Ctx);
+  // MCExpr for:
+  // Dst = Dst & ~Mask
+  // Dst = Dst | (Value << Shift)
+  static void bits_set(const MCExpr *&Dst, const MCExpr *Value, uint32_t Shift,
+                       uint32_t Mask, MCContext &Ctx);
+
+  // MCExpr for:
+  // return (Src & Mask) >> Shift
+  static const MCExpr *bits_get(const MCExpr *Src, uint32_t Shift,
+                                uint32_t Mask, MCContext &Ctx);
+};
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELDESCRIPTOR_H
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 4742b0b..3006fcd 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetStreamer.h"
+#include "AMDGPUMCKernelDescriptor.h"
 #include "AMDGPUPTNote.h"
 #include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -307,94 +308,142 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 
 void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
-    const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
+    const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR,
     bool ReserveVCC, bool ReserveFlatScr) {
   IsaVersion IVersion = getIsaVersion(STI.getCPU());
+  const MCAsmInfo *MAI = getContext().getAsmInfo();
 
   OS << "\t.amdhsa_kernel " << KernelName << '\n';
 
-#define PRINT_FIELD(STREAM, DIRECTIVE, KERNEL_DESC, MEMBER_NAME, FIELD_NAME)   \
-  STREAM << "\t\t" << DIRECTIVE << " "                                         \
-         << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
-
-  OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
-     << '\n';
-  OS << "\t\t.amdhsa_private_segment_fixed_size "
-     << KD.private_segment_fixed_size << '\n';
-  OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
-
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_count", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT);
+  auto PrintField = [&](const MCExpr *Expr, uint32_t Shift, uint32_t Mask,
+                        StringRef Directive) {
+    int64_t IVal;
+    OS << "\t\t" << Directive << ' ';
+    const MCExpr *pgm_rsrc1_bits =
+        MCKernelDescriptor::bits_get(Expr, Shift, Mask, getContext());
+    if (pgm_rsrc1_bits->evaluateAsAbsolute(IVal))
+      OS << static_cast<uint64_t>(IVal);
+    else
+      pgm_rsrc1_bits->print(OS, MAI);
+    OS << '\n';
+  };
+
+  OS << "\t\t.amdhsa_group_segment_fixed_size ";
+  KD.group_segment_fixed_size->print(OS, MAI);
+  OS << '\n';
+
+  OS << "\t\t.amdhsa_private_segment_fixed_size ";
+  KD.private_segment_fixed_size->print(OS, MAI);
+  OS << '\n';
+
+  OS << "\t\t.amdhsa_kernarg_size ";
+  KD.kernarg_size->print(OS, MAI);
+  OS << '\n';
+
+  PrintField(
+      KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, ".amdhsa_user_sgpr_count");
 
   if (!hasArchitectedFlatScratch(STI))
-    PRINT_FIELD(
-        OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
-        kernel_code_properties,
-        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_queue_ptr", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+    PrintField(
+        KD.kernel_code_properties,
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
+        ".amdhsa_user_sgpr_private_segment_buffer");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR,
+             ".amdhsa_user_sgpr_dispatch_ptr");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR,
+             ".amdhsa_user_sgpr_queue_ptr");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+             ".amdhsa_user_sgpr_kernarg_segment_ptr");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID,
+             ".amdhsa_user_sgpr_dispatch_id");
   if (!hasArchitectedFlatScratch(STI))
-    PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
-                kernel_code_properties,
-                amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT,
+               ".amdhsa_user_sgpr_flat_scratch_init");
   if (hasKernargPreload(STI)) {
-    PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD,
-                kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH);
-    PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD,
-                kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET);
+    PrintField(KD.kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH_SHIFT,
+               amdhsa::KERNARG_PRELOAD_SPEC_LENGTH,
+               ".amdhsa_user_sgpr_kernarg_preload_length");
+    PrintField(KD.kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET_SHIFT,
+               amdhsa::KERNARG_PRELOAD_SPEC_OFFSET,
+               ".amdhsa_user_sgpr_kernarg_preload_offset");
   }
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+  PrintField(
+      KD.kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
+      ".amdhsa_user_sgpr_private_segment_size");
   if (IVersion.Major >= 10)
-    PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
-                kernel_code_properties,
-                amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
+               ".amdhsa_wavefront_size32");
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
-    PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties,
-                amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
-  PRINT_FIELD(OS,
-              (hasArchitectedFlatScratch(STI)
-                   ? ".amdhsa_enable_private_segment"
-                   : ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
-              KD, compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_info", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
-  PRINT_FIELD(OS, ".amdhsa_system_vgpr_workitem_id", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK,
+               ".amdhsa_uses_dynamic_stack");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT,
+             (hasArchitectedFlatScratch(STI)
+                  ? ".amdhsa_enable_private_segment"
+                  : ".amdhsa_system_sgpr_private_segment_wavefront_offset"));
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X,
+             ".amdhsa_system_sgpr_workgroup_id_x");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y,
+             ".amdhsa_system_sgpr_workgroup_id_y");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z,
+             ".amdhsa_system_sgpr_workgroup_id_z");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO,
+             ".amdhsa_system_sgpr_workgroup_info");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID,
+             ".amdhsa_system_vgpr_workitem_id");
 
   // These directives are required.
   OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
   OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
 
-  if (AMDGPU::isGFX90A(STI))
-    OS << "\t\t.amdhsa_accum_offset " <<
-      (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3,
-                       amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
-      << '\n';
+  if (AMDGPU::isGFX90A(STI)) {
+    // MCExpr equivalent of taking the (accum_offset + 1) * 4.
+    const MCExpr *accum_bits = MCKernelDescriptor::bits_get(
+        KD.compute_pgm_rsrc3,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
+    accum_bits = MCBinaryExpr::createAdd(
+        accum_bits, MCConstantExpr::create(1, getContext()), getContext());
+    accum_bits = MCBinaryExpr::createMul(
+        accum_bits, MCConstantExpr::create(4, getContext()), getContext());
+    OS << "\t\t.amdhsa_accum_offset ";
+    int64_t IVal;
+    if (accum_bits->evaluateAsAbsolute(IVal)) {
+      OS << static_cast<uint64_t>(IVal);
+    } else {
+      accum_bits->print(OS, MAI);
+    }
+    OS << '\n';
+  }
 
   if (!ReserveVCC)
     OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
@@ -411,74 +460,105 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     break;
   }
 
-  PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
-  PRINT_FIELD(OS, ".amdhsa_float_round_mode_16_64", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
-  PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_32", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
-  PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32,
+             ".amdhsa_float_round_mode_32");
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64,
+             ".amdhsa_float_round_mode_16_64");
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32,
+             ".amdhsa_float_denorm_mode_32");
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
+             ".amdhsa_float_denorm_mode_16_64");
   if (IVersion.Major < 12) {
-    PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD, compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
-    PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD, compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP,
+               ".amdhsa_dx10_clamp");
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE,
+               ".amdhsa_ieee_mode");
+  }
+  if (IVersion.Major >= 9) {
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL,
+               ".amdhsa_fp16_overflow");
   }
-  if (IVersion.Major >= 9)
-    PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
   if (AMDGPU::isGFX90A(STI))
-    PRINT_FIELD(OS, ".amdhsa_tg_split", KD,
-                compute_pgm_rsrc3,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
+    PrintField(KD.compute_pgm_rsrc3,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
   if (IVersion.Major >= 10) {
-    PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
-    PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
-    PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
+               ".amdhsa_workgroup_processor_mode");
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
+               ".amdhsa_memory_ordered");
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS,
+               ".amdhsa_forward_progress");
   }
   if (IVersion.Major >= 10 && IVersion.Major < 12) {
-    PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
+    PrintField(KD.compute_pgm_rsrc3,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT,
+               ".amdhsa_shared_vgpr_count");
   }
-  if (IVersion.Major >= 12)
-    PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
-  PRINT_FIELD(
-      OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
-      compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_denorm_src", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
-  PRINT_FIELD(
-      OS, ".amdhsa_exception_fp_ieee_div_zero", KD,
-      compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_overflow", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_underflow", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_inexact", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
-  PRINT_FIELD(OS, ".amdhsa_exception_int_div_zero", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
-#undef PRINT_FIELD
+  if (IVersion.Major >= 12) {
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN,
+               ".amdhsa_round_robin_scheduling");
+  }
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION,
+      ".amdhsa_exception_fp_ieee_invalid_op");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
+      ".amdhsa_exception_fp_denorm_src");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO,
+      ".amdhsa_exception_fp_ieee_div_zero");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
+      ".amdhsa_exception_fp_ieee_overflow");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
+      ".amdhsa_exception_fp_ieee_underflow");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
+      ".amdhsa_exception_fp_ieee_inexact");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
+      ".amdhsa_exception_int_div_zero");
 
   OS << "\t.end_amdhsa_kernel\n";
 }
@@ -835,7 +915,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 
 void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
-    const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+    const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR,
     uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
   auto &Streamer = getStreamer();
   auto &Context = Streamer.getContext();
@@ -853,7 +933,7 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
   // Kernel descriptor symbol's type and size are fixed.
   KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
   KernelDescriptorSymbol->setSize(
-      MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+      MCConstantExpr::create(sizeof(amdhsa::kernel_descriptor_t), Context));
 
   // The visibility of the kernel code symbol must be protected or less to allow
   // static relocations from the kernel descriptor to be used.
@@ -861,31 +941,43 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
 
   Streamer.emitLabel(KernelDescriptorSymbol);
-  Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size);
-  Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size);
-  Streamer.emitInt32(KernelDescriptor.kernarg_size);
-
-  for (uint8_t Res : KernelDescriptor.reserved0)
-    Streamer.emitInt8(Res);
+  Streamer.emitValue(
+      KernelDescriptor.group_segment_fixed_size,
+      sizeof(amdhsa::kernel_descriptor_t::group_segment_fixed_size));
+  Streamer.emitValue(
+      KernelDescriptor.private_segment_fixed_size,
+      sizeof(amdhsa::kernel_descriptor_t::private_segment_fixed_size));
+  Streamer.emitValue(KernelDescriptor.kernarg_size,
+                     sizeof(amdhsa::kernel_descriptor_t::kernarg_size));
+
+  for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved0); ++i)
+    Streamer.emitInt8(0u);
 
   // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
   // expression being created is:
   //   (start of kernel code) - (start of kernel descriptor)
   // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
-  Streamer.emitValue(MCBinaryExpr::createSub(
-      MCSymbolRefExpr::create(
-          KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
-      MCSymbolRefExpr::create(
-          KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
-      Context),
-      sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
-  for (uint8_t Res : KernelDescriptor.reserved1)
-    Streamer.emitInt8(Res);
-  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc3);
-  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1);
-  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2);
-  Streamer.emitInt16(KernelDescriptor.kernel_code_properties);
-  Streamer.emitInt16(KernelDescriptor.kernarg_preload);
-  for (uint8_t Res : KernelDescriptor.reserved3)
-    Streamer.emitInt8(Res);
+  Streamer.emitValue(
+      MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(KernelCodeSymbol,
+                                  MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+          MCSymbolRefExpr::create(KernelDescriptorSymbol,
+                                  MCSymbolRefExpr::VK_None, Context),
+          Context),
+      sizeof(amdhsa::kernel_descriptor_t::kernel_code_entry_byte_offset));
+  for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved1); ++i)
+    Streamer.emitInt8(0u);
+  Streamer.emitValue(KernelDescriptor.compute_pgm_rsrc3,
+                     sizeof(amdhsa::kernel_descriptor_t::compute_pgm_rsrc3));
+  Streamer.emitValue(KernelDescriptor.compute_pgm_rsrc1,
+                     sizeof(amdhsa::kernel_descriptor_t::compute_pgm_rsrc1));
+  Streamer.emitValue(KernelDescriptor.compute_pgm_rsrc2,
+                     sizeof(amdhsa::kernel_descriptor_t::compute_pgm_rsrc2));
+  Streamer.emitValue(
+      KernelDescriptor.kernel_code_properties,
+      sizeof(amdhsa::kernel_descriptor_t::kernel_code_properties));
+  Streamer.emitValue(KernelDescriptor.kernarg_preload,
+                     sizeof(amdhsa::kernel_descriptor_t::kernarg_preload));
+  for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved3); ++i)
+    Streamer.emitInt8(0u);
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 5aa80ff..706897a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -22,15 +22,13 @@ class MCSymbol;
 class formatted_raw_ostream;
 
 namespace AMDGPU {
+
+struct MCKernelDescriptor;
 namespace HSAMD {
 struct Metadata;
 }
 } // namespace AMDGPU
 
-namespace amdhsa {
-struct kernel_descriptor_t;
-}
-
 class AMDGPUTargetStreamer : public MCTargetStreamer {
   AMDGPUPALMetadata PALMetadata;
 
@@ -94,10 +92,11 @@ public:
     return true;
   }
 
-  virtual void EmitAmdhsaKernelDescriptor(
-      const MCSubtargetInfo &STI, StringRef KernelName,
-      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {}
+  virtual void
+  EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
+                             const AMDGPU::MCKernelDescriptor &KernelDescriptor,
+                             uint64_t NextVGPR, uint64_t NextSGPR,
+                             bool ReserveVCC, bool ReserveFlatScr) {}
 
   static StringRef getArchNameFromElfMach(unsigned ElfMach);
   static unsigned getElfMach(StringRef GPU);
@@ -150,10 +149,11 @@ public:
   bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
                                 bool TrapEnabled) override;
 
-  void EmitAmdhsaKernelDescriptor(
-      const MCSubtargetInfo &STI, StringRef KernelName,
-      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
+  void
+  EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
+                             const AMDGPU::MCKernelDescriptor &KernelDescriptor,
+                             uint64_t NextVGPR, uint64_t NextSGPR,
+                             bool ReserveVCC, bool ReserveFlatScr) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -205,10 +205,11 @@ public:
   bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
                                 bool TrapEnabled) override;
 
-  void EmitAmdhsaKernelDescriptor(
-      const MCSubtargetInfo &STI, StringRef KernelName,
-      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
+  void
+  EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
+                             const AMDGPU::MCKernelDescriptor &KernelDescriptor,
+                             uint64_t NextVGPR, uint64_t NextSGPR,
+                             bool ReserveVCC, bool ReserveFlatScr) override;
 };
 }
 #endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 0842a58..14a02b6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_component_library(LLVMAMDGPUDesc
   AMDGPUMCExpr.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUTargetStreamer.cpp
+  AMDGPUMCKernelDescriptor.cpp
   R600InstPrinter.cpp
   R600MCCodeEmitter.cpp
   R600MCTargetDesc.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7bb84d7..5d44396 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1221,47 +1221,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   }
 }
 
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
-    const MCSubtargetInfo *STI) {
-  IsaVersion Version = getIsaVersion(STI->getCPU());
-
-  amdhsa::kernel_descriptor_t KD;
-  memset(&KD, 0, sizeof(KD));
-
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                  amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
-                  amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
-  if (Version.Major >= 12) {
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, 0);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF, 0);
-  } else {
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, 1);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, 1);
-  }
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
-                  amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
-  if (Version.Major >= 10) {
-    AMDHSA_BITS_SET(KD.kernel_code_properties,
-                    amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
-                    STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
-                    STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, 1);
-  }
-  if (AMDGPU::isGFX90A(*STI)) {
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
-                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
-                    STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
-  }
-  return KD;
-}
-
 bool isGroupSegment(const GlobalValue *GV) {
   return GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f4f9a78..943588f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -34,10 +34,6 @@ class StringRef;
 class Triple;
 class raw_ostream;
 
-namespace amdhsa {
-struct kernel_descriptor_t;
-}
-
 namespace AMDGPU {
 
 struct IsaVersion;
@@ -855,9 +851,6 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const MCSubtargetInfo *STI);
 
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
-    const MCSubtargetInfo *STI);
-
 bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 5c113cc..e8d2cba 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -958,6 +958,7 @@ bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
 
     unsigned NotImm = ~Imm & 0xffff;
     if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
+      MRI->clearKillFlags(LastVPTReg);
       Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
       if (MRI->use_empty(VPR)) {
         DeadInstructions.insert(Copy);
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 0f4ece64..047c673 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -612,11 +612,11 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
 
 static void findTemporariesForLR(const BitVector &GPRsNoLRSP,
                                  const BitVector &PopFriendly,
-                                 const LivePhysRegs &UsedRegs, unsigned &PopReg,
+                                 const LiveRegUnits &UsedRegs, unsigned &PopReg,
                                  unsigned &TmpReg, MachineRegisterInfo &MRI) {
   PopReg = TmpReg = 0;
   for (auto Reg : GPRsNoLRSP.set_bits()) {
-    if (UsedRegs.available(MRI, Reg)) {
+    if (UsedRegs.available(Reg)) {
       // Remember the first pop-friendly register and exit.
       if (PopFriendly.test(Reg)) {
         PopReg = Reg;
@@ -684,7 +684,7 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   // Look for a temporary register to use.
   // First, compute the liveness information.
   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-  LivePhysRegs UsedRegs(TRI);
+  LiveRegUnits UsedRegs(TRI);
   UsedRegs.addLiveOuts(MBB);
   // The semantic of pristines changed recently and now,
   // the callee-saved registers that are touched in the function
@@ -710,11 +710,6 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   unsigned TemporaryReg = 0;
   BitVector PopFriendly =
       TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
-  // R7 may be used as a frame pointer, hence marked as not generally
-  // allocatable, however there's no reason to not use it as a temporary for
-  // restoring LR.
-  if (STI.getFramePointerReg() == ARM::R7)
-    PopFriendly.set(ARM::R7);
 
   assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
   // Rebuild the GPRs from the high registers because they are removed
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index 11b716f..ad06f47 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -26,6 +26,19 @@ def CSR_ILP32D_LP64D
     : CalleeSavedRegs<(add CSR_ILP32_LP64,
                        F8_D, F9_D, (sequence "F%u_D", 18, 27))>;
 
+defvar CSR_V = (add (sequence "V%u", 1, 7), (sequence "V%u", 24, 31),
+                     V2M2, V4M2, V6M2, V24M2, V26M2, V28M2, V30M2,
+                     V4M4, V24M4, V28M4, V24M8);
+
+def CSR_ILP32_LP64_V
+    : CalleeSavedRegs<(add CSR_ILP32_LP64, CSR_V)>;
+
+def CSR_ILP32F_LP64F_V
+    : CalleeSavedRegs<(add CSR_ILP32F_LP64F, CSR_V)>;
+
+def CSR_ILP32D_LP64D_V
+    : CalleeSavedRegs<(add CSR_ILP32D_LP64D, CSR_V)>;
+
 // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 39f2b3f..39075c81 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -388,6 +388,21 @@ getUnmanagedCSI(const MachineFunction &MF,
   return NonLibcallCSI;
 }
 
+static SmallVector<CalleeSavedInfo, 8>
+getRVVCalleeSavedInfo(const MachineFunction &MF,
+                      const std::vector<CalleeSavedInfo> &CSI) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  SmallVector<CalleeSavedInfo, 8> RVVCSI;
+
+  for (auto &CS : CSI) {
+    int FI = CS.getFrameIdx();
+    if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector)
+      RVVCSI.push_back(CS);
+  }
+
+  return RVVCSI;
+}
+
 void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
                                            MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
@@ -590,6 +605,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   // directives.
   for (const auto &Entry : CSI) {
     int FrameIdx = Entry.getFrameIdx();
+    if (FrameIdx >= 0 &&
+        MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+      continue;
+
     int64_t Offset = MFI.getObjectOffset(FrameIdx);
     Register Reg = Entry.getReg();
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
@@ -726,7 +745,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
 
   const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo());
 
-  // Skip to before the restores of callee-saved registers
+  // Skip to before the restores of scalar callee-saved registers
   // FIXME: assumes exactly one instruction is used to restore each
   // callee-saved register.
   auto LastFrameDestroy = MBBI;
@@ -1029,15 +1048,24 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   // Create a buffer of RVV objects to allocate.
   SmallVector<int, 8> ObjectsToAllocate;
-  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
-    unsigned StackID = MFI.getStackID(I);
-    if (StackID != TargetStackID::ScalableVector)
-      continue;
-    if (MFI.isDeadObjectIndex(I))
-      continue;
+  auto pushRVVObjects = [&](int FIBegin, int FIEnd) {
+    for (int I = FIBegin, E = FIEnd; I != E; ++I) {
+      unsigned StackID = MFI.getStackID(I);
+      if (StackID != TargetStackID::ScalableVector)
+        continue;
+      if (MFI.isDeadObjectIndex(I))
+        continue;
 
-    ObjectsToAllocate.push_back(I);
-  }
+      ObjectsToAllocate.push_back(I);
+    }
+  };
+  // First push RVV Callee Saved object, then push RVV stack object
+  std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+  const auto &RVVCSI = getRVVCalleeSavedInfo(MF, CSI);
+  if (!RVVCSI.empty())
+    pushRVVObjects(RVVCSI[0].getFrameIdx(),
+                   RVVCSI[RVVCSI.size() - 1].getFrameIdx() + 1);
+  pushRVVObjects(0, MFI.getObjectIndexEnd() - RVVCSI.size());
 
   // The minimum alignment is 16 bytes.
   Align RVVStackAlign(16);
@@ -1487,13 +1515,19 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
 
   // Manually spill values not spilled by libcall & Push/Pop.
   const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
-  for (auto &CS : UnmanagedCSI) {
-    // Insert the spill to the stack frame.
-    Register Reg = CS.getReg();
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(),
-                            RC, TRI, Register());
-  }
+  const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
+
+  auto storeRegToStackSlot = [&](decltype(UnmanagedCSI) CSInfo) {
+    for (auto &CS : CSInfo) {
+      // Insert the spill to the stack frame.
+      Register Reg = CS.getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg),
+                              CS.getFrameIdx(), RC, TRI, Register());
+    }
+  };
+  storeRegToStackSlot(UnmanagedCSI);
+  storeRegToStackSlot(RVVCSI);
 
   return true;
 }
@@ -1511,19 +1545,26 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
     DL = MI->getDebugLoc();
 
   // Manually restore values not restored by libcall & Push/Pop.
-  // Keep the same order as in the prologue. There is no need to reverse the
-  // order in the epilogue. In addition, the return address will be restored
-  // first in the epilogue. It increases the opportunity to avoid the
-  // load-to-use data hazard between loading RA and return by RA.
-  // loadRegFromStackSlot can insert multiple instructions.
+  // Reverse the restore order in epilog.  In addition, the return
+  // address will be restored first in the epilogue. It increases
+  // the opportunity to avoid the load-to-use data hazard between
+  // loading RA and return by RA.  loadRegFromStackSlot can insert
+  // multiple instructions.
   const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
-  for (auto &CS : UnmanagedCSI) {
-    Register Reg = CS.getReg();
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                             Register());
-    assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
-  }
+  const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
+
+  auto loadRegFromStackSlot = [&](decltype(UnmanagedCSI) CSInfo) {
+    for (auto &CS : CSInfo) {
+      Register Reg = CS.getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
+                               Register());
+      assert(MI != MBB.begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+    }
+  };
+  loadRegFromStackSlot(RVVCSI);
+  loadRegFromStackSlot(UnmanagedCSI);
 
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   if (RVFI->isPushable(*MF)) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ca78648c..564fda6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18724,6 +18724,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   case CallingConv::Fast:
   case CallingConv::SPIR_KERNEL:
   case CallingConv::GRAAL:
+  case CallingConv::RISCV_VectorCall:
     break;
   case CallingConv::GHC:
     if (Subtarget.isRVE())
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 74d6532..11c3f2d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -71,6 +71,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                              : CSR_Interrupt_SaveList;
   }
 
+  bool HasVectorCSR =
+      MF->getFunction().getCallingConv() == CallingConv::RISCV_VectorCall;
+
   switch (Subtarget.getTargetABI()) {
   default:
     llvm_unreachable("Unrecognized ABI");
@@ -79,12 +82,18 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_ILP32E_LP64E_SaveList;
   case RISCVABI::ABI_ILP32:
   case RISCVABI::ABI_LP64:
+    if (HasVectorCSR)
+      return CSR_ILP32_LP64_V_SaveList;
     return CSR_ILP32_LP64_SaveList;
   case RISCVABI::ABI_ILP32F:
   case RISCVABI::ABI_LP64F:
+    if (HasVectorCSR)
+      return CSR_ILP32F_LP64F_V_SaveList;
     return CSR_ILP32F_LP64F_SaveList;
   case RISCVABI::ABI_ILP32D:
   case RISCVABI::ABI_LP64D:
+    if (HasVectorCSR)
+      return CSR_ILP32D_LP64D_V_SaveList;
     return CSR_ILP32D_LP64D_SaveList;
   }
 }
@@ -665,12 +674,18 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
     return CSR_ILP32E_LP64E_RegMask;
   case RISCVABI::ABI_ILP32:
   case RISCVABI::ABI_LP64:
+    if (CC == CallingConv::RISCV_VectorCall)
+      return CSR_ILP32_LP64_V_RegMask;
     return CSR_ILP32_LP64_RegMask;
   case RISCVABI::ABI_ILP32F:
   case RISCVABI::ABI_LP64F:
+    if (CC == CallingConv::RISCV_VectorCall)
+      return CSR_ILP32F_LP64F_V_RegMask;
     return CSR_ILP32F_LP64F_RegMask;
   case RISCVABI::ABI_ILP32D:
   case RISCVABI::ABI_LP64D:
+    if (CC == CallingConv::RISCV_VectorCall)
+      return CSR_ILP32D_LP64D_V_RegMask;
     return CSR_ILP32D_LP64D_RegMask;
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 000d01b..38cdf3c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -909,23 +909,33 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (!IsTypeLegal)
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
+  std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  // FIXME: Need to consider vsetvli and lmul.
   int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
                 (int)Log2_32(Src->getScalarSizeInBits());
   switch (ISD) {
   case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-    if (Src->getScalarSizeInBits() == 1) {
+  case ISD::ZERO_EXTEND: {
+    const unsigned SrcEltSize = Src->getScalarSizeInBits();
+    if (SrcEltSize == 1) {
       // We do not use vsext/vzext to extend from mask vector.
       // Instead we use the following instructions to extend from mask vector:
       // vmv.v.i v8, 0
       // vmerge.vim v8, v8, -1, v0
-      return 2;
+      return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
+                                     DstLT.second, CostKind);
     }
-    return 1;
+    if ((PowDiff < 1) || (PowDiff > 3))
+      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+    unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
+    unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
+    unsigned Op =
+        (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
+    return getRISCVInstructionCost(Op, DstLT.second, CostKind);
+  }
   case ISD::TRUNCATE:
     if (Dst->getScalarSizeInBits() == 1) {
       // We do not use several vncvt to truncate to mask vector. So we could
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9bad38f..9d98d31 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43995,6 +43995,50 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                      Extract->getOperand(1));
 }
 
+// If this extract is from a loaded vector value and will be used as an
+// integer, that requires a potentially expensive XMM -> GPR transfer.
+// Additionally, if we can convert to a scalar integer load, that will likely
+// be folded into a subsequent integer op.
+// Note: SrcVec might not have a VecVT type, but it must be the same size.
+// Note: Unlike the related fold for this in DAGCombiner, this is not limited
+//       to a single-use of the loaded vector. For the reasons above, we
+//       expect this to be profitable even if it creates an extra load.
+static SDValue
+combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+         "Only EXTRACT_VECTOR_ELT supported so far");
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+
+  bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
+    return Use->getOpcode() == ISD::STORE ||
+           Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
+           Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
+  });
+
+  auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
+  if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
+      VecVT.getVectorElementType() == VT &&
+      VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
+      DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
+    SDValue NewPtr = TLI.getVectorElementPointer(
+        DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
+    unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
+    MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
+    Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
+    SDValue Load =
+        DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
+                    LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
+    DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
+    return Load;
+  }
+
+  return SDValue();
+}
+
 // Attempt to peek through a target shuffle and extract the scalar from the
 // source.
 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
@@ -44191,6 +44235,11 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
     return DAG.getZExtOrTrunc(V, dl, VT);
 
+  if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
+    if (SDValue V = combineExtractFromVectorLoad(
+            N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
+      return V;
+
   return SDValue();
 }
 
@@ -44600,6 +44649,12 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
     return V;
 
+  if (CIdx)
+    if (SDValue V = combineExtractFromVectorLoad(
+            N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
+            dl, DAG, DCI))
+      return V;
+
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
   // and then testing the relevant element.
   //
@@ -44645,34 +44700,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // If this extract is from a loaded vector value and will be used as an
-  // integer, that requires a potentially expensive XMM -> GPR transfer.
-  // Additionally, if we can convert to a scalar integer load, that will likely
-  // be folded into a subsequent integer op.
-  // Note: Unlike the related fold for this in DAGCombiner, this is not limited
-  //       to a single-use of the loaded vector. For the reasons above, we
-  //       expect this to be profitable even if it creates an extra load.
-  bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
-    return Use->getOpcode() == ISD::STORE ||
-           Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
-           Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
-  });
-  auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
-  if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
-      SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
-      !LikelyUsedAsVector && LoadVec->isSimple()) {
-    SDValue NewPtr =
-        TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
-    unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
-    MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
-    Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
-    SDValue Load =
-        DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
-                    LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
-    DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
-    return Load;
-  }
-
   return SDValue();
 }
 
@@ -48273,7 +48300,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
       TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index e36832f..7109946 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -186,11 +186,6 @@ void AArch64::ExtensionSet::enable(ArchExtKind E) {
   // Special cases for dependencies which vary depending on the base
   // architecture version.
   if (BaseArch) {
-    // +sve implies +f32mm if the base architecture is v8.6A+ or v9.1A+
-    // It isn't the case in general that sve implies both f64mm and f32mm
-    if (E == AEK_SVE && BaseArch->is_superset(ARMV8_6A))
-      enable(AEK_F32MM);
-
     // +fp16 implies +fp16fml for v8.4A+, but not v9.0-A+
     if (E == AEK_FP16 && BaseArch->is_superset(ARMV8_4A) &&
         !BaseArch->is_superset(ARMV9A))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index af238a4..8c698e5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -611,6 +611,18 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
       Y->getType() == Z->getType())
     return createPowiExpr(I, *this, X, Y, Z);
 
+  // powi(X, Y) / X --> powi(X, Y-1)
+  // This is legal when (Y - 1) can't wraparound, in which case reassoc and nnan
+  // are required.
+  // TODO: Multi-use may be also better off creating Powi(x,y-1)
+  if (I.hasAllowReassoc() && I.hasNoNaNs() &&
+      match(Op0, m_OneUse(m_AllowReassoc(m_Intrinsic<Intrinsic::powi>(
+                     m_Specific(Op1), m_Value(Y))))) &&
+      willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) {
+    Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType());
+    return createPowiExpr(I, *this, Op1, Y, NegOne);
+  }
+
   return nullptr;
 }
 
@@ -1904,20 +1916,8 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
     return replaceInstUsesWith(I, Pow);
   }
 
-  // powi(X, Y) / X --> powi(X, Y-1)
-  // This is legal when (Y - 1) can't wraparound, in which case reassoc and nnan
-  // are required.
-  // TODO: Multi-use may be also better off creating Powi(x,y-1)
-  if (I.hasAllowReassoc() && I.hasNoNaNs() &&
-      match(Op0, m_OneUse(m_Intrinsic<Intrinsic::powi>(m_Specific(Op1),
-                                                       m_Value(Y)))) &&
-      willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) {
-    Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType());
-    Value *Y1 = Builder.CreateAdd(Y, NegOne);
-    Type *Types[] = {Op1->getType(), Y1->getType()};
-    Value *Pow = Builder.CreateIntrinsic(Intrinsic::powi, Types, {Op1, Y1}, &I);
-    return replaceInstUsesWith(I, Pow);
-  }
+  if (Instruction *FoldedPowi = foldPowiReassoc(I))
+    return FoldedPowi;
 
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fbf1cb6..e1f26b9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11926,7 +11926,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
         if (VecTy != Vec->getType()) {
-          assert((getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
+          assert((It != MinBWs.end() ||
+                  getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
                   MinBWs.contains(getOperandEntry(E, I))) &&
                  "Expected item in MinBWs.");
           Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index bd26c19..14da9a3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -16,74 +16,74 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -96,73 +96,73 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -179,74 +179,74 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -259,73 +259,73 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -522,74 +522,74 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -602,73 +602,73 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -685,74 +685,74 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -765,73 +765,73 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
index 80efe912..30cb32c 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
@@ -1141,7 +1141,7 @@ define signext i32 @vreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 
 define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
 ; CHECK-LABEL: 'vwreduce_add_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %red
 ;
@@ -1157,7 +1157,7 @@ define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
 
 define signext i32 @vwreduce_uadd_nxv4i16(<vscale x 4 x i16> %v) {
 ; CHECK-LABEL: 'vwreduce_uadd_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %red
 ;
@@ -1445,7 +1445,7 @@ define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
 
 define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_add_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
@@ -1461,7 +1461,7 @@ define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
 
 define i64 @vwreduce_uadd_nxv2i32(<vscale x 2 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_uadd_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
@@ -1597,7 +1597,7 @@ define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
 
 define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_add_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
@@ -1613,7 +1613,7 @@ define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 
 define i64 @vwreduce_uadd_nxv4i32(<vscale x 4 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_uadd_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
index 225bad6..aa7a90b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
@@ -12,12 +12,12 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -66,12 +66,12 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -120,12 +120,12 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
@@ -177,12 +177,12 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -231,12 +231,12 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -285,12 +285,12 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
@@ -341,13 +341,13 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_0 = extractelement <2 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -395,13 +395,13 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -449,13 +449,13 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
@@ -506,13 +506,13 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_0 = extractelement <2 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -560,13 +560,13 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -614,13 +614,13 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
index 5387c8d..6e1ae02 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
@@ -12,12 +12,12 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -66,12 +66,12 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -120,12 +120,12 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
@@ -177,12 +177,12 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -231,12 +231,12 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -285,12 +285,12 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
@@ -341,13 +341,13 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i1_0 = insertelement <2 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -395,13 +395,13 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -449,13 +449,13 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
@@ -506,13 +506,13 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i1_0 = insertelement <2 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -560,13 +560,13 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -614,13 +614,13 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
index 46bf315..b763198 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
@@ -197,7 +197,7 @@ define void  @broadcast_fixed() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll
index b1f4abf..2190e2f 100644
--- a/llvm/test/Bitcode/compatibility-3.6.ll
+++ b/llvm/test/Bitcode/compatibility-3.6.ll
@@ -1061,16 +1061,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1178,11 +1178,11 @@ define void @intrinsics.codegen() {
 ; CHECK: attributes #27 = { uwtable }
 ; CHECK: attributes #28 = { "cpu"="cortex-a8" }
 ; CHECK: attributes #29 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #30 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #31 = { nounwind memory(argmem: read) }
-; CHECK: attributes #32 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #33 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #34 = { nocallback nounwind }
+; CHECK: attributes #30 = { nounwind memory(argmem: read) }
+; CHECK: attributes #31 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #32 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #33 = { nocallback nounwind }
+; CHECK: attributes #34 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #36 = { builtin }
 
diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll
index 91e55f6..7e59b5c 100644
--- a/llvm/test/Bitcode/compatibility-3.7.ll
+++ b/llvm/test/Bitcode/compatibility-3.7.ll
@@ -1092,16 +1092,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1241,11 +1241,11 @@ define void @misc.metadata() {
 ; CHECK: attributes #30 = { uwtable }
 ; CHECK: attributes #31 = { "cpu"="cortex-a8" }
 ; CHECK: attributes #32 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #33 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #34 = { nounwind memory(argmem: read) }
-; CHECK: attributes #35 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #37 = { nocallback nounwind }
+; CHECK: attributes #33 = { nounwind memory(argmem: read) }
+; CHECK: attributes #34 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #36 = { nocallback nounwind }
+; CHECK: attributes #37 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #39 = { builtin }
 
diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll
index aa4d8b1..ebd1f2f 100644
--- a/llvm/test/Bitcode/compatibility-3.8.ll
+++ b/llvm/test/Bitcode/compatibility-3.8.ll
@@ -1247,16 +1247,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1551,11 +1551,11 @@ normal:
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #42 = { builtin }
 
diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll
index e3c84f6..c34f04c 100644
--- a/llvm/test/Bitcode/compatibility-3.9.ll
+++ b/llvm/test/Bitcode/compatibility-3.9.ll
@@ -1318,16 +1318,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1624,11 +1624,11 @@ declare void @f.writeonly() writeonly
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #43 = { builtin }
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index 06cb8420..05bffda 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -1318,16 +1318,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1649,11 +1649,11 @@ define i8** @constexpr() {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #43 = { builtin }
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index f9ae558..0c87228 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -1330,16 +1330,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1664,11 +1664,11 @@ define i8** @constexpr() {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { speculatable }
 ; CHECK: attributes #43 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 1458e1b..44c6808 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -1340,16 +1340,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1674,11 +1674,11 @@ define i8** @constexpr() {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { speculatable }
 ; CHECK: attributes #43 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index fa8b052..b374924 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1648,16 +1648,16 @@ define void @instructions.va_arg(ptr %v, ...) {
   %ap = alloca ptr
 
   call void @llvm.va_start(ptr %ap)
-  ; CHECK: call void @llvm.va_start(ptr %ap)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap)
 
   va_arg ptr %ap, i32
   ; CHECK: va_arg ptr %ap, i32
 
   call void @llvm.va_copy(ptr %v, ptr %ap)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap)
 
   call void @llvm.va_end(ptr %ap)
-  ; CHECK: call void @llvm.va_end(ptr %ap)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap)
 
   ret void
 }
@@ -2091,12 +2091,12 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
-; CHECK: attributes #41 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+; CHECK: attributes #41 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #42 = { memory(write) }
 ; CHECK: attributes #43 = { speculatable }
 ; CHECK: attributes #44 = { strictfp }
diff --git a/llvm/test/Bitcode/thinlto-function-summary.ll b/llvm/test/Bitcode/thinlto-function-summary.ll
index 799759e..13c6611 100644
--- a/llvm/test/Bitcode/thinlto-function-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary.ll
@@ -13,9 +13,9 @@
 ; "variadic"
 ; BC-NEXT: <FUNCTION op0=46 op1=8
 ; "llvm.va_start"
-; BC-NEXT: <FUNCTION op0=54 op1=13
+; BC-NEXT: <FUNCTION op0=54 op1=16
 ; "f"
-; BC-NEXT: <ALIAS op0=67 op1=1
+; BC-NEXT: <ALIAS op0=70 op1=1
 ; BC: <GLOBALVAL_SUMMARY_BLOCK
 ; BC-NEXT: <VERSION
 ; BC-NEXT: <FLAGS
@@ -26,7 +26,7 @@
 ; BC-NEXT: <ALIAS {{.*}} op0=6 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC: <STRTAB_BLOCK
-; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicllvm.va_startf{{.*}}'
+; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicllvm.va_start.p{{[0-9]+}}f{{.*}}'
 
 
 ; RUN: opt -passes=name-anon-globals -module-summary < %s | llvm-dis | FileCheck %s
diff --git a/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll b/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll
index fad7b8e..fd3f500 100644
--- a/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll
+++ b/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll
@@ -10,7 +10,7 @@ define i32 @varArgIntrinsic(i32 %X, ...) {
   %ap = alloca i8*
   %ap2 = bitcast i8** %ap to i8*
 
-; CHECK: call void @llvm.va_start(ptr %ap2)
+; CHECK: call void @llvm.va_start.p0(ptr %ap2)
   call void @llvm.va_start(i8* %ap2)
 
 ; CHECK-NEXT: %tmp = va_arg ptr %ap, i32
@@ -19,12 +19,12 @@ define i32 @varArgIntrinsic(i32 %X, ...) {
   %aq = alloca i8*
   %aq2 = bitcast i8** %aq to i8*
 
-; CHECK: call void @llvm.va_copy(ptr %aq2, ptr %ap2)
+; CHECK: call void @llvm.va_copy.p0(ptr %aq2, ptr %ap2)
   call void @llvm.va_copy(i8* %aq2, i8* %ap2)
-; CHECK-NEXT: call void @llvm.va_end(ptr %aq2)
+; CHECK-NEXT: call void @llvm.va_end.p0(ptr %aq2)
   call void @llvm.va_end(i8* %aq2)
 
-; CHECK-NEXT:  call void @llvm.va_end(ptr %ap2)
+; CHECK-NEXT:  call void @llvm.va_end.p0(ptr %ap2)
   call void @llvm.va_end(i8* %ap2)
   ret i32 %tmp
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
index 089e171..c9fd2d3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -518,4 +518,6 @@ attributes #5 = { nobuiltin }
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"short", !1}
-!4 = !{i64 0, i64 4, !0, i64 4, i64 2, !3, i64 8, i64 4, !0, i64 12, i64 2, !3, i64 16, i64 4, !0, i64 20, i64 2, !3}
+!4 = !{i64 0, i64 4, !5, i64 4, i64 2, !6, i64 8, i64 4, !5, i64 12, i64 2, !6, i64 16, i64 4, !5, i64 20, i64 2, !6}
+!5 = !{!0, !0, i64 0}
+!6 = !{!3, !3, i64 0}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index 942f459..8ddaf24 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -808,7 +808,7 @@ define float @test_pown_fast_f32_nobuiltin(float %x, i32 %y) {
 ; CHECK-LABEL: define float @test_pown_fast_f32_nobuiltin
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 entry:
@@ -820,11 +820,11 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
 ; CHECK-LABEL: define float @test_pown_fast_f32_strictfp
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]])
-; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
-; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
+; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR0]]
+; CHECK-NEXT:    [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
+; CHECK-NEXT:    [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR0]]
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index 2ffa647..2e64a34 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -896,7 +896,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp(
 ; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 76ec1cc..99d02ff 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -358,65 +358,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1)
 ; ---------------------------------------------------------------------
 
 define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -450,69 +391,6 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
 }
 
 define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f32_e32 v0, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,71 +427,6 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i
 }
 
 define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v4, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v0, v4
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v4, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v0, v4
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,73 +461,6 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
 }
 
 define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f32_e32 v4, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v0, v[4:5]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v0
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v0, v[4:5]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v0
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -752,80 +498,6 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -876,84 +548,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1007,83 +601,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 }
 
 define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1134,87 +651,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre
 }
 
 define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index d137f47..380ce7f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -372,65 +372,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
 ; ---------------------------------------------------------------------
 
 define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,69 +405,6 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
 }
 
 define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f64_e32 v0, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -563,71 +441,6 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %
 }
 
 define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v4, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v0, v4
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v4, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v0, v4
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -663,73 +476,6 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
 }
 
 define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f64_e32 v4, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v0, v[4:5]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v0
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v0, v[4:5]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v0
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -768,80 +514,6 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,84 +568,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1029,83 +623,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 }
 
 define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1160,87 +677,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
 }
 
 define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-zext.ll b/llvm/test/CodeGen/RISCV/rvv/binop-zext.ll
new file mode 100644
index 0000000..e050240
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/binop-zext.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+; Check that we perform binary arithmetic in a narrower type where possible, via
+; combineBinOpOfZExt or otherwise.
+
+define <vscale x 8 x i32> @add(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %add = add <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %add
+}
+
+define <vscale x 8 x i32> @sub(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwsubu.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %sub = sub <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %sub
+}
+
+define <vscale x 8 x i32> @mul(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: mul:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %mul = mul <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %mul
+}
+
+define <vscale x 8 x i32> @sdiv(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: sdiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vdivu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %sdiv = sdiv <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %sdiv
+}
+
+define <vscale x 8 x i32> @udiv(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: udiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vdivu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %udiv = udiv <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %udiv
+}
+
+define <vscale x 8 x i32> @srem(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: srem:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vremu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %srem = srem <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %srem
+}
+
+define <vscale x 8 x i32> @urem(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: urem:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vremu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %urem = urem <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %urem
+}
+
+define <vscale x 8 x i32> @and(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: and:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %shl = and <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %shl
+}
+
+define <vscale x 8 x i32> @or(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vor.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %or = or <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %or
+}
+
+define <vscale x 8 x i32> @xor(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: xor:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vxor.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %xor = xor <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %xor
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
new file mode 100644
index 0000000..84936d8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m -mattr=+v -O2 < %s \
+; RUN:    | FileCheck --check-prefix=SPILL-O2 %s
+
+define <vscale x 1 x i32> @test_vector_std(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_std:
+; SPILL-O2:       # %bb.0: # %entry
+; SPILL-O2-NEXT:    addi sp, sp, -16
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 1
+; SPILL-O2-NEXT:    sub sp, sp, a0
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    #APP
+; SPILL-O2-NEXT:    #NO_APP
+; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 1
+; SPILL-O2-NEXT:    add sp, sp, a0
+; SPILL-O2-NEXT:    addi sp, sp, 16
+; SPILL-O2-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+
+  ret <vscale x 1 x i32> %va
+}
+
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_callee:
+; SPILL-O2:       # %bb.0: # %entry
+; SPILL-O2-NEXT:    addi sp, sp, -16
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 4
+; SPILL-O2-NEXT:    sub sp, sp, a0
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 4
+; SPILL-O2-NEXT:    sub a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v1, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 13
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs2r.v v2, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    #APP
+; SPILL-O2-NEXT:    #NO_APP
+; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 4
+; SPILL-O2-NEXT:    sub a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 13
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl2r.v v2, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 4
+; SPILL-O2-NEXT:    add sp, sp, a0
+; SPILL-O2-NEXT:    addi sp, sp, 16
+; SPILL-O2-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+
+  ret <vscale x 1 x i32> %va
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index a4aef57..571e2df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1187,3 +1187,19 @@ define <vscale x 2 x i32> @vmerge_larger_vl_false_becomes_tail(<vscale x 2 x i32
   %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %false, <vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i64 3)
   ret <vscale x 2 x i32> %b
 }
+
+; Test widening pseudos with their TIED variant (passthru same as first op).
+define <vscale x 2 x i64> @vpmerge_vwsub.w_tied(<vscale x 2 x i64> %passthru, <vscale x 2 x i64> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_vwsub.w_tied:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv2r.v v10, v8
+; CHECK-NEXT:    vwsub.wv v10, v10, v12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %vl.zext = zext i32 %vl to i64
+  %a = call <vscale x 2 x i64> @llvm.riscv.vwsub.w.nxv2i64.nxv2i32(<vscale x 2 x i64> %passthru, <vscale x 2 x i64> %passthru, <vscale x 2 x i32> %y, i64 %vl.zext)
+  %b = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru, i32 %vl)
+  ret <vscale x 2 x i64> %b
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir
index f28311e..f9b175e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode -run-pass arm-mve-vpt-opts %s -o - | FileCheck %s
+# RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode -run-pass arm-mve-vpt-opts -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            vcmp_with_opposite_cond
@@ -1021,3 +1021,26 @@ body:             |
     %16:mqpr = MVE_VORR %15, %15, 1, %10, $noreg, undef %16
     %17:mqpr = MVE_VORR %16, %16, 1, %11, $noreg, undef %17
 ...
+---
+name:            reuse_kill_flags
+alignment:       4
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: reuse_kill_flags
+    ; CHECK: [[t2MOVi:%[0-9]+]]:tgpreven = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vccr = COPY [[t2MOVi]]
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mqpr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR [[DEF]], [[DEF]], 1, [[COPY]], $noreg, undef [[MVE_VORR]]
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mqpr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR [[DEF1]], [[DEF1]], 1, killed [[COPY]], $noreg, undef [[MVE_VORR1]]
+    ; CHECK-NEXT: tBX_RET 14 /* CC::al */, $noreg, implicit [[DEF1]]
+    %0:tgpreven = t2MOVi 0, 14, $noreg, $noreg
+    %1:vccr = COPY %0:tgpreven
+    %2:mqpr = IMPLICIT_DEF
+    %3:mqpr = MVE_VORR %2:mqpr, %2:mqpr, 1, killed %1, $noreg, undef %3
+    %4:vccr = COPY %0:tgpreven
+    %5:mqpr = IMPLICIT_DEF
+    %6:mqpr = MVE_VORR %5:mqpr, %5:mqpr, 1, killed %4, $noreg, undef %6
+    tBX_RET 14 /* CC::al */, $noreg, implicit %5:mqpr
+
+...
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index 9bb7fec..7a8ddf5 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -80,3 +80,33 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
   %trunc = trunc <16 x i16> %shuffle to <16 x i8>
   ret <16 x i8> %trunc
 }
+
+define <8 x i16> @combine_pavgw_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: combine_pavgw_demandedelts:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
+; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_pavgw_demandedelts:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_pavgw_demandedelts:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %avg = tail call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %s0, <8 x i16> %a1)
+  %shuffle = shufflevector <8 x i16> %avg, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %shuffle
+}
+
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index e3e1cdc..022b25a 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -10,20 +10,13 @@ define i32 @t(ptr %val) nounwind  {
 ; X86-SSE2-LABEL: t:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    movl 8(%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-SSSE3-LABEL: t:
-; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm0, %eax
-; X64-SSSE3-NEXT:    retq
-;
-; X64-AVX-LABEL: t:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    movl 8(%rdi), %eax
-; X64-AVX-NEXT:    retq
+; X64-LABEL: t:
+; X64:       # %bb.0:
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:    retq
   %tmp2 = load <2 x i64>, ptr %val, align 16		; <<2 x i64>> [#uses=1]
   %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32>		; <<4 x i32>> [#uses=1]
   %tmp4 = extractelement <4 x i32> %tmp3, i32 2		; <i32> [#uses=1]
@@ -83,11 +76,9 @@ bb:
 define i64 @t4(ptr %a) {
 ; X86-SSE2-LABEL: t4:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl (%ecx), %eax
+; X86-SSE2-NEXT:    movl 4(%ecx), %edx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-LABEL: t4:
@@ -286,35 +277,25 @@ entry:
 define i32 @PR85419(ptr %p0) {
 ; X86-SSE2-LABEL: PR85419:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm1, %ecx
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    orl (%eax), %ecx
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    cmovel %edx, %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl (%ecx), %edx
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    orl 4(%ecx), %edx
+; X86-SSE2-NEXT:    je .LBB8_2
+; X86-SSE2-NEXT:  # %bb.1:
+; X86-SSE2-NEXT:    movl 8(%ecx), %eax
+; X86-SSE2-NEXT:  .LBB8_2:
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-SSSE3-LABEL: PR85419:
-; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    xorl %ecx, %ecx
-; X64-SSSE3-NEXT:    cmpq $0, (%rdi)
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm0, %eax
-; X64-SSSE3-NEXT:    cmovel %ecx, %eax
-; X64-SSSE3-NEXT:    retq
-;
-; X64-AVX-LABEL: PR85419:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %eax, %eax
-; X64-AVX-NEXT:    cmpq $0, (%rdi)
-; X64-AVX-NEXT:    je .LBB8_2
-; X64-AVX-NEXT:  # %bb.1:
-; X64-AVX-NEXT:    movl 8(%rdi), %eax
-; X64-AVX-NEXT:  .LBB8_2:
-; X64-AVX-NEXT:    retq
+; X64-LABEL: PR85419:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq $0, (%rdi)
+; X64-NEXT:    je .LBB8_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:  .LBB8_2:
+; X64-NEXT:    retq
   %load = load <2 x i64>, ptr %p0, align 16
   %vecext.i = extractelement <2 x i64> %load, i64 0
   %cmp = icmp eq i64 %vecext.i, 0
@@ -443,35 +424,35 @@ define i32 @main() nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-32, %esp
 ; X86-SSE2-NEXT:    subl $64, %esp
-; X86-SSE2-NEXT:    movdqa zero, %xmm0
-; X86-SSE2-NEXT:    movaps n1+16, %xmm1
-; X86-SSE2-NEXT:    movaps n1, %xmm2
-; X86-SSE2-NEXT:    movaps %xmm2, zero
-; X86-SSE2-NEXT:    movaps %xmm1, zero+16
-; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,2,2,2]
-; X86-SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm1, (%esp)
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm1
-; X86-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm2, %eax
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movaps n1+16, %xmm0
+; X86-SSE2-NEXT:    movaps n1, %xmm1
+; X86-SSE2-NEXT:    movl zero+4, %ecx
+; X86-SSE2-NEXT:    movl zero+8, %eax
+; X86-SSE2-NEXT:    movaps %xmm1, zero
+; X86-SSE2-NEXT:    movaps %xmm0, zero+16
+; X86-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2]
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movdqa (%esp), %xmm0
+; X86-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movd %xmm1, %esi
 ; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    divl %ecx
-; X86-SSE2-NEXT:    movl %eax, %ecx
+; X86-SSE2-NEXT:    divl %esi
+; X86-SSE2-NEXT:    movl %eax, %esi
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %esi
+; X86-SSE2-NEXT:    movd %xmm0, %edi
+; X86-SSE2-NEXT:    movl %ecx, %eax
 ; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    divl %esi
-; X86-SSE2-NEXT:    addl %ecx, %eax
-; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    divl %edi
+; X86-SSE2-NEXT:    addl %esi, %eax
+; X86-SSE2-NEXT:    leal -8(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -481,31 +462,29 @@ define i32 @main() nounwind {
 ; X64-SSSE3-NEXT:    movq %rsp, %rbp
 ; X64-SSSE3-NEXT:    andq $-32, %rsp
 ; X64-SSSE3-NEXT:    subq $64, %rsp
-; X64-SSSE3-NEXT:    movdqa zero(%rip), %xmm0
 ; X64-SSSE3-NEXT:    movq n1@GOTPCREL(%rip), %rax
-; X64-SSSE3-NEXT:    movaps (%rax), %xmm1
-; X64-SSSE3-NEXT:    movaps 16(%rax), %xmm2
-; X64-SSSE3-NEXT:    movaps %xmm1, zero(%rip)
-; X64-SSSE3-NEXT:    movaps %xmm2, zero+16(%rip)
-; X64-SSSE3-NEXT:    movaps {{.*#+}} xmm1 = [2,2,2,2]
-; X64-SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT:    movaps %xmm1, (%rsp)
-; X64-SSSE3-NEXT:    movdqa (%rsp), %xmm1
-; X64-SSSE3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm2, %eax
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm2, %ecx
+; X64-SSSE3-NEXT:    movaps (%rax), %xmm0
+; X64-SSSE3-NEXT:    movaps 16(%rax), %xmm1
+; X64-SSSE3-NEXT:    movl zero+4(%rip), %ecx
+; X64-SSSE3-NEXT:    movl zero+8(%rip), %eax
+; X64-SSSE3-NEXT:    movaps %xmm0, zero(%rip)
+; X64-SSSE3-NEXT:    movaps %xmm1, zero+16(%rip)
+; X64-SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2]
+; X64-SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSSE3-NEXT:    movaps %xmm0, (%rsp)
+; X64-SSSE3-NEXT:    movdqa (%rsp), %xmm0
+; X64-SSSE3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-SSSE3-NEXT:    movd %xmm1, %esi
 ; X64-SSSE3-NEXT:    xorl %edx, %edx
-; X64-SSSE3-NEXT:    divl %ecx
-; X64-SSSE3-NEXT:    movl %eax, %ecx
+; X64-SSSE3-NEXT:    divl %esi
+; X64-SSSE3-NEXT:    movl %eax, %esi
 ; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-SSSE3-NEXT:    movd %xmm0, %eax
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSSE3-NEXT:    movd %xmm0, %esi
+; X64-SSSE3-NEXT:    movd %xmm0, %edi
+; X64-SSSE3-NEXT:    movl %ecx, %eax
 ; X64-SSSE3-NEXT:    xorl %edx, %edx
-; X64-SSSE3-NEXT:    divl %esi
-; X64-SSSE3-NEXT:    addl %ecx, %eax
+; X64-SSSE3-NEXT:    divl %edi
+; X64-SSSE3-NEXT:    addl %esi, %eax
 ; X64-SSSE3-NEXT:    movq %rbp, %rsp
 ; X64-SSSE3-NEXT:    popq %rbp
 ; X64-SSSE3-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll
index 68dcfa7..e825328 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux-unknown | FileCheck %s --check-prefix=CHECK-64
-; RUN: llc < %s -mtriple=i386-linux-unknown | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=i386-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-32
 
 ; Test that a large stack offset uses a single add/sub instruction to
 ; adjust the stack pointer.
diff --git a/llvm/test/CodeGen/X86/huge-stack-offset2.ll b/llvm/test/CodeGen/X86/huge-stack-offset2.ll
index 3bf0260..053643eb 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset2.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
 
 ; Test how we handle pathologically large stack frames when RAX is live through
 ; the prologue and epilogue.
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 898b34e..6aa0a81 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -12,7 +12,7 @@
 ; vXf64
 ;
 
-define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val) {
+define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val) nounwind {
 ; SSE-LABEL: store_v1f64_v1i64:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    testq %rdi, %rdi
@@ -46,7 +46,7 @@ define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val)
   ret void
 }
 
-define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val) {
+define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val) nounwind {
 ; SSE-LABEL: store_v2f64_v2i64:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movmskpd %xmm0, %eax
@@ -106,7 +106,7 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val)
   ret void
 }
 
-define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val) {
+define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val) nounwind {
 ; SSE2-LABEL: store_v4f64_v4i64:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
@@ -222,7 +222,7 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val)
 ; vXf32
 ;
 
-define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val) nounwind {
 ; SSE2-LABEL: store_v2f32_v2i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -314,7 +314,7 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val)
   ret void
 }
 
-define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i32> %mask) {
+define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i32> %mask) nounwind {
 ; SSE2-LABEL: store_v4f32_v4i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskps %xmm2, %eax
@@ -425,7 +425,7 @@ define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i3
   ret void
 }
 
-define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i32> %mask) {
+define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i32> %mask) nounwind {
 ; SSE2-LABEL: store_v8f32_v8i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
@@ -605,7 +605,7 @@ define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i3
   ret void
 }
 
-define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16 x i32> %mask) {
+define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16 x i32> %mask) nounwind {
 ; SSE2-LABEL: store_v16f32_v16i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
@@ -914,7 +914,7 @@ define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16
 ; vXi64
 ;
 
-define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) {
+define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) nounwind {
 ; SSE2-LABEL: store_v2i64_v2i64:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
@@ -998,7 +998,7 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) {
   ret void
 }
 
-define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) {
+define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) nounwind {
 ; SSE2-LABEL: store_v4i64_v4i64:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
@@ -1122,7 +1122,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) {
 ; vXi32
 ;
 
-define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) {
+define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) nounwind {
 ; SSE-LABEL: store_v1i32_v1i32:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    testl %edi, %edi
@@ -1156,7 +1156,7 @@ define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) {
   ret void
 }
 
-define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) nounwind {
 ; SSE2-LABEL: store_v2i32_v2i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -1256,7 +1256,7 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
   ret void
 }
 
-define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) nounwind {
 ; SSE2-LABEL: store_v4i32_v4i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -1370,7 +1370,7 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
   ret void
 }
 
-define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) {
+define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) nounwind {
 ; SSE2-LABEL: store_v8i32_v8i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -1560,7 +1560,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) {
 ; vXi16
 ;
 
-define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) {
+define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) nounwind {
 ; SSE2-LABEL: store_v8i16_v8i16:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -1907,7 +1907,7 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) {
   ret void
 }
 
-define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val) {
+define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val) nounwind {
 ; SSE2-LABEL: store_v16i16_v16i16:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -2676,7 +2676,7 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val
 ; vXi8
 ;
 
-define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) {
+define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) nounwind {
 ; SSE2-LABEL: store_v16i8_v16i8:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -3273,7 +3273,7 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) {
   ret void
 }
 
-define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) {
+define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) nounwind {
 ; SSE2-LABEL: store_v32i8_v32i8:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -4670,7 +4670,7 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) {
 
 ;;; Stores with Constant Masks
 
-define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) nounwind {
 ; SSE-LABEL: mstore_constmask_v4i32_v4i32:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movups %xmm1, (%rdi)
@@ -4693,7 +4693,7 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i3
 
 ; Make sure we are able to detect all ones constant mask after type legalization
 ; to avoid masked stores.
-define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 x i64> %val) {
+define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 x i64> %val) nounwind {
 ; SSE2-LABEL: mstore_constmask_allones_split:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
@@ -4810,7 +4810,7 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 
 ;  When only one element of the mask is set, reduce to a scalar store.
 
-define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) {
+define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set1:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movss %xmm0, (%rdi)
@@ -4832,7 +4832,7 @@ define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) {
 
 ; Choose a different element to show that the correct address offset is produced.
 
-define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) {
+define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) nounwind {
 ; SSE2-LABEL: one_mask_bit_set2:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -4860,7 +4860,7 @@ define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) {
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
-define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) {
+define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set3:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movlps %xmm1, 16(%rdi)
@@ -4886,7 +4886,7 @@ define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) {
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
-define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) {
+define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set4:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movhps %xmm1, 24(%rdi)
@@ -4912,7 +4912,7 @@ define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) {
 
 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
 
-define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) {
+define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set5:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movlps %xmm3, 48(%rdi)
@@ -4944,7 +4944,7 @@ define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) {
 }
 
 ; Try one elt in each half of a vector that needs to split
-define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
+define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind {
 ; SSE2-LABEL: one_mask_bit_set6:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movlps %xmm3, 48(%rdi)
@@ -4999,7 +4999,7 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
   ret void
 }
 
-define void @top_bits_unset_stack() {
+define void @top_bits_unset_stack() nounwind {
 ; SSE-LABEL: top_bits_unset_stack:
 ; SSE:       ## %bb.0: ## %entry
 ; SSE-NEXT:    xorps %xmm0, %xmm0
@@ -5047,7 +5047,6 @@ define void @top_bits_unset_stack() {
 ; X86-AVX512-LABEL: top_bits_unset_stack:
 ; X86-AVX512:       ## %bb.0: ## %entry
 ; X86-AVX512-NEXT:    subl $76, %esp
-; X86-AVX512-NEXT:    .cfi_def_cfa_offset 80
 ; X86-AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    movb $63, %al
 ; X86-AVX512-NEXT:    kmovd %eax, %k1
@@ -5064,7 +5063,7 @@ entry:
 
 ; SimplifyDemandedBits eliminates an ashr here.
 
-define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <4 x i32> %masksrc) {
+define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <4 x i32> %masksrc) nounwind {
 ; SSE-LABEL: masked_store_bool_mask_demand_trunc_sext:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pslld $31, %xmm2
@@ -5160,7 +5159,7 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <
 
 ; PR26697
 
-define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %mask) {
+define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %mask) nounwind {
 ; SSE2-LABEL: one_mask_bit_set1_variable:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskps %xmm1, %eax
@@ -5267,7 +5266,7 @@ define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %
 ; This needs to be widened to v4i32.
 ; This used to assert in type legalization. PR38436
 ; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
-define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
+define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind {
 ; SSE2-LABEL: widen_masked_store:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    andb $1, %sil
@@ -5448,7 +5447,7 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
   ret void
 }
 
-define void @zero_mask(ptr %addr, <2 x double> %val) {
+define void @zero_mask(ptr %addr, <2 x double> %val) nounwind {
 ; SSE-LABEL: zero_mask:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    retq
@@ -5464,7 +5463,7 @@ define void @zero_mask(ptr %addr, <2 x double> %val) {
   ret void
 }
 
-define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask) {
+define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask) nounwind {
 ; SSE2-LABEL: PR11210:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskps %xmm2, %eax
@@ -5638,492 +5637,248 @@ define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask)
   ret void
 }
 
-define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) {
-; SSE2-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
-; SSE2:       ## %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm6
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm7
-; SSE2-NEXT:    movdqa 64(%rdi), %xmm8
-; SSE2-NEXT:    movl 80(%rsi), %eax
-; SSE2-NEXT:    movl 64(%rsi), %r8d
-; SSE2-NEXT:    movl 48(%rsi), %r9d
-; SSE2-NEXT:    movl 32(%rsi), %r10d
-; SSE2-NEXT:    movl 16(%rsi), %r11d
-; SSE2-NEXT:    movdqa 80(%rsi), %xmm0
-; SSE2-NEXT:    movdqa 64(%rsi), %xmm1
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm2
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm4
-; SSE2-NEXT:    movdqa (%rsi), %xmm5
-; SSE2-NEXT:    packssdw 48(%rdi), %xmm7
-; SSE2-NEXT:    packssdw 16(%rdi), %xmm6
-; SSE2-NEXT:    packsswb %xmm7, %xmm6
-; SSE2-NEXT:    packssdw 80(%rdi), %xmm8
-; SSE2-NEXT:    packsswb %xmm8, %xmm8
-; SSE2-NEXT:    pmovmskb %xmm6, %edi
-; SSE2-NEXT:    andl $21845, %edi ## imm = 0x5555
-; SSE2-NEXT:    pmovmskb %xmm8, %ecx
-; SSE2-NEXT:    andl $85, %ecx
-; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    orl %edi, %ecx
-; SSE2-NEXT:    testb $1, %cl
-; SSE2-NEXT:    jne LBB31_1
-; SSE2-NEXT:  ## %bb.2: ## %else
-; SSE2-NEXT:    testb $2, %cl
-; SSE2-NEXT:    jne LBB31_3
-; SSE2-NEXT:  LBB31_4: ## %else2
-; SSE2-NEXT:    testb $4, %cl
-; SSE2-NEXT:    jne LBB31_5
-; SSE2-NEXT:  LBB31_6: ## %else4
-; SSE2-NEXT:    testb $8, %cl
-; SSE2-NEXT:    jne LBB31_7
-; SSE2-NEXT:  LBB31_8: ## %else6
-; SSE2-NEXT:    testb $16, %cl
-; SSE2-NEXT:    jne LBB31_9
-; SSE2-NEXT:  LBB31_10: ## %else8
-; SSE2-NEXT:    testb $32, %cl
-; SSE2-NEXT:    jne LBB31_11
-; SSE2-NEXT:  LBB31_12: ## %else10
-; SSE2-NEXT:    testb $64, %cl
-; SSE2-NEXT:    jne LBB31_13
-; SSE2-NEXT:  LBB31_14: ## %else12
-; SSE2-NEXT:    testb %cl, %cl
-; SSE2-NEXT:    js LBB31_15
-; SSE2-NEXT:  LBB31_16: ## %else14
-; SSE2-NEXT:    testl $256, %ecx ## imm = 0x100
-; SSE2-NEXT:    jne LBB31_17
-; SSE2-NEXT:  LBB31_18: ## %else16
-; SSE2-NEXT:    testl $512, %ecx ## imm = 0x200
-; SSE2-NEXT:    jne LBB31_19
-; SSE2-NEXT:  LBB31_20: ## %else18
-; SSE2-NEXT:    testl $1024, %ecx ## imm = 0x400
-; SSE2-NEXT:    jne LBB31_21
-; SSE2-NEXT:  LBB31_22: ## %else20
-; SSE2-NEXT:    testl $2048, %ecx ## imm = 0x800
-; SSE2-NEXT:    jne LBB31_23
-; SSE2-NEXT:  LBB31_24: ## %else22
-; SSE2-NEXT:    testl $4096, %ecx ## imm = 0x1000
-; SSE2-NEXT:    jne LBB31_25
-; SSE2-NEXT:  LBB31_26: ## %else24
-; SSE2-NEXT:    testl $8192, %ecx ## imm = 0x2000
-; SSE2-NEXT:    jne LBB31_27
-; SSE2-NEXT:  LBB31_28: ## %else26
-; SSE2-NEXT:    testl $16384, %ecx ## imm = 0x4000
-; SSE2-NEXT:    jne LBB31_29
-; SSE2-NEXT:  LBB31_30: ## %else28
-; SSE2-NEXT:    testw %cx, %cx
-; SSE2-NEXT:    js LBB31_31
-; SSE2-NEXT:  LBB31_32: ## %else30
-; SSE2-NEXT:    testl $65536, %ecx ## imm = 0x10000
-; SSE2-NEXT:    jne LBB31_33
-; SSE2-NEXT:  LBB31_34: ## %else32
-; SSE2-NEXT:    testl $131072, %ecx ## imm = 0x20000
-; SSE2-NEXT:    jne LBB31_35
-; SSE2-NEXT:  LBB31_36: ## %else34
-; SSE2-NEXT:    testl $262144, %ecx ## imm = 0x40000
-; SSE2-NEXT:    jne LBB31_37
-; SSE2-NEXT:  LBB31_38: ## %else36
-; SSE2-NEXT:    testl $524288, %ecx ## imm = 0x80000
-; SSE2-NEXT:    jne LBB31_39
-; SSE2-NEXT:  LBB31_40: ## %else38
-; SSE2-NEXT:    testl $1048576, %ecx ## imm = 0x100000
-; SSE2-NEXT:    jne LBB31_41
-; SSE2-NEXT:  LBB31_42: ## %else40
-; SSE2-NEXT:    testl $2097152, %ecx ## imm = 0x200000
-; SSE2-NEXT:    jne LBB31_43
-; SSE2-NEXT:  LBB31_44: ## %else42
-; SSE2-NEXT:    testl $4194304, %ecx ## imm = 0x400000
-; SSE2-NEXT:    je LBB31_46
-; SSE2-NEXT:  LBB31_45: ## %cond.store43
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    movl %eax, 88(%rdx)
-; SSE2-NEXT:  LBB31_46: ## %else44
-; SSE2-NEXT:    movb $1, %al
-; SSE2-NEXT:    testb %al, %al
-; SSE2-NEXT:    jne LBB31_48
-; SSE2-NEXT:  ## %bb.47: ## %cond.store45
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movl %eax, 92(%rdx)
-; SSE2-NEXT:  LBB31_48: ## %else46
-; SSE2-NEXT:    retq
-; SSE2-NEXT:  LBB31_1: ## %cond.store
-; SSE2-NEXT:    movl (%rsi), %esi
-; SSE2-NEXT:    movl %esi, (%rdx)
-; SSE2-NEXT:    testb $2, %cl
-; SSE2-NEXT:    je LBB31_4
-; SSE2-NEXT:  LBB31_3: ## %cond.store1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
-; SSE2-NEXT:    movd %xmm6, %esi
-; SSE2-NEXT:    movl %esi, 4(%rdx)
-; SSE2-NEXT:    testb $4, %cl
-; SSE2-NEXT:    je LBB31_6
-; SSE2-NEXT:  LBB31_5: ## %cond.store3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; SSE2-NEXT:    movd %xmm6, %esi
-; SSE2-NEXT:    movl %esi, 8(%rdx)
-; SSE2-NEXT:    testb $8, %cl
-; SSE2-NEXT:    je LBB31_8
-; SSE2-NEXT:  LBB31_7: ## %cond.store5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; SSE2-NEXT:    movd %xmm5, %esi
-; SSE2-NEXT:    movl %esi, 12(%rdx)
-; SSE2-NEXT:    testb $16, %cl
-; SSE2-NEXT:    je LBB31_10
-; SSE2-NEXT:  LBB31_9: ## %cond.store7
-; SSE2-NEXT:    movl %r11d, 16(%rdx)
-; SSE2-NEXT:    testb $32, %cl
-; SSE2-NEXT:    je LBB31_12
-; SSE2-NEXT:  LBB31_11: ## %cond.store9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1]
-; SSE2-NEXT:    movd %xmm5, %esi
-; SSE2-NEXT:    movl %esi, 20(%rdx)
-; SSE2-NEXT:    testb $64, %cl
-; SSE2-NEXT:    je LBB31_14
-; SSE2-NEXT:  LBB31_13: ## %cond.store11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movd %xmm5, %esi
-; SSE2-NEXT:    movl %esi, 24(%rdx)
-; SSE2-NEXT:    testb %cl, %cl
-; SSE2-NEXT:    jns LBB31_16
-; SSE2-NEXT:  LBB31_15: ## %cond.store13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; SSE2-NEXT:    movd %xmm4, %esi
-; SSE2-NEXT:    movl %esi, 28(%rdx)
-; SSE2-NEXT:    testl $256, %ecx ## imm = 0x100
-; SSE2-NEXT:    je LBB31_18
-; SSE2-NEXT:  LBB31_17: ## %cond.store15
-; SSE2-NEXT:    movl %r10d, 32(%rdx)
-; SSE2-NEXT:    testl $512, %ecx ## imm = 0x200
-; SSE2-NEXT:    je LBB31_20
-; SSE2-NEXT:  LBB31_19: ## %cond.store17
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; SSE2-NEXT:    movd %xmm4, %esi
-; SSE2-NEXT:    movl %esi, 36(%rdx)
-; SSE2-NEXT:    testl $1024, %ecx ## imm = 0x400
-; SSE2-NEXT:    je LBB31_22
-; SSE2-NEXT:  LBB31_21: ## %cond.store19
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movd %xmm4, %esi
-; SSE2-NEXT:    movl %esi, 40(%rdx)
-; SSE2-NEXT:    testl $2048, %ecx ## imm = 0x800
-; SSE2-NEXT:    je LBB31_24
-; SSE2-NEXT:  LBB31_23: ## %cond.store21
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSE2-NEXT:    movd %xmm3, %esi
-; SSE2-NEXT:    movl %esi, 44(%rdx)
-; SSE2-NEXT:    testl $4096, %ecx ## imm = 0x1000
-; SSE2-NEXT:    je LBB31_26
-; SSE2-NEXT:  LBB31_25: ## %cond.store23
-; SSE2-NEXT:    movl %r9d, 48(%rdx)
-; SSE2-NEXT:    testl $8192, %ecx ## imm = 0x2000
-; SSE2-NEXT:    je LBB31_28
-; SSE2-NEXT:  LBB31_27: ## %cond.store25
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE2-NEXT:    movd %xmm3, %esi
-; SSE2-NEXT:    movl %esi, 52(%rdx)
-; SSE2-NEXT:    testl $16384, %ecx ## imm = 0x4000
-; SSE2-NEXT:    je LBB31_30
-; SSE2-NEXT:  LBB31_29: ## %cond.store27
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movd %xmm3, %esi
-; SSE2-NEXT:    movl %esi, 56(%rdx)
-; SSE2-NEXT:    testw %cx, %cx
-; SSE2-NEXT:    jns LBB31_32
-; SSE2-NEXT:  LBB31_31: ## %cond.store29
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE2-NEXT:    movd %xmm2, %esi
-; SSE2-NEXT:    movl %esi, 60(%rdx)
-; SSE2-NEXT:    testl $65536, %ecx ## imm = 0x10000
-; SSE2-NEXT:    je LBB31_34
-; SSE2-NEXT:  LBB31_33: ## %cond.store31
-; SSE2-NEXT:    movl %r8d, 64(%rdx)
-; SSE2-NEXT:    testl $131072, %ecx ## imm = 0x20000
-; SSE2-NEXT:    je LBB31_36
-; SSE2-NEXT:  LBB31_35: ## %cond.store33
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSE2-NEXT:    movd %xmm2, %esi
-; SSE2-NEXT:    movl %esi, 68(%rdx)
-; SSE2-NEXT:    testl $262144, %ecx ## imm = 0x40000
-; SSE2-NEXT:    je LBB31_38
-; SSE2-NEXT:  LBB31_37: ## %cond.store35
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movd %xmm2, %esi
-; SSE2-NEXT:    movl %esi, 72(%rdx)
-; SSE2-NEXT:    testl $524288, %ecx ## imm = 0x80000
-; SSE2-NEXT:    je LBB31_40
-; SSE2-NEXT:  LBB31_39: ## %cond.store37
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT:    movd %xmm1, %esi
-; SSE2-NEXT:    movl %esi, 76(%rdx)
-; SSE2-NEXT:    testl $1048576, %ecx ## imm = 0x100000
-; SSE2-NEXT:    je LBB31_42
-; SSE2-NEXT:  LBB31_41: ## %cond.store39
-; SSE2-NEXT:    movl %eax, 80(%rdx)
-; SSE2-NEXT:    testl $2097152, %ecx ## imm = 0x200000
-; SSE2-NEXT:    je LBB31_44
-; SSE2-NEXT:  LBB31_43: ## %cond.store41
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    movl %eax, 84(%rdx)
-; SSE2-NEXT:    testl $4194304, %ecx ## imm = 0x400000
-; SSE2-NEXT:    jne LBB31_45
-; SSE2-NEXT:    jmp LBB31_46
-;
-; SSE4-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
-; SSE4:       ## %bb.0:
-; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
-; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
-; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
-; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
-; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
-; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
-; SSE4-NEXT:    movdqa (%rdi), %xmm1
-; SSE4-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE4-NEXT:    movdqa 64(%rdi), %xmm0
-; SSE4-NEXT:    movl 92(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 88(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 84(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 80(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 76(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 72(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 68(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 64(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 60(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 56(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 52(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    packssdw 48(%rdi), %xmm2
-; SSE4-NEXT:    packssdw 16(%rdi), %xmm1
-; SSE4-NEXT:    packsswb %xmm2, %xmm1
-; SSE4-NEXT:    packssdw 80(%rdi), %xmm0
-; SSE4-NEXT:    packsswb %xmm0, %xmm0
-; SSE4-NEXT:    pmovmskb %xmm1, %eax
-; SSE4-NEXT:    andl $21845, %eax ## imm = 0x5555
-; SSE4-NEXT:    pmovmskb %xmm0, %edi
-; SSE4-NEXT:    andl $85, %edi
-; SSE4-NEXT:    shll $16, %edi
-; SSE4-NEXT:    orl %eax, %edi
-; SSE4-NEXT:    movl 48(%rsi), %r13d
-; SSE4-NEXT:    testb $1, %dil
-; SSE4-NEXT:    movl 44(%rsi), %eax
-; SSE4-NEXT:    movl 40(%rsi), %ecx
-; SSE4-NEXT:    movl 36(%rsi), %r8d
-; SSE4-NEXT:    movl 32(%rsi), %r9d
-; SSE4-NEXT:    movl 28(%rsi), %r10d
-; SSE4-NEXT:    movl 24(%rsi), %r11d
-; SSE4-NEXT:    movl 20(%rsi), %ebx
-; SSE4-NEXT:    movl 16(%rsi), %ebp
-; SSE4-NEXT:    movl 12(%rsi), %r14d
-; SSE4-NEXT:    movl 8(%rsi), %r15d
-; SSE4-NEXT:    movl 4(%rsi), %r12d
-; SSE4-NEXT:    jne LBB31_1
-; SSE4-NEXT:  ## %bb.2: ## %else
-; SSE4-NEXT:    testb $2, %dil
-; SSE4-NEXT:    jne LBB31_3
-; SSE4-NEXT:  LBB31_4: ## %else2
-; SSE4-NEXT:    testb $4, %dil
-; SSE4-NEXT:    jne LBB31_5
-; SSE4-NEXT:  LBB31_6: ## %else4
-; SSE4-NEXT:    testb $8, %dil
-; SSE4-NEXT:    jne LBB31_7
-; SSE4-NEXT:  LBB31_8: ## %else6
-; SSE4-NEXT:    testb $16, %dil
-; SSE4-NEXT:    jne LBB31_9
-; SSE4-NEXT:  LBB31_10: ## %else8
-; SSE4-NEXT:    testb $32, %dil
-; SSE4-NEXT:    jne LBB31_11
-; SSE4-NEXT:  LBB31_12: ## %else10
-; SSE4-NEXT:    testb $64, %dil
-; SSE4-NEXT:    jne LBB31_13
-; SSE4-NEXT:  LBB31_14: ## %else12
-; SSE4-NEXT:    testb %dil, %dil
-; SSE4-NEXT:    js LBB31_15
-; SSE4-NEXT:  LBB31_16: ## %else14
-; SSE4-NEXT:    testl $256, %edi ## imm = 0x100
-; SSE4-NEXT:    jne LBB31_17
-; SSE4-NEXT:  LBB31_18: ## %else16
-; SSE4-NEXT:    testl $512, %edi ## imm = 0x200
-; SSE4-NEXT:    jne LBB31_19
-; SSE4-NEXT:  LBB31_20: ## %else18
-; SSE4-NEXT:    testl $1024, %edi ## imm = 0x400
-; SSE4-NEXT:    jne LBB31_21
-; SSE4-NEXT:  LBB31_22: ## %else20
-; SSE4-NEXT:    testl $2048, %edi ## imm = 0x800
-; SSE4-NEXT:    jne LBB31_23
-; SSE4-NEXT:  LBB31_24: ## %else22
-; SSE4-NEXT:    testl $4096, %edi ## imm = 0x1000
-; SSE4-NEXT:    jne LBB31_25
-; SSE4-NEXT:  LBB31_26: ## %else24
-; SSE4-NEXT:    testl $8192, %edi ## imm = 0x2000
-; SSE4-NEXT:    jne LBB31_27
-; SSE4-NEXT:  LBB31_28: ## %else26
-; SSE4-NEXT:    testl $16384, %edi ## imm = 0x4000
-; SSE4-NEXT:    jne LBB31_29
-; SSE4-NEXT:  LBB31_30: ## %else28
-; SSE4-NEXT:    testw %di, %di
-; SSE4-NEXT:    js LBB31_31
-; SSE4-NEXT:  LBB31_32: ## %else30
-; SSE4-NEXT:    testl $65536, %edi ## imm = 0x10000
-; SSE4-NEXT:    jne LBB31_33
-; SSE4-NEXT:  LBB31_34: ## %else32
-; SSE4-NEXT:    testl $131072, %edi ## imm = 0x20000
-; SSE4-NEXT:    jne LBB31_35
-; SSE4-NEXT:  LBB31_36: ## %else34
-; SSE4-NEXT:    testl $262144, %edi ## imm = 0x40000
-; SSE4-NEXT:    jne LBB31_37
-; SSE4-NEXT:  LBB31_38: ## %else36
-; SSE4-NEXT:    testl $524288, %edi ## imm = 0x80000
-; SSE4-NEXT:    jne LBB31_39
-; SSE4-NEXT:  LBB31_40: ## %else38
-; SSE4-NEXT:    testl $1048576, %edi ## imm = 0x100000
-; SSE4-NEXT:    jne LBB31_41
-; SSE4-NEXT:  LBB31_42: ## %else40
-; SSE4-NEXT:    testl $2097152, %edi ## imm = 0x200000
-; SSE4-NEXT:    jne LBB31_43
-; SSE4-NEXT:  LBB31_44: ## %else42
-; SSE4-NEXT:    testl $4194304, %edi ## imm = 0x400000
-; SSE4-NEXT:    je LBB31_46
-; SSE4-NEXT:  LBB31_45: ## %cond.store43
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 88(%rdx)
-; SSE4-NEXT:  LBB31_46: ## %else44
-; SSE4-NEXT:    movb $1, %al
-; SSE4-NEXT:    testb %al, %al
-; SSE4-NEXT:    jne LBB31_48
-; SSE4-NEXT:  ## %bb.47: ## %cond.store45
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 92(%rdx)
-; SSE4-NEXT:  LBB31_48: ## %else46
-; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    retq
-; SSE4-NEXT:  LBB31_1: ## %cond.store
-; SSE4-NEXT:    movl (%rsi), %esi
-; SSE4-NEXT:    movl %esi, (%rdx)
-; SSE4-NEXT:    testb $2, %dil
-; SSE4-NEXT:    je LBB31_4
-; SSE4-NEXT:  LBB31_3: ## %cond.store1
-; SSE4-NEXT:    movl %r12d, 4(%rdx)
-; SSE4-NEXT:    testb $4, %dil
-; SSE4-NEXT:    je LBB31_6
-; SSE4-NEXT:  LBB31_5: ## %cond.store3
-; SSE4-NEXT:    movl %r15d, 8(%rdx)
-; SSE4-NEXT:    testb $8, %dil
-; SSE4-NEXT:    je LBB31_8
-; SSE4-NEXT:  LBB31_7: ## %cond.store5
-; SSE4-NEXT:    movl %r14d, 12(%rdx)
-; SSE4-NEXT:    testb $16, %dil
-; SSE4-NEXT:    je LBB31_10
-; SSE4-NEXT:  LBB31_9: ## %cond.store7
-; SSE4-NEXT:    movl %ebp, 16(%rdx)
-; SSE4-NEXT:    testb $32, %dil
-; SSE4-NEXT:    je LBB31_12
-; SSE4-NEXT:  LBB31_11: ## %cond.store9
-; SSE4-NEXT:    movl %ebx, 20(%rdx)
-; SSE4-NEXT:    testb $64, %dil
-; SSE4-NEXT:    je LBB31_14
-; SSE4-NEXT:  LBB31_13: ## %cond.store11
-; SSE4-NEXT:    movl %r11d, 24(%rdx)
-; SSE4-NEXT:    testb %dil, %dil
-; SSE4-NEXT:    jns LBB31_16
-; SSE4-NEXT:  LBB31_15: ## %cond.store13
-; SSE4-NEXT:    movl %r10d, 28(%rdx)
-; SSE4-NEXT:    testl $256, %edi ## imm = 0x100
-; SSE4-NEXT:    je LBB31_18
-; SSE4-NEXT:  LBB31_17: ## %cond.store15
-; SSE4-NEXT:    movl %r9d, 32(%rdx)
-; SSE4-NEXT:    testl $512, %edi ## imm = 0x200
-; SSE4-NEXT:    je LBB31_20
-; SSE4-NEXT:  LBB31_19: ## %cond.store17
-; SSE4-NEXT:    movl %r8d, 36(%rdx)
-; SSE4-NEXT:    testl $1024, %edi ## imm = 0x400
-; SSE4-NEXT:    je LBB31_22
-; SSE4-NEXT:  LBB31_21: ## %cond.store19
-; SSE4-NEXT:    movl %ecx, 40(%rdx)
-; SSE4-NEXT:    testl $2048, %edi ## imm = 0x800
-; SSE4-NEXT:    je LBB31_24
-; SSE4-NEXT:  LBB31_23: ## %cond.store21
-; SSE4-NEXT:    movl %eax, 44(%rdx)
-; SSE4-NEXT:    testl $4096, %edi ## imm = 0x1000
-; SSE4-NEXT:    je LBB31_26
-; SSE4-NEXT:  LBB31_25: ## %cond.store23
-; SSE4-NEXT:    movl %r13d, 48(%rdx)
-; SSE4-NEXT:    testl $8192, %edi ## imm = 0x2000
-; SSE4-NEXT:    je LBB31_28
-; SSE4-NEXT:  LBB31_27: ## %cond.store25
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 52(%rdx)
-; SSE4-NEXT:    testl $16384, %edi ## imm = 0x4000
-; SSE4-NEXT:    je LBB31_30
-; SSE4-NEXT:  LBB31_29: ## %cond.store27
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 56(%rdx)
-; SSE4-NEXT:    testw %di, %di
-; SSE4-NEXT:    jns LBB31_32
-; SSE4-NEXT:  LBB31_31: ## %cond.store29
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 60(%rdx)
-; SSE4-NEXT:    testl $65536, %edi ## imm = 0x10000
-; SSE4-NEXT:    je LBB31_34
-; SSE4-NEXT:  LBB31_33: ## %cond.store31
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 64(%rdx)
-; SSE4-NEXT:    testl $131072, %edi ## imm = 0x20000
-; SSE4-NEXT:    je LBB31_36
-; SSE4-NEXT:  LBB31_35: ## %cond.store33
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 68(%rdx)
-; SSE4-NEXT:    testl $262144, %edi ## imm = 0x40000
-; SSE4-NEXT:    je LBB31_38
-; SSE4-NEXT:  LBB31_37: ## %cond.store35
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 72(%rdx)
-; SSE4-NEXT:    testl $524288, %edi ## imm = 0x80000
-; SSE4-NEXT:    je LBB31_40
-; SSE4-NEXT:  LBB31_39: ## %cond.store37
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 76(%rdx)
-; SSE4-NEXT:    testl $1048576, %edi ## imm = 0x100000
-; SSE4-NEXT:    je LBB31_42
-; SSE4-NEXT:  LBB31_41: ## %cond.store39
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 80(%rdx)
-; SSE4-NEXT:    testl $2097152, %edi ## imm = 0x200000
-; SSE4-NEXT:    je LBB31_44
-; SSE4-NEXT:  LBB31_43: ## %cond.store41
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 84(%rdx)
-; SSE4-NEXT:    testl $4194304, %edi ## imm = 0x400000
-; SSE4-NEXT:    jne LBB31_45
-; SSE4-NEXT:    jmp LBB31_46
+define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) nounwind {
+; SSE-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE-NEXT:    movdqa 64(%rdi), %xmm0
+; SSE-NEXT:    movl 92(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 88(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 84(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 80(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 76(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 72(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 68(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 64(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 60(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 56(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 52(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    packssdw 48(%rdi), %xmm2
+; SSE-NEXT:    packssdw 16(%rdi), %xmm1
+; SSE-NEXT:    packsswb %xmm2, %xmm1
+; SSE-NEXT:    packssdw 80(%rdi), %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    andl $21845, %eax ## imm = 0x5555
+; SSE-NEXT:    pmovmskb %xmm0, %edi
+; SSE-NEXT:    andl $85, %edi
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    orl %eax, %edi
+; SSE-NEXT:    movl 48(%rsi), %r13d
+; SSE-NEXT:    testb $1, %dil
+; SSE-NEXT:    movl 44(%rsi), %eax
+; SSE-NEXT:    movl 40(%rsi), %ecx
+; SSE-NEXT:    movl 36(%rsi), %r8d
+; SSE-NEXT:    movl 32(%rsi), %r9d
+; SSE-NEXT:    movl 28(%rsi), %r10d
+; SSE-NEXT:    movl 24(%rsi), %r11d
+; SSE-NEXT:    movl 20(%rsi), %ebx
+; SSE-NEXT:    movl 16(%rsi), %ebp
+; SSE-NEXT:    movl 12(%rsi), %r14d
+; SSE-NEXT:    movl 8(%rsi), %r15d
+; SSE-NEXT:    movl 4(%rsi), %r12d
+; SSE-NEXT:    jne LBB31_1
+; SSE-NEXT:  ## %bb.2: ## %else
+; SSE-NEXT:    testb $2, %dil
+; SSE-NEXT:    jne LBB31_3
+; SSE-NEXT:  LBB31_4: ## %else2
+; SSE-NEXT:    testb $4, %dil
+; SSE-NEXT:    jne LBB31_5
+; SSE-NEXT:  LBB31_6: ## %else4
+; SSE-NEXT:    testb $8, %dil
+; SSE-NEXT:    jne LBB31_7
+; SSE-NEXT:  LBB31_8: ## %else6
+; SSE-NEXT:    testb $16, %dil
+; SSE-NEXT:    jne LBB31_9
+; SSE-NEXT:  LBB31_10: ## %else8
+; SSE-NEXT:    testb $32, %dil
+; SSE-NEXT:    jne LBB31_11
+; SSE-NEXT:  LBB31_12: ## %else10
+; SSE-NEXT:    testb $64, %dil
+; SSE-NEXT:    jne LBB31_13
+; SSE-NEXT:  LBB31_14: ## %else12
+; SSE-NEXT:    testb %dil, %dil
+; SSE-NEXT:    js LBB31_15
+; SSE-NEXT:  LBB31_16: ## %else14
+; SSE-NEXT:    testl $256, %edi ## imm = 0x100
+; SSE-NEXT:    jne LBB31_17
+; SSE-NEXT:  LBB31_18: ## %else16
+; SSE-NEXT:    testl $512, %edi ## imm = 0x200
+; SSE-NEXT:    jne LBB31_19
+; SSE-NEXT:  LBB31_20: ## %else18
+; SSE-NEXT:    testl $1024, %edi ## imm = 0x400
+; SSE-NEXT:    jne LBB31_21
+; SSE-NEXT:  LBB31_22: ## %else20
+; SSE-NEXT:    testl $2048, %edi ## imm = 0x800
+; SSE-NEXT:    jne LBB31_23
+; SSE-NEXT:  LBB31_24: ## %else22
+; SSE-NEXT:    testl $4096, %edi ## imm = 0x1000
+; SSE-NEXT:    jne LBB31_25
+; SSE-NEXT:  LBB31_26: ## %else24
+; SSE-NEXT:    testl $8192, %edi ## imm = 0x2000
+; SSE-NEXT:    jne LBB31_27
+; SSE-NEXT:  LBB31_28: ## %else26
+; SSE-NEXT:    testl $16384, %edi ## imm = 0x4000
+; SSE-NEXT:    jne LBB31_29
+; SSE-NEXT:  LBB31_30: ## %else28
+; SSE-NEXT:    testw %di, %di
+; SSE-NEXT:    js LBB31_31
+; SSE-NEXT:  LBB31_32: ## %else30
+; SSE-NEXT:    testl $65536, %edi ## imm = 0x10000
+; SSE-NEXT:    jne LBB31_33
+; SSE-NEXT:  LBB31_34: ## %else32
+; SSE-NEXT:    testl $131072, %edi ## imm = 0x20000
+; SSE-NEXT:    jne LBB31_35
+; SSE-NEXT:  LBB31_36: ## %else34
+; SSE-NEXT:    testl $262144, %edi ## imm = 0x40000
+; SSE-NEXT:    jne LBB31_37
+; SSE-NEXT:  LBB31_38: ## %else36
+; SSE-NEXT:    testl $524288, %edi ## imm = 0x80000
+; SSE-NEXT:    jne LBB31_39
+; SSE-NEXT:  LBB31_40: ## %else38
+; SSE-NEXT:    testl $1048576, %edi ## imm = 0x100000
+; SSE-NEXT:    jne LBB31_41
+; SSE-NEXT:  LBB31_42: ## %else40
+; SSE-NEXT:    testl $2097152, %edi ## imm = 0x200000
+; SSE-NEXT:    jne LBB31_43
+; SSE-NEXT:  LBB31_44: ## %else42
+; SSE-NEXT:    testl $4194304, %edi ## imm = 0x400000
+; SSE-NEXT:    je LBB31_46
+; SSE-NEXT:  LBB31_45: ## %cond.store43
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 88(%rdx)
+; SSE-NEXT:  LBB31_46: ## %else44
+; SSE-NEXT:    movb $1, %al
+; SSE-NEXT:    testb %al, %al
+; SSE-NEXT:    jne LBB31_48
+; SSE-NEXT:  ## %bb.47: ## %cond.store45
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 92(%rdx)
+; SSE-NEXT:  LBB31_48: ## %else46
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+; SSE-NEXT:  LBB31_1: ## %cond.store
+; SSE-NEXT:    movl (%rsi), %esi
+; SSE-NEXT:    movl %esi, (%rdx)
+; SSE-NEXT:    testb $2, %dil
+; SSE-NEXT:    je LBB31_4
+; SSE-NEXT:  LBB31_3: ## %cond.store1
+; SSE-NEXT:    movl %r12d, 4(%rdx)
+; SSE-NEXT:    testb $4, %dil
+; SSE-NEXT:    je LBB31_6
+; SSE-NEXT:  LBB31_5: ## %cond.store3
+; SSE-NEXT:    movl %r15d, 8(%rdx)
+; SSE-NEXT:    testb $8, %dil
+; SSE-NEXT:    je LBB31_8
+; SSE-NEXT:  LBB31_7: ## %cond.store5
+; SSE-NEXT:    movl %r14d, 12(%rdx)
+; SSE-NEXT:    testb $16, %dil
+; SSE-NEXT:    je LBB31_10
+; SSE-NEXT:  LBB31_9: ## %cond.store7
+; SSE-NEXT:    movl %ebp, 16(%rdx)
+; SSE-NEXT:    testb $32, %dil
+; SSE-NEXT:    je LBB31_12
+; SSE-NEXT:  LBB31_11: ## %cond.store9
+; SSE-NEXT:    movl %ebx, 20(%rdx)
+; SSE-NEXT:    testb $64, %dil
+; SSE-NEXT:    je LBB31_14
+; SSE-NEXT:  LBB31_13: ## %cond.store11
+; SSE-NEXT:    movl %r11d, 24(%rdx)
+; SSE-NEXT:    testb %dil, %dil
+; SSE-NEXT:    jns LBB31_16
+; SSE-NEXT:  LBB31_15: ## %cond.store13
+; SSE-NEXT:    movl %r10d, 28(%rdx)
+; SSE-NEXT:    testl $256, %edi ## imm = 0x100
+; SSE-NEXT:    je LBB31_18
+; SSE-NEXT:  LBB31_17: ## %cond.store15
+; SSE-NEXT:    movl %r9d, 32(%rdx)
+; SSE-NEXT:    testl $512, %edi ## imm = 0x200
+; SSE-NEXT:    je LBB31_20
+; SSE-NEXT:  LBB31_19: ## %cond.store17
+; SSE-NEXT:    movl %r8d, 36(%rdx)
+; SSE-NEXT:    testl $1024, %edi ## imm = 0x400
+; SSE-NEXT:    je LBB31_22
+; SSE-NEXT:  LBB31_21: ## %cond.store19
+; SSE-NEXT:    movl %ecx, 40(%rdx)
+; SSE-NEXT:    testl $2048, %edi ## imm = 0x800
+; SSE-NEXT:    je LBB31_24
+; SSE-NEXT:  LBB31_23: ## %cond.store21
+; SSE-NEXT:    movl %eax, 44(%rdx)
+; SSE-NEXT:    testl $4096, %edi ## imm = 0x1000
+; SSE-NEXT:    je LBB31_26
+; SSE-NEXT:  LBB31_25: ## %cond.store23
+; SSE-NEXT:    movl %r13d, 48(%rdx)
+; SSE-NEXT:    testl $8192, %edi ## imm = 0x2000
+; SSE-NEXT:    je LBB31_28
+; SSE-NEXT:  LBB31_27: ## %cond.store25
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 52(%rdx)
+; SSE-NEXT:    testl $16384, %edi ## imm = 0x4000
+; SSE-NEXT:    je LBB31_30
+; SSE-NEXT:  LBB31_29: ## %cond.store27
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 56(%rdx)
+; SSE-NEXT:    testw %di, %di
+; SSE-NEXT:    jns LBB31_32
+; SSE-NEXT:  LBB31_31: ## %cond.store29
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 60(%rdx)
+; SSE-NEXT:    testl $65536, %edi ## imm = 0x10000
+; SSE-NEXT:    je LBB31_34
+; SSE-NEXT:  LBB31_33: ## %cond.store31
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 64(%rdx)
+; SSE-NEXT:    testl $131072, %edi ## imm = 0x20000
+; SSE-NEXT:    je LBB31_36
+; SSE-NEXT:  LBB31_35: ## %cond.store33
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 68(%rdx)
+; SSE-NEXT:    testl $262144, %edi ## imm = 0x40000
+; SSE-NEXT:    je LBB31_38
+; SSE-NEXT:  LBB31_37: ## %cond.store35
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 72(%rdx)
+; SSE-NEXT:    testl $524288, %edi ## imm = 0x80000
+; SSE-NEXT:    je LBB31_40
+; SSE-NEXT:  LBB31_39: ## %cond.store37
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 76(%rdx)
+; SSE-NEXT:    testl $1048576, %edi ## imm = 0x100000
+; SSE-NEXT:    je LBB31_42
+; SSE-NEXT:  LBB31_41: ## %cond.store39
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 80(%rdx)
+; SSE-NEXT:    testl $2097152, %edi ## imm = 0x200000
+; SSE-NEXT:    je LBB31_44
+; SSE-NEXT:  LBB31_43: ## %cond.store41
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 84(%rdx)
+; SSE-NEXT:    testl $4194304, %edi ## imm = 0x400000
+; SSE-NEXT:    jne LBB31_45
+; SSE-NEXT:    jmp LBB31_46
 ;
 ; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; AVX1:       ## %bb.0:
@@ -6266,7 +6021,7 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 }
 
 ; From https://reviews.llvm.org/rGf8d9097168b7#1165311
-define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) #0 {
+define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind {
 ; SSE2-LABEL: undefshuffle:
 ; SSE2:       ## %bb.0: ## %else
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index 426f4ee..6a5770a 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1   | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1   | FileCheck %s --check-prefixes=CHECK,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX
 
 declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
 
@@ -71,28 +71,12 @@ define i1 @parseHeaders2_scalar_or(ptr %ptr) nounwind {
 }
 
 define i1 @parseHeaders2_scalar_and(ptr %ptr) nounwind {
-; SSE2-LABEL: parseHeaders2_scalar_and:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, (%rdi)
-; SSE2-NEXT:    sete %al
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: parseHeaders2_scalar_and:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq (%rdi), %rax
-; SSE41-NEXT:    testq %rax, 8(%rdi)
-; SSE41-NEXT:    sete %al
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: parseHeaders2_scalar_and:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    testq %rax, 8(%rdi)
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    retq
+; CHECK-LABEL: parseHeaders2_scalar_and:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    testq %rax, 8(%rdi)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
   %vload = load <2 x i64>, ptr %ptr, align 8
   %v1 = extractelement <2 x i64> %vload, i32 0
   %v2 = extractelement <2 x i64> %vload, i32 1
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 2187c65..97c3c204 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -60,36 +60,30 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movdqu 1024(%rdx,%rdi), %xmm5
-; CHECK-NEXT:    movdqu 1040(%rdx,%rdi), %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; CHECK-NEXT:    movq %xmm5, %r8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; CHECK-NEXT:    movq %xmm5, %r9
-; CHECK-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
-; CHECK-NEXT:    movq %rcx, %r10
-; CHECK-NEXT:    sbbq %r9, %r10
-; CHECK-NEXT:    setge %r9b
-; CHECK-NEXT:    movzbl %r9b, %r9d
-; CHECK-NEXT:    andl $1, %r9d
-; CHECK-NEXT:    negq %r9
-; CHECK-NEXT:    movq %r9, %xmm5
 ; CHECK-NEXT:    cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-NEXT:    movq %rcx, %r9
-; CHECK-NEXT:    sbbq %r8, %r9
+; CHECK-NEXT:    movq %rcx, %r8
+; CHECK-NEXT:    sbbq 1032(%rdx,%rdi), %r8
+; CHECK-NEXT:    setge %r8b
+; CHECK-NEXT:    movzbl %r8b, %r8d
+; CHECK-NEXT:    andl $1, %r8d
+; CHECK-NEXT:    negq %r8
+; CHECK-NEXT:    movq %r8, %xmm5
+; CHECK-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
+; CHECK-NEXT:    movq %rcx, %r8
+; CHECK-NEXT:    sbbq 1048(%rdx,%rdi), %r8
 ; CHECK-NEXT:    setge %r8b
 ; CHECK-NEXT:    movzbl %r8b, %r8d
 ; CHECK-NEXT:    andl $1, %r8d
 ; CHECK-NEXT:    negq %r8
 ; CHECK-NEXT:    movq %r8, %xmm6
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
-; CHECK-NEXT:    movdqa %xmm1, %xmm5
-; CHECK-NEXT:    psllq %xmm4, %xmm5
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm6
+; CHECK-NEXT:    psllq %xmm4, %xmm6
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm8
 ; CHECK-NEXT:    psllq %xmm7, %xmm8
-; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
-; CHECK-NEXT:    andpd %xmm6, %xmm8
+; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
+; CHECK-NEXT:    andpd %xmm5, %xmm8
 ; CHECK-NEXT:    orpd %xmm8, %xmm3
 ; CHECK-NEXT:    paddq %xmm2, %xmm4
 ; CHECK-NEXT:    addq $32, %rdi
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 2610f43..62051d1 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1983,91 +1983,75 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movzwl 16(%eax), %edx
 ; X86-SSE-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movdqa (%eax), %xmm3
-; X86-SSE-NEXT:    movdqa (%ecx), %xmm0
-; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm1
-; X86-SSE-NEXT:    pxor %xmm5, %xmm5
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE-NEXT:    pextrw $7, %xmm3, %eax
-; X86-SSE-NEXT:    pextrw $4, %xmm3, %edi
-; X86-SSE-NEXT:    pextrw $0, %xmm3, %ebp
-; X86-SSE-NEXT:    pextrw $1, %xmm3, %esi
-; X86-SSE-NEXT:    pextrw $3, %xmm3, %ebx
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X86-SSE-NEXT:    movd %xmm3, %ecx
+; X86-SSE-NEXT:    movdqa (%eax), %xmm2
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE-NEXT:    pextrw $7, %xmm2, %eax
+; X86-SSE-NEXT:    pextrw $4, %xmm2, %esi
+; X86-SSE-NEXT:    pextrw $1, %xmm2, %edi
+; X86-SSE-NEXT:    pextrw $0, %xmm2, %ebx
+; X86-SSE-NEXT:    pextrw $3, %xmm2, %ebp
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 28(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm3, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
+; X86-SSE-NEXT:    divl 24(%ecx)
 ; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm5, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm5, %ecx
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm5
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X86-SSE-NEXT:    divl 16(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 20(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; X86-SSE-NEXT:    movl %edi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    divl 16(%edi)
+; X86-SSE-NEXT:    divl 4(%ecx)
 ; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm2, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm1, %ecx
+; X86-SSE-NEXT:    movl %ebx, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X86-SSE-NEXT:    divl (%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X86-SSE-NEXT:    movl %ebp, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl (%edi)
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm2, %ecx
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X86-SSE-NEXT:    movd %xmm2, %ecx
-; X86-SSE-NEXT:    movl %ebx, %eax
+; X86-SSE-NEXT:    divl 12(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm2, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
+; X86-SSE-NEXT:    divl 8(%ecx)
 ; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm4, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm0, %ecx
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 32(%edi)
+; X86-SSE-NEXT:    divl 32(%ecx)
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm4
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
 ; X86-SSE-NEXT:    movl %eax, (%eax)
-; X86-SSE-NEXT:    movdqa %xmm3, (%eax)
+; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
 ; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
 ; X86-SSE-NEXT:    addl $4, %esp
 ; X86-SSE-NEXT:    popl %esi
@@ -2204,91 +2188,76 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X64-SSE-LABEL: PR34947:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movzwl 16(%rdi), %ecx
-; X64-SSE-NEXT:    movdqa (%rdi), %xmm3
-; X64-SSE-NEXT:    movdqa (%rsi), %xmm0
-; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm1
-; X64-SSE-NEXT:    pxor %xmm5, %xmm5
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm2
-; X64-SSE-NEXT:    pextrw $7, %xmm3, %eax
-; X64-SSE-NEXT:    pextrw $4, %xmm3, %r8d
-; X64-SSE-NEXT:    pextrw $0, %xmm3, %r10d
-; X64-SSE-NEXT:    pextrw $1, %xmm3, %edi
-; X64-SSE-NEXT:    pextrw $3, %xmm3, %r9d
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X64-SSE-NEXT:    movd %xmm3, %r11d
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r11d
-; X64-SSE-NEXT:    movd %edx, %xmm3
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm5, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm5, %r11d
+; X64-SSE-NEXT:    movdqa (%rdi), %xmm2
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE-NEXT:    pextrw $7, %xmm2, %eax
+; X64-SSE-NEXT:    pextrw $4, %xmm2, %edi
+; X64-SSE-NEXT:    pextrw $1, %xmm2, %r8d
+; X64-SSE-NEXT:    pextrw $0, %xmm2, %r9d
+; X64-SSE-NEXT:    pextrw $3, %xmm2, %r10d
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r11d
-; X64-SSE-NEXT:    movd %edx, %xmm5
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    divl 28(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X64-SSE-NEXT:    movd %xmm3, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl 16(%rsi)
+; X64-SSE-NEXT:    divl 24(%rsi)
 ; X64-SSE-NEXT:    movd %edx, %xmm3
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X64-SSE-NEXT:    movd %xmm2, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-SSE-NEXT:    movd %xmm1, %r8d
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r8d
-; X64-SSE-NEXT:    movd %edx, %xmm1
 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; X64-SSE-NEXT:    movl %r10d, %eax
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl (%rsi)
+; X64-SSE-NEXT:    divl 16(%rsi)
 ; X64-SSE-NEXT:    movd %edx, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X64-SSE-NEXT:    movd %xmm2, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-SSE-NEXT:    movd %xmm0, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r8d
-; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X64-SSE-NEXT:    movd %xmm2, %edi
+; X64-SSE-NEXT:    divl 20(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm0
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl 4(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm0
 ; X64-SSE-NEXT:    movl %r9d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %edi
-; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm4, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm0, %edi
+; X64-SSE-NEXT:    divl (%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm3
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X64-SSE-NEXT:    movl %r10d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %edi
+; X64-SSE-NEXT:    divl 12(%rsi)
 ; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X64-SSE-NEXT:    movd %xmm2, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl 8(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm2
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; X64-SSE-NEXT:    movl %ecx, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl 32(%rsi)
 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm3
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X64-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
 ; X64-SSE-NEXT:    movl %eax, (%rax)
-; X64-SSE-NEXT:    movdqa %xmm3, (%rax)
 ; X64-SSE-NEXT:    movdqa %xmm1, (%rax)
+; X64-SSE-NEXT:    movdqa %xmm3, (%rax)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: PR34947:
diff --git a/llvm/test/CodeGen/X86/stack-protector.ll b/llvm/test/CodeGen/X86/stack-protector.ll
index a277f9f..f4f3ae4 100644
--- a/llvm/test/CodeGen/X86/stack-protector.ll
+++ b/llvm/test/CodeGen/X86/stack-protector.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
 ; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-KERNEL-X64 %s
+; RUN: llc -code-model=kernel -mtriple=x86_64-unknown-freebsd < %s -o - | FileCheck --check-prefix=FREEBSD-KERNEL-X64 %s
 ; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | FileCheck --check-prefix=DARWIN-X64 %s
 ; RUN: llc -mtriple=amd64-pc-openbsd < %s -o - | FileCheck --check-prefix=OPENBSD-AMD64 %s
 ; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
@@ -75,6 +76,10 @@ entry:
 ; LINUX-X64: mov{{l|q}} %fs:
 ; LINUX-X64: callq __stack_chk_fail
 
+; FREEBSD-KERNEL-X64-LABEL: test1b:
+; FREEBSD-KERNEL-X64-NOT: mov{{l|q}} __stack_chk_guard@GOTPCREL
+; FREEBSD-KERNEL-X64: callq __stack_chk_fail
+
 ; LINUX-KERNEL-X64-LABEL: test1b:
 ; LINUX-KERNEL-X64: mov{{l|q}} %gs:
 ; LINUX-KERNEL-X64: callq __stack_chk_fail
@@ -118,6 +123,10 @@ entry:
 ; LINUX-X64: mov{{l|q}} %fs:
 ; LINUX-X64: callq __stack_chk_fail
 
+; FREEBSD-KERNEL-X64-LABEL: test1c:
+; FREEBSD-KERNEL-X64: mov{{l|q}} __stack_chk_guard(%rip)
+; FREEBSD-KERNEL-X64: callq __stack_chk_fail
+
 ; LINUX-KERNEL-X64-LABEL: test1c:
 ; LINUX-KERNEL-X64: mov{{l|q}} %gs:
 ; LINUX-KERNEL-X64: callq __stack_chk_fail
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 99a3821..f2240a9 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -1101,17 +1101,13 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
 define void @indices_convert() {
 ; SSE3-LABEL: indices_convert:
 ; SSE3:       # %bb.0: # %bb
-; SSE3-NEXT:    movdqa (%rax), %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE3-NEXT:    movd %xmm1, %eax
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps (%rax), %xmm0
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movl (%rax), %eax
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE3-NEXT:    movd %xmm1, %ecx
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    andl $3, %ecx
 ; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -1120,17 +1116,13 @@ define void @indices_convert() {
 ;
 ; SSSE3-LABEL: indices_convert:
 ; SSSE3:       # %bb.0: # %bb
-; SSSE3-NEXT:    movdqa (%rax), %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps (%rax), %xmm0
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movl (%rax), %eax
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSSE3-NEXT:    andl $3, %eax
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSSE3-NEXT:    movd %xmm1, %ecx
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 7bbcdee..e26de4b 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2911,23 +2911,12 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
 ;
 
 define <2 x double> @sitofp_load_2i64_to_2f64(ptr%a) {
-; SSE2-LABEL: sitofp_load_2i64_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_2i64_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: sitofp_load_2i64_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
+; SSE-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_2i64_to_2f64:
 ; VEX:       # %bb.0:
@@ -3093,35 +3082,16 @@ define <2 x double> @sitofp_load_2i8_to_2f64(ptr%a) {
 }
 
 define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) {
-; SSE2-LABEL: sitofp_load_4i64_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i64_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    cvtsi2sdq 24(%rdi), %xmm2
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: sitofp_load_4i64_to_4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
+; SSE-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    cvtsi2sdq 24(%rdi), %xmm2
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f64:
 ; VEX:       # %bb.0:
@@ -3865,22 +3835,14 @@ define <4 x double> @uitofp_load_4i8_to_4f64(ptr%a) {
 define <4 x float> @sitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-LABEL: sitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 24(%rdi), %xmm0
+; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 8(%rdi), %xmm2
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ssq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_load_4i64_to_4f32:
@@ -4015,39 +3977,24 @@ define <4 x float> @sitofp_load_4i8_to_4f32(ptr%a) {
 define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-LABEL: sitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 24(%rdi), %xmm0
+; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 8(%rdi), %xmm2
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ssq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT:    xorps %xmm4, %xmm4
-; SSE2-NEXT:    cvtsi2ssq 48(%rdi), %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    cvtsi2ssq 56(%rdi), %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq 48(%rdi), %xmm2
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    cvtsi2ssq 40(%rdi), %xmm3
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2ssq 32(%rdi), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_load_8i64_to_8f32:
@@ -4256,70 +4203,64 @@ define <8 x float> @sitofp_load_8i8_to_8f32(ptr%a) {
 define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_1
 ; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    jmp .LBB83_3
 ; SSE2-NEXT:  .LBB83_1:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_3:
-; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB83_6
 ; SSE2-NEXT:  .LBB83_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB83_6:
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    js .LBB83_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    jmp .LBB83_9
+; SSE2-NEXT:  .LBB83_7:
 ; SSE2-NEXT:    movq %rcx, %rdx
 ; SSE2-NEXT:    shrq %rdx
 ; SSE2-NEXT:    andl $1, %ecx
 ; SSE2-NEXT:    orq %rdx, %rcx
 ; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB83_6:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB83_7
-; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB83_9
-; SSE2-NEXT:  .LBB83_7:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_9:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_10
 ; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    jmp .LBB83_12
 ; SSE2-NEXT:  .LBB83_10:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_12:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -4591,8 +4532,7 @@ define <4 x float> @uitofp_load_4i8_to_4f32(ptr%a) {
 define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_1
 ; SSE2-NEXT:  # %bb.2:
@@ -4606,127 +4546,114 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_3:
-; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB87_6
 ; SSE2-NEXT:  .LBB87_4:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB87_6:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_7
 ; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB87_9
-; SSE2-NEXT:  .LBB87_7:
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm3
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    jns .LBB87_11
+; SSE2-NEXT:  .LBB87_10:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB87_9:
-; SSE2-NEXT:    movq 48(%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm3, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
-; SSE2-NEXT:    js .LBB87_10
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm4
 ; SSE2-NEXT:    jmp .LBB87_12
-; SSE2-NEXT:  .LBB87_10:
+; SSE2-NEXT:  .LBB87_7:
 ; SSE2-NEXT:    movq %rcx, %rdx
 ; SSE2-NEXT:    shrq %rdx
 ; SSE2-NEXT:    andl $1, %ecx
 ; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm4
-; SSE2-NEXT:    addss %xmm4, %xmm4
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm3
+; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB87_10
+; SSE2-NEXT:  .LBB87_11:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:  .LBB87_12:
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm5
+; SSE2-NEXT:    movq 56(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_13
 ; SSE2-NEXT:  # %bb.14:
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
 ; SSE2-NEXT:    jmp .LBB87_15
 ; SSE2-NEXT:  .LBB87_13:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
+; SSE2-NEXT:    addss %xmm5, %xmm5
 ; SSE2-NEXT:  .LBB87_15:
-; SSE2-NEXT:    movq 32(%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; SSE2-NEXT:    movq %xmm5, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_16
 ; SSE2-NEXT:  # %bb.17:
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    jmp .LBB87_18
 ; SSE2-NEXT:  .LBB87_16:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
-; SSE2-NEXT:    addss %xmm5, %xmm5
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
+; SSE2-NEXT:    addss %xmm4, %xmm4
 ; SSE2-NEXT:  .LBB87_18:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm4
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    movq 40(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_19
 ; SSE2-NEXT:  # %bb.20:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE2-NEXT:    jmp .LBB87_21
 ; SSE2-NEXT:  .LBB87_19:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_21:
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    movq 32(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_22
 ; SSE2-NEXT:  # %bb.23:
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB87_24
 ; SSE2-NEXT:  .LBB87_22:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB87_24:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_8i64_to_8f32:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
index 96ac4b6..9133b32 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
@@ -758,7 +758,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -808,7 +808,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef %t, i32 noundef
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -851,7 +851,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -901,7 +901,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -936,7 +936,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -986,7 +986,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1021,7 +1021,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1071,7 +1071,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1106,7 +1106,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(fp128 noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1156,7 +1156,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(fp128 noundef %t, i32 nound
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1191,7 +1191,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1241,7 +1241,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1276,7 +1276,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz([2 x i64] %t.coe
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1326,7 +1326,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz([2 x i64] %t.coe
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1361,7 +1361,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz([2 x double] a
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1411,7 +1411,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz([2 x double] a
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1446,7 +1446,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz([4 x double] alignst
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1496,7 +1496,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz([4 x double] alignst
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1531,7 +1531,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz([2 x i64] %t.co
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1581,7 +1581,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz([2 x i64] %t.co
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1616,7 +1616,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz([2 x fp128] ali
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1666,7 +1666,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz([2 x fp128] ali
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1701,7 +1701,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz([4 x fp128] ali
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1751,7 +1751,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz([4 x fp128] ali
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll b/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll
index 1535fcc..e0b5907 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll
@@ -39,7 +39,7 @@ define i64 @foo(i64 %guard, ...) #1 {
 ; Only 56 bytes of the register save area is copied, because of
 ; "use-soft-float".
 
-; CHECK: call void @llvm.va_start(ptr %vl)
+; CHECK: call void @llvm.va_start.p0(ptr %vl)
 ; CHECK: [[VlAddr:%.*]] = ptrtoint ptr %vl to i64
 ; CHECK: [[RegSaveAreaAddrAddr:%.*]] = add i64 [[VlAddr]], 24
 ; CHECK: [[RegSaveAreaAddr:%.*]] = inttoptr i64 [[RegSaveAreaAddrAddr]] to ptr
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
index aff4d2c..2051015 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
@@ -560,7 +560,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef signext %t, i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -580,7 +580,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef signext %t, i32
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -623,7 +623,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -643,7 +643,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -678,7 +678,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -698,7 +698,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -733,7 +733,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -753,7 +753,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -788,7 +788,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(x86_fp80 noundef %t, i32 no
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -808,7 +808,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(x86_fp80 noundef %t, i32 no
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -843,7 +843,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -863,7 +863,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -898,7 +898,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz(i64 %t.coerce0,
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -918,7 +918,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz(i64 %t.coerce0,
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -953,7 +953,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz(double %t.coer
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -973,7 +973,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz(double %t.coer
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1008,7 +1008,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz(ptr noundef byval(%s
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1028,7 +1028,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz(ptr noundef byval(%s
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1063,7 +1063,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz(double %t.coerc
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1083,7 +1083,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz(double %t.coerc
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1118,7 +1118,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1138,7 +1138,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1173,7 +1173,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1193,7 +1193,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
index 21f3311..f07f3ad 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
@@ -542,7 +542,7 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[TMP27]], 17592186044416, !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr, !dbg [[DBG11]]
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 24, i1 false), !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]]), !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]]), !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 16, !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr, !dbg [[DBG11]]
diff --git a/llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s b/llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s
new file mode 100644
index 0000000..4623500
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// OBJDUMP:       0000 00000000 0f000000 00000000 00000000
+
+.text
+
+.p2align 8
+.type caller,@function
+caller:
+  s_endpgm
+
+.rodata
+
+.p2align 6
+.amdhsa_kernel caller
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_private_segment_fixed_size max(7, callee1.private_seg_size, callee2.private_seg_size)
+.end_amdhsa_kernel
+
+.set callee1.private_seg_size, 4
+.set callee2.private_seg_size, 15
+
+// ASM: .amdhsa_private_segment_fixed_size max(7, callee1.private_seg_size, callee2.private_seg_size)
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s b/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
new file mode 100644
index 0000000..fab3e89
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
@@ -0,0 +1,281 @@
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=ASM %s
+
+// Some expression currently require (immediately) solvable expressions, i.e.,
+// they don't depend on yet-unknown symbolic values.
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type user_sgpr_count,@function
+user_sgpr_count:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_count
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_count defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_count
+
+.p2align 8
+.type user_sgpr_private_segment_buffer,@function
+user_sgpr_private_segment_buffer:
+  s_endpgm
+
+.amdhsa_kernel user_sgpr_private_segment_buffer
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_private_segment_buffer defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_private_segment_buffer
+
+.p2align 8
+.type user_sgpr_kernarg_preload_length,@function
+user_sgpr_kernarg_preload_length:
+  s_endpgm
+
+.amdhsa_kernel user_sgpr_kernarg_preload_length
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_kernarg_preload_length defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_kernarg_preload_length defined_boolean
+
+.p2align 8
+.type user_sgpr_kernarg_preload_offset,@function
+user_sgpr_kernarg_preload_offset:
+  s_endpgm
+
+.amdhsa_kernel user_sgpr_kernarg_preload_offset
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_kernarg_preload_offset defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_kernarg_preload_offset defined_boolean
+
+.p2align 8
+.type user_sgpr_dispatch_ptr,@function
+user_sgpr_dispatch_ptr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_dispatch_ptr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_dispatch_ptr defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_dispatch_ptr
+
+.p2align 8
+.type user_sgpr_queue_ptr,@function
+user_sgpr_queue_ptr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_queue_ptr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_queue_ptr defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_queue_ptr
+
+.p2align 8
+.type user_sgpr_kernarg_segment_ptr,@function
+user_sgpr_kernarg_segment_ptr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_kernarg_segment_ptr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_kernarg_segment_ptr defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_kernarg_segment_ptr
+
+.p2align 8
+.type user_sgpr_dispatch_id,@function
+user_sgpr_dispatch_id:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_dispatch_id
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_dispatch_id defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_dispatch_id
+
+.p2align 8
+.type user_sgpr_flat_scratch_init,@function
+user_sgpr_flat_scratch_init:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_flat_scratch_init
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_flat_scratch_init defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_flat_scratch_init
+
+.p2align 8
+.type user_sgpr_private_segment_size,@function
+user_sgpr_private_segment_size:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_private_segment_size
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_private_segment_size defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_private_segment_size
+
+.p2align 8
+.type wavefront_size32,@function
+wavefront_size32:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel wavefront_size32
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_wavefront_size32 defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_wavefront_size32
+
+.p2align 8
+.type next_free_vgpr,@function
+next_free_vgpr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel next_free_vgpr
+  .amdhsa_next_free_vgpr defined_boolean
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_next_free_vgpr
+
+.p2align 8
+.type next_free_sgpr,@function
+next_free_sgpr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel next_free_sgpr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr defined_boolean
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_next_free_sgpr
+
+.p2align 8
+.type accum_offset,@function
+accum_offset:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel accum_offset
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_accum_offset
+
+.p2align 8
+.type reserve_vcc,@function
+reserve_vcc:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel reserve_vcc
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_reserve_vcc
+
+.p2align 8
+.type reserve_flat_scratch,@function
+reserve_flat_scratch:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel reserve_flat_scratch
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_flat_scratch defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_reserve_flat_scratch
+
+.p2align 8
+.type shared_vgpr_count,@function
+shared_vgpr_count:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel shared_vgpr_count
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_shared_vgpr_count defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_shared_vgpr_count
+
+.set defined_boolean, 1
+
+// ASM:       .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
new file mode 100644
index 0000000..95af59c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
@@ -0,0 +1,190 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0afe4 801f007f 000c0000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0afe4 801f007f 000c0000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later,@function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined,@function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1)>>0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&32)>>5
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_wavefront_size32 (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1024)>>10
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
new file mode 100644
index 0000000..e1107fb
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
@@ -0,0 +1,186 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0afe4 811f007f 000c0000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0afe4 811f007f 000c0000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later,@function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined,@function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_wavefront_size32 (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1024)>>10
+// ASM-NEXT: .amdhsa_enable_private_segment (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
new file mode 100644
index 0000000..449616d
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
@@ -0,0 +1,184 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f02fe4 811f007f 000c0000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f02fe4 811f007f 000c0000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later,@function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined,@function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_round_robin_scheduling defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_round_robin_scheduling defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_wavefront_size32 (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1024)>>10
+// ASM-NEXT: .amdhsa_enable_private_segment (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_round_robin_scheduling 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
new file mode 100644
index 0000000..c7e0544
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
@@ -0,0 +1,168 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0af00 801f007f 00080000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0af00 801f007f 00080000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later,@function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined,@function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer (((0&(~2048))|(defined_boolean<<11))&1)>>0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((0&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((0&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((0&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((0&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init (((0&(~2048))|(defined_boolean<<11))&32)>>5
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((0&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
new file mode 100644
index 0000000..49a5015
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
@@ -0,0 +1,171 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefix=ASM %s
+
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx801 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0af00 801f007f 00080000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0af00 801f007f 00080000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later,@function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined,@function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer (((0&(~2048))|(defined_boolean<<11))&1)>>0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((0&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((0&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((0&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((0&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init (((0&(~2048))|(defined_boolean<<11))&32)>>5
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((0&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
new file mode 100644
index 0000000..b7f8923
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
@@ -0,0 +1,148 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0030 0000ac04 81000000 00000000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 0000ac04 81000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later,@function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined,@function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_system_sgpr_private_segment_wavefront_offset defined_boolean
+  .amdhsa_dx10_clamp defined_boolean
+  .amdhsa_ieee_mode defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_tg_split defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_system_sgpr_private_segment_wavefront_offset defined_boolean
+  .amdhsa_dx10_clamp defined_boolean
+  .amdhsa_ieee_mode defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_tg_split defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset (((((((0&(~65536))|(defined_boolean<<16))&(~63))|(0<<0))&63)>>0)+1)*4
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_tg_split (((((0&(~65536))|(defined_boolean<<16))&(~63))|(0<<0))&65536)>>16
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 0
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+// ASM-NEXT: .amdhsa_exception_int_div_zero 0
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-tg-split.s b/llvm/test/MC/AMDGPU/hsa-tg-split.s
new file mode 100644
index 0000000..5a4d3e2
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-tg-split.s
@@ -0,0 +1,74 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack,+tgsplit < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack,+tgsplit -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// OBJDUMP: Contents of section .rodata
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type minimal,@function
+minimal:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 0
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 0
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+// ASM-NEXT: .amdhsa_exception_int_div_zero 0
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll b/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll
index 1882107..4c5a448 100644
--- a/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll
+++ b/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll
@@ -23,7 +23,7 @@ define internal i32 @i(ptr inalloca(ptr) %a, ...) {
 ; CHECK-LABEL: define {{[^@]+}}@i
 ; CHECK-SAME: (ptr inalloca(ptr) [[A:%.*]], ...) unnamed_addr {
 ; CHECK-NEXT:    [[AP:%.*]] = alloca ptr, align 4
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[AP]], align 4
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 ; CHECK-NEXT:    ret i32 [[L]]
diff --git a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
index ef365d6..38dfd25 100644
--- a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
@@ -17,10 +17,10 @@ define i32 @func1(i32 %a, double %b, ptr %v, ...) nounwind {
 ; CHECK-NEXT:    [[AP:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], ptr [[A_ADDR]], double [[B:%.*]], ptr [[B_ADDR]])
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(ptr [[V:%.*]], ptr [[AP]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[V:%.*]], ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[AP]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
 ; CHECK-NEXT:    [[TMP_RELOAD:%.*]] = load i32, ptr [[TMP_LOC]], align 4
@@ -52,10 +52,10 @@ define i32 @func2(i32 %a, double %b, ptr %v, ...) nounwind {
 ; CHECK-NEXT:    [[AP:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], ptr [[A_ADDR]], double [[B:%.*]], ptr [[B_ADDR]])
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(ptr [[V:%.*]], ptr [[AP]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[V:%.*]], ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[AP]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
 ; CHECK-NEXT:    [[TMP_RELOAD:%.*]] = load i32, ptr [[TMP_LOC]], align 4
diff --git a/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll b/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll
index 9f565de..2d52608 100644
--- a/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll
+++ b/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll
@@ -51,7 +51,7 @@ entry:
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; CHECK-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_0(ptr [[V]], ptr [[AP]], i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
@@ -70,7 +70,7 @@ entry:
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; CHECK-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_0(ptr [[V]], ptr [[AP]], i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
@@ -84,8 +84,8 @@ entry:
 ; CHECK-NEXT:  newFuncRoot:
 ; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
 ; CHECK:       entry_to_outline:
-; CHECK-NEXT:    call void @llvm.va_copy(ptr [[TMP0]], ptr [[TMP1]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[TMP1]])
+; CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[TMP0]], ptr [[TMP1]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[TMP1]])
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    br label [[ENTRY_AFTER_OUTLINE_EXITSTUB:%.*]]
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index 50b0e7a..2f264a2 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -141,4 +141,4 @@ attributes #1 = { argmemonly nounwind }
 !5 = distinct !{!5, !"some domain"}
 !6 = !{!7}
 !7 = distinct !{!7, !5, !"some scope 2"}
-!8 = !{i64 0, i64 8, null}
+!8 = !{i64 0, i64 8, !0}
diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll
index 43e34c8..6c0575e 100644
--- a/llvm/test/Transforms/InstCombine/powi.ll
+++ b/llvm/test/Transforms/InstCombine/powi.ll
@@ -313,7 +313,7 @@ define double @fdiv_pow_powi(double %x) {
 ; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc nnan double [[X:%.*]], [[X]]
 ; CHECK-NEXT:    ret double [[DIV]]
 ;
-  %p1 = call double @llvm.powi.f64.i32(double %x, i32 3)
+  %p1 = call reassoc double @llvm.powi.f64.i32(double %x, i32 3)
   %div = fdiv reassoc nnan double %p1, %x
   ret double %div
 }
@@ -323,7 +323,7 @@ define float @fdiv_powf_powi(float %x) {
 ; CHECK-NEXT:    [[DIV:%.*]] = call reassoc nnan float @llvm.powi.f32.i32(float [[X:%.*]], i32 99)
 ; CHECK-NEXT:    ret float [[DIV]]
 ;
-  %p1 = call float @llvm.powi.f32.i32(float %x, i32 100)
+  %p1 = call reassoc float @llvm.powi.f32.i32(float %x, i32 100)
   %div = fdiv reassoc nnan float %p1, %x
   ret float %div
 }
@@ -347,10 +347,21 @@ define double @fdiv_pow_powi_multi_use(double %x) {
 define float @fdiv_powf_powi_missing_reassoc(float %x) {
 ; CHECK-LABEL: @fdiv_powf_powi_missing_reassoc(
 ; CHECK-NEXT:    [[P1:%.*]] = call float @llvm.powi.f32.i32(float [[X:%.*]], i32 100)
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv nnan float [[P1]], [[X]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc nnan float [[P1]], [[X]]
 ; CHECK-NEXT:    ret float [[DIV]]
 ;
   %p1 = call float @llvm.powi.f32.i32(float %x, i32 100)
+  %div = fdiv reassoc nnan float %p1, %x
+  ret float %div
+}
+
+define float @fdiv_powf_powi_missing_reassoc1(float %x) {
+; CHECK-LABEL: @fdiv_powf_powi_missing_reassoc1(
+; CHECK-NEXT:    [[P1:%.*]] = call reassoc float @llvm.powi.f32.i32(float [[X:%.*]], i32 100)
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv nnan float [[P1]], [[X]]
+; CHECK-NEXT:    ret float [[DIV]]
+;
+  %p1 = call reassoc float @llvm.powi.f32.i32(float %x, i32 100)
   %div = fdiv nnan float %p1, %x
   ret float %div
 }
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index 996d2c0..d079c03 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -75,7 +75,7 @@ entry:
 !1 = !{!"omnipotent char", !0}
 !2 = !{!5, !5, i64 0}
 !3 = !{i64 0, i64 4, !2}
-!4 = !{i64 0, i64 8, null}
+!4 = !{i64 0, i64 8, !2}
 !5 = !{!"float", !0}
 !6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
 !7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index 0e7461c..82e9a2a 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -41,7 +41,7 @@ define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
 ; CHECK:       bb22:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb23:
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[TMP]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[TMP]])
 ; CHECK-NEXT:    ret i32 undef
 ;
 bb:
diff --git a/llvm/test/Transforms/Reassociate/vaarg_movable.ll b/llvm/test/Transforms/Reassociate/vaarg_movable.ll
index 337877a..4e45b21 100644
--- a/llvm/test/Transforms/Reassociate/vaarg_movable.ll
+++ b/llvm/test/Transforms/Reassociate/vaarg_movable.ll
@@ -10,13 +10,13 @@ define i32 @func(i32 %dummy, ...) {
 ;
 ; CHECK-LABEL: @func(
 ; CHECK-NEXT:    [[VARARGS:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[VARARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VARARGS]])
 ; CHECK-NEXT:    [[V0:%.*]] = va_arg ptr [[VARARGS]], i32
 ; CHECK-NEXT:    [[V1:%.*]] = va_arg ptr [[VARARGS]], i32
 ; CHECK-NEXT:    [[V0_NEG:%.*]] = sub i32 0, [[V0]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[V0_NEG]], 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], [[V1]]
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[VARARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VARARGS]])
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
   %varargs = alloca ptr, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
new file mode 100644
index 0000000..f376ca7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    br i1 true, label [[BB3]], label [[BB2]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[B]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[C]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[B]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %tobool3.not, label %bb1, label %bb2
+
+bb1:
+  %conv1.i.us = ashr i32 %0, 16
+  %cmp2.i.us = icmp slt i32 %conv1.i.us, %0
+  %sext26.us = zext i1 %cmp2.i.us to i32
+  %conv1.i.us.5 = ashr i32 %0, 16
+  %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0
+  %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32
+  %conv1.i.us.6 = ashr i32 %0, 16
+  %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0
+  %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32
+  %conv1.i.us.7 = ashr i32 %0, 16
+  %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0
+  %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32
+  br label %bb3
+
+bb2:
+  %cmp2.i = icmp sgt i32 %0, 0
+  %1 = zext i1 %cmp2.i to i32
+  %cond.i = select i1 %tobool3.not, i32 %0, i32 %1
+  %sext26 = shl i32 %cond.i, 16
+  %conv13 = ashr i32 %sext26, 16
+  %cmp2.i.5 = icmp sgt i32 %0, 0
+  %2 = zext i1 %cmp2.i.5 to i32
+  %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2
+  %sext26.5 = shl i32 %cond.i.5, 16
+  %conv13.5 = ashr i32 %sext26.5, 16
+  %cmp2.i.6 = icmp sgt i32 %0, 0
+  %3 = zext i1 %cmp2.i.6 to i32
+  %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3
+  %sext26.6 = shl i32 %cond.i.6, 16
+  %conv13.6 = ashr i32 %sext26.6, 16
+  %cmp2.i.7 = icmp sgt i32 %0, 0
+  %4 = zext i1 %cmp2.i.7 to i32
+  %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4
+  %sext26.7 = shl i32 %cond.i.7, 16
+  %conv13.7 = ashr i32 %sext26.7, 16
+  br i1 true, label %bb3, label %bb2
+
+bb3:
+  %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ]
+  %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ]
+  %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ]
+  %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ]
+  store i32 %conv13p, ptr %b, align 16
+  store i32 %conv13.5p, ptr %a, align 8
+  store i32 %conv13.6p, ptr %c, align 16
+  store i32 %conv13.7p, ptr %b, align 8
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
index 0fcd787..61034de 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -539,7 +539,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
 !6 = !{!5, !5, i64 0}
 !7 = !{i64 0, i64 8, !6, i64 8, i64 4, !1}
 !8 = !{i64 0, i64 4, !1, i64 4, i64 8, !6}
-!9 = !{i64 0, i64 8, !6, i64 4, i64 8, !1}
+!9 = !{i64 0, i64 8, !6, i64 8, i64 8, !1}
 !10 = !{i64 0, i64 2, !1, i64 2, i64 2, !1}
 !11 = !{i64 0, i64 1, !1, i64 1, i64 3, !1}
 !12 = !{i64 0, i64 2, !1, i64 2, i64 6, !1}
diff --git a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
index bbcdcb6..73ae66d 100644
--- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
@@ -836,5 +836,6 @@ define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
 !2 = !{ !"set2", !0 }
 !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
 !4 = !{ float 4.0 }
-!5 = !{ i64 0, i64 8, null }
+!5 = !{ i64 0, i64 8, !6 }
+!6 = !{ !1, !1, i64 0 }
 !13 = distinct !{}
diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll
index db7c5f5..87a70cc 100644
--- a/llvm/test/Transforms/Scalarizer/basic.ll
+++ b/llvm/test/Transforms/Scalarizer/basic.ll
@@ -870,5 +870,6 @@ define <2 x float> @f25(<2 x float> %src) {
 !2 = !{ !"set2", !0 }
 !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
 !4 = !{ float 4.0 }
-!5 = !{ i64 0, i64 8, null }
+!5 = !{ i64 0, i64 8, !6 }
+!6 = !{ !1, !1, i64 0 }
 !13 = distinct !{}
diff --git a/llvm/test/Verifier/tbaa-struct.ll b/llvm/test/Verifier/tbaa-struct.ll
index b8ddc7c..14c19a1 100644
--- a/llvm/test/Verifier/tbaa-struct.ll
+++ b/llvm/test/Verifier/tbaa-struct.ll
@@ -1,28 +1,36 @@
-; RUN: llvm-as < %s 2>&1
-
-; FIXME: The verifer should reject the invalid !tbaa.struct nodes below.
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
 define void @test_overlapping_regions(ptr %a1) {
+; CHECK: Overlapping tbaa.struct regions
+; CHECK-NEXT:  %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
   %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
   ret void
 }
 
 define void @test_size_not_integer(ptr %a1) {
+; CHECK: Size must be a constant integer
+; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !5
   store i8 1, ptr %a1, align 1, !tbaa.struct !5
   ret void
 }
 
 define void @test_offset_not_integer(ptr %a1, ptr %a2) {
+; CHECK: Offset must be a constant integer
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
   ret void
 }
 
 define void @test_tbaa_missing(ptr %a1, ptr %a2) {
+; CHECK: TBAA tag missing
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
   ret void
 }
 
 define void @test_tbaa_invalid(ptr %a1) {
+; CHECK: Old-style TBAA is no longer allowed, use struct-path TBAA instead
+; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !8
   store i8 1, ptr %a1, align 1, !tbaa.struct !8
   ret void
 }
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 9ce53fe..e9987d0 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -14,6 +14,8 @@ ARMAP-NEXT: Archive EC map
 ARMAP-NEXT: #expname in test.dll
 ARMAP-NEXT: #funcexp in test.dll
 ARMAP-NEXT: #mangledfunc in test.dll
+ARMAP-NEXT: #manglednonamefunc in test.dll
+ARMAP-NEXT: #nonamefunc in test.dll
 ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
 ARMAP-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
 ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
@@ -23,13 +25,19 @@ ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
 ARMAP-NEXT: __imp_aux_expname in test.dll
 ARMAP-NEXT: __imp_aux_funcexp in test.dll
 ARMAP-NEXT: __imp_aux_mangledfunc in test.dll
+ARMAP-NEXT: __imp_aux_manglednonamefunc in test.dll
+ARMAP-NEXT: __imp_aux_nonamefunc in test.dll
 ARMAP-NEXT: __imp_dataexp in test.dll
 ARMAP-NEXT: __imp_expname in test.dll
 ARMAP-NEXT: __imp_funcexp in test.dll
 ARMAP-NEXT: __imp_mangledfunc in test.dll
+ARMAP-NEXT: __imp_manglednonamefunc in test.dll
+ARMAP-NEXT: __imp_nonamefunc in test.dll
 ARMAP-NEXT: expname in test.dll
 ARMAP-NEXT: funcexp in test.dll
 ARMAP-NEXT: mangledfunc in test.dll
+ARMAP-NEXT: manglednonamefunc in test.dll
+ARMAP-NEXT: nonamefunc in test.dll
 ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
@@ -95,6 +103,25 @@ READOBJ-NEXT: Type: data
 READOBJ-NEXT: Name type: name
 READOBJ-NEXT: Export name: dataexp
 READOBJ-NEXT: Symbol: __imp_dataexp
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: ordinal
+READOBJ-NEXT: Symbol: __imp_nonamefunc
+READOBJ-NEXT: Symbol: nonamefunc
+READOBJ-NEXT: Symbol: __imp_aux_nonamefunc
+READOBJ-NEXT: Symbol: #nonamefunc
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: ordinal
+READOBJ-NEXT: Symbol: __imp_manglednonamefunc
+READOBJ-NEXT: Symbol: manglednonamefunc
+READOBJ-NEXT: Symbol: __imp_aux_manglednonamefunc
+READOBJ-NEXT: Symbol: #manglednonamefunc
+
 
 Using -machine:arm64x gives the same output.
 RUN: llvm-lib -machine:arm64x -def:test.def -out:testx.lib
@@ -112,22 +139,28 @@ RUN: llvm-nm --print-armap testx.lib | FileCheck -check-prefix=ARMAPX %s
 
 ARMAPX:      Archive map
 ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: #manglednonamefunc in test.dll
 ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
 ARMAPX-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
 ARMAPX-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
 ARMAPX-NEXT: __imp_#mangledfunc in test.dll
+ARMAPX-NEXT: __imp_#manglednonamefunc in test.dll
 ARMAPX-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
 ARMAPX-NEXT: __imp_dataexp in test.dll
 ARMAPX-NEXT: __imp_expname in test.dll
 ARMAPX-NEXT: __imp_funcexp in test.dll
+ARMAPX-NEXT: __imp_nonamefunc in test.dll
 ARMAPX-NEXT: expname in test.dll
 ARMAPX-NEXT: funcexp in test.dll
+ARMAPX-NEXT: nonamefunc in test.dll
 ARMAPX-NEXT: test_NULL_THUNK_DATA in test.dll
 ARMAPX-EMPTY:
 ARMAPX-NEXT: Archive EC map
 ARMAPX-NEXT: #expname in test.dll
 ARMAPX-NEXT: #funcexp in test.dll
 ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: #manglednonamefunc in test.dll
+ARMAPX-NEXT: #nonamefunc in test.dll
 ARMAPX-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
 ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
 ARMAPX-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
@@ -137,13 +170,19 @@ ARMAPX-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
 ARMAPX-NEXT: __imp_aux_expname in test.dll
 ARMAPX-NEXT: __imp_aux_funcexp in test.dll
 ARMAPX-NEXT: __imp_aux_mangledfunc in test.dll
+ARMAPX-NEXT: __imp_aux_manglednonamefunc in test.dll
+ARMAPX-NEXT: __imp_aux_nonamefunc in test.dll
 ARMAPX-NEXT: __imp_dataexp in test.dll
 ARMAPX-NEXT: __imp_expname in test.dll
 ARMAPX-NEXT: __imp_funcexp in test.dll
 ARMAPX-NEXT: __imp_mangledfunc in test.dll
+ARMAPX-NEXT: __imp_manglednonamefunc in test.dll
+ARMAPX-NEXT: __imp_nonamefunc in test.dll
 ARMAPX-NEXT: expname in test.dll
 ARMAPX-NEXT: funcexp in test.dll
 ARMAPX-NEXT: mangledfunc in test.dll
+ARMAPX-NEXT: manglednonamefunc in test.dll
+ARMAPX-NEXT: nonamefunc in test.dll
 ARMAPX-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj testx.lib | FileCheck -check-prefix=READOBJX %s
@@ -211,6 +250,24 @@ READOBJX-NEXT: Export name: dataexp
 READOBJX-NEXT: Symbol: __imp_dataexp
 READOBJX-EMPTY:
 READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_nonamefunc
+READOBJX-NEXT: Symbol: nonamefunc
+READOBJX-NEXT: Symbol: __imp_aux_nonamefunc
+READOBJX-NEXT: Symbol: #nonamefunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_manglednonamefunc
+READOBJX-NEXT: Symbol: manglednonamefunc
+READOBJX-NEXT: Symbol: __imp_aux_manglednonamefunc
+READOBJX-NEXT: Symbol: #manglednonamefunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
 READOBJX-NEXT: Format: COFF-import-file-ARM64
 READOBJX-NEXT: Type: code
 READOBJX-NEXT: Name type: name
@@ -248,6 +305,20 @@ READOBJX-NEXT: Type: data
 READOBJX-NEXT: Name type: name
 READOBJX-NEXT: Export name: dataexp
 READOBJX-NEXT: Symbol: __imp_dataexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_nonamefunc
+READOBJX-NEXT: Symbol: nonamefunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_#manglednonamefunc
+READOBJX-NEXT: Symbol: #manglednonamefunc
 
 
 RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test2.def -out:test2.lib
@@ -266,6 +337,8 @@ ARMAPX2-NEXT: Archive EC map
 ARMAPX2-NEXT: #expname in test2.dll
 ARMAPX2-NEXT: #funcexp in test2.dll
 ARMAPX2-NEXT: #mangledfunc in test2.dll
+ARMAPX2-NEXT: #manglednonamefunc in test2.dll
+ARMAPX2-NEXT: #nonamefunc in test2.dll
 ARMAPX2-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test2.dll
 ARMAPX2-NEXT: ?test_cpp_func@@YAHPEAX@Z in test2.dll
 ARMAPX2-NEXT: __IMPORT_DESCRIPTOR_test2 in test2.dll
@@ -275,13 +348,19 @@ ARMAPX2-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test2.dll
 ARMAPX2-NEXT: __imp_aux_expname in test2.dll
 ARMAPX2-NEXT: __imp_aux_funcexp in test2.dll
 ARMAPX2-NEXT: __imp_aux_mangledfunc in test2.dll
+ARMAPX2-NEXT: __imp_aux_manglednonamefunc in test2.dll
+ARMAPX2-NEXT: __imp_aux_nonamefunc in test2.dll
 ARMAPX2-NEXT: __imp_dataexp in test2.dll
 ARMAPX2-NEXT: __imp_expname in test2.dll
 ARMAPX2-NEXT: __imp_funcexp in test2.dll
 ARMAPX2-NEXT: __imp_mangledfunc in test2.dll
+ARMAPX2-NEXT: __imp_manglednonamefunc in test2.dll
+ARMAPX2-NEXT: __imp_nonamefunc in test2.dll
 ARMAPX2-NEXT: expname in test2.dll
 ARMAPX2-NEXT: funcexp in test2.dll
 ARMAPX2-NEXT: mangledfunc in test2.dll
+ARMAPX2-NEXT: manglednonamefunc in test2.dll
+ARMAPX2-NEXT: nonamefunc in test2.dll
 ARMAPX2-NEXT: test2_NULL_THUNK_DATA in test2.dll
 
 ARMAPX2:      test2.dll:
@@ -312,6 +391,18 @@ ARMAPX2-NEXT: test2.dll:
 ARMAPX2-NEXT: 00000000 D __imp_dataexp
 ARMAPX2-EMPTY:
 ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #nonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_aux_nonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_nonamefunc
+ARMAPX2-NEXT: 00000000 T nonamefunc
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #manglednonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_aux_manglednonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_manglednonamefunc
+ARMAPX2-NEXT: 00000000 T manglednonamefunc
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
 ARMAPX2-NEXT: 00000000 T __imp_otherfunc
 ARMAPX2-NEXT: 00000000 T otherfunc
 
@@ -406,6 +497,8 @@ EXPORTS
     ?test_cpp_func@@YAHPEAX@Z
     expname=impname
     dataexp DATA
+    nonamefunc @1 NONAME
+    #manglednonamefunc @2 NONAME
 
 #--- test2.def
 LIBRARY test2.dll
diff --git a/llvm/unittests/Object/GOFFObjectFileTest.cpp b/llvm/unittests/Object/GOFFObjectFileTest.cpp
index 734dac6..69f60d0 100644
--- a/llvm/unittests/Object/GOFFObjectFileTest.cpp
+++ b/llvm/unittests/Object/GOFFObjectFileTest.cpp
@@ -502,3 +502,100 @@ TEST(GOFFObjectFileTest, InvalidERSymbolType) {
         FailedWithMessage("ESD record 1 has unknown Executable type 0x03"));
   }
 }
+
+TEST(GOFFObjectFileTest, TXTConstruct) {
+  char GOFFData[GOFF::RecordLength * 6] = {};
+
+  // HDR record.
+  GOFFData[0] = 0x03;
+  GOFFData[1] = 0xF0;
+  GOFFData[50] = 0x01;
+
+  // ESD record.
+  GOFFData[GOFF::RecordLength] = 0x03;
+  GOFFData[GOFF::RecordLength + 7] = 0x01;  // ESDID.
+  GOFFData[GOFF::RecordLength + 71] = 0x05; // Size of symbol name.
+  GOFFData[GOFF::RecordLength + 72] = 0xa5; // Symbol name is v.
+  GOFFData[GOFF::RecordLength + 73] = 0x81; // Symbol name is a.
+  GOFFData[GOFF::RecordLength + 74] = 0x99; // Symbol name is r.
+  GOFFData[GOFF::RecordLength + 75] = 0x7b; // Symbol name is #.
+  GOFFData[GOFF::RecordLength + 76] = 0x83; // Symbol name is c.
+
+  // ESD record.
+  GOFFData[GOFF::RecordLength * 2] = 0x03;
+  GOFFData[GOFF::RecordLength * 2 + 3] = 0x01;
+  GOFFData[GOFF::RecordLength * 2 + 7] = 0x02;  // ESDID.
+  GOFFData[GOFF::RecordLength * 2 + 11] = 0x01; // Parent ESDID.
+  GOFFData[GOFF::RecordLength * 2 + 27] = 0x08; // Length.
+  GOFFData[GOFF::RecordLength * 2 + 40] = 0x01; // Name Space ID.
+  GOFFData[GOFF::RecordLength * 2 + 41] = 0x80;
+  GOFFData[GOFF::RecordLength * 2 + 60] = 0x04; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 61] = 0x04; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 63] = 0x0a; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 66] = 0x03; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 71] = 0x08; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 72] = 0xc3; // Symbol name is c.
+  GOFFData[GOFF::RecordLength * 2 + 73] = 0x6d; // Symbol name is _.
+  GOFFData[GOFF::RecordLength * 2 + 74] = 0xc3; // Symbol name is c.
+  GOFFData[GOFF::RecordLength * 2 + 75] = 0xd6; // Symbol name is o.
+  GOFFData[GOFF::RecordLength * 2 + 76] = 0xc4; // Symbol name is D.
+  GOFFData[GOFF::RecordLength * 2 + 77] = 0xc5; // Symbol name is E.
+  GOFFData[GOFF::RecordLength * 2 + 78] = 0xf6; // Symbol name is 6.
+  GOFFData[GOFF::RecordLength * 2 + 79] = 0xf4; // Symbol name is 4.
+
+  // ESD record.
+  GOFFData[GOFF::RecordLength * 3] = 0x03;
+  GOFFData[GOFF::RecordLength * 3 + 3] = 0x02;
+  GOFFData[GOFF::RecordLength * 3 + 7] = 0x03;  // ESDID.
+  GOFFData[GOFF::RecordLength * 3 + 11] = 0x02; // Parent ESDID.
+  GOFFData[GOFF::RecordLength * 3 + 71] = 0x05; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 3 + 72] = 0xa5; // Symbol name is v.
+  GOFFData[GOFF::RecordLength * 3 + 73] = 0x81; // Symbol name is a.
+  GOFFData[GOFF::RecordLength * 3 + 74] = 0x99; // Symbol name is r.
+  GOFFData[GOFF::RecordLength * 3 + 75] = 0x7b; // Symbol name is #.
+  GOFFData[GOFF::RecordLength * 3 + 76] = 0x83; // Symbol name is c.
+
+  // TXT record.
+  GOFFData[GOFF::RecordLength * 4] = 0x03;
+  GOFFData[GOFF::RecordLength * 4 + 1] = 0x10;
+  GOFFData[GOFF::RecordLength * 4 + 7] = 0x02;
+  GOFFData[GOFF::RecordLength * 4 + 23] = 0x08; // Data Length.
+  GOFFData[GOFF::RecordLength * 4 + 24] = 0x12;
+  GOFFData[GOFF::RecordLength * 4 + 25] = 0x34;
+  GOFFData[GOFF::RecordLength * 4 + 26] = 0x56;
+  GOFFData[GOFF::RecordLength * 4 + 27] = 0x78;
+  GOFFData[GOFF::RecordLength * 4 + 28] = 0x9a;
+  GOFFData[GOFF::RecordLength * 4 + 29] = 0xbc;
+  GOFFData[GOFF::RecordLength * 4 + 30] = 0xde;
+  GOFFData[GOFF::RecordLength * 4 + 31] = 0xf0;
+
+  // END record.
+  GOFFData[GOFF::RecordLength * 5] = 0x03;
+  GOFFData[GOFF::RecordLength * 5 + 1] = 0x40;
+  GOFFData[GOFF::RecordLength * 5 + 11] = 0x06;
+
+  StringRef Data(GOFFData, GOFF::RecordLength * 6);
+
+  Expected<std::unique_ptr<ObjectFile>> GOFFObjOrErr =
+      object::ObjectFile::createGOFFObjectFile(
+          MemoryBufferRef(Data, "dummyGOFF"));
+
+  ASSERT_THAT_EXPECTED(GOFFObjOrErr, Succeeded());
+
+  GOFFObjectFile *GOFFObj = dyn_cast<GOFFObjectFile>((*GOFFObjOrErr).get());
+  auto Symbols = GOFFObj->symbols();
+  ASSERT_EQ(std::distance(Symbols.begin(), Symbols.end()), 1);
+  SymbolRef Symbol = *Symbols.begin();
+  Expected<StringRef> SymbolNameOrErr = GOFFObj->getSymbolName(Symbol);
+  ASSERT_THAT_EXPECTED(SymbolNameOrErr, Succeeded());
+  StringRef SymbolName = SymbolNameOrErr.get();
+  EXPECT_EQ(SymbolName, "var#c");
+
+  auto Sections = GOFFObj->sections();
+  ASSERT_EQ(std::distance(Sections.begin(), Sections.end()), 1);
+  SectionRef Section = *Sections.begin();
+  Expected<StringRef> SectionContent = Section.getContents();
+  ASSERT_THAT_EXPECTED(SectionContent, Succeeded());
+  StringRef Contents = SectionContent.get();
+  EXPECT_EQ(Contents, "\x12\x34\x56\x78\x9a\xbc\xde\xf0");
+}
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index a7d0b16..2c72a72 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -2347,13 +2347,6 @@ AArch64ExtensionDependenciesBaseArchTestParams
          {},
          {"aes", "sha2", "sha3", "sm4"}},
 
-        // +sve implies +f32mm if the base architecture is v8.6A+ or v9.1A+, but
-        // not earlier architectures.
-        {AArch64::ARMV8_5A, {"sve"}, {"sve"}, {"f32mm"}},
-        {AArch64::ARMV9A, {"sve"}, {"sve"}, {"f32mm"}},
-        {AArch64::ARMV8_6A, {"sve"}, {"sve", "f32mm"}, {}},
-        {AArch64::ARMV9_1A, {"sve"}, {"sve", "f32mm"}, {}},
-
         // +fp16 implies +fp16fml for v8.4A+, but not v9.0-A+
         {AArch64::ARMV8_3A, {"fp16"}, {"fullfp16"}, {"fp16fml"}},
         {AArch64::ARMV9A, {"fp16"}, {"fullfp16"}, {"fp16fml"}},
@@ -2520,10 +2513,10 @@ AArch64ExtensionDependenciesBaseCPUTestParams
          {}},
         {"cortex-a520",
          {},
-         {"v9.2a",    "bf16",     "crc",     "dotprod", "f32mm",        "flagm",
-          "fp-armv8", "fullfp16", "fp16fml", "i8mm",    "lse",          "mte",
-          "pauth",    "perfmon",  "predres", "ras",     "rcpc",         "rdm",
-          "sb",       "neon",     "ssbs",    "sve",     "sve2-bitperm", "sve2"},
+         {"v9.2a",    "bf16",    "crc",  "dotprod",      "flagm", "fp-armv8",
+          "fullfp16", "fp16fml", "i8mm", "lse",          "mte",   "pauth",
+          "perfmon",  "predres", "ras",  "rcpc",         "rdm",   "sb",
+          "neon",     "ssbs",    "sve",  "sve2-bitperm", "sve2"},
          {}},
 
         // Negative modifiers
diff --git a/llvm/utils/TableGen/Common/CMakeLists.txt b/llvm/utils/TableGen/Common/CMakeLists.txt
index 0440f02..c31ed5a 100644
--- a/llvm/utils/TableGen/Common/CMakeLists.txt
+++ b/llvm/utils/TableGen/Common/CMakeLists.txt
@@ -12,10 +12,12 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL
   GlobalISel/CodeExpander.cpp
+  GlobalISel/CombinerUtils.cpp
   GlobalISel/CXXPredicates.cpp
   GlobalISel/GlobalISelMatchTable.cpp
   GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
   GlobalISel/MatchDataInfo.cpp
+  GlobalISel/PatternParser.cpp
   GlobalISel/Patterns.cpp
 
   AsmWriterInst.cpp
diff --git a/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp
new file mode 100644
index 0000000..37e6306
--- /dev/null
+++ b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp
@@ -0,0 +1,23 @@
+//===- CombinerUtils.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CombinerUtils.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace llvm {
+
+StringRef insertStrRef(StringRef S) {
+  if (S.empty())
+    return {};
+
+  static StringSet<> Pool;
+  auto [It, Inserted] = Pool.insert(S);
+  return It->getKey();
+}
+
+} // namespace llvm
diff --git a/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h
index 8cb2514..82a64c6 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h
@@ -65,6 +65,10 @@ inline const DagInit *getDagWithOperatorOfSubClass(const Init &N,
         return I;
   return nullptr;
 }
+
+/// Copies a StringRef into a static pool to preserve it.
+StringRef insertStrRef(StringRef S);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
new file mode 100644
index 0000000..1d6c4c7
--- /dev/null
+++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
@@ -0,0 +1,462 @@
+//===- PatternParser.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Common/GlobalISel/PatternParser.h"
+#include "Basic/CodeGenIntrinsics.h"
+#include "Common/CodeGenTarget.h"
+#include "Common/GlobalISel/CombinerUtils.h"
+#include "Common/GlobalISel/Patterns.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/SaveAndRestore.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+namespace gi {
+static constexpr StringLiteral MIFlagsEnumClassName = "MIFlagEnum";
+
+namespace {
+class PrettyStackTraceParse : public PrettyStackTraceEntry {
+  const Record &Def;
+
+public:
+  PrettyStackTraceParse(const Record &Def) : Def(Def) {}
+
+  void print(raw_ostream &OS) const override {
+    if (Def.isSubClassOf("GICombineRule"))
+      OS << "Parsing GICombineRule '" << Def.getName() << '\'';
+    else if (Def.isSubClassOf(PatFrag::ClassName))
+      OS << "Parsing " << PatFrag::ClassName << " '" << Def.getName() << '\'';
+    else
+      OS << "Parsing '" << Def.getName() << '\'';
+    OS << '\n';
+  }
+};
+} // namespace
+
+bool PatternParser::parsePatternList(
+    const DagInit &List,
+    function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
+    StringRef Operator, StringRef AnonPatNamePrefix) {
+  if (List.getOperatorAsDef(DiagLoc)->getName() != Operator) {
+    PrintError(DiagLoc, "Expected " + Operator + " operator");
+    return false;
+  }
+
+  if (List.getNumArgs() == 0) {
+    PrintError(DiagLoc, Operator + " pattern list is empty");
+    return false;
+  }
+
+  // The match section consists of a list of matchers and predicates. Parse each
+  // one and add the equivalent GIMatchDag nodes, predicates, and edges.
+  for (unsigned I = 0; I < List.getNumArgs(); ++I) {
+    Init *Arg = List.getArg(I);
+    std::string Name = List.getArgName(I)
+                           ? List.getArgName(I)->getValue().str()
+                           : ("__" + AnonPatNamePrefix + "_" + Twine(I)).str();
+
+    if (auto Pat = parseInstructionPattern(*Arg, Name)) {
+      if (!ParseAction(std::move(Pat)))
+        return false;
+      continue;
+    }
+
+    if (auto Pat = parseWipMatchOpcodeMatcher(*Arg, Name)) {
+      if (!ParseAction(std::move(Pat)))
+        return false;
+      continue;
+    }
+
+    // Parse arbitrary C++ code
+    if (const auto *StringI = dyn_cast<StringInit>(Arg)) {
+      auto CXXPat = std::make_unique<CXXPattern>(*StringI, insertStrRef(Name));
+      if (!ParseAction(std::move(CXXPat)))
+        return false;
+      continue;
+    }
+
+    PrintError(DiagLoc,
+               "Failed to parse pattern: '" + Arg->getAsString() + '\'');
+    return false;
+  }
+
+  return true;
+}
+
+static const CodeGenInstruction &
+getInstrForIntrinsic(const CodeGenTarget &CGT, const CodeGenIntrinsic *I) {
+  StringRef Opc;
+  if (I->isConvergent) {
+    Opc = I->hasSideEffects ? "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS"
+                            : "G_INTRINSIC_CONVERGENT";
+  } else {
+    Opc = I->hasSideEffects ? "G_INTRINSIC_W_SIDE_EFFECTS" : "G_INTRINSIC";
+  }
+
+  RecordKeeper &RK = I->TheDef->getRecords();
+  return CGT.getInstruction(RK.getDef(Opc));
+}
+
+static const CodeGenIntrinsic *getCodeGenIntrinsic(Record *R) {
+  // Intrinsics need to have a static lifetime because the match table keeps
+  // references to CodeGenIntrinsic objects.
+  static DenseMap<const Record *, std::unique_ptr<CodeGenIntrinsic>>
+      AllIntrinsics;
+
+  auto &Ptr = AllIntrinsics[R];
+  if (!Ptr)
+    Ptr = std::make_unique<CodeGenIntrinsic>(R, std::vector<Record *>());
+  return Ptr.get();
+}
+
+std::unique_ptr<Pattern>
+PatternParser::parseInstructionPattern(const Init &Arg, StringRef Name) {
+  const DagInit *DagPat = dyn_cast<DagInit>(&Arg);
+  if (!DagPat)
+    return nullptr;
+
+  std::unique_ptr<InstructionPattern> Pat;
+  if (const DagInit *IP = getDagWithOperatorOfSubClass(Arg, "Instruction")) {
+    auto &Instr = CGT.getInstruction(IP->getOperatorAsDef(DiagLoc));
+    Pat =
+        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
+  } else if (const DagInit *IP =
+                 getDagWithOperatorOfSubClass(Arg, "Intrinsic")) {
+    Record *TheDef = IP->getOperatorAsDef(DiagLoc);
+    const CodeGenIntrinsic *Intrin = getCodeGenIntrinsic(TheDef);
+    const CodeGenInstruction &Instr = getInstrForIntrinsic(CGT, Intrin);
+    Pat =
+        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
+    cast<CodeGenInstructionPattern>(*Pat).setIntrinsic(Intrin);
+  } else if (const DagInit *PFP =
+                 getDagWithOperatorOfSubClass(Arg, PatFrag::ClassName)) {
+    const Record *Def = PFP->getOperatorAsDef(DiagLoc);
+    const PatFrag *PF = parsePatFrag(Def);
+    if (!PF)
+      return nullptr; // Already diagnosed by parsePatFrag
+    Pat = std::make_unique<PatFragPattern>(*PF, insertStrRef(Name));
+  } else if (const DagInit *BP =
+                 getDagWithOperatorOfSubClass(Arg, BuiltinPattern::ClassName)) {
+    Pat = std::make_unique<BuiltinPattern>(*BP->getOperatorAsDef(DiagLoc),
+                                           insertStrRef(Name));
+  } else
+    return nullptr;
+
+  for (unsigned K = 0; K < DagPat->getNumArgs(); ++K) {
+    Init *Arg = DagPat->getArg(K);
+    if (auto *DagArg = getDagWithSpecificOperator(*Arg, "MIFlags")) {
+      if (!parseInstructionPatternMIFlags(*Pat, DagArg))
+        return nullptr;
+      continue;
+    }
+
+    if (!parseInstructionPatternOperand(*Pat, Arg, DagPat->getArgName(K)))
+      return nullptr;
+  }
+
+  if (!Pat->checkSemantics(DiagLoc))
+    return nullptr;
+
+  return std::move(Pat);
+}
+
+std::unique_ptr<Pattern>
+PatternParser::parseWipMatchOpcodeMatcher(const Init &Arg, StringRef Name) {
+  const DagInit *Matcher = getDagWithSpecificOperator(Arg, "wip_match_opcode");
+  if (!Matcher)
+    return nullptr;
+
+  if (Matcher->getNumArgs() == 0) {
+    PrintError(DiagLoc, "Empty wip_match_opcode");
+    return nullptr;
+  }
+
+  // Each argument is an opcode that can match.
+  auto Result = std::make_unique<AnyOpcodePattern>(insertStrRef(Name));
+  for (const auto &Arg : Matcher->getArgs()) {
+    Record *OpcodeDef = getDefOfSubClass(*Arg, "Instruction");
+    if (OpcodeDef) {
+      Result->addOpcode(&CGT.getInstruction(OpcodeDef));
+      continue;
+    }
+
+    PrintError(DiagLoc, "Arguments to wip_match_opcode must be instructions");
+    return nullptr;
+  }
+
+  return std::move(Result);
+}
+
+bool PatternParser::parseInstructionPatternOperand(InstructionPattern &IP,
+                                                   const Init *OpInit,
+                                                   const StringInit *OpName) {
+  const auto ParseErr = [&]() {
+    PrintError(DiagLoc,
+               "cannot parse operand '" + OpInit->getAsUnquotedString() + "' ");
+    if (OpName)
+      PrintNote(DiagLoc,
+                "operand name is '" + OpName->getAsUnquotedString() + '\'');
+    return false;
+  };
+
+  // untyped immediate, e.g. 0
+  if (const auto *IntImm = dyn_cast<IntInit>(OpInit)) {
+    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
+    IP.addOperand(IntImm->getValue(), insertStrRef(Name), PatternType());
+    return true;
+  }
+
+  // typed immediate, e.g. (i32 0)
+  if (const auto *DagOp = dyn_cast<DagInit>(OpInit)) {
+    if (DagOp->getNumArgs() != 1)
+      return ParseErr();
+
+    const Record *TyDef = DagOp->getOperatorAsDef(DiagLoc);
+    auto ImmTy = PatternType::get(DiagLoc, TyDef,
+                                  "cannot parse immediate '" +
+                                      DagOp->getAsUnquotedString() + '\'');
+    if (!ImmTy)
+      return false;
+
+    if (!IP.hasAllDefs()) {
+      PrintError(DiagLoc, "out operand of '" + IP.getInstName() +
+                              "' cannot be an immediate");
+      return false;
+    }
+
+    const auto *Val = dyn_cast<IntInit>(DagOp->getArg(0));
+    if (!Val)
+      return ParseErr();
+
+    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
+    IP.addOperand(Val->getValue(), insertStrRef(Name), *ImmTy);
+    return true;
+  }
+
+  // Typed operand e.g. $x/$z in (G_FNEG $x, $z)
+  if (auto *DefI = dyn_cast<DefInit>(OpInit)) {
+    if (!OpName) {
+      PrintError(DiagLoc, "expected an operand name after '" +
+                              OpInit->getAsString() + '\'');
+      return false;
+    }
+    const Record *Def = DefI->getDef();
+    auto Ty = PatternType::get(DiagLoc, Def, "cannot parse operand type");
+    if (!Ty)
+      return false;
+    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), *Ty);
+    return true;
+  }
+
+  // Untyped operand e.g. $x/$z in (G_FNEG $x, $z)
+  if (isa<UnsetInit>(OpInit)) {
+    assert(OpName && "Unset w/ no OpName?");
+    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), PatternType());
+    return true;
+  }
+
+  return ParseErr();
+}
+
+bool PatternParser::parseInstructionPatternMIFlags(InstructionPattern &IP,
+                                                   const DagInit *Op) {
+  auto *CGIP = dyn_cast<CodeGenInstructionPattern>(&IP);
+  if (!CGIP) {
+    PrintError(DiagLoc,
+               "matching/writing MIFlags is only allowed on CodeGenInstruction "
+               "patterns");
+    return false;
+  }
+
+  const auto CheckFlagEnum = [&](const Record *R) {
+    if (!R->isSubClassOf(MIFlagsEnumClassName)) {
+      PrintError(DiagLoc, "'" + R->getName() + "' is not a subclass of '" +
+                              MIFlagsEnumClassName + "'");
+      return false;
+    }
+
+    return true;
+  };
+
+  if (CGIP->getMIFlagsInfo()) {
+    PrintError(DiagLoc, "MIFlags can only be present once on an instruction");
+    return false;
+  }
+
+  auto &FI = CGIP->getOrCreateMIFlagsInfo();
+  for (unsigned K = 0; K < Op->getNumArgs(); ++K) {
+    const Init *Arg = Op->getArg(K);
+
+    // Match/set a flag: (MIFlags FmNoNans)
+    if (const auto *Def = dyn_cast<DefInit>(Arg)) {
+      const Record *R = Def->getDef();
+      if (!CheckFlagEnum(R))
+        return false;
+
+      FI.addSetFlag(R);
+      continue;
+    }
+
+    // Do not match a flag/unset a flag: (MIFlags (not FmNoNans))
+    if (const DagInit *NotDag = getDagWithSpecificOperator(*Arg, "not")) {
+      for (const Init *NotArg : NotDag->getArgs()) {
+        const DefInit *DefArg = dyn_cast<DefInit>(NotArg);
+        if (!DefArg) {
+          PrintError(DiagLoc, "cannot parse '" + NotArg->getAsUnquotedString() +
+                                  "': expected a '" + MIFlagsEnumClassName +
+                                  "'");
+          return false;
+        }
+
+        const Record *R = DefArg->getDef();
+        if (!CheckFlagEnum(R))
+          return false;
+
+        FI.addUnsetFlag(R);
+        continue;
+      }
+
+      continue;
+    }
+
+    // Copy flags from a matched instruction: (MIFlags $mi)
+    if (isa<UnsetInit>(Arg)) {
+      FI.addCopyFlag(insertStrRef(Op->getArgName(K)->getAsUnquotedString()));
+      continue;
+    }
+  }
+
+  return true;
+}
+
+std::unique_ptr<PatFrag> PatternParser::parsePatFragImpl(const Record *Def) {
+  auto StackTrace = PrettyStackTraceParse(*Def);
+  if (!Def->isSubClassOf(PatFrag::ClassName))
+    return nullptr;
+
+  const DagInit *Ins = Def->getValueAsDag("InOperands");
+  if (Ins->getOperatorAsDef(Def->getLoc())->getName() != "ins") {
+    PrintError(Def, "expected 'ins' operator for " + PatFrag::ClassName +
+                        " in operands list");
+    return nullptr;
+  }
+
+  const DagInit *Outs = Def->getValueAsDag("OutOperands");
+  if (Outs->getOperatorAsDef(Def->getLoc())->getName() != "outs") {
+    PrintError(Def, "expected 'outs' operator for " + PatFrag::ClassName +
+                        " out operands list");
+    return nullptr;
+  }
+
+  auto Result = std::make_unique<PatFrag>(*Def);
+  if (!parsePatFragParamList(*Outs, [&](StringRef Name, unsigned Kind) {
+        Result->addOutParam(insertStrRef(Name), (PatFrag::ParamKind)Kind);
+        return true;
+      }))
+    return nullptr;
+
+  if (!parsePatFragParamList(*Ins, [&](StringRef Name, unsigned Kind) {
+        Result->addInParam(insertStrRef(Name), (PatFrag::ParamKind)Kind);
+        return true;
+      }))
+    return nullptr;
+
+  const ListInit *Alts = Def->getValueAsListInit("Alternatives");
+  unsigned AltIdx = 0;
+  for (const Init *Alt : *Alts) {
+    const auto *PatDag = dyn_cast<DagInit>(Alt);
+    if (!PatDag) {
+      PrintError(Def, "expected dag init for PatFrag pattern alternative");
+      return nullptr;
+    }
+
+    PatFrag::Alternative &A = Result->addAlternative();
+    const auto AddPat = [&](std::unique_ptr<Pattern> Pat) {
+      A.Pats.push_back(std::move(Pat));
+      return true;
+    };
+
+    SaveAndRestore<ArrayRef<SMLoc>> DiagLocSAR(DiagLoc, Def->getLoc());
+    if (!parsePatternList(
+            *PatDag, AddPat, "pattern",
+            /*AnonPatPrefix*/
+            (Def->getName() + "_alt" + Twine(AltIdx++) + "_pattern").str()))
+      return nullptr;
+  }
+
+  if (!Result->buildOperandsTables() || !Result->checkSemantics())
+    return nullptr;
+
+  return Result;
+}
+
+bool PatternParser::parsePatFragParamList(
+    const DagInit &OpsList,
+    function_ref<bool(StringRef, unsigned)> ParseAction) {
+  for (unsigned K = 0; K < OpsList.getNumArgs(); ++K) {
+    const StringInit *Name = OpsList.getArgName(K);
+    const Init *Ty = OpsList.getArg(K);
+
+    if (!Name) {
+      PrintError(DiagLoc, "all operands must be named'");
+      return false;
+    }
+    const std::string NameStr = Name->getAsUnquotedString();
+
+    PatFrag::ParamKind OpKind;
+    if (isSpecificDef(*Ty, "gi_imm"))
+      OpKind = PatFrag::PK_Imm;
+    else if (isSpecificDef(*Ty, "root"))
+      OpKind = PatFrag::PK_Root;
+    else if (isa<UnsetInit>(Ty) ||
+             isSpecificDef(*Ty, "gi_mo")) // no type = gi_mo.
+      OpKind = PatFrag::PK_MachineOperand;
+    else {
+      PrintError(
+          DiagLoc,
+          '\'' + NameStr +
+              "' operand type was expected to be 'root', 'gi_imm' or 'gi_mo'");
+      return false;
+    }
+
+    if (!ParseAction(NameStr, (unsigned)OpKind))
+      return false;
+  }
+
+  return true;
+}
+
+const PatFrag *PatternParser::parsePatFrag(const Record *Def) {
+  // Cache already parsed PatFrags to avoid doing extra work.
+  static DenseMap<const Record *, std::unique_ptr<PatFrag>> ParsedPatFrags;
+
+  auto It = ParsedPatFrags.find(Def);
+  if (It != ParsedPatFrags.end()) {
+    SeenPatFrags.insert(It->second.get());
+    return It->second.get();
+  }
+
+  std::unique_ptr<PatFrag> NewPatFrag = parsePatFragImpl(Def);
+  if (!NewPatFrag) {
+    PrintError(Def, "Could not parse " + PatFrag::ClassName + " '" +
+                        Def->getName() + "'");
+    // Put a nullptr in the map so we don't attempt parsing this again.
+    ParsedPatFrags[Def] = nullptr;
+    return nullptr;
+  }
+
+  const auto *Res = NewPatFrag.get();
+  ParsedPatFrags[Def] = std::move(NewPatFrag);
+  SeenPatFrags.insert(Res);
+  return Res;
+}
+
+} // namespace gi
+} // namespace llvm
diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.h b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.h
new file mode 100644
index 0000000..cd6f524
--- /dev/null
+++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.h
@@ -0,0 +1,118 @@
+//===- PatternParser.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Contains tools to parse MIR patterns from TableGen DAG elements.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_GLOBALISEL_PATTERNPARSER_H
+#define LLVM_UTILS_GLOBALISEL_PATTERNPARSER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/SMLoc.h"
+#include <memory>
+
+namespace llvm {
+class CodeGenTarget;
+class DagInit;
+class Init;
+class Record;
+class StringRef;
+class StringInit;
+
+namespace gi {
+class InstructionPattern;
+class Pattern;
+class PatFrag;
+
+/// Helper class to parse MIR Pattern lists.
+///
+/// e.g., `(match (G_FADD $x, $y, $z), (G_FNEG $y, $z))`
+class PatternParser {
+  const CodeGenTarget &CGT;
+  ArrayRef<SMLoc> DiagLoc;
+
+  mutable SmallPtrSet<const PatFrag *, 2> SeenPatFrags;
+
+public:
+  PatternParser(const CodeGenTarget &CGT, ArrayRef<SMLoc> DiagLoc)
+      : CGT(CGT), DiagLoc(DiagLoc) {}
+
+  /// Parses a list of patterns such as:
+  ///   (Operator (Pattern1 ...), (Pattern2 ...))
+  /// \param List         DagInit of the expected pattern list.
+  /// \param ParseAction  Callback to handle a succesfully parsed pattern.
+  /// \param Operator     The name of the operator, e.g. "match"
+  /// \param AnonPatNamePrefix Prefix for anonymous pattern names.
+  /// \return true on success, false on failure.
+  bool
+  parsePatternList(const DagInit &List,
+                   function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
+                   StringRef Operator, StringRef AnonPatNamePrefix);
+
+  /// \returns all PatFrags encountered by this PatternParser.
+  const auto &getSeenPatFrags() const { return SeenPatFrags; }
+
+private:
+  /// Parse any InstructionPattern from a TableGen Init.
+  /// \param Arg Init to parse.
+  /// \param PatName Name of the pattern that will be parsed.
+  /// \return the parsed pattern on success, nullptr on failure.
+  std::unique_ptr<Pattern> parseInstructionPattern(const Init &Arg,
+                                                   StringRef PatName);
+
+  /// Parse a WipOpcodeMatcher from a TableGen Init.
+  /// \param Arg Init to parse.
+  /// \param PatName Name of the pattern that will be parsed.
+  /// \return the parsed pattern on success, nullptr on failure.
+  std::unique_ptr<Pattern> parseWipMatchOpcodeMatcher(const Init &Arg,
+                                                      StringRef PatName);
+
+  /// Parses an Operand of an InstructionPattern from a TableGen Init.
+  /// \param IP InstructionPattern for which we're parsing.
+  /// \param OpInit Init to parse.
+  /// \param OpName Name of the operand to parse.
+  /// \return true on success, false on failure.
+  bool parseInstructionPatternOperand(InstructionPattern &IP,
+                                      const Init *OpInit,
+                                      const StringInit *OpName);
+
+  /// Parses a MIFlag for an InstructionPattern from a TableGen Init.
+  /// \param IP InstructionPattern for which we're parsing.
+  /// \param Op Init to parse.
+  /// \return true on success, false on failure.
+  bool parseInstructionPatternMIFlags(InstructionPattern &IP,
+                                      const DagInit *Op);
+
+  /// (Uncached) PatFrag parsing implementation.
+  /// \param Def PatFrag def to parsee.
+  /// \return the parsed PatFrag on success, nullptr on failure.
+  std::unique_ptr<PatFrag> parsePatFragImpl(const Record *Def);
+
+  /// Parses the in or out parameter list of a PatFrag.
+  /// \param OpsList Init to parse.
+  /// \param ParseAction Callback on successful parse, with the name of
+  ///                     the parameter and its \ref PatFrag::ParamKind
+  /// \return true on success, false on failure.
+  bool
+  parsePatFragParamList(const DagInit &OpsList,
+                        function_ref<bool(StringRef, unsigned)> ParseAction);
+
+  /// Cached PatFrag parser. This avoids duplicate work by keeping track of
+  /// already-parsed PatFrags.
+  /// \param Def PatFrag def to parsee.
+  /// \return the parsed PatFrag on success, nullptr on failure.
+  const PatFrag *parsePatFrag(const Record *Def);
+};
+
+} // namespace gi
+} // namespace llvm
+
+#endif
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 39b9f8a..1ae6efd 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -36,6 +36,7 @@
 #include "Common/GlobalISel/GlobalISelMatchTable.h"
 #include "Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h"
 #include "Common/GlobalISel/MatchDataInfo.h"
+#include "Common/GlobalISel/PatternParser.h"
 #include "Common/GlobalISel/Patterns.h"
 #include "Common/SubtargetFeatureInfo.h"
 #include "llvm/ADT/APInt.h"
@@ -80,7 +81,6 @@ cl::opt<bool> DebugTypeInfer("gicombiner-debug-typeinfer",
 
 constexpr StringLiteral CXXApplyPrefix = "GICXXCustomAction_CombineApply";
 constexpr StringLiteral CXXPredPrefix = "GICXXPred_MI_Predicate_";
-constexpr StringLiteral MIFlagsEnumClassName = "MIFlagEnum";
 
 //===- CodeExpansions Helpers  --------------------------------------------===//
 
@@ -109,17 +109,6 @@ void declareTempRegExpansion(CodeExpansions &CE, unsigned TempRegID,
 
 //===- Misc. Helpers  -----------------------------------------------------===//
 
-/// Copies a StringRef into a static pool to preserve it.
-/// Most Pattern classes use StringRef so we need this.
-StringRef insertStrRef(StringRef S) {
-  if (S.empty())
-    return {};
-
-  static StringSet<> Pool;
-  auto [It, Inserted] = Pool.insert(S);
-  return It->getKey();
-}
-
 template <typename Container> auto keys(Container &&C) {
   return map_range(C, [](auto &Entry) -> auto & { return Entry.first; });
 }
@@ -639,8 +628,9 @@ public:
                      SubtargetFeatureInfoMap &SubtargetFeatures,
                      Record &RuleDef, unsigned ID,
                      std::vector<RuleMatcher> &OutRMs)
-      : CGT(CGT), SubtargetFeatures(SubtargetFeatures), RuleDef(RuleDef),
-        RuleID(ID), OutRMs(OutRMs) {}
+      : Parser(CGT, RuleDef.getLoc()), CGT(CGT),
+        SubtargetFeatures(SubtargetFeatures), RuleDef(RuleDef), RuleID(ID),
+        OutRMs(OutRMs) {}
 
   /// Parses all fields in the RuleDef record.
   bool parseAll();
@@ -718,26 +708,6 @@ private:
   bool buildRuleOperandsTable();
 
   bool parseDefs(const DagInit &Def);
-  bool
-  parsePatternList(const DagInit &List,
-                   function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
-                   StringRef Operator, ArrayRef<SMLoc> DiagLoc,
-                   StringRef AnonPatNamePrefix) const;
-
-  std::unique_ptr<Pattern> parseInstructionPattern(const Init &Arg,
-                                                   StringRef PatName) const;
-  std::unique_ptr<Pattern> parseWipMatchOpcodeMatcher(const Init &Arg,
-                                                      StringRef PatName) const;
-  bool parseInstructionPatternOperand(InstructionPattern &IP,
-                                      const Init *OpInit,
-                                      const StringInit *OpName) const;
-  bool parseInstructionPatternMIFlags(InstructionPattern &IP,
-                                      const DagInit *Op) const;
-  std::unique_ptr<PatFrag> parsePatFragImpl(const Record *Def) const;
-  bool parsePatFragParamList(
-      ArrayRef<SMLoc> DiagLoc, const DagInit &OpsList,
-      function_ref<bool(StringRef, PatFrag::ParamKind)> ParseAction) const;
-  const PatFrag *parsePatFrag(const Record *Def) const;
 
   bool emitMatchPattern(CodeExpansions &CE, const PatternAlternatives &Alts,
                         const InstructionPattern &IP);
@@ -781,6 +751,7 @@ private:
       DenseSet<const Pattern *> &SeenPats, OperandDefLookupFn LookupOperandDef,
       OperandMapperFnRef OperandMapper = [](const auto &O) { return O; });
 
+  PatternParser Parser;
   const CodeGenTarget &CGT;
   SubtargetFeatureInfoMap &SubtargetFeatures;
   Record &RuleDef;
@@ -808,9 +779,6 @@ private:
 
   SmallVector<MatchDataInfo, 2> MatchDatas;
   SmallVector<PatternAlternatives, 1> PermutationsToEmit;
-
-  // print()/debug-only members.
-  mutable SmallPtrSet<const PatFrag *, 2> SeenPatFrags;
 };
 
 bool CombineRuleBuilder::parseAll() {
@@ -819,16 +787,16 @@ bool CombineRuleBuilder::parseAll() {
   if (!parseDefs(*RuleDef.getValueAsDag("Defs")))
     return false;
 
-  if (!parsePatternList(
+  if (!Parser.parsePatternList(
           *RuleDef.getValueAsDag("Match"),
           [this](auto Pat) { return addMatchPattern(std::move(Pat)); }, "match",
-          RuleDef.getLoc(), (RuleDef.getName() + "_match").str()))
+          (RuleDef.getName() + "_match").str()))
     return false;
 
-  if (!parsePatternList(
+  if (!Parser.parsePatternList(
           *RuleDef.getValueAsDag("Apply"),
           [this](auto Pat) { return addApplyPattern(std::move(Pat)); }, "apply",
-          RuleDef.getLoc(), (RuleDef.getName() + "_apply").str()))
+          (RuleDef.getName() + "_apply").str()))
     return false;
 
   if (!buildRuleOperandsTable() || !typecheckPatterns() || !findRoots() ||
@@ -884,9 +852,10 @@ void CombineRuleBuilder::print(raw_ostream &OS) const {
     OS << "  )\n";
   }
 
-  if (!SeenPatFrags.empty()) {
+  const auto &SeenPFs = Parser.getSeenPatFrags();
+  if (!SeenPFs.empty()) {
     OS << "  (PatFrags\n";
-    for (const auto *PF : SeenPatFrags) {
+    for (const auto *PF : Parser.getSeenPatFrags()) {
       PF->print(OS, /*Indent=*/"    ");
       OS << '\n';
     }
@@ -1500,426 +1469,6 @@ bool CombineRuleBuilder::parseDefs(const DagInit &Def) {
   return true;
 }
 
-bool CombineRuleBuilder::parsePatternList(
-    const DagInit &List,
-    function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
-    StringRef Operator, ArrayRef<SMLoc> DiagLoc,
-    StringRef AnonPatNamePrefix) const {
-  if (List.getOperatorAsDef(RuleDef.getLoc())->getName() != Operator) {
-    ::PrintError(DiagLoc, "Expected " + Operator + " operator");
-    return false;
-  }
-
-  if (List.getNumArgs() == 0) {
-    ::PrintError(DiagLoc, Operator + " pattern list is empty");
-    return false;
-  }
-
-  // The match section consists of a list of matchers and predicates. Parse each
-  // one and add the equivalent GIMatchDag nodes, predicates, and edges.
-  for (unsigned I = 0; I < List.getNumArgs(); ++I) {
-    Init *Arg = List.getArg(I);
-    std::string Name = List.getArgName(I)
-                           ? List.getArgName(I)->getValue().str()
-                           : ("__" + AnonPatNamePrefix + "_" + Twine(I)).str();
-
-    if (auto Pat = parseInstructionPattern(*Arg, Name)) {
-      if (!ParseAction(std::move(Pat)))
-        return false;
-      continue;
-    }
-
-    if (auto Pat = parseWipMatchOpcodeMatcher(*Arg, Name)) {
-      if (!ParseAction(std::move(Pat)))
-        return false;
-      continue;
-    }
-
-    // Parse arbitrary C++ code
-    if (const auto *StringI = dyn_cast<StringInit>(Arg)) {
-      auto CXXPat = std::make_unique<CXXPattern>(*StringI, insertStrRef(Name));
-      if (!ParseAction(std::move(CXXPat)))
-        return false;
-      continue;
-    }
-
-    ::PrintError(DiagLoc,
-                 "Failed to parse pattern: '" + Arg->getAsString() + "'");
-    return false;
-  }
-
-  return true;
-}
-
-static const CodeGenInstruction &
-getInstrForIntrinsic(const CodeGenTarget &CGT, const CodeGenIntrinsic *I) {
-  StringRef Opc;
-  if (I->isConvergent) {
-    Opc = I->hasSideEffects ? "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS"
-                            : "G_INTRINSIC_CONVERGENT";
-  } else {
-    Opc = I->hasSideEffects ? "G_INTRINSIC_W_SIDE_EFFECTS" : "G_INTRINSIC";
-  }
-
-  RecordKeeper &RK = I->TheDef->getRecords();
-  return CGT.getInstruction(RK.getDef(Opc));
-}
-
-static const CodeGenIntrinsic *getCodeGenIntrinsic(Record *R) {
-  // Intrinsics need to have a static lifetime because the match table keeps
-  // references to CodeGenIntrinsic objects.
-  static DenseMap<const Record *, std::unique_ptr<CodeGenIntrinsic>>
-      AllIntrinsics;
-
-  auto &Ptr = AllIntrinsics[R];
-  if (!Ptr)
-    Ptr = std::make_unique<CodeGenIntrinsic>(R, std::vector<Record *>());
-  return Ptr.get();
-}
-
-std::unique_ptr<Pattern>
-CombineRuleBuilder::parseInstructionPattern(const Init &Arg,
-                                            StringRef Name) const {
-  const DagInit *DagPat = dyn_cast<DagInit>(&Arg);
-  if (!DagPat)
-    return nullptr;
-
-  std::unique_ptr<InstructionPattern> Pat;
-  if (const DagInit *IP = getDagWithOperatorOfSubClass(Arg, "Instruction")) {
-    auto &Instr = CGT.getInstruction(IP->getOperatorAsDef(RuleDef.getLoc()));
-    Pat =
-        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
-  } else if (const DagInit *IP =
-                 getDagWithOperatorOfSubClass(Arg, "Intrinsic")) {
-    Record *TheDef = IP->getOperatorAsDef(RuleDef.getLoc());
-    const CodeGenIntrinsic *Intrin = getCodeGenIntrinsic(TheDef);
-    const CodeGenInstruction &Instr = getInstrForIntrinsic(CGT, Intrin);
-    Pat =
-        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
-    cast<CodeGenInstructionPattern>(*Pat).setIntrinsic(Intrin);
-  } else if (const DagInit *PFP =
-                 getDagWithOperatorOfSubClass(Arg, PatFrag::ClassName)) {
-    const Record *Def = PFP->getOperatorAsDef(RuleDef.getLoc());
-    const PatFrag *PF = parsePatFrag(Def);
-    if (!PF)
-      return nullptr; // Already diagnosed by parsePatFrag
-    Pat = std::make_unique<PatFragPattern>(*PF, insertStrRef(Name));
-  } else if (const DagInit *BP =
-                 getDagWithOperatorOfSubClass(Arg, BuiltinPattern::ClassName)) {
-    Pat = std::make_unique<BuiltinPattern>(
-        *BP->getOperatorAsDef(RuleDef.getLoc()), insertStrRef(Name));
-  } else
-    return nullptr;
-
-  for (unsigned K = 0; K < DagPat->getNumArgs(); ++K) {
-    Init *Arg = DagPat->getArg(K);
-    if (auto *DagArg = getDagWithSpecificOperator(*Arg, "MIFlags")) {
-      if (!parseInstructionPatternMIFlags(*Pat, DagArg))
-        return nullptr;
-      continue;
-    }
-
-    if (!parseInstructionPatternOperand(*Pat, Arg, DagPat->getArgName(K)))
-      return nullptr;
-  }
-
-  if (!Pat->checkSemantics(RuleDef.getLoc()))
-    return nullptr;
-
-  return std::move(Pat);
-}
-
-std::unique_ptr<Pattern>
-CombineRuleBuilder::parseWipMatchOpcodeMatcher(const Init &Arg,
-                                               StringRef Name) const {
-  const DagInit *Matcher = getDagWithSpecificOperator(Arg, "wip_match_opcode");
-  if (!Matcher)
-    return nullptr;
-
-  if (Matcher->getNumArgs() == 0) {
-    PrintError("Empty wip_match_opcode");
-    return nullptr;
-  }
-
-  // Each argument is an opcode that can match.
-  auto Result = std::make_unique<AnyOpcodePattern>(insertStrRef(Name));
-  for (const auto &Arg : Matcher->getArgs()) {
-    Record *OpcodeDef = getDefOfSubClass(*Arg, "Instruction");
-    if (OpcodeDef) {
-      Result->addOpcode(&CGT.getInstruction(OpcodeDef));
-      continue;
-    }
-
-    PrintError("Arguments to wip_match_opcode must be instructions");
-    return nullptr;
-  }
-
-  return std::move(Result);
-}
-
-bool CombineRuleBuilder::parseInstructionPatternOperand(
-    InstructionPattern &IP, const Init *OpInit,
-    const StringInit *OpName) const {
-  const auto ParseErr = [&]() {
-    PrintError("cannot parse operand '" + OpInit->getAsUnquotedString() + "' ");
-    if (OpName)
-      PrintNote("operand name is '" + OpName->getAsUnquotedString() + "'");
-    return false;
-  };
-
-  // untyped immediate, e.g. 0
-  if (const auto *IntImm = dyn_cast<IntInit>(OpInit)) {
-    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
-    IP.addOperand(IntImm->getValue(), insertStrRef(Name), PatternType());
-    return true;
-  }
-
-  // typed immediate, e.g. (i32 0)
-  if (const auto *DagOp = dyn_cast<DagInit>(OpInit)) {
-    if (DagOp->getNumArgs() != 1)
-      return ParseErr();
-
-    const Record *TyDef = DagOp->getOperatorAsDef(RuleDef.getLoc());
-    auto ImmTy = PatternType::get(RuleDef.getLoc(), TyDef,
-                                  "cannot parse immediate '" +
-                                      DagOp->getAsUnquotedString() + "'");
-    if (!ImmTy)
-      return false;
-
-    if (!IP.hasAllDefs()) {
-      PrintError("out operand of '" + IP.getInstName() +
-                 "' cannot be an immediate");
-      return false;
-    }
-
-    const auto *Val = dyn_cast<IntInit>(DagOp->getArg(0));
-    if (!Val)
-      return ParseErr();
-
-    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
-    IP.addOperand(Val->getValue(), insertStrRef(Name), *ImmTy);
-    return true;
-  }
-
-  // Typed operand e.g. $x/$z in (G_FNEG $x, $z)
-  if (auto *DefI = dyn_cast<DefInit>(OpInit)) {
-    if (!OpName) {
-      PrintError("expected an operand name after '" + OpInit->getAsString() +
-                 "'");
-      return false;
-    }
-    const Record *Def = DefI->getDef();
-    auto Ty =
-        PatternType::get(RuleDef.getLoc(), Def, "cannot parse operand type");
-    if (!Ty)
-      return false;
-    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), *Ty);
-    return true;
-  }
-
-  // Untyped operand e.g. $x/$z in (G_FNEG $x, $z)
-  if (isa<UnsetInit>(OpInit)) {
-    assert(OpName && "Unset w/ no OpName?");
-    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), PatternType());
-    return true;
-  }
-
-  return ParseErr();
-}
-
-bool CombineRuleBuilder::parseInstructionPatternMIFlags(
-    InstructionPattern &IP, const DagInit *Op) const {
-  auto *CGIP = dyn_cast<CodeGenInstructionPattern>(&IP);
-  if (!CGIP) {
-    PrintError("matching/writing MIFlags is only allowed on CodeGenInstruction "
-               "patterns");
-    return false;
-  }
-
-  const auto CheckFlagEnum = [&](const Record *R) {
-    if (!R->isSubClassOf(MIFlagsEnumClassName)) {
-      PrintError("'" + R->getName() + "' is not a subclass of '" +
-                 MIFlagsEnumClassName + "'");
-      return false;
-    }
-
-    return true;
-  };
-
-  if (CGIP->getMIFlagsInfo()) {
-    PrintError("MIFlags can only be present once on an instruction");
-    return false;
-  }
-
-  auto &FI = CGIP->getOrCreateMIFlagsInfo();
-  for (unsigned K = 0; K < Op->getNumArgs(); ++K) {
-    const Init *Arg = Op->getArg(K);
-
-    // Match/set a flag: (MIFlags FmNoNans)
-    if (const auto *Def = dyn_cast<DefInit>(Arg)) {
-      const Record *R = Def->getDef();
-      if (!CheckFlagEnum(R))
-        return false;
-
-      FI.addSetFlag(R);
-      continue;
-    }
-
-    // Do not match a flag/unset a flag: (MIFlags (not FmNoNans))
-    if (const DagInit *NotDag = getDagWithSpecificOperator(*Arg, "not")) {
-      for (const Init *NotArg : NotDag->getArgs()) {
-        const DefInit *DefArg = dyn_cast<DefInit>(NotArg);
-        if (!DefArg) {
-          PrintError("cannot parse '" + NotArg->getAsUnquotedString() +
-                     "': expected a '" + MIFlagsEnumClassName + "'");
-          return false;
-        }
-
-        const Record *R = DefArg->getDef();
-        if (!CheckFlagEnum(R))
-          return false;
-
-        FI.addUnsetFlag(R);
-        continue;
-      }
-
-      continue;
-    }
-
-    // Copy flags from a matched instruction: (MIFlags $mi)
-    if (isa<UnsetInit>(Arg)) {
-      FI.addCopyFlag(insertStrRef(Op->getArgName(K)->getAsUnquotedString()));
-      continue;
-    }
-  }
-
-  return true;
-}
-
-std::unique_ptr<PatFrag>
-CombineRuleBuilder::parsePatFragImpl(const Record *Def) const {
-  auto StackTrace = PrettyStackTraceParse(*Def);
-  if (!Def->isSubClassOf(PatFrag::ClassName))
-    return nullptr;
-
-  const DagInit *Ins = Def->getValueAsDag("InOperands");
-  if (Ins->getOperatorAsDef(Def->getLoc())->getName() != "ins") {
-    ::PrintError(Def, "expected 'ins' operator for " + PatFrag::ClassName +
-                          " in operands list");
-    return nullptr;
-  }
-
-  const DagInit *Outs = Def->getValueAsDag("OutOperands");
-  if (Outs->getOperatorAsDef(Def->getLoc())->getName() != "outs") {
-    ::PrintError(Def, "expected 'outs' operator for " + PatFrag::ClassName +
-                          " out operands list");
-    return nullptr;
-  }
-
-  auto Result = std::make_unique<PatFrag>(*Def);
-  if (!parsePatFragParamList(Def->getLoc(), *Outs,
-                             [&](StringRef Name, PatFrag::ParamKind Kind) {
-                               Result->addOutParam(insertStrRef(Name), Kind);
-                               return true;
-                             }))
-    return nullptr;
-
-  if (!parsePatFragParamList(Def->getLoc(), *Ins,
-                             [&](StringRef Name, PatFrag::ParamKind Kind) {
-                               Result->addInParam(insertStrRef(Name), Kind);
-                               return true;
-                             }))
-    return nullptr;
-
-  const ListInit *Alts = Def->getValueAsListInit("Alternatives");
-  unsigned AltIdx = 0;
-  for (const Init *Alt : *Alts) {
-    const auto *PatDag = dyn_cast<DagInit>(Alt);
-    if (!PatDag) {
-      ::PrintError(Def, "expected dag init for PatFrag pattern alternative");
-      return nullptr;
-    }
-
-    PatFrag::Alternative &A = Result->addAlternative();
-    const auto AddPat = [&](std::unique_ptr<Pattern> Pat) {
-      A.Pats.push_back(std::move(Pat));
-      return true;
-    };
-
-    if (!parsePatternList(
-            *PatDag, AddPat, "pattern", Def->getLoc(),
-            /*AnonPatPrefix*/
-            (Def->getName() + "_alt" + Twine(AltIdx++) + "_pattern").str()))
-      return nullptr;
-  }
-
-  if (!Result->buildOperandsTables() || !Result->checkSemantics())
-    return nullptr;
-
-  return Result;
-}
-
-bool CombineRuleBuilder::parsePatFragParamList(
-    ArrayRef<SMLoc> DiagLoc, const DagInit &OpsList,
-    function_ref<bool(StringRef, PatFrag::ParamKind)> ParseAction) const {
-  for (unsigned K = 0; K < OpsList.getNumArgs(); ++K) {
-    const StringInit *Name = OpsList.getArgName(K);
-    const Init *Ty = OpsList.getArg(K);
-
-    if (!Name) {
-      ::PrintError(DiagLoc, "all operands must be named'");
-      return false;
-    }
-    const std::string NameStr = Name->getAsUnquotedString();
-
-    PatFrag::ParamKind OpKind;
-    if (isSpecificDef(*Ty, "gi_imm"))
-      OpKind = PatFrag::PK_Imm;
-    else if (isSpecificDef(*Ty, "root"))
-      OpKind = PatFrag::PK_Root;
-    else if (isa<UnsetInit>(Ty) ||
-             isSpecificDef(*Ty, "gi_mo")) // no type = gi_mo.
-      OpKind = PatFrag::PK_MachineOperand;
-    else {
-      ::PrintError(
-          DiagLoc,
-          "'" + NameStr +
-              "' operand type was expected to be 'root', 'gi_imm' or 'gi_mo'");
-      return false;
-    }
-
-    if (!ParseAction(NameStr, OpKind))
-      return false;
-  }
-
-  return true;
-}
-
-const PatFrag *CombineRuleBuilder::parsePatFrag(const Record *Def) const {
-  // Cache already parsed PatFrags to avoid doing extra work.
-  static DenseMap<const Record *, std::unique_ptr<PatFrag>> ParsedPatFrags;
-
-  auto It = ParsedPatFrags.find(Def);
-  if (It != ParsedPatFrags.end()) {
-    SeenPatFrags.insert(It->second.get());
-    return It->second.get();
-  }
-
-  std::unique_ptr<PatFrag> NewPatFrag = parsePatFragImpl(Def);
-  if (!NewPatFrag) {
-    ::PrintError(Def, "Could not parse " + PatFrag::ClassName + " '" +
-                          Def->getName() + "'");
-    // Put a nullptr in the map so we don't attempt parsing this again.
-    ParsedPatFrags[Def] = nullptr;
-    return nullptr;
-  }
-
-  const auto *Res = NewPatFrag.get();
-  ParsedPatFrags[Def] = std::move(NewPatFrag);
-  SeenPatFrags.insert(Res);
-  return Res;
-}
-
 bool CombineRuleBuilder::emitMatchPattern(CodeExpansions &CE,
                                           const PatternAlternatives &Alts,
                                           const InstructionPattern &IP) {
@@ -2956,8 +2505,8 @@ GICombinerEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules) {
                                                const Matcher *B) {
     auto *L = static_cast<const RuleMatcher *>(A);
     auto *R = static_cast<const RuleMatcher *>(B);
-    return std::tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
-           std::tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
+    return std::make_tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
+           std::make_tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
   });
 
   for (Matcher *Rule : InputRules)
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
index 12d875c..5ba91fc 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
@@ -104,6 +104,7 @@ static_library("MCTargetDesc") {
     "AMDGPUMCAsmInfo.cpp",
     "AMDGPUMCCodeEmitter.cpp",
     "AMDGPUMCExpr.cpp",
+    "AMDGPUMCKernelDescriptor.cpp",
     "AMDGPUMCTargetDesc.cpp",
     "AMDGPUTargetStreamer.cpp",
     "R600InstPrinter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
index c0ea627..daa3278 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
@@ -18,9 +18,11 @@ static_library("Common") {
     "DAGISelMatcher.cpp",
     "GlobalISel/CXXPredicates.cpp",
     "GlobalISel/CodeExpander.cpp",
+    "GlobalISel/CombinerUtils.cpp",
     "GlobalISel/GlobalISelMatchTable.cpp",
     "GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp",
     "GlobalISel/MatchDataInfo.cpp",
+    "GlobalISel/PatternParser.cpp",
     "GlobalISel/Patterns.cpp",
     "InfoByHwMode.cpp",
     "OptEmitter.cpp",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index f4bac93..28526f1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -611,19 +611,19 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
 // Variadic function intrinsics.
 //
 
-def LLVM_VaStartOp : LLVM_ZeroResultIntrOp<"vastart">,
+def LLVM_VaStartOp : LLVM_ZeroResultIntrOp<"vastart", [0]>,
                      Arguments<(ins LLVM_AnyPointer:$arg_list)> {
   let assemblyFormat = "$arg_list attr-dict `:` qualified(type($arg_list))";
   let summary = "Initializes `arg_list` for subsequent variadic argument extractions.";
 }
 
-def LLVM_VaCopyOp : LLVM_ZeroResultIntrOp<"vacopy">,
+def LLVM_VaCopyOp : LLVM_ZeroResultIntrOp<"vacopy", [0]>,
                     Arguments<(ins LLVM_AnyPointer:$dest_list, LLVM_AnyPointer:$src_list)> {
   let assemblyFormat = "$src_list `to` $dest_list attr-dict `:` type(operands)";
   let summary = "Copies the current argument position from `src_list` to `dest_list`.";
 }
 
-def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend">,
+def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend", [0]>,
                    Arguments<(ins LLVM_AnyPointer:$arg_list)> {
   let assemblyFormat = "$arg_list attr-dict `:` qualified(type($arg_list))";
   let summary = "Destroys `arg_list`, which has been initialized by `intr.vastart` or `intr.vacopy`.";
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 6c8a170..f7c1f4d 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -210,7 +210,7 @@ public:
   /// registration. The promised interface type can be an interface of any type
   /// not just a dialect interface, i.e. it may also be an
   /// AttributeInterface/OpInterface/TypeInterface/etc.
-  template <typename ConcreteT, typename InterfaceT>
+  template <typename InterfaceT, typename ConcreteT>
   void declarePromisedInterface() {
     unresolvedPromisedInterfaces.insert(
         {TypeID::get<ConcreteT>(), InterfaceT::getInterfaceID()});
@@ -221,7 +221,7 @@ public:
   // declarePromisedInterfaces<FunctionOpInterface, MyFuncType1, MyFuncType2>()
   template <typename InterfaceT, typename... ConcreteT>
   void declarePromisedInterfaces() {
-    (declarePromisedInterface<ConcreteT, InterfaceT>(), ...);
+    (declarePromisedInterface<InterfaceT, ConcreteT>(), ...);
   }
 
   /// Checks if the given interface, which is attempting to be used, is a
diff --git a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
index 6a59318..042acf6 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
@@ -48,9 +48,9 @@ void arith::ArithDialect::initialize() {
 #include "mlir/Dialect/Arith/IR/ArithOpsAttributes.cpp.inc"
       >();
   addInterfaces<ArithInlinerInterface>();
-  declarePromisedInterface<ArithDialect, ConvertToLLVMPatternInterface>();
-  declarePromisedInterface<SelectOp,
-                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, ArithDialect>();
+  declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
+                           SelectOp>();
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConstantOp,
                             IndexCastOp, SelectOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, AddIOp, ConstantOp, SubIOp,
diff --git a/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp b/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
index ca57171..0bdcf43 100644
--- a/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
+++ b/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
@@ -40,7 +40,7 @@ void complex::ComplexDialect::initialize() {
 #define GET_ATTRDEF_LIST
 #include "mlir/Dialect/Complex/IR/ComplexAttributes.cpp.inc"
       >();
-  declarePromisedInterface<ComplexDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, ComplexDialect>();
   addInterfaces<ComplexInlinerInterface>();
 }
 
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index c6b02b9..5d11f8f6 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -70,11 +70,11 @@ void ControlFlowDialect::initialize() {
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.cpp.inc"
       >();
   addInterfaces<ControlFlowInlinerInterface>();
-  declarePromisedInterface<ControlFlowDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, ControlFlowDialect>();
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, BranchOp,
                             CondBranchOp>();
-  declarePromisedInterface<CondBranchOp,
-                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
+                           CondBranchOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Func/IR/FuncOps.cpp b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
index ed2ecfe9..95589e8 100644
--- a/mlir/lib/Dialect/Func/IR/FuncOps.cpp
+++ b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
@@ -42,8 +42,8 @@ void FuncDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/Func/IR/FuncOps.cpp.inc"
       >();
-  declarePromisedInterface<FuncDialect, DialectInlinerInterface>();
-  declarePromisedInterface<FuncDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<DialectInlinerInterface, FuncDialect>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, FuncDialect>();
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, CallOp,
                             FuncOp, ReturnOp>();
 }
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a02eca8..f1b9ca5 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -216,8 +216,8 @@ void GPUDialect::initialize() {
 #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
       >();
   addInterfaces<GPUInlinerInterface>();
-  declarePromisedInterface<TerminatorOp,
-                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
+                           TerminatorOp>();
 }
 
 static std::string getSparseHandleKeyword(SparseHandleKind kind) {
diff --git a/mlir/lib/Dialect/Index/IR/IndexDialect.cpp b/mlir/lib/Dialect/Index/IR/IndexDialect.cpp
index d631afa..183d0e3 100644
--- a/mlir/lib/Dialect/Index/IR/IndexDialect.cpp
+++ b/mlir/lib/Dialect/Index/IR/IndexDialect.cpp
@@ -19,7 +19,7 @@ using namespace mlir::index;
 void IndexDialect::initialize() {
   registerAttributes();
   registerOperations();
-  declarePromisedInterface<IndexDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, IndexDialect>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 9e84074..94197e4 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1044,8 +1044,8 @@ void NVVMDialect::initialize() {
   // Support unknown operations because not all NVVM operations are
   // registered.
   allowUnknownOperations();
-  declarePromisedInterface<NVVMDialect, ConvertToLLVMPatternInterface>();
-  declarePromisedInterface<NVVMTargetAttr, gpu::TargetAttrInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, NVVMDialect>();
+  declarePromisedInterface<gpu::TargetAttrInterface, NVVMTargetAttr>();
 }
 
 LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
index 0f2e75c..65b770a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
@@ -247,7 +247,7 @@ void ROCDLDialect::initialize() {
 
   // Support unknown operations because not all ROCDL operations are registered.
   allowUnknownOperations();
-  declarePromisedInterface<ROCDLTargetAttr, gpu::TargetAttrInterface>();
+  declarePromisedInterface<gpu::TargetAttrInterface, ROCDLTargetAttr>();
 }
 
 LogicalResult ROCDLDialect::verifyOperationAttribute(Operation *op,
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
index a6936fd..9e50c35 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
@@ -123,16 +123,16 @@ void mlir::linalg::LinalgDialect::initialize() {
 
   addInterfaces<LinalgInlinerInterface>();
 
-  declarePromisedInterface<GenericOp, mesh::ShardingInterface>();
+  declarePromisedInterface<mesh::ShardingInterface, GenericOp>();
   declarePromisedInterfaces<mesh::ShardingInterface,
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
                             >();
-  declarePromisedInterface<CopyOp, SubsetOpInterface>();
-  declarePromisedInterface<CopyOp, SubsetInsertionOpInterface>();
-  declarePromisedInterface<IndexOp, ValueBoundsOpInterface>();
-  declarePromisedInterface<linalg::GenericOp, TilingInterface>();
-  declarePromisedInterface<linalg::GenericOp, PartialReductionOpInterface>();
+  declarePromisedInterface<SubsetOpInterface, CopyOp>();
+  declarePromisedInterface<SubsetInsertionOpInterface, CopyOp>();
+  declarePromisedInterface<ValueBoundsOpInterface, IndexOp>();
+  declarePromisedInterface<TilingInterface, linalg::GenericOp>();
+  declarePromisedInterface<PartialReductionOpInterface, linalg::GenericOp>();
   declarePromisedInterfaces<TilingInterface,
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
diff --git a/mlir/lib/Dialect/Math/IR/MathDialect.cpp b/mlir/lib/Dialect/Math/IR/MathDialect.cpp
index a71b24c..285b5ca 100644
--- a/mlir/lib/Dialect/Math/IR/MathDialect.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathDialect.cpp
@@ -35,5 +35,5 @@ void mlir::math::MathDialect::initialize() {
 #include "mlir/Dialect/Math/IR/MathOps.cpp.inc"
       >();
   addInterfaces<MathInlinerInterface>();
-  declarePromisedInterface<MathDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, MathDialect>();
 }
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
index 41082a8..3a8bd12 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
@@ -47,14 +47,14 @@ void mlir::memref::MemRefDialect::initialize() {
 #include "mlir/Dialect/MemRef/IR/MemRefOps.cpp.inc"
       >();
   addInterfaces<MemRefInlinerInterface>();
-  declarePromisedInterface<MemRefDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, MemRefDialect>();
   declarePromisedInterfaces<bufferization::AllocationOpInterface, AllocOp,
                             AllocaOp, ReallocOp>();
   declarePromisedInterfaces<RuntimeVerifiableOpInterface, CastOp, ExpandShapeOp,
                             LoadOp, ReinterpretCastOp, StoreOp, SubViewOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, AllocOp, AllocaOp, CastOp,
                             DimOp, GetGlobalOp, RankOp, SubViewOp>();
-  declarePromisedInterface<MemRefType, DestructurableTypeInterface>();
+  declarePromisedInterface<DestructurableTypeInterface, MemRefType>();
 }
 
 /// Finds the unique dealloc operation (if one exists) for `allocValue`.
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index ddb9676..5bca8e8 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -79,7 +79,7 @@ void SCFDialect::initialize() {
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConditionOp,
                             ExecuteRegionOp, ForOp, IfOp, IndexSwitchOp,
                             ForallOp, InParallelOp, WhileOp, YieldOp>();
-  declarePromisedInterface<ForOp, ValueBoundsOpInterface>();
+  declarePromisedInterface<ValueBoundsOpInterface, ForOp>();
 }
 
 /// Default callback for IfOp builders. Inserts a yield without arguments.
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index e914f46..72488d6 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -135,7 +135,7 @@ void SPIRVDialect::initialize() {
 
   // Allow unknown operations because SPIR-V is extensible.
   allowUnknownOperations();
-  declarePromisedInterface<TargetEnvAttr, gpu::TargetAttrInterface>();
+  declarePromisedInterface<gpu::TargetAttrInterface, TargetEnvAttr>();
 }
 
 std::string SPIRVDialect::getAttributeName(Decoration decoration) {
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
index 4b31567..0020777 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
@@ -62,7 +62,7 @@ void TensorDialect::initialize() {
                             ParallelInsertSliceOp>();
   declarePromisedInterfaces<SubsetInsertionOpInterface, InsertSliceOp,
                             ParallelInsertSliceOp>();
-  declarePromisedInterface<ExtractSliceOp, SubsetExtractionOpInterface>();
+  declarePromisedInterface<SubsetExtractionOpInterface, ExtractSliceOp>();
   declarePromisedInterfaces<TilingInterface, PadOp, PackOp, UnPackOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, CastOp, DimOp, EmptyOp,
                             ExtractSliceOp, PadOp, RankOp>();
diff --git a/mlir/lib/Dialect/UB/IR/UBOps.cpp b/mlir/lib/Dialect/UB/IR/UBOps.cpp
index 3a2010c..5b2cfe7 100644
--- a/mlir/lib/Dialect/UB/IR/UBOps.cpp
+++ b/mlir/lib/Dialect/UB/IR/UBOps.cpp
@@ -46,7 +46,7 @@ void UBDialect::initialize() {
 #include "mlir/Dialect/UB/IR/UBOpsAttributes.cpp.inc"
       >();
   addInterfaces<UBInlinerInterface>();
-  declarePromisedInterface<UBDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, UBDialect>();
 }
 
 Operation *UBDialect::materializeConstant(OpBuilder &builder, Attribute value,
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 3529682..e566bfa 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -382,8 +382,8 @@ void VectorDialect::initialize() {
                             YieldOp>();
   declarePromisedInterfaces<SubsetOpInterface, TransferReadOp,
                             TransferWriteOp>();
-  declarePromisedInterface<TransferReadOp, SubsetExtractionOpInterface>();
-  declarePromisedInterface<TransferWriteOp, SubsetInsertionOpInterface>();
+  declarePromisedInterface<SubsetExtractionOpInterface, TransferReadOp>();
+  declarePromisedInterface<SubsetInsertionOpInterface, TransferWriteOp>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/test/Target/LLVMIR/Import/basic.ll b/mlir/test/Target/LLVMIR/Import/basic.ll
index a059425..448b0eb 100644
--- a/mlir/test/Target/LLVMIR/Import/basic.ll
+++ b/mlir/test/Target/LLVMIR/Import/basic.ll
@@ -72,26 +72,26 @@ define i32 @useFreezeOp(i32 %x) {
 ; Varadic function definition
 %struct.va_list = type { ptr }
 
-declare void @llvm.va_start(ptr)
-declare void @llvm.va_copy(ptr, ptr)
-declare void @llvm.va_end(ptr)
+declare void @llvm.va_start.p0(ptr)
+declare void @llvm.va_copy.p0(ptr, ptr)
+declare void @llvm.va_end.p0(ptr)
 
 ; CHECK-LABEL: llvm.func @variadic_function
 define void @variadic_function(i32 %X, ...) {
   ; CHECK: %[[ALLOCA0:.+]] = llvm.alloca %{{.*}} x !llvm.struct<"struct.va_list", (ptr)> {{.*}} : (i32) -> !llvm.ptr
   %ap = alloca %struct.va_list
   ; CHECK: llvm.intr.vastart %[[ALLOCA0]]
-  call void @llvm.va_start(ptr %ap)
+  call void @llvm.va_start.p0(ptr %ap)
 
   ; CHECK: %[[ALLOCA1:.+]] = llvm.alloca %{{.*}} x !llvm.ptr {{.*}} : (i32) -> !llvm.ptr
   %aq = alloca ptr
   ; CHECK: llvm.intr.vacopy %[[ALLOCA0]] to %[[ALLOCA1]]
-  call void @llvm.va_copy(ptr %aq, ptr %ap)
+  call void @llvm.va_copy.p0(ptr %aq, ptr %ap)
   ; CHECK: llvm.intr.vaend %[[ALLOCA1]]
-  call void @llvm.va_end(ptr %aq)
+  call void @llvm.va_end.p0(ptr %aq)
 
   ; CHECK: llvm.intr.vaend %[[ALLOCA0]]
-  call void @llvm.va_end(ptr %ap)
+  call void @llvm.va_end.p0(ptr %ap)
   ; CHECK: llvm.return
   ret void
 }
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 8556183..0cefb4f 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -599,11 +599,11 @@ define void @ushl_sat_test(i32 %0, i32 %1, <8 x i32> %2, <8 x i32> %3) {
 ; CHECK-LABEL: llvm.func @va_intrinsics_test
 define void @va_intrinsics_test(ptr %0, ptr %1) {
 ; CHECK: llvm.intr.vastart %{{.*}}
-  call void @llvm.va_start(ptr %0)
+  call void @llvm.va_start.p0(ptr %0)
 ; CHECK: llvm.intr.vacopy %{{.*}} to %{{.*}}
-  call void @llvm.va_copy(ptr %1, ptr %0)
+  call void @llvm.va_copy.p0(ptr %1, ptr %0)
 ; CHECK: llvm.intr.vaend %{{.*}}
-  call void @llvm.va_end(ptr %0)
+  call void @llvm.va_end.p0(ptr %0)
   ret void
 }
 
@@ -1076,9 +1076,9 @@ declare ptr @llvm.stacksave.p0()
 declare ptr addrspace(1) @llvm.stacksave.p1()
 declare void @llvm.stackrestore.p0(ptr)
 declare void @llvm.stackrestore.p1(ptr addrspace(1))
-declare void @llvm.va_start(ptr)
-declare void @llvm.va_copy(ptr, ptr)
-declare void @llvm.va_end(ptr)
+declare void @llvm.va_start.p0(ptr)
+declare void @llvm.va_copy.p0(ptr, ptr)
+declare void @llvm.va_end.p0(ptr)
 declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index c38c7ea..97f3793 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -2251,14 +2251,14 @@ llvm.func @vararg_function(%arg0: i32, ...) {
   %1 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA0:.+]] = alloca %struct.va_list, align 8
   %2 = llvm.alloca %1 x !llvm.struct<"struct.va_list", (ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  // CHECK: call void @llvm.va_start(ptr %[[ALLOCA0]])
+  // CHECK: call void @llvm.va_start.p0(ptr %[[ALLOCA0]])
   llvm.intr.vastart %2 : !llvm.ptr
   // CHECK: %[[ALLOCA1:.+]] = alloca ptr, align 8
   %4 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  // CHECK: call void @llvm.va_copy(ptr %[[ALLOCA1]], ptr %[[ALLOCA0]])
+  // CHECK: call void @llvm.va_copy.p0(ptr %[[ALLOCA1]], ptr %[[ALLOCA0]])
   llvm.intr.vacopy %2 to %4 : !llvm.ptr, !llvm.ptr
-  // CHECK: call void @llvm.va_end(ptr %[[ALLOCA1]])
-  // CHECK: call void @llvm.va_end(ptr %[[ALLOCA0]])
+  // CHECK: call void @llvm.va_end.p0(ptr %[[ALLOCA1]])
+  // CHECK: call void @llvm.va_end.p0(ptr %[[ALLOCA0]])
   llvm.intr.vaend %4 : !llvm.ptr
   llvm.intr.vaend %2 : !llvm.ptr
   // CHECK: ret void
diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp
index 16de34c..58049a9 100644
--- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp
+++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp
@@ -431,8 +431,8 @@ TEST(InterfaceAttachmentTest, PromisedInterfaces) {
       attr.hasPromiseOrImplementsInterface<TestExternalAttrInterface>());
 
   // Add a promise `TestExternalAttrInterface`.
-  testDialect->declarePromisedInterface<test::SimpleAAttr,
-                                        TestExternalAttrInterface>();
+  testDialect->declarePromisedInterface<TestExternalAttrInterface,
+                                        test::SimpleAAttr>();
   EXPECT_TRUE(
       attr.hasPromiseOrImplementsInterface<TestExternalAttrInterface>());
 
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index a60bdb9..a242049 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -6752,11 +6752,11 @@ void __kmp_register_library_startup(void) {
       int fd1 = -1;
       shm_name = __kmp_str_format("/%s", name);
       int shm_preexist = 0;
-      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
       if ((fd1 == -1) && (errno == EEXIST)) {
         // file didn't open because it already exists.
         // try opening existing file
-        fd1 = shm_open(shm_name, O_RDWR, 0666);
+        fd1 = shm_open(shm_name, O_RDWR, 0600);
         if (fd1 == -1) { // file didn't open
           KMP_WARNING(FunctionError, "Can't open SHM");
           __kmp_shm_available = false;
@@ -6800,11 +6800,11 @@ void __kmp_register_library_startup(void) {
       int fd1 = -1;
       temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
       int tmp_preexist = 0;
-      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
       if ((fd1 == -1) && (errno == EEXIST)) {
         // file didn't open because it already exists.
         // try opening existing file
-        fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
+        fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
         if (fd1 == -1) { // file didn't open if (fd1 == -1) {
           KMP_WARNING(FunctionError, "Can't open TEMP");
           __kmp_tmp_available = false;
@@ -6944,7 +6944,7 @@ void __kmp_unregister_library(void) {
   int fd1;
   if (__kmp_shm_available) {
     shm_name = __kmp_str_format("/%s", name);
-    fd1 = shm_open(shm_name, O_RDONLY, 0666);
+    fd1 = shm_open(shm_name, O_RDONLY, 0600);
     if (fd1 != -1) { // File opened successfully
       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
       if (data1 != MAP_FAILED) {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e0790e0..eb0afbb 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -216,7 +216,6 @@ libc_support_library(
         ":__support_cpp_limits",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
-        ":__support_macros_config",
         ":__support_macros_sanitizer",
     ],
 )
@@ -383,7 +382,6 @@ libc_support_library(
     ],
     deps = [
         ":__support_macros_attributes",
-        ":__support_macros_config",
         ":__support_macros_properties_types",
         ":llvm_libc_macros_stdfix_macros",
     ],
@@ -663,7 +661,6 @@ libc_support_library(
         ":__support_cpp_limits",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
-        ":__support_macros_config",
     ],
 )
 
@@ -2318,7 +2315,6 @@ libc_support_library(
         ":__support_cpp_cstddef",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
-        ":__support_macros_config",
         ":__support_macros_optimization",
         ":__support_macros_properties_architectures",
         ":__support_macros_properties_cpu_features",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 0e658353..3c3e17bf 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -292,6 +292,10 @@ cc_library(
             "-ldl",
             "-lm",
         ],
+        "@platforms//os:macos": [
+            "-pthread",
+            "-ldl",
+        ],
         "//conditions:default": [
             "-pthread",
             "-ldl",
author	Vitaly Buka <vitalybuka@google.com>	2024-03-27 10:37:41 -0700
committer	Vitaly Buka <vitalybuka@google.com>	2024-03-27 10:37:41 -0700
commit	3c80a2aa608d9068623cde891cf71cd4f7c19f16 (patch)
tree	45b8bc261e29f078314b6c33064318e441f7f64a
parent	45cb6b7a7f942f8e83ea5039ca0fcee1788346b7 (diff)
parent	3403aee7a38ac979dbb5e57c779063535c605f6d (diff)
download	llvm-users/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation.zip llvm-users/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation.tar.gz llvm-users/vitalybuka/spr/instrprofiling-do-not-sanitize-pgo-instrumentation.tar.bz2