104 files changed, 1913 insertions, 1491 deletions
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 4ab4dee..f656687 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2063,12 +2063,9 @@ bool Compiler<Emitter>::visitCallArgs(ArrayRef<const Expr *> Args,
                                       const FunctionDecl *FuncDecl,
                                       bool Activate) {
   assert(VarScope->getKind() == ScopeKind::Call);
-  bool HasNonNullAttr = false;
   llvm::BitVector NonNullArgs;
-  if (FuncDecl && FuncDecl->hasAttr<NonNullAttr>()) {
-    HasNonNullAttr = true;
+  if (FuncDecl && FuncDecl->hasAttr<NonNullAttr>())
     NonNullArgs = collectNonNullArgs(FuncDecl, Args);
-  }
 
   unsigned ArgIndex = 0;
   for (const Expr *Arg : Args) {
@@ -2094,7 +2091,7 @@ bool Compiler<Emitter>::visitCallArgs(ArrayRef<const Expr *> Args,
         return false;
     }
 
-    if (HasNonNullAttr && NonNullArgs[ArgIndex]) {
+    if (!NonNullArgs.empty() && NonNullArgs[ArgIndex]) {
       PrimType ArgT = classify(Arg).value_or(PT_Ptr);
       if (ArgT == PT_Ptr) {
         if (!this->emitCheckNonNullArg(ArgT, Arg))
@@ -5997,6 +5994,23 @@ bool Compiler<Emitter>::checkLiteralType(const Expr *E) {
   return this->emitCheckLiteralType(E->getType().getTypePtr(), E);
 }
 
+static bool initNeedsOverridenLoc(const CXXCtorInitializer *Init) {
+  const Expr *InitExpr = Init->getInit();
+
+  if (!Init->isWritten() && !Init->isInClassMemberInitializer() &&
+      !isa<CXXConstructExpr>(InitExpr))
+    return true;
+
+  if (const auto *CE = dyn_cast<CXXConstructExpr>(InitExpr)) {
+    const CXXConstructorDecl *Ctor = CE->getConstructor();
+    if (Ctor->isDefaulted() && Ctor->isCopyOrMoveConstructor() &&
+        Ctor->isTrivial())
+      return true;
+  }
+
+  return false;
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
   assert(!ReturnType);
@@ -6071,10 +6085,7 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
       const Record::Field *F = R->getField(Member);
 
       LocOverrideScope<Emitter> LOS(this, SourceInfo{},
-                                    !Init->isWritten() &&
-                                        !Init->isInClassMemberInitializer() &&
-                                        (!isa<CXXConstructExpr>(InitExpr) ||
-                                         Member->isAnonymousStructOrUnion()));
+                                    initNeedsOverridenLoc(Init));
       if (!emitFieldInitializer(F, F->Offset, InitExpr, IsUnion))
         return false;
     } else if (const Type *Base = Init->getBaseClass()) {
@@ -6104,10 +6115,7 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
         return false;
     } else if (const IndirectFieldDecl *IFD = Init->getIndirectMember()) {
       LocOverrideScope<Emitter> LOS(this, SourceInfo{},
-                                    !Init->isWritten() &&
-                                        !Init->isInClassMemberInitializer() &&
-                                        !isa<CXXConstructExpr>(InitExpr));
-
+                                    initNeedsOverridenLoc(Init));
       assert(IFD->getChainingSize() >= 2);
 
       unsigned NestedFieldOffset = 0;
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 7403e90..629c1ff 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -153,28 +153,6 @@ static void dtorArrayDesc(Block *B, std::byte *Ptr, const Descriptor *D) {
   }
 }
 
-static void moveArrayDesc(Block *B, std::byte *Src, std::byte *Dst,
-                          const Descriptor *D) {
-  const unsigned NumElems = D->getNumElems();
-  const unsigned ElemSize =
-      D->ElemDesc->getAllocSize() + sizeof(InlineDescriptor);
-
-  unsigned ElemOffset = 0;
-  for (unsigned I = 0; I < NumElems; ++I, ElemOffset += ElemSize) {
-    auto *SrcPtr = Src + ElemOffset;
-    auto *DstPtr = Dst + ElemOffset;
-
-    auto *SrcDesc = reinterpret_cast<InlineDescriptor *>(SrcPtr);
-    auto *SrcElemLoc = reinterpret_cast<std::byte *>(SrcDesc + 1);
-    auto *DstDesc = reinterpret_cast<InlineDescriptor *>(DstPtr);
-    auto *DstElemLoc = reinterpret_cast<std::byte *>(DstDesc + 1);
-
-    *DstDesc = *SrcDesc;
-    if (auto Fn = D->ElemDesc->MoveFn)
-      Fn(B, SrcElemLoc, DstElemLoc, D->ElemDesc);
-  }
-}
-
 static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
                       bool IsVolatile, bool IsActive, bool IsUnionField,
                       bool InUnion, const Descriptor *D, unsigned FieldOffset) {
@@ -268,45 +246,6 @@ static void dtorRecord(Block *B, std::byte *Ptr, const Descriptor *D) {
     destroyBase(B, Ptr, F.Desc, F.Offset);
 }
 
-static void moveRecord(Block *B, std::byte *Src, std::byte *Dst,
-                       const Descriptor *D) {
-  assert(D);
-  assert(D->ElemRecord);
-
-  // FIXME: Code duplication.
-  for (const auto &F : D->ElemRecord->fields()) {
-    auto FieldOffset = F.Offset;
-    const auto *SrcDesc =
-        reinterpret_cast<const InlineDescriptor *>(Src + FieldOffset) - 1;
-    auto *DestDesc =
-        reinterpret_cast<InlineDescriptor *>(Dst + FieldOffset) - 1;
-    std::memcpy(DestDesc, SrcDesc, sizeof(InlineDescriptor));
-
-    if (auto Fn = F.Desc->MoveFn)
-      Fn(B, Src + FieldOffset, Dst + FieldOffset, F.Desc);
-  }
-
-  for (const auto &Base : D->ElemRecord->bases()) {
-    auto BaseOffset = Base.Offset;
-    const auto *SrcDesc =
-        reinterpret_cast<const InlineDescriptor *>(Src + BaseOffset) - 1;
-    auto *DestDesc = reinterpret_cast<InlineDescriptor *>(Dst + BaseOffset) - 1;
-    std::memcpy(DestDesc, SrcDesc, sizeof(InlineDescriptor));
-
-    if (auto Fn = Base.Desc->MoveFn)
-      Fn(B, Src + BaseOffset, Dst + BaseOffset, Base.Desc);
-  }
-
-  for (const auto &VBase : D->ElemRecord->virtual_bases()) {
-    auto VBaseOffset = VBase.Offset;
-    const auto *SrcDesc =
-        reinterpret_cast<const InlineDescriptor *>(Src + VBaseOffset) - 1;
-    auto *DestDesc =
-        reinterpret_cast<InlineDescriptor *>(Dst + VBaseOffset) - 1;
-    std::memcpy(DestDesc, SrcDesc, sizeof(InlineDescriptor));
-  }
-}
-
 static BlockCtorFn getCtorPrim(PrimType Type) {
   // Floating types are special. They are primitives, but need their
   // constructor called.
@@ -337,18 +276,6 @@ static BlockDtorFn getDtorPrim(PrimType Type) {
   COMPOSITE_TYPE_SWITCH(Type, return dtorTy<T>, return nullptr);
 }
 
-static BlockMoveFn getMovePrim(PrimType Type) {
-  if (Type == PT_Float)
-    return moveTy<PrimConv<PT_Float>::T>;
-  if (Type == PT_IntAP)
-    return moveTy<PrimConv<PT_IntAP>::T>;
-  if (Type == PT_IntAPS)
-    return moveTy<PrimConv<PT_IntAPS>::T>;
-  if (Type == PT_MemberPtr)
-    return moveTy<PrimConv<PT_MemberPtr>::T>;
-  COMPOSITE_TYPE_SWITCH(Type, return moveTy<T>, return nullptr);
-}
-
 static BlockCtorFn getCtorArrayPrim(PrimType Type) {
   TYPE_SWITCH(Type, return ctorArrayTy<T>);
   llvm_unreachable("unknown Expr");
@@ -359,11 +286,6 @@ static BlockDtorFn getDtorArrayPrim(PrimType Type) {
   llvm_unreachable("unknown Expr");
 }
 
-static BlockMoveFn getMoveArrayPrim(PrimType Type) {
-  TYPE_SWITCH(Type, return moveArrayTy<T>);
-  llvm_unreachable("unknown Expr");
-}
-
 /// Primitives.
 Descriptor::Descriptor(const DeclTy &D, const Type *SourceTy, PrimType Type,
                        MetadataSize MD, bool IsConst, bool IsTemporary,
@@ -372,7 +294,7 @@ Descriptor::Descriptor(const DeclTy &D, const Type *SourceTy, PrimType Type,
       MDSize(MD.value_or(0)), AllocSize(align(Size + MDSize)), PrimT(Type),
       IsConst(IsConst), IsMutable(IsMutable), IsTemporary(IsTemporary),
       IsVolatile(IsVolatile), CtorFn(getCtorPrim(Type)),
-      DtorFn(getDtorPrim(Type)), MoveFn(getMovePrim(Type)) {
+      DtorFn(getDtorPrim(Type)) {
   assert(AllocSize >= Size);
   assert(Source && "Missing source");
 }
@@ -386,7 +308,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
       AllocSize(align(MDSize) + align(Size) + sizeof(InitMapPtr)), PrimT(Type),
       IsConst(IsConst), IsMutable(IsMutable), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
-      DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
+      DtorFn(getDtorArrayPrim(Type)) {
   assert(Source && "Missing source");
   assert(NumElems <= (MaxArrayElemBytes / ElemSize));
 }
@@ -399,7 +321,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
       AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
-      DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
+      DtorFn(getDtorArrayPrim(Type)) {
   assert(Source && "Missing source");
 }
 
@@ -414,7 +336,7 @@ Descriptor::Descriptor(const DeclTy &D, const Type *SourceTy,
       AllocSize(std::max<size_t>(alignof(void *), Size) + MDSize),
       ElemDesc(Elem), IsConst(IsConst), IsMutable(IsMutable),
       IsTemporary(IsTemporary), IsArray(true), CtorFn(ctorArrayDesc),
-      DtorFn(dtorArrayDesc), MoveFn(moveArrayDesc) {
+      DtorFn(dtorArrayDesc) {
   assert(Source && "Missing source");
 }
 
@@ -425,7 +347,7 @@ Descriptor::Descriptor(const DeclTy &D, const Descriptor *Elem, MetadataSize MD,
       Size(UnknownSizeMark), MDSize(MD.value_or(0)),
       AllocSize(MDSize + alignof(void *)), ElemDesc(Elem), IsConst(true),
       IsMutable(false), IsTemporary(IsTemporary), IsArray(true),
-      CtorFn(ctorArrayDesc), DtorFn(dtorArrayDesc), MoveFn(moveArrayDesc) {
+      CtorFn(ctorArrayDesc), DtorFn(dtorArrayDesc) {
   assert(Source && "Missing source");
 }
 
@@ -437,7 +359,7 @@ Descriptor::Descriptor(const DeclTy &D, const Record *R, MetadataSize MD,
       Size(ElemSize), MDSize(MD.value_or(0)), AllocSize(Size + MDSize),
       ElemRecord(R), IsConst(IsConst), IsMutable(IsMutable),
       IsTemporary(IsTemporary), IsVolatile(IsVolatile), CtorFn(ctorRecord),
-      DtorFn(dtorRecord), MoveFn(moveRecord) {
+      DtorFn(dtorRecord) {
   assert(Source && "Missing source");
 }
 
diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h
index 4c925f6..cd34e11 100644
--- a/clang/lib/AST/ByteCode/Descriptor.h
+++ b/clang/lib/AST/ByteCode/Descriptor.h
@@ -41,14 +41,6 @@ using BlockCtorFn = void (*)(Block *Storage, std::byte *FieldPtr, bool IsConst,
 using BlockDtorFn = void (*)(Block *Storage, std::byte *FieldPtr,
                              const Descriptor *FieldDesc);
 
-/// Invoked when a block with pointers referencing it goes out of scope. Such
-/// blocks are persisted: the move function copies all inline descriptors and
-/// non-trivial fields, as existing pointers might need to reference those
-/// descriptors. Data is not copied since it cannot be legally read.
-using BlockMoveFn = void (*)(Block *Storage, std::byte *SrcFieldPtr,
-                             std::byte *DstFieldPtr,
-                             const Descriptor *FieldDesc);
-
 enum class GlobalInitState {
   Initialized,
   NoInitializer,
@@ -181,7 +173,6 @@ public:
   /// Storage management methods.
   const BlockCtorFn CtorFn = nullptr;
   const BlockDtorFn DtorFn = nullptr;
-  const BlockMoveFn MoveFn = nullptr;
 
   /// Allocates a descriptor for a primitive.
   Descriptor(const DeclTy &D, const Type *SourceTy, PrimType Type,
diff --git a/clang/lib/AST/ByteCode/DynamicAllocator.cpp b/clang/lib/AST/ByteCode/DynamicAllocator.cpp
index 9b8b664..f38d585 100644
--- a/clang/lib/AST/ByteCode/DynamicAllocator.cpp
+++ b/clang/lib/AST/ByteCode/DynamicAllocator.cpp
@@ -13,25 +13,6 @@
 using namespace clang;
 using namespace clang::interp;
 
-// FIXME: There is a peculiar problem with the way we track pointers
-// to blocks and the way we allocate dynamic memory.
-//
-// When we have code like this:
-// while (true) {
-//   char *buffer = new char[1024];
-//   delete[] buffer;
-// }
-//
-// We have a local variable 'buffer' pointing to the heap allocated memory.
-// When deallocating the memory via delete[], that local variable still
-// points to the memory, which means we will create a DeadBlock for it and move
-// it over to that block, essentially duplicating the allocation. Moving
-// the data is also slow.
-//
-// However, when we actually try to access the allocation after it has been
-// freed, we need the block to still exist (alive or dead) so we can tell
-// that it's a dynamic allocation.
-
 DynamicAllocator::~DynamicAllocator() { cleanup(); }
 
 void DynamicAllocator::cleanup() {
@@ -42,8 +23,11 @@ void DynamicAllocator::cleanup() {
   for (auto &Iter : AllocationSites) {
     auto &AllocSite = Iter.second;
     for (auto &Alloc : AllocSite.Allocations) {
-      Block *B = reinterpret_cast<Block *>(Alloc.Memory.get());
+      Block *B = Alloc.block();
+      assert(!B->IsDead);
+      assert(B->isInitialized());
       B->invokeDtor();
+
       if (B->hasPointers()) {
         while (B->Pointers) {
           Pointer *Next = B->Pointers->asBlockPointer().Next;
@@ -89,6 +73,12 @@ Block *DynamicAllocator::allocate(const Descriptor *D, unsigned EvalID,
   assert(D);
   assert(D->asExpr());
 
+  // Garbage collection. Remove all dead allocations that don't have pointers to
+  // them anymore.
+  llvm::erase_if(DeadAllocations, [](Allocation &Alloc) -> bool {
+    return !Alloc.block()->hasPointers();
+  });
+
   auto Memory =
       std::make_unique<std::byte[]>(sizeof(Block) + D->getAllocSize());
   auto *B = new (Memory.get()) Block(EvalID, D, /*isStatic=*/false);
@@ -128,23 +118,39 @@ bool DynamicAllocator::deallocate(const Expr *Source,
     return false;
 
   auto &Site = It->second;
-  assert(Site.size() > 0);
+  assert(!Site.empty());
 
   // Find the Block to delete.
   auto AllocIt = llvm::find_if(Site.Allocations, [&](const Allocation &A) {
-    const Block *B = reinterpret_cast<const Block *>(A.Memory.get());
-    return BlockToDelete == B;
+    return BlockToDelete == A.block();
   });
 
   assert(AllocIt != Site.Allocations.end());
 
-  Block *B = reinterpret_cast<Block *>(AllocIt->Memory.get());
+  Block *B = AllocIt->block();
+  assert(B->isInitialized());
+  assert(!B->IsDead);
   B->invokeDtor();
 
-  S.deallocate(B);
-  Site.Allocations.erase(AllocIt);
+  // Almost all our dynamic allocations have a pointer pointing to them
+  // when we deallocate them, since otherwise we can't call delete() at all.
+  // This means that we would usually need to create DeadBlocks for all of them.
+  // To work around that, we instead mark them as dead without moving the data
+  // over to a DeadBlock and simply keep the block in a separate DeadAllocations
+  // list.
+  if (B->hasPointers()) {
+    B->IsDead = true;
+    DeadAllocations.push_back(std::move(*AllocIt));
+    Site.Allocations.erase(AllocIt);
+
+    if (Site.size() == 0)
+      AllocationSites.erase(It);
+    return true;
+  }
 
-  if (Site.size() == 0)
+  // Get rid of the allocation altogether.
+  Site.Allocations.erase(AllocIt);
+  if (Site.empty())
     AllocationSites.erase(It);
 
   return true;
diff --git a/clang/lib/AST/ByteCode/DynamicAllocator.h b/clang/lib/AST/ByteCode/DynamicAllocator.h
index cff09bf..31d0e58 100644
--- a/clang/lib/AST/ByteCode/DynamicAllocator.h
+++ b/clang/lib/AST/ByteCode/DynamicAllocator.h
@@ -43,6 +43,7 @@ private:
     std::unique_ptr<std::byte[]> Memory;
     Allocation(std::unique_ptr<std::byte[]> Memory)
         : Memory(std::move(Memory)) {}
+    Block *block() const { return reinterpret_cast<Block *>(Memory.get()); }
   };
 
   struct AllocationSite {
@@ -55,6 +56,7 @@ private:
     }
 
     size_t size() const { return Allocations.size(); }
+    bool empty() const { return Allocations.empty(); }
   };
 
 public:
@@ -65,8 +67,6 @@ public:
 
   void cleanup();
 
-  unsigned getNumAllocations() const { return AllocationSites.size(); }
-
   /// Allocate ONE element of the given descriptor.
   Block *allocate(const Descriptor *D, unsigned EvalID, Form AllocForm);
   /// Allocate \p NumElements primitive elements of the given type.
@@ -96,8 +96,13 @@ public:
     return llvm::make_range(AllocationSites.begin(), AllocationSites.end());
   }
 
+  bool hasAllocations() const { return !AllocationSites.empty(); }
+
 private:
   llvm::DenseMap<const Expr *, AllocationSite> AllocationSites;
+  // Allocations that have already been deallocated but had pointers
+  // to them.
+  llvm::SmallVector<Allocation> DeadAllocations;
 
   using PoolAllocTy = llvm::BumpPtrAllocator;
   PoolAllocTy DescAllocator;
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp
index 976b7c0..9ed61c7 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -292,7 +292,7 @@ bool EvalEmitter::emitGetLocal(uint32_t I, const SourceInfo &Info) {
 
   Block *B = getLocal(I);
 
-  if (!CheckLocalLoad(S, OpPC, Pointer(B)))
+  if (!CheckLocalLoad(S, OpPC, B))
     return false;
 
   S.Stk.push<T>(*reinterpret_cast<T *>(B->data()));
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index eb4e480..b5c044c 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -211,25 +211,26 @@ static void diagnoseNonConstVariable(InterpState &S, CodePtr OpPC,
   S.Note(VD->getLocation(), diag::note_declared_at);
 }
 
-static bool CheckTemporary(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+static bool CheckTemporary(InterpState &S, CodePtr OpPC, const Block *B,
                            AccessKinds AK) {
-  if (auto ID = Ptr.getDeclID()) {
-    if (!Ptr.isStaticTemporary())
+  if (B->getDeclID()) {
+    if (!(B->isStatic() && B->isTemporary()))
       return true;
 
     const auto *MTE = dyn_cast_if_present<MaterializeTemporaryExpr>(
-        Ptr.getDeclDesc()->asExpr());
+        B->getDescriptor()->asExpr());
     if (!MTE)
       return true;
 
     // FIXME(perf): Since we do this check on every Load from a static
     // temporary, it might make sense to cache the value of the
     // isUsableInConstantExpressions call.
-    if (!MTE->isUsableInConstantExpressions(S.getASTContext()) &&
-        Ptr.block()->getEvalID() != S.Ctx.getEvalID()) {
+    if (B->getEvalID() != S.Ctx.getEvalID() &&
+        !MTE->isUsableInConstantExpressions(S.getASTContext())) {
       const SourceInfo &E = S.Current->getSource(OpPC);
       S.FFDiag(E, diag::note_constexpr_access_static_temporary, 1) << AK;
-      S.Note(Ptr.getDeclLoc(), diag::note_constexpr_temporary_here);
+      S.Note(B->getDescriptor()->getLocation(),
+             diag::note_constexpr_temporary_here);
       return false;
     }
   }
@@ -517,7 +518,7 @@ bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
 bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                 AccessKinds AK) {
-  if (!Ptr.isOnePastEnd())
+  if (!Ptr.isOnePastEnd() && !Ptr.isZeroSizeArray())
     return true;
   if (S.getLangOpts().CPlusPlus) {
     const SourceInfo &Loc = S.Current->getSource(OpPC);
@@ -658,17 +659,19 @@ static bool CheckVolatile(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                      AccessKinds AK) {
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                           AccessKinds AK) {
   assert(Ptr.isLive());
+  assert(!Ptr.isInitialized());
+  return DiagnoseUninitialized(S, OpPC, Ptr.isExtern(), Ptr.getDeclDesc(), AK);
+}
 
-  if (Ptr.isInitialized())
-    return true;
-
-  if (Ptr.isExtern() && S.checkingPotentialConstantExpression())
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, bool Extern,
+                           const Descriptor *Desc, AccessKinds AK) {
+  if (Extern && S.checkingPotentialConstantExpression())
     return false;
 
-  if (const auto *VD = Ptr.getDeclDesc()->asVarDecl();
+  if (const auto *VD = Desc->asVarDecl();
       VD && (VD->isConstexpr() || VD->hasGlobalStorage())) {
 
     if (VD == S.EvaluatingDecl &&
@@ -703,9 +706,9 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-static bool CheckLifetime(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+static bool CheckLifetime(InterpState &S, CodePtr OpPC, Lifetime LT,
                           AccessKinds AK) {
-  if (Ptr.getLifetime() == Lifetime::Started)
+  if (LT == Lifetime::Started)
     return true;
 
   if (!S.checkingPotentialConstantExpression()) {
@@ -715,11 +718,11 @@ static bool CheckLifetime(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-static bool CheckWeak(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!Ptr.isWeak())
+static bool CheckWeak(InterpState &S, CodePtr OpPC, const Block *B) {
+  if (!B->isWeak())
     return true;
 
-  const auto *VD = Ptr.getDeclDesc()->asVarDecl();
+  const auto *VD = B->getDescriptor()->asVarDecl();
   assert(VD);
   S.FFDiag(S.Current->getLocation(OpPC), diag::note_constexpr_var_init_weak)
       << VD;
@@ -732,32 +735,56 @@ static bool CheckWeak(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 // ones removed that are impossible on primitive global values.
 // For example, since those can't be members of structs, they also can't
 // be mutable.
-bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!CheckExtern(S, OpPC, Ptr))
+bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Block *B) {
+  const auto &Desc =
+      *reinterpret_cast<const GlobalInlineDescriptor *>(B->rawData());
+  if (!CheckExtern(S, OpPC, Pointer(const_cast<Block *>(B))))
     return false;
-  if (!CheckConstant(S, OpPC, Ptr))
+  if (!CheckConstant(S, OpPC, B->getDescriptor()))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+  if (!CheckDummy(S, OpPC, B, AK_Read))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+  if (Desc.InitState != GlobalInitState::Initialized)
+    return DiagnoseUninitialized(S, OpPC, B->isExtern(), B->getDescriptor(),
+                                 AK_Read);
+  if (!CheckTemporary(S, OpPC, B, AK_Read))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
+  if (!CheckWeak(S, OpPC, B))
     return false;
-  if (!CheckWeak(S, OpPC, Ptr))
-    return false;
-  if (!CheckVolatile(S, OpPC, Ptr, AK_Read))
+  if (B->getDescriptor()->IsVolatile) {
+    if (!S.getLangOpts().CPlusPlus)
+      return Invalid(S, OpPC);
+
+    const ValueDecl *D = B->getDescriptor()->asValueDecl();
+    S.FFDiag(S.Current->getLocation(OpPC),
+             diag::note_constexpr_access_volatile_obj, 1)
+        << AK_Read << 1 << D;
+    S.Note(D->getLocation(), diag::note_constexpr_volatile_here) << 1;
     return false;
+  }
   return true;
 }
 
 // Similarly, for local loads.
-bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!CheckLifetime(S, OpPC, Ptr, AK_Read))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
-    return false;
-  if (!CheckVolatile(S, OpPC, Ptr, AK_Read))
+bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Block *B) {
+  assert(!B->isExtern());
+  const auto &Desc = *reinterpret_cast<const InlineDescriptor *>(B->rawData());
+  if (!CheckLifetime(S, OpPC, Desc.LifeState, AK_Read))
+    return false;
+  if (!Desc.IsInitialized)
+    return DiagnoseUninitialized(S, OpPC, /*Extern=*/false, B->getDescriptor(),
+                                 AK_Read);
+  if (B->getDescriptor()->IsVolatile) {
+    if (!S.getLangOpts().CPlusPlus)
+      return Invalid(S, OpPC);
+
+    const ValueDecl *D = B->getDescriptor()->asValueDecl();
+    S.FFDiag(S.Current->getLocation(OpPC),
+             diag::note_constexpr_access_volatile_obj, 1)
+        << AK_Read << 1 << D;
+    S.Note(D->getLocation(), diag::note_constexpr_volatile_here) << 1;
     return false;
+  }
   return true;
 }
 
@@ -769,19 +796,19 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
     return false;
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK))
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK))
     return false;
   if (!CheckActive(S, OpPC, Ptr, AK))
     return false;
-  if (!CheckLifetime(S, OpPC, Ptr, AK))
+  if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK))
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK);
+  if (Ptr.isBlockPointer() && !CheckTemporary(S, OpPC, Ptr.block(), AK))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK))
-    return false;
-  if (!CheckWeak(S, OpPC, Ptr))
+  if (Ptr.isBlockPointer() && !CheckWeak(S, OpPC, Ptr.block()))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
     return false;
@@ -798,21 +825,19 @@ bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
 
-  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Read))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
-  if (!CheckRange(S, OpPC, Ptr, AK_Read))
-    return false;
   if (!CheckActive(S, OpPC, Ptr, AK_Read))
     return false;
-  if (!CheckLifetime(S, OpPC, Ptr, AK_Read))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+  if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK_Read))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK_Read);
+  if (Ptr.isBlockPointer() && !CheckTemporary(S, OpPC, Ptr.block(), AK_Read))
     return false;
-  if (!CheckWeak(S, OpPC, Ptr))
+  if (Ptr.isBlockPointer() && !CheckWeak(S, OpPC, Ptr.block()))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
     return false;
@@ -822,9 +847,9 @@ bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_Assign))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK_Assign))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Assign))
     return false;
-  if (!CheckLifetime(S, OpPC, Ptr, AK_Assign))
+  if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK_Assign))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
@@ -1098,12 +1123,11 @@ bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
   return diagnoseUnknownDecl(S, OpPC, D);
 }
 
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                AccessKinds AK) {
-  if (!Ptr.isDummy())
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Block *B, AccessKinds AK) {
+  const Descriptor *Desc = B->getDescriptor();
+  if (!Desc->isDummy())
     return true;
 
-  const Descriptor *Desc = Ptr.getDeclDesc();
   const ValueDecl *D = Desc->asValueDecl();
   if (!D)
     return false;
@@ -1426,7 +1450,7 @@ static bool checkConstructor(InterpState &S, CodePtr OpPC, const Function *Func,
 bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_Destroy))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Destroy))
+  if (!CheckTemporary(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK_Destroy))
     return false;
@@ -1620,8 +1644,17 @@ bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
 
   const auto *StaticDecl = cast<CXXRecordDecl>(Func->getParentDecl());
   const auto *InitialFunction = cast<CXXMethodDecl>(Callee);
-  const CXXMethodDecl *Overrider = S.getContext().getOverridingFunction(
-      DynamicDecl, StaticDecl, InitialFunction);
+  const CXXMethodDecl *Overrider;
+
+  if (StaticDecl != DynamicDecl) {
+    if (!DynamicDecl->isDerivedFrom(StaticDecl))
+      return false;
+    Overrider = S.getContext().getOverridingFunction(DynamicDecl, StaticDecl,
+                                                     InitialFunction);
+
+  } else {
+    Overrider = InitialFunction;
+  }
 
   if (Overrider != InitialFunction) {
     // DR1872: An instantiated virtual constexpr function can't be called in a
@@ -1749,7 +1782,7 @@ static void startLifetimeRecurse(const Pointer &Ptr) {
 
 bool StartLifetime(InterpState &S, CodePtr OpPC) {
   const auto &Ptr = S.Stk.peek<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Destroy))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   startLifetimeRecurse(Ptr.narrow());
   return true;
@@ -1780,7 +1813,7 @@ static void endLifetimeRecurse(const Pointer &Ptr) {
 /// Ends the lifetime of the peek'd pointer.
 bool EndLifetime(InterpState &S, CodePtr OpPC) {
   const auto &Ptr = S.Stk.peek<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Destroy))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   endLifetimeRecurse(Ptr.narrow());
   return true;
@@ -1789,7 +1822,7 @@ bool EndLifetime(InterpState &S, CodePtr OpPC) {
 /// Ends the lifetime of the pop'd pointer.
 bool EndLifetimePop(InterpState &S, CodePtr OpPC) {
   const auto &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Destroy))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   endLifetimeRecurse(Ptr.narrow());
   return true;
@@ -1804,16 +1837,16 @@ bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E,
 
   // Similar to CheckStore(), but with the additional CheckTemporary() call and
   // the AccessKinds are different.
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Construct))
+  if (!CheckTemporary(S, OpPC, Ptr.block(), AK_Construct))
     return false;
   if (!CheckLive(S, OpPC, Ptr, AK_Construct))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK_Construct))
+  if (!CheckDummy(S, OpPC, Ptr.block(), AK_Construct))
     return false;
 
   // CheckLifetime for this and all base pointers.
   for (Pointer P = Ptr;;) {
-    if (!CheckLifetime(S, OpPC, P, AK_Construct))
+    if (!CheckLifetime(S, OpPC, P.getLifetime(), AK_Construct))
       return false;
 
     if (P.isRoot())
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 8a28106..0d3f492 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -51,8 +51,7 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK);
 
 /// Checks if a pointer is a dummy pointer.
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                AccessKinds AK);
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Block *B, AccessKinds AK);
 
 /// Checks if a pointer is null.
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
@@ -89,11 +88,14 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK = AK_Read);
 bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                      AccessKinds AK);
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                           AccessKinds AK);
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, bool Extern,
+                           const Descriptor *Desc, AccessKinds AK);
+
 /// Checks a direct load of a primitive value from a global or local variable.
-bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
-bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Block *B);
+bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Block *B);
 
 /// Checks if a value can be stored in a block.
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
@@ -1351,10 +1353,10 @@ inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetLocal(InterpState &S, CodePtr OpPC, uint32_t I) {
-  const Pointer &Ptr = S.Current->getLocalPointer(I);
-  if (!CheckLocalLoad(S, OpPC, Ptr))
+  const Block *B = S.Current->getLocalBlock(I);
+  if (!CheckLocalLoad(S, OpPC, B))
     return false;
-  S.Stk.push<T>(Ptr.deref<T>());
+  S.Stk.push<T>(B->deref<T>());
   return true;
 }
 
@@ -1465,22 +1467,26 @@ bool SetThisField(InterpState &S, CodePtr OpPC, uint32_t I) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
-  const Pointer &Ptr = S.P.getPtrGlobal(I);
+  const Block *B = S.P.getGlobal(I);
 
-  if (!CheckGlobalLoad(S, OpPC, Ptr))
+  if (!CheckGlobalLoad(S, OpPC, B))
     return false;
 
-  S.Stk.push<T>(Ptr.deref<T>());
+  S.Stk.push<T>(B->deref<T>());
   return true;
 }
 
 /// Same as GetGlobal, but without the checks.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetGlobalUnchecked(InterpState &S, CodePtr OpPC, uint32_t I) {
-  const Pointer &Ptr = S.P.getPtrGlobal(I);
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
-    return false;
-  S.Stk.push<T>(Ptr.deref<T>());
+  const Block *B = S.P.getGlobal(I);
+  const auto &Desc =
+      *reinterpret_cast<const GlobalInlineDescriptor *>(B->rawData());
+  if (Desc.InitState != GlobalInitState::Initialized)
+    return DiagnoseUninitialized(S, OpPC, B->isExtern(), B->getDescriptor(),
+                                 AK_Read);
+
+  S.Stk.push<T>(B->deref<T>());
   return true;
 }
 
@@ -2351,8 +2357,8 @@ static inline bool IncDecPtrHelper(InterpState &S, CodePtr OpPC,
 static inline bool IncPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
-    return false;
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK_Increment);
 
   return IncDecPtrHelper<ArithOp::Add>(S, OpPC, Ptr);
 }
@@ -2360,8 +2366,8 @@ static inline bool IncPtr(InterpState &S, CodePtr OpPC) {
 static inline bool DecPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
-    return false;
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK_Decrement);
 
   return IncDecPtrHelper<ArithOp::Sub>(S, OpPC, Ptr);
 }
@@ -3195,6 +3201,9 @@ inline bool GetMemberPtr(InterpState &S, CodePtr OpPC, const ValueDecl *D) {
 inline bool GetMemberPtrBase(InterpState &S, CodePtr OpPC) {
   const auto &MP = S.Stk.pop<MemberPointer>();
 
+  if (!MP.isBaseCastPossible())
+    return false;
+
   S.Stk.push<Pointer>(MP.getBase());
   return true;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBlock.cpp b/clang/lib/AST/ByteCode/InterpBlock.cpp
index 963b54e..8b7f6a7 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.cpp
+++ b/clang/lib/AST/ByteCode/InterpBlock.cpp
@@ -64,7 +64,7 @@ void Block::removePointer(Pointer *P) {
 }
 
 void Block::cleanup() {
-  if (Pointers == nullptr && IsDead)
+  if (Pointers == nullptr && !IsDynamic && IsDead)
     (reinterpret_cast<DeadBlock *>(this + 1) - 1)->free();
 }
 
@@ -133,8 +133,7 @@ DeadBlock::DeadBlock(DeadBlock *&Root, Block *Blk)
 }
 
 void DeadBlock::free() {
-  if (B.IsInitialized)
-    B.invokeDtor();
+  assert(!B.isInitialized());
 
   if (Prev)
     Prev->Next = Next;
diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h
index 5162223..07194d6 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.h
+++ b/clang/lib/AST/ByteCode/InterpBlock.h
@@ -103,6 +103,10 @@ public:
     return reinterpret_cast<const std::byte *>(this) + sizeof(Block);
   }
 
+  template <typename T> T deref() const {
+    return *reinterpret_cast<const T *>(data());
+  }
+
   /// Invokes the constructor.
   void invokeCtor() {
     assert(!IsInitialized);
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index f908d02..c835bd4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -276,7 +276,7 @@ static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
   if (!CheckLive(S, OpPC, StrPtr, AK_Read))
     return false;
 
-  if (!CheckDummy(S, OpPC, StrPtr, AK_Read))
+  if (!CheckDummy(S, OpPC, StrPtr.block(), AK_Read))
     return false;
 
   assert(StrPtr.getFieldDesc()->isPrimitiveArray());
@@ -2232,7 +2232,7 @@ static bool interp__builtin_is_within_lifetime(InterpState &S, CodePtr OpPC,
       return false;
     if (!CheckMutable(S, OpPC, Ptr))
       return false;
-    if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+    if (!CheckDummy(S, OpPC, Ptr.block(), AK_Read))
       return false;
   }
 
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 14f99c7..f2eac86 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -133,6 +133,11 @@ static bool shouldSkipInBacktrace(const Function *F) {
       MD && MD->getParent()->isAnonymousStructOrUnion())
     return true;
 
+  if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(FD);
+      Ctor && Ctor->isDefaulted() && Ctor->isTrivial() &&
+      Ctor->isCopyOrMoveConstructor() && Ctor->inits().empty())
+    return true;
+
   return false;
 }
 
@@ -226,6 +231,10 @@ Pointer InterpFrame::getLocalPointer(unsigned Offset) const {
   return Pointer(localBlock(Offset));
 }
 
+Block *InterpFrame::getLocalBlock(unsigned Offset) const {
+  return localBlock(Offset);
+}
+
 Pointer InterpFrame::getParamPointer(unsigned Off) {
   // Return the block if it was created previously.
   if (auto Pt = Params.find(Off); Pt != Params.end())
diff --git a/clang/lib/AST/ByteCode/InterpFrame.h b/clang/lib/AST/ByteCode/InterpFrame.h
index cfebe93..4be5391 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.h
+++ b/clang/lib/AST/ByteCode/InterpFrame.h
@@ -86,6 +86,7 @@ public:
 
   /// Returns a pointer to a local variables.
   Pointer getLocalPointer(unsigned Offset) const;
+  Block *getLocalBlock(unsigned Offset) const;
 
   /// Returns the value of an argument.
   template <typename T> const T &getParam(unsigned Offset) const {
diff --git a/clang/lib/AST/ByteCode/InterpState.cpp b/clang/lib/AST/ByteCode/InterpState.cpp
index a06b125..32f940c 100644
--- a/clang/lib/AST/ByteCode/InterpState.cpp
+++ b/clang/lib/AST/ByteCode/InterpState.cpp
@@ -76,9 +76,7 @@ bool InterpState::reportOverflow(const Expr *E, const llvm::APSInt &Value) {
 
 void InterpState::deallocate(Block *B) {
   assert(B);
-  const Descriptor *Desc = B->getDescriptor();
-  assert(Desc);
-
+  assert(!B->isDynamic());
   // The block might have a pointer saved in a field in its data
   // that points to the block itself. We call the dtor first,
   // which will destroy all the data but leave InlineDescriptors
@@ -87,6 +85,7 @@ void InterpState::deallocate(Block *B) {
   if (B->IsInitialized)
     B->invokeDtor();
 
+  assert(!B->isInitialized());
   if (B->hasPointers()) {
     size_t Size = B->getSize();
     // Allocate a new block, transferring over pointers.
@@ -95,24 +94,20 @@ void InterpState::deallocate(Block *B) {
     auto *D = new (Memory) DeadBlock(DeadBlocks, B);
     // Since the block doesn't hold any actual data anymore, we can just
     // memcpy() everything over.
-    std::memcpy(D->rawData(), B->rawData(), Desc->getAllocSize());
-    D->B.IsInitialized = B->IsInitialized;
-
-    // We moved the contents over to the DeadBlock.
-    B->IsInitialized = false;
+    std::memcpy(D->rawData(), B->rawData(), Size);
+    D->B.IsInitialized = false;
   }
 }
 
 bool InterpState::maybeDiagnoseDanglingAllocations() {
-  bool NoAllocationsLeft = (Alloc.getNumAllocations() == 0);
+  bool NoAllocationsLeft = !Alloc.hasAllocations();
 
   if (!checkingPotentialConstantExpression()) {
-    for (const auto &It : Alloc.allocation_sites()) {
-      assert(It.second.size() > 0);
+    for (const auto &[Source, Site] : Alloc.allocation_sites()) {
+      assert(!Site.empty());
 
-      const Expr *Source = It.first;
       CCEDiag(Source->getExprLoc(), diag::note_constexpr_memory_leak)
-          << (It.second.size() - 1) << Source->getSourceRange();
+          << (Site.size() - 1) << Source->getSourceRange();
     }
   }
   // Keep evaluating before C++20, since the CXXNewExpr wasn't valid there
diff --git a/clang/lib/AST/ByteCode/MemberPointer.h b/clang/lib/AST/ByteCode/MemberPointer.h
index b17ce25..8dd75ca 100644
--- a/clang/lib/AST/ByteCode/MemberPointer.h
+++ b/clang/lib/AST/ByteCode/MemberPointer.h
@@ -51,6 +51,12 @@ public:
 
   FunctionPointer toFunctionPointer(const Context &Ctx) const;
 
+  bool isBaseCastPossible() const {
+    if (PtrOffset < 0)
+      return true;
+    return static_cast<uint64_t>(PtrOffset) <= Base.getByteOffset();
+  }
+
   Pointer getBase() const {
     if (PtrOffset < 0)
       return Base.atField(-PtrOffset);
diff --git a/clang/lib/AST/CommentParser.cpp b/clang/lib/AST/CommentParser.cpp
index e61846d..2e5821a 100644
--- a/clang/lib/AST/CommentParser.cpp
+++ b/clang/lib/AST/CommentParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/CommentParser.h"
+#include "clang/AST/Comment.h"
 #include "clang/AST/CommentCommandTraits.h"
 #include "clang/AST/CommentSema.h"
 #include "clang/Basic/CharInfo.h"
@@ -569,6 +570,8 @@ BlockCommandComment *Parser::parseBlockCommand() {
 
 InlineCommandComment *Parser::parseInlineCommand() {
   assert(Tok.is(tok::backslash_command) || Tok.is(tok::at_command));
+  CommandMarkerKind CMK =
+      Tok.is(tok::backslash_command) ? CMK_Backslash : CMK_At;
   const CommandInfo *Info = Traits.getCommandInfo(Tok.getCommandID());
 
   const Token CommandTok = Tok;
@@ -580,7 +583,7 @@ InlineCommandComment *Parser::parseInlineCommand() {
 
   InlineCommandComment *IC = S.actOnInlineCommand(
       CommandTok.getLocation(), CommandTok.getEndLocation(),
-      CommandTok.getCommandID(), Args);
+      CommandTok.getCommandID(), CMK, Args);
 
   if (Args.size() < Info->NumArgs) {
     Diag(CommandTok.getEndLocation().getLocWithOffset(1),
diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp
index 88520d7..c02983b 100644
--- a/clang/lib/AST/CommentSema.cpp
+++ b/clang/lib/AST/CommentSema.cpp
@@ -363,12 +363,13 @@ void Sema::actOnTParamCommandFinish(TParamCommandComment *Command,
 InlineCommandComment *
 Sema::actOnInlineCommand(SourceLocation CommandLocBegin,
                          SourceLocation CommandLocEnd, unsigned CommandID,
+                         CommandMarkerKind CommandMarker,
                          ArrayRef<Comment::Argument> Args) {
   StringRef CommandName = Traits.getCommandInfo(CommandID)->Name;
 
-  return new (Allocator)
-      InlineCommandComment(CommandLocBegin, CommandLocEnd, CommandID,
-                           getInlineCommandRenderKind(CommandName), Args);
+  return new (Allocator) InlineCommandComment(
+      CommandLocBegin, CommandLocEnd, CommandID,
+      getInlineCommandRenderKind(CommandName), CommandMarker, Args);
 }
 
 InlineContentComment *Sema::actOnUnknownCommand(SourceLocation LocBegin,
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 34af9cc..d9c6632 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9860,11 +9860,15 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
     if (Value.isInt()) {
       unsigned Size = Info.Ctx.getTypeSize(E->getType());
       uint64_t N = Value.getInt().extOrTrunc(Size).getZExtValue();
-      Result.Base = (Expr*)nullptr;
-      Result.InvalidBase = false;
-      Result.Offset = CharUnits::fromQuantity(N);
-      Result.Designator.setInvalid();
-      Result.IsNullPtr = false;
+      if (N == Info.Ctx.getTargetNullPointerValue(E->getType())) {
+        Result.setNull(Info.Ctx, E->getType());
+      } else {
+        Result.Base = (Expr *)nullptr;
+        Result.InvalidBase = false;
+        Result.Offset = CharUnits::fromQuantity(N);
+        Result.Designator.setInvalid();
+        Result.IsNullPtr = false;
+      }
       return true;
     } else {
       // In rare instances, the value isn't an lvalue.
@@ -11624,7 +11628,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
   case Builtin::BI__builtin_elementwise_add_sat:
-  case Builtin::BI__builtin_elementwise_sub_sat: {
+  case Builtin::BI__builtin_elementwise_sub_sat:
+  case clang::X86::BI__builtin_ia32_pmulhuw128:
+  case clang::X86::BI__builtin_ia32_pmulhuw256:
+  case clang::X86::BI__builtin_ia32_pmulhuw512:
+  case clang::X86::BI__builtin_ia32_pmulhw128:
+  case clang::X86::BI__builtin_ia32_pmulhw256:
+  case clang::X86::BI__builtin_ia32_pmulhw512: {
     APValue SourceLHS, SourceRHS;
     if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
         !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
@@ -11649,6 +11659,18 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
             APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
                    DestEltTy->isUnsignedIntegerOrEnumerationType())));
         break;
+      case clang::X86::BI__builtin_ia32_pmulhuw128:
+      case clang::X86::BI__builtin_ia32_pmulhuw256:
+      case clang::X86::BI__builtin_ia32_pmulhuw512:
+        ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhu(LHS, RHS),
+                                                /*isUnsigned=*/true)));
+        break;
+      case clang::X86::BI__builtin_ia32_pmulhw128:
+      case clang::X86::BI__builtin_ia32_pmulhw256:
+      case clang::X86::BI__builtin_ia32_pmulhw512:
+        ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS),
+                                                /*isUnsigned=*/false)));
+        break;
       }
     }
 
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index f7a98bd..fe20004 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -329,10 +329,11 @@ OpenACCPrivateClause::Create(const ASTContext &C, SourceLocation BeginLoc,
 
 OpenACCFirstPrivateClause *OpenACCFirstPrivateClause::Create(
     const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
-    ArrayRef<Expr *> VarList, ArrayRef<VarDecl *> InitRecipes,
+    ArrayRef<Expr *> VarList, ArrayRef<OpenACCFirstPrivateRecipe> InitRecipes,
     SourceLocation EndLoc) {
-  void *Mem =
-      C.Allocate(OpenACCFirstPrivateClause::totalSizeToAlloc<Expr *, VarDecl *>(
+  void *Mem = C.Allocate(
+      OpenACCFirstPrivateClause::totalSizeToAlloc<Expr *,
+                                                  OpenACCFirstPrivateRecipe>(
           VarList.size(), InitRecipes.size()));
   return new (Mem) OpenACCFirstPrivateClause(BeginLoc, LParenLoc, VarList,
                                              InitRecipes, EndLoc);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 4c36f24..0297f9c 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2645,8 +2645,10 @@ void OpenACCClauseProfiler::VisitFirstPrivateClause(
     const OpenACCFirstPrivateClause &Clause) {
   VisitClauseWithVarList(Clause);
 
-  for (auto *VD : Clause.getInitRecipes())
-    Profiler.VisitDecl(VD);
+  for (auto &Recipe : Clause.getInitRecipes()) {
+    Profiler.VisitDecl(Recipe.RecipeDecl);
+    Profiler.VisitDecl(Recipe.InitFromTemporary);
+  }
 }
 
 void OpenACCClauseProfiler::VisitAttachClause(
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 141edc8..03d7413 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -5246,6 +5246,15 @@ bool Type::isHLSLResourceRecord() const {
   return HLSLAttributedResourceType::findHandleTypeOnResource(this) != nullptr;
 }
 
+bool Type::isHLSLResourceRecordArray() const {
+  const Type *Ty = getUnqualifiedDesugaredType();
+  if (!Ty->isArrayType())
+    return false;
+  while (isa<ConstantArrayType>(Ty))
+    Ty = Ty->getArrayElementTypeNoTypeQual();
+  return Ty->isHLSLResourceRecord();
+}
+
 bool Type::isHLSLIntangibleType() const {
   const Type *Ty = getUnqualifiedDesugaredType();
 
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 40dff7e..f4ead3d 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -1986,6 +1986,14 @@ public:
     const auto *FD = dyn_cast<FunctionDecl>(CE->getDirectCallee());
     if (!FD)
       return false;
+
+    bool IsGlobalAndNotInAnyNamespace =
+        FD->isGlobal() && !FD->getEnclosingNamespaceContext()->isNamespace();
+
+    // A libc function must either be in the std:: namespace or a global
+    // function that is not in any namespace:
+    if (!FD->isInStdNamespace() && !IsGlobalAndNotInAnyNamespace)
+      return false;
     auto isSingleStringLiteralArg = false;
     if (CE->getNumArgs() == 1) {
       isSingleStringLiteralArg =
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index be21ce9..b8663eb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -16,6 +16,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenCleanup.h"
 #include "CIRGenFunction.h"
 
 #include "clang/CIR/MissingFeatures.h"
@@ -33,6 +34,52 @@ using namespace clang::CIRGen;
 
 void EHScopeStack::Cleanup::anchor() {}
 
+/// Push an entry of the given size onto this protected-scope stack.
+char *EHScopeStack::allocate(size_t size) {
+  size = llvm::alignTo(size, ScopeStackAlignment);
+  if (!startOfBuffer) {
+    unsigned capacity = llvm::PowerOf2Ceil(std::max(size, 1024ul));
+    startOfBuffer = std::make_unique<char[]>(capacity);
+    startOfData = endOfBuffer = startOfBuffer.get() + capacity;
+  } else if (static_cast<size_t>(startOfData - startOfBuffer.get()) < size) {
+    unsigned currentCapacity = endOfBuffer - startOfBuffer.get();
+    unsigned usedCapacity =
+        currentCapacity - (startOfData - startOfBuffer.get());
+    unsigned requiredCapacity = usedCapacity + size;
+    // We know from the 'else if' condition that requiredCapacity is greater
+    // than currentCapacity.
+    unsigned newCapacity = llvm::PowerOf2Ceil(requiredCapacity);
+
+    std::unique_ptr<char[]> newStartOfBuffer =
+        std::make_unique<char[]>(newCapacity);
+    char *newEndOfBuffer = newStartOfBuffer.get() + newCapacity;
+    char *newStartOfData = newEndOfBuffer - usedCapacity;
+    memcpy(newStartOfData, startOfData, usedCapacity);
+    startOfBuffer.swap(newStartOfBuffer);
+    endOfBuffer = newEndOfBuffer;
+    startOfData = newStartOfData;
+  }
+
+  assert(startOfBuffer.get() + size <= startOfData);
+  startOfData -= size;
+  return startOfData;
+}
+
+void EHScopeStack::deallocate(size_t size) {
+  startOfData += llvm::alignTo(size, ScopeStackAlignment);
+}
+
+void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
+  char *buffer = allocate(size);
+
+  // When the full implementation is upstreamed, this will allocate
+  // extra memory for and construct a wrapper object that is used to
+  // manage the cleanup generation.
+  assert(!cir::MissingFeatures::ehCleanupScope());
+
+  return buffer;
+}
+
 static mlir::Block *getCurCleanupBlock(CIRGenFunction &cgf) {
   mlir::OpBuilder::InsertionGuard guard(cgf.getBuilder());
   mlir::Block *cleanup =
@@ -44,26 +91,34 @@ static mlir::Block *getCurCleanupBlock(CIRGenFunction &cgf) {
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
 void CIRGenFunction::popCleanupBlock() {
-  assert(!ehStack.cleanupStack.empty() && "cleanup stack is empty!");
+  assert(!ehStack.empty() && "cleanup stack is empty!");
+
+  // The memory for the cleanup continues to be owned by the EHScopeStack
+  // allocator, so we just destroy the object rather than attempting to
+  // free it.
+  EHScopeStack::Cleanup &cleanup = *ehStack.begin();
+
+  // The eventual implementation here will use the EHCleanupScope helper class.
+  assert(!cir::MissingFeatures::ehCleanupScope());
+
   mlir::OpBuilder::InsertionGuard guard(builder);
-  std::unique_ptr<EHScopeStack::Cleanup> cleanup =
-      ehStack.cleanupStack.pop_back_val();
 
   assert(!cir::MissingFeatures::ehCleanupFlags());
   mlir::Block *cleanupEntry = getCurCleanupBlock(*this);
   builder.setInsertionPointToEnd(cleanupEntry);
-  cleanup->emit(*this);
+  cleanup.emit(*this);
+
+  ehStack.deallocate(cleanup.getSize());
 }
 
 /// Pops cleanup blocks until the given savepoint is reached.
-void CIRGenFunction::popCleanupBlocks(size_t oldCleanupStackDepth) {
+void CIRGenFunction::popCleanupBlocks(
+    EHScopeStack::stable_iterator oldCleanupStackDepth) {
   assert(!cir::MissingFeatures::ehstackBranches());
 
-  assert(ehStack.getStackDepth() >= oldCleanupStackDepth);
-
   // Pop cleanup blocks until we reach the base stack depth for the
   // current scope.
-  while (ehStack.getStackDepth() > oldCleanupStackDepth) {
+  while (ehStack.stable_begin() != oldCleanupStackDepth) {
     popCleanupBlock();
   }
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
new file mode 100644
index 0000000..7361c8c
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// These classes support the generation of CIR for cleanups, initially based
+// on LLVM IR cleanup handling, but ought to change as CIR evolves.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
+#define CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
+
+#include "EHScopeStack.h"
+
+namespace clang::CIRGen {
+
+/// A non-stable pointer into the scope stack.
+class EHScopeStack::iterator {
+  char *ptr = nullptr;
+
+  friend class EHScopeStack;
+  explicit iterator(char *ptr) : ptr(ptr) {}
+
+public:
+  iterator() = default;
+
+  EHScopeStack::Cleanup *get() const {
+    return reinterpret_cast<EHScopeStack::Cleanup *>(ptr);
+  }
+
+  EHScopeStack::Cleanup &operator*() const { return *get(); }
+};
+
+inline EHScopeStack::iterator EHScopeStack::begin() const {
+  return iterator(startOfData);
+}
+
+} // namespace clang::CIRGen
+#endif // CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 78d375c..715d101 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -667,6 +667,12 @@ struct DestroyObject final : EHScopeStack::Cleanup {
   void emit(CIRGenFunction &cgf) override {
     cgf.emitDestroy(addr, type, destroyer);
   }
+
+  // This is a placeholder until EHCleanupScope is implemented.
+  size_t getSize() const override {
+    assert(!cir::MissingFeatures::ehCleanupScope());
+    return sizeof(DestroyObject);
+  }
 };
 } // namespace
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 3aa170e..cba06a1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -116,6 +116,15 @@ public:
 
   mlir::Value emitPromotedComplexOperand(const Expr *e, QualType promotionTy);
 
+  LValue emitCompoundAssignLValue(
+      const CompoundAssignOperator *e,
+      mlir::Value (ComplexExprEmitter::*func)(const BinOpInfo &),
+      RValue &value);
+
+  mlir::Value emitCompoundAssign(
+      const CompoundAssignOperator *e,
+      mlir::Value (ComplexExprEmitter::*func)(const BinOpInfo &));
+
   mlir::Value emitBinAdd(const BinOpInfo &op);
   mlir::Value emitBinSub(const BinOpInfo &op);
   mlir::Value emitBinMul(const BinOpInfo &op);
@@ -153,6 +162,19 @@ public:
   HANDLEBINOP(Sub)
   HANDLEBINOP(Mul)
 #undef HANDLEBINOP
+
+  // Compound assignments.
+  mlir::Value VisitBinAddAssign(const CompoundAssignOperator *e) {
+    return emitCompoundAssign(e, &ComplexExprEmitter::emitBinAdd);
+  }
+
+  mlir::Value VisitBinSubAssign(const CompoundAssignOperator *e) {
+    return emitCompoundAssign(e, &ComplexExprEmitter::emitBinSub);
+  }
+
+  mlir::Value VisitBinMulAssign(const CompoundAssignOperator *e) {
+    return emitCompoundAssign(e, &ComplexExprEmitter::emitBinMul);
+  }
 };
 } // namespace
 
@@ -166,6 +188,12 @@ static const ComplexType *getComplexType(QualType type) {
 }
 #endif // NDEBUG
 
+static mlir::Value createComplexFromReal(CIRGenBuilderTy &builder,
+                                         mlir::Location loc, mlir::Value real) {
+  mlir::Value imag = builder.getNullValue(real.getType(), loc);
+  return builder.createComplexCreate(loc, real, imag);
+}
+
 LValue ComplexExprEmitter::emitBinAssignLValue(const BinaryOperator *e,
                                                mlir::Value &value) {
   assert(cgf.getContext().hasSameUnqualifiedType(e->getLHS()->getType(),
@@ -602,7 +630,7 @@ mlir::Value ComplexExprEmitter::emitPromoted(const Expr *e,
 
   mlir::Value result = Visit(const_cast<Expr *>(e));
   if (!promotionTy.isNull())
-    cgf.cgm.errorNYI("emitPromoted emitPromotedValue");
+    return cgf.emitPromotedValue(result, promotionTy);
 
   return result;
 }
@@ -630,6 +658,104 @@ ComplexExprEmitter::emitBinOps(const BinaryOperator *e, QualType promotionTy) {
   return binOpInfo;
 }
 
+LValue ComplexExprEmitter::emitCompoundAssignLValue(
+    const CompoundAssignOperator *e,
+    mlir::Value (ComplexExprEmitter::*func)(const BinOpInfo &), RValue &value) {
+  QualType lhsTy = e->getLHS()->getType();
+  QualType rhsTy = e->getRHS()->getType();
+  SourceLocation exprLoc = e->getExprLoc();
+  mlir::Location loc = cgf.getLoc(exprLoc);
+
+  if (lhsTy->getAs<AtomicType>()) {
+    cgf.cgm.errorNYI("emitCompoundAssignLValue AtmoicType");
+    return {};
+  }
+
+  BinOpInfo opInfo{loc};
+  opInfo.fpFeatures = e->getFPFeaturesInEffect(cgf.getLangOpts());
+
+  assert(!cir::MissingFeatures::cgFPOptionsRAII());
+
+  // Load the RHS and LHS operands.
+  // __block variables need to have the rhs evaluated first, plus this should
+  // improve codegen a little.
+  QualType promotionTypeCR = getPromotionType(e->getComputationResultType());
+  opInfo.ty = promotionTypeCR.isNull() ? e->getComputationResultType()
+                                       : promotionTypeCR;
+
+  QualType complexElementTy =
+      opInfo.ty->castAs<ComplexType>()->getElementType();
+  QualType promotionTypeRHS = getPromotionType(rhsTy);
+
+  // The RHS should have been converted to the computation type.
+  if (e->getRHS()->getType()->isRealFloatingType()) {
+    if (!promotionTypeRHS.isNull()) {
+      opInfo.rhs = createComplexFromReal(
+          cgf.getBuilder(), loc,
+          cgf.emitPromotedScalarExpr(e->getRHS(), promotionTypeRHS));
+    } else {
+      assert(cgf.getContext().hasSameUnqualifiedType(complexElementTy, rhsTy));
+      opInfo.rhs = createComplexFromReal(cgf.getBuilder(), loc,
+                                         cgf.emitScalarExpr(e->getRHS()));
+    }
+  } else {
+    if (!promotionTypeRHS.isNull()) {
+      opInfo.rhs = cgf.emitPromotedComplexExpr(e->getRHS(), promotionTypeRHS);
+    } else {
+      assert(cgf.getContext().hasSameUnqualifiedType(opInfo.ty, rhsTy));
+      opInfo.rhs = Visit(e->getRHS());
+    }
+  }
+
+  LValue lhs = cgf.emitLValue(e->getLHS());
+
+  // Load from the l-value and convert it.
+  QualType promotionTypeLHS = getPromotionType(e->getComputationLHSType());
+  if (lhsTy->isAnyComplexType()) {
+    mlir::Value lhsValue = emitLoadOfLValue(lhs, exprLoc);
+    QualType destTy = promotionTypeLHS.isNull() ? opInfo.ty : promotionTypeLHS;
+    opInfo.lhs = emitComplexToComplexCast(lhsValue, lhsTy, destTy, exprLoc);
+  } else {
+    cgf.cgm.errorNYI("emitCompoundAssignLValue emitLoadOfScalar");
+    return {};
+  }
+
+  // Expand the binary operator.
+  mlir::Value result = (this->*func)(opInfo);
+
+  // Truncate the result and store it into the LHS lvalue.
+  if (lhsTy->isAnyComplexType()) {
+    mlir::Value resultValue =
+        emitComplexToComplexCast(result, opInfo.ty, lhsTy, exprLoc);
+    emitStoreOfComplex(loc, resultValue, lhs, /*isInit*/ false);
+    value = RValue::getComplex(resultValue);
+  } else {
+    mlir::Value resultValue =
+        cgf.emitComplexToScalarConversion(result, opInfo.ty, lhsTy, exprLoc);
+    cgf.emitStoreOfScalar(resultValue, lhs, /*isInit*/ false);
+    value = RValue::get(resultValue);
+  }
+
+  return lhs;
+}
+
+mlir::Value ComplexExprEmitter::emitCompoundAssign(
+    const CompoundAssignOperator *e,
+    mlir::Value (ComplexExprEmitter::*func)(const BinOpInfo &)) {
+  RValue val;
+  LValue lv = emitCompoundAssignLValue(e, func, val);
+
+  // The result of an assignment in C is the assigned r-value.
+  if (!cgf.getLangOpts().CPlusPlus)
+    return val.getComplexValue();
+
+  // If the lvalue is non-volatile, return the computed value of the assignment.
+  if (!lv.isVolatileQualified())
+    return val.getComplexValue();
+
+  return emitLoadOfLValue(lv, e->getExprLoc());
+}
+
 mlir::Value ComplexExprEmitter::emitBinAdd(const BinOpInfo &op) {
   assert(!cir::MissingFeatures::fastMathFlags());
   assert(!cir::MissingFeatures::cgFPOptionsRAII());
@@ -654,7 +780,7 @@ getComplexRangeAttr(LangOptions::ComplexRangeKind range) {
   case LangOptions::CX_Basic:
     return cir::ComplexRangeKind::Basic;
   case LangOptions::CX_None:
-    // The default value for ComplexRangeKind is Full is no option is selected
+    // The default value for ComplexRangeKind is Full if no option is selected
     return cir::ComplexRangeKind::Full;
   }
 }
@@ -685,6 +811,31 @@ mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   return ComplexExprEmitter(*this).Visit(const_cast<Expr *>(e));
 }
 
+using CompoundFunc =
+    mlir::Value (ComplexExprEmitter::*)(const ComplexExprEmitter::BinOpInfo &);
+
+static CompoundFunc getComplexOp(BinaryOperatorKind op) {
+  switch (op) {
+  case BO_MulAssign:
+    return &ComplexExprEmitter::emitBinMul;
+  case BO_DivAssign:
+    llvm_unreachable("getComplexOp: BO_DivAssign");
+  case BO_SubAssign:
+    return &ComplexExprEmitter::emitBinSub;
+  case BO_AddAssign:
+    return &ComplexExprEmitter::emitBinAdd;
+  default:
+    llvm_unreachable("unexpected complex compound assignment");
+  }
+}
+
+LValue CIRGenFunction::emitComplexCompoundAssignmentLValue(
+    const CompoundAssignOperator *e) {
+  CompoundFunc op = getComplexOp(e->getOpcode());
+  RValue val;
+  return ComplexExprEmitter(*this).emitCompoundAssignLValue(e, op, val);
+}
+
 mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e,
                                                      LValue lv,
                                                      cir::UnaryOpKind op,
@@ -729,3 +880,11 @@ mlir::Value CIRGenFunction::emitPromotedComplexExpr(const Expr *e,
                                                     QualType promotionType) {
   return ComplexExprEmitter(*this).emitPromoted(e, promotionType);
 }
+
+mlir::Value CIRGenFunction::emitPromotedValue(mlir::Value result,
+                                              QualType promotionType) {
+  assert(!mlir::cast<cir::ComplexType>(result.getType()).isIntegerComplex() &&
+         "integral complex will never be promoted");
+  return builder.createCast(cir::CastKind::float_complex, result,
+                            convertType(promotionType));
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 32c1c1a..3e06513 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1955,6 +1955,29 @@ mlir::Value CIRGenFunction::emitScalarConversion(mlir::Value src,
       .emitScalarConversion(src, srcTy, dstTy, loc);
 }
 
+mlir::Value CIRGenFunction::emitComplexToScalarConversion(mlir::Value src,
+                                                          QualType srcTy,
+                                                          QualType dstTy,
+                                                          SourceLocation loc) {
+  assert(srcTy->isAnyComplexType() && hasScalarEvaluationKind(dstTy) &&
+         "Invalid complex -> scalar conversion");
+
+  QualType complexElemTy = srcTy->castAs<ComplexType>()->getElementType();
+  if (dstTy->isBooleanType()) {
+    auto kind = complexElemTy->isFloatingType()
+                    ? cir::CastKind::float_complex_to_bool
+                    : cir::CastKind::int_complex_to_bool;
+    return builder.createCast(getLoc(loc), kind, src, convertType(dstTy));
+  }
+
+  auto kind = complexElemTy->isFloatingType()
+                  ? cir::CastKind::float_complex_to_real
+                  : cir::CastKind::int_complex_to_real;
+  mlir::Value real =
+      builder.createCast(getLoc(loc), kind, src, convertType(complexElemTy));
+  return emitScalarConversion(real, complexElemTy, dstTy, loc);
+}
+
 mlir::Value ScalarExprEmitter::VisitUnaryLNot(const UnaryOperator *e) {
   // Perform vector logical not on comparison with zero vector.
   if (e->getType()->isVectorType() &&
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index eb05c93..dedd01c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -28,8 +28,6 @@ CIRGenFunction::CIRGenFunction(CIRGenModule &cgm, CIRGenBuilderTy &builder,
                                bool suppressNewContext)
     : CIRGenTypeCache(cgm), cgm{cgm}, builder(builder) {
   ehStack.setCGF(this);
-  currentCleanupStackDepth = 0;
-  assert(ehStack.getStackDepth() == 0);
 }
 
 CIRGenFunction::~CIRGenFunction() {}
@@ -409,6 +407,8 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
   const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
   curFuncDecl = d->getNonClosureContext();
 
+  prologueCleanupDepth = ehStack.stable_begin();
+
   mlir::Block *entryBB = &fn.getBlocks().front();
   builder.setInsertionPointToStart(entryBB);
 
@@ -475,11 +475,11 @@ void CIRGenFunction::finishFunction(SourceLocation endLoc) {
   // important to do this before we enter the return block or return
   // edges will be *really* confused.
   // TODO(cir): Use prologueCleanupDepth here.
-  bool hasCleanups = ehStack.getStackDepth() != currentCleanupStackDepth;
+  bool hasCleanups = ehStack.stable_begin() != prologueCleanupDepth;
   if (hasCleanups) {
     assert(!cir::MissingFeatures::generateDebugInfo());
     // FIXME(cir): should we clearInsertionPoint? breaks many testcases
-    popCleanupBlocks(currentCleanupStackDepth);
+    popCleanupBlocks(prologueCleanupDepth);
   }
 }
 
@@ -785,9 +785,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     }
     if (!ty->isAnyComplexType())
       return emitCompoundAssignmentLValue(cast<CompoundAssignOperator>(e));
-    cgm.errorNYI(e->getSourceRange(),
-                 "CompoundAssignOperator with ComplexType");
-    return LValue();
+
+    return emitComplexCompoundAssignmentLValue(cast<CompoundAssignOperator>(e));
   }
   case Expr::CallExprClass:
   case Expr::CXXMemberCallExprClass:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 3d92545..bdbc77c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -601,9 +601,13 @@ public:
                      FunctionArgList args, clang::SourceLocation loc,
                      clang::SourceLocation startLoc);
 
+  /// The cleanup depth enclosing all the cleanups associated with the
+  /// parameters.
+  EHScopeStack::stable_iterator prologueCleanupDepth;
+
   /// Takes the old cleanup stack size and emits the cleanup blocks
   /// that have been added.
-  void popCleanupBlocks(size_t oldCleanupStackDepth);
+  void popCleanupBlocks(EHScopeStack::stable_iterator oldCleanupStackDepth);
   void popCleanupBlock();
 
   /// Push a cleanup to be run at the end of the current full-expression.  Safe
@@ -622,7 +626,7 @@ public:
   /// Enters a new scope for capturing cleanups, all of which
   /// will be executed once the scope is exited.
   class RunCleanupsScope {
-    size_t cleanupStackDepth, oldCleanupStackDepth;
+    EHScopeStack::stable_iterator cleanupStackDepth, oldCleanupStackDepth;
 
   protected:
     bool performCleanup;
@@ -638,7 +642,7 @@ public:
     /// Enter a new cleanup scope.
     explicit RunCleanupsScope(CIRGenFunction &cgf)
         : performCleanup(true), cgf(cgf) {
-      cleanupStackDepth = cgf.ehStack.getStackDepth();
+      cleanupStackDepth = cgf.ehStack.stable_begin();
       oldCleanupStackDepth = cgf.currentCleanupStackDepth;
       cgf.currentCleanupStackDepth = cleanupStackDepth;
     }
@@ -663,7 +667,7 @@ public:
   };
 
   // Cleanup stack depth of the RunCleanupsScope that was pushed most recently.
-  size_t currentCleanupStackDepth;
+  EHScopeStack::stable_iterator currentCleanupStackDepth = ehStack.stable_end();
 
 public:
   /// Represents a scope, including function bodies, compound statements, and
@@ -944,6 +948,11 @@ public:
   /// sanitizer is enabled, a runtime check is also emitted.
   mlir::Value emitCheckedArgForAssume(const Expr *e);
 
+  /// Emit a conversion from the specified complex type to the specified
+  /// destination type, where the destination type is an LLVM scalar type.
+  mlir::Value emitComplexToScalarConversion(mlir::Value src, QualType srcTy,
+                                            QualType dstTy, SourceLocation loc);
+
   LValue emitCompoundAssignmentLValue(const clang::CompoundAssignOperator *e);
   LValue emitCompoundLiteralLValue(const CompoundLiteralExpr *e);
 
@@ -1047,6 +1056,8 @@ public:
 
   mlir::Value emitPromotedScalarExpr(const Expr *e, QualType promotionType);
 
+  mlir::Value emitPromotedValue(mlir::Value result, QualType promotionType);
+
   /// Emit the computation of the specified expression of scalar type.
   mlir::Value emitScalarExpr(const clang::Expr *e);
 
@@ -1076,6 +1087,7 @@ public:
                                        cir::UnaryOpKind op, bool isPre);
 
   LValue emitComplexAssignmentLValue(const BinaryOperator *e);
+  LValue emitComplexCompoundAssignmentLValue(const CompoundAssignOperator *e);
 
   void emitCompoundStmt(const clang::CompoundStmt &s);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index b143682..ff6d293 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1307,7 +1307,8 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
   }
 
   case Decl::Var:
-  case Decl::Decomposition: {
+  case Decl::Decomposition:
+  case Decl::VarTemplateSpecialization: {
     auto *vd = cast<VarDecl>(decl);
     if (isa<DecompositionDecl>(decl)) {
       errorNYI(decl->getSourceRange(), "global variable decompositions");
@@ -1342,6 +1343,8 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
   case Decl::StaticAssert:
   case Decl::TypeAliasTemplate:
   case Decl::UsingShadow:
+  case Decl::VarTemplate:
+  case Decl::VarTemplatePartialSpecialization:
     break;
 
   case Decl::CXXConstructor:
@@ -1362,6 +1365,21 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
     assert(!cir::MissingFeatures::generateDebugInfo());
     assert(!cir::MissingFeatures::cxxRecordStaticMembers());
     break;
+
+  case Decl::FileScopeAsm:
+    // File-scope asm is ignored during device-side CUDA compilation.
+    if (langOpts.CUDA && langOpts.CUDAIsDevice)
+      break;
+    // File-scope asm is ignored during device-side OpenMP compilation.
+    if (langOpts.OpenMPIsTargetDevice)
+      break;
+    // File-scope asm is ignored during device-side SYCL compilation.
+    if (langOpts.SYCLIsDevice)
+      break;
+    auto *file_asm = cast<FileScopeAsmDecl>(decl);
+    std::string line = file_asm->getAsmString();
+    globalScopeAsm.push_back(builder.getStringAttr(line));
+    break;
   }
 }
 
@@ -1975,6 +1993,9 @@ void CIRGenModule::release() {
   emitDeferred();
   applyReplacements();
 
+  theModule->setAttr(cir::CIRDialect::getModuleLevelAsmAttrName(),
+                     builder.getArrayAttr(globalScopeAsm));
+
   // There's a lot of code that is not implemented yet.
   assert(!cir::MissingFeatures::cgmRelease());
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 5d07d38..163a0fc 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -90,6 +90,8 @@ private:
   /// for FunctionDecls's.
   CIRGenFunction *curCGF = nullptr;
 
+  llvm::SmallVector<mlir::Attribute> globalScopeAsm;
+
 public:
   mlir::ModuleOp getModule() const { return theModule; }
   CIRGenBuilderTy &getBuilder() { return builder; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 50642e7..332babd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -412,7 +412,7 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
   auto *retBlock = curLexScope->getOrCreateRetBlock(*this, loc);
   // This should emit a branch through the cleanup block if one exists.
   builder.create<cir::BrOp>(loc, retBlock);
-  if (ehStack.getStackDepth() != currentCleanupStackDepth)
+  if (ehStack.stable_begin() != currentCleanupStackDepth)
     cgm.errorNYI(s.getSourceRange(), "return with cleanup stack");
   builder.createBlock(builder.getBlock()->getParent());
 
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 22750ac..47478f6 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -42,7 +42,47 @@ enum CleanupKind : unsigned {
 /// A stack of scopes which respond to exceptions, including cleanups
 /// and catch blocks.
 class EHScopeStack {
+  friend class CIRGenFunction;
+
 public:
+  // TODO(ogcg): Switch to alignof(uint64_t) instead of 8
+  enum { ScopeStackAlignment = 8 };
+
+  /// A saved depth on the scope stack.  This is necessary because
+  /// pushing scopes onto the stack invalidates iterators.
+  class stable_iterator {
+    friend class EHScopeStack;
+
+    /// Offset from startOfData to endOfBuffer.
+    ptrdiff_t size = -1;
+
+    explicit stable_iterator(ptrdiff_t size) : size(size) {}
+
+  public:
+    static stable_iterator invalid() { return stable_iterator(-1); }
+    stable_iterator() = default;
+
+    bool isValid() const { return size >= 0; }
+
+    /// Returns true if this scope encloses I.
+    /// Returns false if I is invalid.
+    /// This scope must be valid.
+    bool encloses(stable_iterator other) const { return size <= other.size; }
+
+    /// Returns true if this scope strictly encloses I: that is,
+    /// if it encloses I and is not I.
+    /// Returns false is I is invalid.
+    /// This scope must be valid.
+    bool strictlyEncloses(stable_iterator I) const { return size < I.size; }
+
+    friend bool operator==(stable_iterator A, stable_iterator B) {
+      return A.size == B.size;
+    }
+    friend bool operator!=(stable_iterator A, stable_iterator B) {
+      return A.size != B.size;
+    }
+  };
+
   /// Information for lazily generating a cleanup.  Subclasses must be
   /// POD-like: cleanups will not be destructed, and they will be
   /// allocated on the cleanup stack and freely copied and moved
@@ -68,30 +108,75 @@ public:
     ///
     // \param flags cleanup kind.
     virtual void emit(CIRGenFunction &cgf) = 0;
-  };
 
-  // Classic codegen has a finely tuned custom allocator and a complex stack
-  // management scheme. We'll probably eventually want to find a way to share
-  // that implementation. For now, we will use a very simplified implementation
-  // to get cleanups working.
-  llvm::SmallVector<std::unique_ptr<Cleanup>, 8> cleanupStack;
+    // This is a placeholder until EHScope is implemented.
+    virtual size_t getSize() const = 0;
+  };
 
 private:
+  // The implementation for this class is in CIRGenCleanup.h and
+  // CIRGenCleanup.cpp; the definition is here because it's used as a
+  // member of CIRGenFunction.
+
+  /// The start of the scope-stack buffer, i.e. the allocated pointer
+  /// for the buffer.  All of these pointers are either simultaneously
+  /// null or simultaneously valid.
+  std::unique_ptr<char[]> startOfBuffer;
+
+  /// The end of the buffer.
+  char *endOfBuffer = nullptr;
+
+  /// The first valid entry in the buffer.
+  char *startOfData = nullptr;
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
+  // This class uses a custom allocator for maximum efficiency because cleanups
+  // are allocated and freed very frequently. It's basically a bump pointer
+  // allocator, but we can't use LLVM's BumpPtrAllocator because we use offsets
+  // into the buffer as stable iterators.
+  char *allocate(size_t size);
+  void deallocate(size_t size);
+
+  void *pushCleanup(CleanupKind kind, size_t dataSize);
+
 public:
   EHScopeStack() = default;
   ~EHScopeStack() = default;
 
   /// Push a lazily-created cleanup on the stack.
   template <class T, class... As> void pushCleanup(CleanupKind kind, As... a) {
-    cleanupStack.push_back(std::make_unique<T>(a...));
+    static_assert(alignof(T) <= ScopeStackAlignment,
+                  "Cleanup's alignment is too large.");
+    void *buffer = pushCleanup(kind, sizeof(T));
+    [[maybe_unused]] Cleanup *obj = new (buffer) T(a...);
   }
 
   void setCGF(CIRGenFunction *inCGF) { cgf = inCGF; }
 
-  size_t getStackDepth() const { return cleanupStack.size(); }
+  /// Pops a cleanup scope off the stack.  This is private to CIRGenCleanup.cpp.
+  void popCleanup();
+
+  /// Determines whether the exception-scopes stack is empty.
+  bool empty() const { return startOfData == endOfBuffer; }
+
+  /// An unstable reference to a scope-stack depth.  Invalidated by
+  /// pushes but not pops.
+  class iterator;
+
+  /// Returns an iterator pointing to the innermost EH scope.
+  iterator begin() const;
+
+  /// Create a stable reference to the top of the EH stack.  The
+  /// returned reference is valid until that scope is popped off the
+  /// stack.
+  stable_iterator stable_begin() const {
+    return stable_iterator(endOfBuffer - startOfData);
+  }
+
+  /// Create a stable reference to the bottom of the EH stack.
+  static stable_iterator stable_end() { return stable_iterator(0); }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index d3fcac1..53ab04e 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1444,6 +1444,27 @@ cir::GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 }
 
 //===----------------------------------------------------------------------===//
+// VTableAddrPointOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+cir::VTableAddrPointOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  StringRef name = getName();
+
+  // Verify that the result type underlying pointer type matches the type of
+  // the referenced cir.global or cir.func op.
+  auto op = symbolTable.lookupNearestSymbolFrom<GlobalOp>(*this, getNameAttr());
+  if (!op)
+    return emitOpError("'")
+           << name << "' does not reference a valid cir.global";
+  std::optional<mlir::Attribute> init = op.getInitialValue();
+  if (!init)
+    return success();
+  assert(!cir::MissingFeatures::vtableInitializer());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // FuncOp
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 7e1c9fb..43a1b51 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2143,6 +2143,11 @@ void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) {
           module->getAttr(cir::CIRDialect::getTripleAttrName()))
     module->setAttr(mlir::LLVM::LLVMDialect::getTargetTripleAttrName(),
                     tripleAttr);
+
+  if (mlir::Attribute asmAttr =
+          module->getAttr(cir::CIRDialect::getModuleLevelAsmAttrName()))
+    module->setAttr(mlir::LLVM::LLVMDialect::getModuleLevelAsmAttrName(),
+                    asmAttr);
 }
 
 void ConvertCIRToLLVMPass::runOnOperation() {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a648bde..071667a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5985,8 +5985,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 
     // Create a temporary array to hold the sizes of local pointer arguments
     // for the block. \p First is the position of the first size argument.
-    auto CreateArrayForSizeVar = [=](unsigned First)
-        -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
+    auto CreateArrayForSizeVar =
+        [=](unsigned First) -> std::pair<llvm::Value *, llvm::Value *> {
       llvm::APInt ArraySize(32, NumArgs - First);
       QualType SizeArrayTy = getContext().getConstantArrayType(
           getContext().getSizeType(), ArraySize, nullptr,
@@ -5999,9 +5999,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       // actually the Alloca ascasted to the default AS, hence the
       // stripPointerCasts()
       llvm::Value *Alloca = TmpPtr->stripPointerCasts();
-      llvm::Value *TmpSize = EmitLifetimeStart(
-          CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), Alloca);
       llvm::Value *ElemPtr;
+      EmitLifetimeStart(Alloca);
       // Each of the following arguments specifies the size of the corresponding
       // argument passed to the enqueued block.
       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
@@ -6018,7 +6017,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       }
       // Return the Alloca itself rather than a potential ascast as this is only
       // used by the paired EmitLifetimeEnd.
-      return {ElemPtr, TmpSize, Alloca};
+      return {ElemPtr, Alloca};
     };
 
     // Could have events and/or varargs.
@@ -6030,7 +6029,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       llvm::Value *Kernel =
           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
       auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
-      auto [ElemPtr, TmpSize, TmpPtr] = CreateArrayForSizeVar(4);
+      auto [ElemPtr, TmpPtr] = CreateArrayForSizeVar(4);
 
       // Create a vector of the arguments, as well as a constant value to
       // express to the runtime the number of variadic arguments.
@@ -6045,8 +6044,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
       auto Call = RValue::get(
           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
-      if (TmpSize)
-        EmitLifetimeEnd(TmpSize, TmpPtr);
+      EmitLifetimeEnd(TmpPtr);
       return Call;
     }
     // Any calls now have event arguments passed.
@@ -6111,15 +6109,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       ArgTys.push_back(Int32Ty);
       Name = "__enqueue_kernel_events_varargs";
 
-      auto [ElemPtr, TmpSize, TmpPtr] = CreateArrayForSizeVar(7);
+      auto [ElemPtr, TmpPtr] = CreateArrayForSizeVar(7);
       Args.push_back(ElemPtr);
       ArgTys.push_back(ElemPtr->getType());
 
       llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
       auto Call = RValue::get(
           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
-      if (TmpSize)
-        EmitLifetimeEnd(TmpSize, TmpPtr);
+      EmitLifetimeEnd(TmpPtr);
       return Call;
     }
     llvm_unreachable("Unexpected enqueue_kernel signature");
diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h
index 96fe046..2dd320d 100644
--- a/clang/lib/CodeGen/CGCXXABI.h
+++ b/clang/lib/CodeGen/CGCXXABI.h
@@ -294,14 +294,22 @@ public:
                                              Address Value,
                                              QualType SrcRecordTy) = 0;
 
+  struct ExactDynamicCastInfo {
+    bool RequiresCastToPrimaryBase;
+    CharUnits Offset;
+  };
+
+  virtual std::optional<ExactDynamicCastInfo>
+  getExactDynamicCastInfo(QualType SrcRecordTy, QualType DestTy,
+                          QualType DestRecordTy) = 0;
+
   /// Emit a dynamic_cast from SrcRecordTy to DestRecordTy. The cast fails if
   /// the dynamic type of Value is not exactly DestRecordTy.
-  virtual llvm::Value *emitExactDynamicCast(CodeGenFunction &CGF, Address Value,
-                                            QualType SrcRecordTy,
-                                            QualType DestTy,
-                                            QualType DestRecordTy,
-                                            llvm::BasicBlock *CastSuccess,
-                                            llvm::BasicBlock *CastFail) = 0;
+  virtual llvm::Value *emitExactDynamicCast(
+      CodeGenFunction &CGF, Address Value, QualType SrcRecordTy,
+      QualType DestTy, QualType DestRecordTy,
+      const ExactDynamicCastInfo &CastInfo, llvm::BasicBlock *CastSuccess,
+      llvm::BasicBlock *CastFail) = 0;
 
   virtual bool EmitBadCastCall(CodeGenFunction &CGF) = 0;
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d9bd443..6e0c2c1 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4319,10 +4319,7 @@ static void emitWriteback(CodeGenFunction &CGF,
 
   if (writeback.WritebackExpr) {
     CGF.EmitIgnoredExpr(writeback.WritebackExpr);
-
-    if (writeback.LifetimeSz)
-      CGF.EmitLifetimeEnd(writeback.LifetimeSz,
-                          writeback.Temporary.getBasePointer());
+    CGF.EmitLifetimeEnd(writeback.Temporary.getBasePointer());
     return;
   }
 
@@ -5282,7 +5279,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // If the call returns a temporary with struct return, create a temporary
   // alloca to hold the result, unless one is given to us.
   Address SRetPtr = Address::invalid();
-  llvm::Value *UnusedReturnSizePtr = nullptr;
+  bool NeedSRetLifetimeEnd = false;
   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
     // For virtual function pointer thunks and musttail calls, we must always
     // forward an incoming SRet pointer to the callee, because a local alloca
@@ -5296,11 +5293,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       SRetPtr = ReturnValue.getAddress();
     } else {
       SRetPtr = CreateMemTempWithoutCast(RetTy, "tmp");
-      if (HaveInsertPoint() && ReturnValue.isUnused()) {
-        llvm::TypeSize size =
-            CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(RetTy));
-        UnusedReturnSizePtr = EmitLifetimeStart(size, SRetPtr.getBasePointer());
-      }
+      if (HaveInsertPoint() && ReturnValue.isUnused())
+        NeedSRetLifetimeEnd = EmitLifetimeStart(SRetPtr.getBasePointer());
     }
     if (IRFunctionArgs.hasSRetArg()) {
       // A mismatch between the allocated return value's AS and the target's
@@ -5484,15 +5478,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         Val = Builder.CreateFreeze(Val);
       IRCallArgs[FirstIRArg] = Val;
 
-      // Emit lifetime markers for the temporary alloca.
-      llvm::TypeSize ByvalTempElementSize =
-          CGM.getDataLayout().getTypeAllocSize(AI.getElementType());
-      llvm::Value *LifetimeSize =
-          EmitLifetimeStart(ByvalTempElementSize, AI.getPointer());
-
-      // Add cleanup code to emit the end lifetime marker after the call.
-      if (LifetimeSize) // In case we disabled lifetime markers.
-        CallLifetimeEndAfterCall.emplace_back(AI, LifetimeSize);
+      // Emit lifetime markers for the temporary alloca and add cleanup code to
+      // emit the end lifetime marker after the call.
+      if (EmitLifetimeStart(AI.getPointer()))
+        CallLifetimeEndAfterCall.emplace_back(AI);
 
       // Generate the copy.
       I->copyInto(*this, AI);
@@ -5653,9 +5642,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       auto unpaddedCoercionType = ArgInfo.getUnpaddedCoerceAndExpandType();
       auto *unpaddedStruct = dyn_cast<llvm::StructType>(unpaddedCoercionType);
 
-      llvm::Value *tempSize = nullptr;
       Address addr = Address::invalid();
       RawAddress AllocaAddr = RawAddress::invalid();
+      bool NeedLifetimeEnd = false;
       if (I->isAggregate()) {
         addr = I->hasLValue() ? I->getKnownLValue().getAddress()
                               : I->getKnownRValue().getAggregateAddress();
@@ -5665,7 +5654,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         assert(RV.isScalar()); // complex should always just be direct
 
         llvm::Type *scalarType = RV.getScalarVal()->getType();
-        auto scalarSize = CGM.getDataLayout().getTypeAllocSize(scalarType);
         auto scalarAlign = CGM.getDataLayout().getPrefTypeAlign(scalarType);
 
         // Materialize to a temporary.
@@ -5674,7 +5662,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                     layout->getAlignment(), scalarAlign)),
                                 "tmp",
                                 /*ArraySize=*/nullptr, &AllocaAddr);
-        tempSize = EmitLifetimeStart(scalarSize, AllocaAddr.getPointer());
+        NeedLifetimeEnd = EmitLifetimeStart(AllocaAddr.getPointer());
 
         Builder.CreateStore(RV.getScalarVal(), addr);
       }
@@ -5699,10 +5687,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       }
       assert(IRArgPos == FirstIRArg + NumIRArgs);
 
-      if (tempSize) {
-        EmitLifetimeEnd(tempSize, AllocaAddr.getPointer());
-      }
-
+      if (NeedLifetimeEnd)
+        EmitLifetimeEnd(AllocaAddr.getPointer());
       break;
     }
 
@@ -5871,9 +5857,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // can't depend on being inside of an ExprWithCleanups, so we need to manually
   // pop this cleanup later on. Being eager about this is OK, since this
   // temporary is 'invisible' outside of the callee.
-  if (UnusedReturnSizePtr)
-    pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, SRetPtr,
-                                         UnusedReturnSizePtr);
+  if (NeedSRetLifetimeEnd)
+    pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, SRetPtr);
 
   llvm::BasicBlock *InvokeDest = CannotThrow ? nullptr : getInvokeDest();
 
@@ -6007,7 +5992,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // insertion point; this allows the rest of IRGen to discard
   // unreachable code.
   if (CI->doesNotReturn()) {
-    if (UnusedReturnSizePtr)
+    if (NeedSRetLifetimeEnd)
       PopCleanupBlock();
 
     // Strip away the noreturn attribute to better diagnose unreachable UB.
@@ -6122,7 +6107,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       case ABIArgInfo::InAlloca:
       case ABIArgInfo::Indirect: {
         RValue ret = convertTempToRValue(SRetPtr, RetTy, SourceLocation());
-        if (UnusedReturnSizePtr)
+        if (NeedSRetLifetimeEnd)
           PopCleanupBlock();
         return ret;
       }
diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h
index 0b4e3f9..3157b7f 100644
--- a/clang/lib/CodeGen/CGCall.h
+++ b/clang/lib/CodeGen/CGCall.h
@@ -289,9 +289,6 @@ public:
     /// An Expression (optional) that performs the writeback with any required
     /// casting.
     const Expr *WritebackExpr;
-
-    // Size for optional lifetime end on the temporary.
-    llvm::Value *LifetimeSz;
   };
 
   struct CallArgCleanup {
@@ -321,9 +318,8 @@ public:
   }
 
   void addWriteback(LValue srcLV, Address temporary, llvm::Value *toUse,
-                    const Expr *writebackExpr = nullptr,
-                    llvm::Value *lifetimeSz = nullptr) {
-    Writeback writeback = {srcLV, temporary, toUse, writebackExpr, lifetimeSz};
+                    const Expr *writebackExpr = nullptr) {
+    Writeback writeback = {srcLV, temporary, toUse, writebackExpr};
     Writebacks.push_back(writeback);
   }
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index ff2dada..0cade0d 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1351,30 +1351,27 @@ void CodeGenFunction::EmitAutoVarDecl(const VarDecl &D) {
 }
 
 /// Emit a lifetime.begin marker if some criteria are satisfied.
-/// \return a pointer to the temporary size Value if a marker was emitted, null
-/// otherwise
-llvm::Value *CodeGenFunction::EmitLifetimeStart(llvm::TypeSize Size,
-                                                llvm::Value *Addr) {
+/// \return whether the marker was emitted.
+bool CodeGenFunction::EmitLifetimeStart(llvm::Value *Addr) {
   if (!ShouldEmitLifetimeMarkers)
-    return nullptr;
+    return false;
 
   assert(Addr->getType()->getPointerAddressSpace() ==
              CGM.getDataLayout().getAllocaAddrSpace() &&
          "Pointer should be in alloca address space");
-  llvm::Value *SizeV = llvm::ConstantInt::get(
-      Int64Ty, Size.isScalable() ? -1 : Size.getFixedValue());
-  llvm::CallInst *C =
-      Builder.CreateCall(CGM.getLLVMLifetimeStartFn(), {SizeV, Addr});
+  llvm::CallInst *C = Builder.CreateCall(CGM.getLLVMLifetimeStartFn(), {Addr});
   C->setDoesNotThrow();
-  return SizeV;
+  return true;
 }
 
-void CodeGenFunction::EmitLifetimeEnd(llvm::Value *Size, llvm::Value *Addr) {
+void CodeGenFunction::EmitLifetimeEnd(llvm::Value *Addr) {
+  if (!ShouldEmitLifetimeMarkers)
+    return;
+
   assert(Addr->getType()->getPointerAddressSpace() ==
              CGM.getDataLayout().getAllocaAddrSpace() &&
          "Pointer should be in alloca address space");
-  llvm::CallInst *C =
-      Builder.CreateCall(CGM.getLLVMLifetimeEndFn(), {Size, Addr});
+  llvm::CallInst *C = Builder.CreateCall(CGM.getLLVMLifetimeEndFn(), {Addr});
   C->setDoesNotThrow();
 }
 
@@ -1632,9 +1629,8 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
         // is rare.
         if (!Bypasses.IsBypassed(&D) &&
             !(!getLangOpts().CPlusPlus && hasLabelBeenSeenInCurrentScope())) {
-          llvm::TypeSize Size = CGM.getDataLayout().getTypeAllocSize(allocaTy);
-          emission.SizeForLifetimeMarkers =
-              EmitLifetimeStart(Size, AllocaAddr.getPointer());
+          emission.UseLifetimeMarkers =
+              EmitLifetimeStart(AllocaAddr.getPointer());
         }
       } else {
         assert(!emission.useLifetimeMarkers());
@@ -1727,9 +1723,8 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
 
   // Make sure we call @llvm.lifetime.end.
   if (emission.useLifetimeMarkers())
-    EHStack.pushCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker,
-                                         emission.getOriginalAllocatedAddress(),
-                                         emission.getSizeForLifetimeMarkers());
+    EHStack.pushCleanup<CallLifetimeEnd>(
+        NormalEHLifetimeMarker, emission.getOriginalAllocatedAddress());
 
   // Analogous to lifetime markers, we use a 'cleanup' to emit fake.use
   // calls for local variables. We are exempting volatile variables and
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5a3d4e4..f1affef 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -588,11 +588,9 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) {
   } else {
     switch (M->getStorageDuration()) {
     case SD_Automatic:
-      if (auto *Size = EmitLifetimeStart(
-              CGM.getDataLayout().getTypeAllocSize(Alloca.getElementType()),
-              Alloca.getPointer())) {
+      if (EmitLifetimeStart(Alloca.getPointer())) {
         pushCleanupAfterFullExpr<CallLifetimeEnd>(NormalEHLifetimeMarker,
-                                                  Alloca, Size);
+                                                  Alloca);
       }
       break;
 
@@ -623,11 +621,8 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) {
             Block, llvm::BasicBlock::iterator(Block->back())));
       }
 
-      if (auto *Size = EmitLifetimeStart(
-              CGM.getDataLayout().getTypeAllocSize(Alloca.getElementType()),
-              Alloca.getPointer())) {
-        pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, Alloca,
-                                             Size);
+      if (EmitLifetimeStart(Alloca.getPointer())) {
+        pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, Alloca);
       }
 
       if (OldConditional) {
@@ -3789,33 +3784,50 @@ void CodeGenFunction::EmitCheck(
   Branch->setMetadata(llvm::LLVMContext::MD_prof, Node);
   EmitBlock(Handlers);
 
+  // Clear arguments for the MinimalRuntime handler.
+  if (CGM.getCodeGenOpts().SanitizeMinimalRuntime) {
+    switch (CheckHandler) {
+    case SanitizerHandler::TypeMismatch:
+      // Pass value pointer only. It adds minimal overhead.
+      StaticArgs = {};
+      assert(DynamicArgs.size() == 1);
+      break;
+    default:
+      // No arguments for other checks.
+      StaticArgs = {};
+      DynamicArgs = {};
+      break;
+    }
+  }
+
   // Handler functions take an i8* pointing to the (handler-specific) static
   // information block, followed by a sequence of intptr_t arguments
   // representing operand values.
   SmallVector<llvm::Value *, 4> Args;
   SmallVector<llvm::Type *, 4> ArgTypes;
-  if (!CGM.getCodeGenOpts().SanitizeMinimalRuntime) {
-    Args.reserve(DynamicArgs.size() + 1);
-    ArgTypes.reserve(DynamicArgs.size() + 1);
-
-    // Emit handler arguments and create handler function type.
-    if (!StaticArgs.empty()) {
-      llvm::Constant *Info = llvm::ConstantStruct::getAnon(StaticArgs);
-      auto *InfoPtr = new llvm::GlobalVariable(
-          CGM.getModule(), Info->getType(), false,
-          llvm::GlobalVariable::PrivateLinkage, Info, "", nullptr,
-          llvm::GlobalVariable::NotThreadLocal,
-          CGM.getDataLayout().getDefaultGlobalsAddressSpace());
-      InfoPtr->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-      CGM.getSanitizerMetadata()->disableSanitizerForGlobal(InfoPtr);
-      Args.push_back(InfoPtr);
-      ArgTypes.push_back(Args.back()->getType());
-    }
 
-    for (llvm::Value *DynamicArg : DynamicArgs) {
-      Args.push_back(EmitCheckValue(DynamicArg));
-      ArgTypes.push_back(IntPtrTy);
-    }
+  Args.reserve(DynamicArgs.size() + 1);
+  ArgTypes.reserve(DynamicArgs.size() + 1);
+
+  // Emit handler arguments and create handler function type.
+  if (!StaticArgs.empty()) {
+    llvm::Constant *Info = llvm::ConstantStruct::getAnon(StaticArgs);
+    auto *InfoPtr = new llvm::GlobalVariable(
+        CGM.getModule(), Info->getType(),
+        // Non-constant global is used in a handler to deduplicate reports.
+        // TODO: change deduplication logic and make it constant.
+        /*isConstant=*/false, llvm::GlobalVariable::PrivateLinkage, Info, "",
+        nullptr, llvm::GlobalVariable::NotThreadLocal,
+        CGM.getDataLayout().getDefaultGlobalsAddressSpace());
+    InfoPtr->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+    CGM.getSanitizerMetadata()->disableSanitizerForGlobal(InfoPtr);
+    Args.push_back(InfoPtr);
+    ArgTypes.push_back(Args.back()->getType());
+  }
+
+  for (llvm::Value *DynamicArg : DynamicArgs) {
+    Args.push_back(EmitCheckValue(DynamicArg));
+    ArgTypes.push_back(IntPtrTy);
   }
 
   llvm::FunctionType *FnType =
@@ -5767,13 +5779,10 @@ LValue CodeGenFunction::EmitHLSLOutArgExpr(const HLSLOutArgExpr *E,
   llvm::Value *Addr = TempLV.getAddress().getBasePointer();
   llvm::Type *ElTy = ConvertTypeForMem(TempLV.getType());
 
-  llvm::TypeSize Sz = CGM.getDataLayout().getTypeAllocSize(ElTy);
-
-  llvm::Value *LifetimeSize = EmitLifetimeStart(Sz, Addr);
+  EmitLifetimeStart(Addr);
 
   Address TmpAddr(Addr, ElTy, TempLV.getAlignment());
-  Args.addWriteback(BaseLV, TmpAddr, nullptr, E->getWritebackCast(),
-                    LifetimeSize);
+  Args.addWriteback(BaseLV, TmpAddr, nullptr, E->getWritebackCast());
   Args.add(RValue::get(TmpAddr, *this), Ty);
   return TempLV;
 }
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index cad6731..e2f11b86 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -300,16 +300,12 @@ void AggExprEmitter::withReturnValueSlot(
   Address RetAddr = Address::invalid();
 
   EHScopeStack::stable_iterator LifetimeEndBlock;
-  llvm::Value *LifetimeSizePtr = nullptr;
   llvm::IntrinsicInst *LifetimeStartInst = nullptr;
   if (!UseTemp) {
     RetAddr = Dest.getAddress();
   } else {
     RetAddr = CGF.CreateMemTempWithoutCast(RetTy, "tmp");
-    llvm::TypeSize Size =
-        CGF.CGM.getDataLayout().getTypeAllocSize(CGF.ConvertTypeForMem(RetTy));
-    LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAddr.getBasePointer());
-    if (LifetimeSizePtr) {
+    if (CGF.EmitLifetimeStart(RetAddr.getBasePointer())) {
       LifetimeStartInst =
           cast<llvm::IntrinsicInst>(std::prev(Builder.GetInsertPoint()));
       assert(LifetimeStartInst->getIntrinsicID() ==
@@ -317,7 +313,7 @@ void AggExprEmitter::withReturnValueSlot(
              "Last insertion wasn't a lifetime.start?");
 
       CGF.pushFullExprCleanup<CodeGenFunction::CallLifetimeEnd>(
-          NormalEHLifetimeMarker, RetAddr, LifetimeSizePtr);
+          NormalEHLifetimeMarker, RetAddr);
       LifetimeEndBlock = CGF.EHStack.stable_begin();
     }
   }
@@ -338,7 +334,7 @@ void AggExprEmitter::withReturnValueSlot(
     // Since we're not guaranteed to be in an ExprWithCleanups, clean up
     // eagerly.
     CGF.DeactivateCleanupBlock(LifetimeEndBlock, LifetimeStartInst);
-    CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAddr.getBasePointer());
+    CGF.EmitLifetimeEnd(RetAddr.getBasePointer());
   }
 }
 
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index fef1baf..49d5d8a 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2295,6 +2295,18 @@ llvm::Value *CodeGenFunction::EmitDynamicCast(Address ThisAddr,
                  CGM.getCXXABI().shouldEmitExactDynamicCast(DestRecordTy) &&
                  !getLangOpts().PointerAuthCalls;
 
+  std::optional<CGCXXABI::ExactDynamicCastInfo> ExactCastInfo;
+  if (IsExact) {
+    ExactCastInfo = CGM.getCXXABI().getExactDynamicCastInfo(SrcRecordTy, DestTy,
+                                                            DestRecordTy);
+    if (!ExactCastInfo) {
+      llvm::Value *NullValue = EmitDynamicCastToNull(*this, DestTy);
+      if (!Builder.GetInsertBlock())
+        EmitBlock(createBasicBlock("dynamic_cast.unreachable"));
+      return NullValue;
+    }
+  }
+
   // C++ [expr.dynamic.cast]p4:
   //   If the value of v is a null pointer value in the pointer case, the result
   //   is the null pointer value of type T.
@@ -2322,7 +2334,8 @@ llvm::Value *CodeGenFunction::EmitDynamicCast(Address ThisAddr,
     // If the destination type is effectively final, this pointer points to the
     // right type if and only if its vptr has the right value.
     Value = CGM.getCXXABI().emitExactDynamicCast(
-        *this, ThisAddr, SrcRecordTy, DestTy, DestRecordTy, CastEnd, CastNull);
+        *this, ThisAddr, SrcRecordTy, DestTy, DestRecordTy, *ExactCastInfo,
+        CastEnd, CastNull);
   } else {
     assert(DestRecordTy->isRecordType() &&
            "destination type must be a record type!");
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 44931d0..7eed4ee 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -4183,9 +4183,8 @@ Value *ScalarExprEmitter::EmitOverflowCheckedBinOp(const BinOpInfo &Ops) {
   return phi;
 }
 
-/// Emit pointer + index arithmetic.
-static Value *emitPointerArithmetic(CodeGenFunction &CGF,
-                                    const BinOpInfo &op,
+/// This function is used for BO_Add/BO_Sub/BO_AddAssign/BO_SubAssign.
+static Value *emitPointerArithmetic(CodeGenFunction &CGF, const BinOpInfo &op,
                                     bool isSubtraction) {
   // Must have binary (not unary) expr here.  Unary pointer
   // increment/decrement doesn't use this path.
@@ -4202,11 +4201,19 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
     std::swap(pointerOperand, indexOperand);
   }
 
+  return CGF.EmitPointerArithmetic(expr, pointerOperand, pointer, indexOperand,
+                                   index, isSubtraction);
+}
+
+/// Emit pointer + index arithmetic.
+llvm::Value *CodeGenFunction::EmitPointerArithmetic(
+    const BinaryOperator *BO, Expr *pointerOperand, llvm::Value *pointer,
+    Expr *indexOperand, llvm::Value *index, bool isSubtraction) {
   bool isSigned = indexOperand->getType()->isSignedIntegerOrEnumerationType();
 
   unsigned width = cast<llvm::IntegerType>(index->getType())->getBitWidth();
-  auto &DL = CGF.CGM.getDataLayout();
-  auto PtrTy = cast<llvm::PointerType>(pointer->getType());
+  auto &DL = CGM.getDataLayout();
+  auto *PtrTy = cast<llvm::PointerType>(pointer->getType());
 
   // Some versions of glibc and gcc use idioms (particularly in their malloc
   // routines) that add a pointer-sized integer (known to be a pointer value)
@@ -4227,79 +4234,77 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
   //
   // Note that we do not suppress the pointer overflow check in this case.
   if (BinaryOperator::isNullPointerArithmeticExtension(
-          CGF.getContext(), op.Opcode, expr->getLHS(), expr->getRHS())) {
-    Value *Ptr = CGF.Builder.CreateIntToPtr(index, pointer->getType());
-    if (CGF.getLangOpts().PointerOverflowDefined ||
-        !CGF.SanOpts.has(SanitizerKind::PointerOverflow) ||
-        NullPointerIsDefined(CGF.Builder.GetInsertBlock()->getParent(),
+          getContext(), BO->getOpcode(), pointerOperand, indexOperand)) {
+    llvm::Value *Ptr = Builder.CreateIntToPtr(index, pointer->getType());
+    if (getLangOpts().PointerOverflowDefined ||
+        !SanOpts.has(SanitizerKind::PointerOverflow) ||
+        NullPointerIsDefined(Builder.GetInsertBlock()->getParent(),
                              PtrTy->getPointerAddressSpace()))
       return Ptr;
     // The inbounds GEP of null is valid iff the index is zero.
     auto CheckOrdinal = SanitizerKind::SO_PointerOverflow;
     auto CheckHandler = SanitizerHandler::PointerOverflow;
-    SanitizerDebugLocation SanScope(&CGF, {CheckOrdinal}, CheckHandler);
-    Value *IsZeroIndex = CGF.Builder.CreateIsNull(index);
-    llvm::Constant *StaticArgs[] = {
-        CGF.EmitCheckSourceLocation(op.E->getExprLoc())};
+    SanitizerDebugLocation SanScope(this, {CheckOrdinal}, CheckHandler);
+    llvm::Value *IsZeroIndex = Builder.CreateIsNull(index);
+    llvm::Constant *StaticArgs[] = {EmitCheckSourceLocation(BO->getExprLoc())};
     llvm::Type *IntPtrTy = DL.getIntPtrType(PtrTy);
-    Value *IntPtr = llvm::Constant::getNullValue(IntPtrTy);
-    Value *ComputedGEP = CGF.Builder.CreateZExtOrTrunc(index, IntPtrTy);
-    Value *DynamicArgs[] = {IntPtr, ComputedGEP};
-    CGF.EmitCheck({{IsZeroIndex, CheckOrdinal}}, CheckHandler, StaticArgs,
-                  DynamicArgs);
+    llvm::Value *IntPtr = llvm::Constant::getNullValue(IntPtrTy);
+    llvm::Value *ComputedGEP = Builder.CreateZExtOrTrunc(index, IntPtrTy);
+    llvm::Value *DynamicArgs[] = {IntPtr, ComputedGEP};
+    EmitCheck({{IsZeroIndex, CheckOrdinal}}, CheckHandler, StaticArgs,
+              DynamicArgs);
     return Ptr;
   }
 
   if (width != DL.getIndexTypeSizeInBits(PtrTy)) {
     // Zero-extend or sign-extend the pointer value according to
     // whether the index is signed or not.
-    index = CGF.Builder.CreateIntCast(index, DL.getIndexType(PtrTy), isSigned,
-                                      "idx.ext");
+    index = Builder.CreateIntCast(index, DL.getIndexType(PtrTy), isSigned,
+                                  "idx.ext");
   }
 
   // If this is subtraction, negate the index.
   if (isSubtraction)
-    index = CGF.Builder.CreateNeg(index, "idx.neg");
+    index = Builder.CreateNeg(index, "idx.neg");
 
-  if (CGF.SanOpts.has(SanitizerKind::ArrayBounds))
-    CGF.EmitBoundsCheck(op.E, pointerOperand, index, indexOperand->getType(),
-                        /*Accessed*/ false);
+  if (SanOpts.has(SanitizerKind::ArrayBounds))
+    EmitBoundsCheck(BO, pointerOperand, index, indexOperand->getType(),
+                    /*Accessed*/ false);
 
-  const PointerType *pointerType
-    = pointerOperand->getType()->getAs<PointerType>();
+  const PointerType *pointerType =
+      pointerOperand->getType()->getAs<PointerType>();
   if (!pointerType) {
     QualType objectType = pointerOperand->getType()
-                                        ->castAs<ObjCObjectPointerType>()
-                                        ->getPointeeType();
-    llvm::Value *objectSize
-      = CGF.CGM.getSize(CGF.getContext().getTypeSizeInChars(objectType));
+                              ->castAs<ObjCObjectPointerType>()
+                              ->getPointeeType();
+    llvm::Value *objectSize =
+        CGM.getSize(getContext().getTypeSizeInChars(objectType));
 
-    index = CGF.Builder.CreateMul(index, objectSize);
+    index = Builder.CreateMul(index, objectSize);
 
-    Value *result =
-        CGF.Builder.CreateGEP(CGF.Int8Ty, pointer, index, "add.ptr");
-    return CGF.Builder.CreateBitCast(result, pointer->getType());
+    llvm::Value *result = Builder.CreateGEP(Int8Ty, pointer, index, "add.ptr");
+    return Builder.CreateBitCast(result, pointer->getType());
   }
 
   QualType elementType = pointerType->getPointeeType();
-  if (const VariableArrayType *vla
-        = CGF.getContext().getAsVariableArrayType(elementType)) {
+  if (const VariableArrayType *vla =
+          getContext().getAsVariableArrayType(elementType)) {
     // The element count here is the total number of non-VLA elements.
-    llvm::Value *numElements = CGF.getVLASize(vla).NumElts;
+    llvm::Value *numElements = getVLASize(vla).NumElts;
 
     // Effectively, the multiply by the VLA size is part of the GEP.
     // GEP indexes are signed, and scaling an index isn't permitted to
     // signed-overflow, so we use the same semantics for our explicit
     // multiply.  We suppress this if overflow is not undefined behavior.
-    llvm::Type *elemTy = CGF.ConvertTypeForMem(vla->getElementType());
-    if (CGF.getLangOpts().PointerOverflowDefined) {
-      index = CGF.Builder.CreateMul(index, numElements, "vla.index");
-      pointer = CGF.Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
+    llvm::Type *elemTy = ConvertTypeForMem(vla->getElementType());
+    if (getLangOpts().PointerOverflowDefined) {
+      index = Builder.CreateMul(index, numElements, "vla.index");
+      pointer = Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
     } else {
-      index = CGF.Builder.CreateNSWMul(index, numElements, "vla.index");
-      pointer = CGF.EmitCheckedInBoundsGEP(
-          elemTy, pointer, index, isSigned, isSubtraction, op.E->getExprLoc(),
-          "add.ptr");
+      index = Builder.CreateNSWMul(index, numElements, "vla.index");
+      pointer =
+          EmitCheckedInBoundsGEP(elemTy, pointer, index, isSigned,
+                                 isSubtraction, BO->getExprLoc(), "add.ptr");
     }
     return pointer;
   }
@@ -4309,16 +4314,15 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
   // future proof.
   llvm::Type *elemTy;
   if (elementType->isVoidType() || elementType->isFunctionType())
-    elemTy = CGF.Int8Ty;
+    elemTy = Int8Ty;
   else
-    elemTy = CGF.ConvertTypeForMem(elementType);
+    elemTy = ConvertTypeForMem(elementType);
 
-  if (CGF.getLangOpts().PointerOverflowDefined)
-    return CGF.Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
+  if (getLangOpts().PointerOverflowDefined)
+    return Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
 
-  return CGF.EmitCheckedInBoundsGEP(
-      elemTy, pointer, index, isSigned, isSubtraction, op.E->getExprLoc(),
-      "add.ptr");
+  return EmitCheckedInBoundsGEP(elemTy, pointer, index, isSigned, isSubtraction,
+                                BO->getExprLoc(), "add.ptr");
 }
 
 // Construct an fmuladd intrinsic to represent a fused mul-add of MulOp and
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index f64ac20..918cb3e 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -103,13 +103,6 @@ llvm::Triple::ArchType CGHLSLRuntime::getArch() {
   return CGM.getTarget().getTriple().getArch();
 }
 
-// Returns true if the type is an HLSL resource class or an array of them
-static bool isResourceRecordTypeOrArrayOf(const clang::Type *Ty) {
-  while (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
-    Ty = CAT->getArrayElementTypeNoTypeQual();
-  return Ty->isHLSLResourceRecord();
-}
-
 // Emits constant global variables for buffer constants declarations
 // and creates metadata linking the constant globals with the buffer global.
 void CGHLSLRuntime::emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
@@ -146,7 +139,7 @@ void CGHLSLRuntime::emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
     if (VDTy.getAddressSpace() != LangAS::hlsl_constant) {
       if (VD->getStorageClass() == SC_Static ||
           VDTy.getAddressSpace() == LangAS::hlsl_groupshared ||
-          isResourceRecordTypeOrArrayOf(VDTy.getTypePtr())) {
+          VDTy->isHLSLResourceRecord() || VDTy->isHLSLResourceRecordArray()) {
         // Emit static and groupshared variables and resource classes inside
         // cbuffer as regular globals
         CGM.EmitGlobal(VD);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 6c32c98..bf16d72 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -701,14 +701,12 @@ public:
     bool isRedundantBeforeReturn() override { return true; }
 
     llvm::Value *Addr;
-    llvm::Value *Size;
 
   public:
-    CallLifetimeEnd(RawAddress addr, llvm::Value *size)
-        : Addr(addr.getPointer()), Size(size) {}
+    CallLifetimeEnd(RawAddress addr) : Addr(addr.getPointer()) {}
 
     void Emit(CodeGenFunction &CGF, Flags flags) override {
-      CGF.EmitLifetimeEnd(Size, Addr);
+      CGF.EmitLifetimeEnd(Addr);
     }
   };
 
@@ -3233,8 +3231,8 @@ public:
   void EmitSehTryScopeBegin();
   void EmitSehTryScopeEnd();
 
-  llvm::Value *EmitLifetimeStart(llvm::TypeSize Size, llvm::Value *Addr);
-  void EmitLifetimeEnd(llvm::Value *Size, llvm::Value *Addr);
+  bool EmitLifetimeStart(llvm::Value *Addr);
+  void EmitLifetimeEnd(llvm::Value *Addr);
 
   llvm::Value *EmitCXXNewExpr(const CXXNewExpr *E);
   void EmitCXXDeleteExpr(const CXXDeleteExpr *E);
@@ -3417,8 +3415,8 @@ public:
     /// initializer.
     bool IsConstantAggregate;
 
-    /// Non-null if we should use lifetime annotations.
-    llvm::Value *SizeForLifetimeMarkers;
+    /// True if lifetime markers should be used.
+    bool UseLifetimeMarkers;
 
     /// Address with original alloca instruction. Invalid if the variable was
     /// emitted as a global constant.
@@ -3432,20 +3430,14 @@ public:
     AutoVarEmission(const VarDecl &variable)
         : Variable(&variable), Addr(Address::invalid()), NRVOFlag(nullptr),
           IsEscapingByRef(false), IsConstantAggregate(false),
-          SizeForLifetimeMarkers(nullptr), AllocaAddr(RawAddress::invalid()) {}
+          UseLifetimeMarkers(false), AllocaAddr(RawAddress::invalid()) {}
 
     bool wasEmittedAsGlobal() const { return !Addr.isValid(); }
 
   public:
     static AutoVarEmission invalid() { return AutoVarEmission(Invalid()); }
 
-    bool useLifetimeMarkers() const {
-      return SizeForLifetimeMarkers != nullptr;
-    }
-    llvm::Value *getSizeForLifetimeMarkers() const {
-      assert(useLifetimeMarkers());
-      return SizeForLifetimeMarkers;
-    }
+    bool useLifetimeMarkers() const { return UseLifetimeMarkers; }
 
     /// Returns the raw, allocated address, which is not necessarily
     /// the address of the object itself. It is casted to default
@@ -5220,6 +5212,12 @@ public:
   /// operation is a subtraction.
   enum { NotSubtraction = false, IsSubtraction = true };
 
+  /// Emit pointer + index arithmetic.
+  llvm::Value *EmitPointerArithmetic(const BinaryOperator *BO,
+                                     Expr *pointerOperand, llvm::Value *pointer,
+                                     Expr *indexOperand, llvm::Value *index,
+                                     bool isSubtraction);
+
   /// Same as IRBuilder::CreateInBoundsGEP, but additionally emits a check to
   /// detect undefined behavior when the pointer overflow sanitizer is enabled.
   /// \p SignedIndices indicates whether any of the GEP indices are signed.
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 38aaceb..05fb137 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -2269,6 +2269,11 @@ struct CounterCoverageMappingBuilder
     // Track LHS True/False Decision.
     const auto DecisionLHS = MCDCBuilder.pop();
 
+    if (auto Gap =
+            findGapAreaBetween(getEnd(E->getLHS()), getStart(E->getRHS()))) {
+      fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), getRegionCounter(E));
+    }
+
     // Counter tracks the right hand side of a logical and operator.
     extendRegion(E->getRHS());
     propagateCounts(getRegionCounter(E), E->getRHS());
@@ -2330,6 +2335,11 @@ struct CounterCoverageMappingBuilder
     // Track LHS True/False Decision.
     const auto DecisionLHS = MCDCBuilder.pop();
 
+    if (auto Gap =
+            findGapAreaBetween(getEnd(E->getLHS()), getStart(E->getRHS()))) {
+      fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), getRegionCounter(E));
+    }
+
     // Counter tracks the right hand side of a logical or operator.
     extendRegion(E->getRHS());
     propagateCounts(getRegionCounter(E), E->getRHS());
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index aae1481..5ffc1ed 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -226,6 +226,10 @@ public:
     return hasUniqueVTablePointer(DestRecordTy);
   }
 
+  std::optional<ExactDynamicCastInfo>
+  getExactDynamicCastInfo(QualType SrcRecordTy, QualType DestTy,
+                          QualType DestRecordTy) override;
+
   llvm::Value *emitDynamicCastCall(CodeGenFunction &CGF, Address Value,
                                    QualType SrcRecordTy, QualType DestTy,
                                    QualType DestRecordTy,
@@ -234,6 +238,7 @@ public:
   llvm::Value *emitExactDynamicCast(CodeGenFunction &CGF, Address ThisAddr,
                                     QualType SrcRecordTy, QualType DestTy,
                                     QualType DestRecordTy,
+                                    const ExactDynamicCastInfo &CastInfo,
                                     llvm::BasicBlock *CastSuccess,
                                     llvm::BasicBlock *CastFail) override;
 
@@ -1681,10 +1686,11 @@ llvm::Value *ItaniumCXXABI::emitDynamicCastCall(
   return Value;
 }
 
-llvm::Value *ItaniumCXXABI::emitExactDynamicCast(
-    CodeGenFunction &CGF, Address ThisAddr, QualType SrcRecordTy,
-    QualType DestTy, QualType DestRecordTy, llvm::BasicBlock *CastSuccess,
-    llvm::BasicBlock *CastFail) {
+std::optional<CGCXXABI::ExactDynamicCastInfo>
+ItaniumCXXABI::getExactDynamicCastInfo(QualType SrcRecordTy, QualType DestTy,
+                                       QualType DestRecordTy) {
+  assert(shouldEmitExactDynamicCast(DestRecordTy));
+
   ASTContext &Context = getContext();
 
   // Find all the inheritance paths.
@@ -1722,41 +1728,56 @@ llvm::Value *ItaniumCXXABI::emitExactDynamicCast(
     if (!Offset)
       Offset = PathOffset;
     else if (Offset != PathOffset) {
-      // Base appears in at least two different places. Find the most-derived
-      // object and see if it's a DestDecl. Note that the most-derived object
-      // must be at least as aligned as this base class subobject, and must
-      // have a vptr at offset 0.
-      ThisAddr = Address(emitDynamicCastToVoid(CGF, ThisAddr, SrcRecordTy),
-                         CGF.VoidPtrTy, ThisAddr.getAlignment());
-      SrcDecl = DestDecl;
-      Offset = CharUnits::Zero();
-      break;
+      // Base appears in at least two different places.
+      return ExactDynamicCastInfo{/*RequiresCastToPrimaryBase=*/true,
+                                  CharUnits::Zero()};
     }
   }
+  if (!Offset)
+    return std::nullopt;
+  return ExactDynamicCastInfo{/*RequiresCastToPrimaryBase=*/false, *Offset};
+}
 
-  if (!Offset) {
-    // If there are no public inheritance paths, the cast always fails.
-    CGF.EmitBranch(CastFail);
-    return llvm::PoisonValue::get(CGF.VoidPtrTy);
-  }
+llvm::Value *ItaniumCXXABI::emitExactDynamicCast(
+    CodeGenFunction &CGF, Address ThisAddr, QualType SrcRecordTy,
+    QualType DestTy, QualType DestRecordTy,
+    const ExactDynamicCastInfo &ExactCastInfo, llvm::BasicBlock *CastSuccess,
+    llvm::BasicBlock *CastFail) {
+  const CXXRecordDecl *SrcDecl = SrcRecordTy->getAsCXXRecordDecl();
+  const CXXRecordDecl *DestDecl = DestRecordTy->getAsCXXRecordDecl();
+
+  llvm::Value *VTable = nullptr;
+  if (ExactCastInfo.RequiresCastToPrimaryBase) {
+    // Base appears in at least two different places. Find the most-derived
+    // object and see if it's a DestDecl. Note that the most-derived object
+    // must be at least as aligned as this base class subobject, and must
+    // have a vptr at offset 0.
+    llvm::Value *PrimaryBase =
+        emitDynamicCastToVoid(CGF, ThisAddr, SrcRecordTy);
+    ThisAddr = Address(PrimaryBase, CGF.VoidPtrTy, ThisAddr.getAlignment());
+    SrcDecl = DestDecl;
+    Address VTablePtrPtr = ThisAddr.withElementType(CGF.VoidPtrPtrTy);
+    VTable = CGF.Builder.CreateLoad(VTablePtrPtr, "vtable");
+  } else
+    VTable = CGF.GetVTablePtr(ThisAddr, CGF.UnqualPtrTy, SrcDecl);
 
   // Compare the vptr against the expected vptr for the destination type at
-  // this offset. Note that we do not know what type ThisAddr points to in
-  // the case where the derived class multiply inherits from the base class
-  // so we can't use GetVTablePtr, so we load the vptr directly instead.
-  llvm::Instruction *VPtr = CGF.Builder.CreateLoad(
-      ThisAddr.withElementType(CGF.VoidPtrPtrTy), "vtable");
-  CGM.DecorateInstructionWithTBAA(
-      VPtr, CGM.getTBAAVTablePtrAccessInfo(CGF.VoidPtrPtrTy));
-  llvm::Value *Success = CGF.Builder.CreateICmpEQ(
-      VPtr, getVTableAddressPoint(BaseSubobject(SrcDecl, *Offset), DestDecl));
-  llvm::Value *Result = ThisAddr.emitRawPointer(CGF);
-  if (!Offset->isZero())
-    Result = CGF.Builder.CreateInBoundsGEP(
-        CGF.CharTy, Result,
-        {llvm::ConstantInt::get(CGF.PtrDiffTy, -Offset->getQuantity())});
+  // this offset.
+  llvm::Constant *ExpectedVTable = getVTableAddressPoint(
+      BaseSubobject(SrcDecl, ExactCastInfo.Offset), DestDecl);
+  llvm::Value *Success = CGF.Builder.CreateICmpEQ(VTable, ExpectedVTable);
+  llvm::Value *AdjustedThisPtr = ThisAddr.emitRawPointer(CGF);
+
+  if (!ExactCastInfo.Offset.isZero()) {
+    CharUnits::QuantityType Offset = ExactCastInfo.Offset.getQuantity();
+    llvm::Constant *OffsetConstant =
+        llvm::ConstantInt::get(CGF.PtrDiffTy, -Offset);
+    AdjustedThisPtr = CGF.Builder.CreateInBoundsGEP(CGF.CharTy, AdjustedThisPtr,
+                                                    OffsetConstant);
+  }
+
   CGF.Builder.CreateCondBr(Success, CastSuccess, CastFail);
-  return Result;
+  return AdjustedThisPtr;
 }
 
 llvm::Value *ItaniumCXXABI::emitDynamicCastToVoid(CodeGenFunction &CGF,
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 700ffa4..e8d2451 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -158,9 +158,15 @@ public:
     // TODO: Add support for exact dynamic_casts.
     return false;
   }
+  std::optional<ExactDynamicCastInfo>
+  getExactDynamicCastInfo(QualType SrcRecordTy, QualType DestTy,
+                          QualType DestRecordTy) override {
+    llvm_unreachable("unsupported");
+  }
   llvm::Value *emitExactDynamicCast(CodeGenFunction &CGF, Address Value,
                                     QualType SrcRecordTy, QualType DestTy,
                                     QualType DestRecordTy,
+                                    const ExactDynamicCastInfo &CastInfo,
                                     llvm::BasicBlock *CastSuccess,
                                     llvm::BasicBlock *CastFail) override {
     llvm_unreachable("unsupported");
diff --git a/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp b/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp
index 33a8d8f..1a1889a 100644
--- a/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp
@@ -246,35 +246,26 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     llvm::FunctionType *LLVMFuncTy =
         cast<llvm::FunctionType>(ConvertType(QualType(FuncTy, 0)));
 
+    bool VarArg = LLVMFuncTy->isVarArg();
     unsigned NParams = LLVMFuncTy->getNumParams();
     std::vector<Value *> Args;
-    Args.reserve(NParams + 3);
+    Args.reserve(NParams + 3 + VarArg);
     // The only real argument is the FuncRef
     Args.push_back(FuncRef);
 
     // Add the type information
-    auto addType = [this, &Args](llvm::Type *T) {
-      if (T->isVoidTy()) {
-        // Do nothing
-      } else if (T->isFloatingPointTy()) {
-        Args.push_back(ConstantFP::get(T, 0));
-      } else if (T->isIntegerTy()) {
-        Args.push_back(ConstantInt::get(T, 0));
-      } else if (T->isPointerTy()) {
-        Args.push_back(ConstantPointerNull::get(llvm::PointerType::get(
-            getLLVMContext(), T->getPointerAddressSpace())));
-      } else {
-        // TODO: Handle reference types. For now, we reject them in Sema.
-        llvm_unreachable("Unhandled type");
-      }
-    };
-
-    addType(LLVMFuncTy->getReturnType());
+    llvm::Type *RetType = LLVMFuncTy->getReturnType();
+    if (!RetType->isVoidTy()) {
+      Args.push_back(PoisonValue::get(RetType));
+    }
     // The token type indicates the boundary between return types and param
     // types.
     Args.push_back(PoisonValue::get(llvm::Type::getTokenTy(getLLVMContext())));
     for (unsigned i = 0; i < NParams; i++) {
-      addType(LLVMFuncTy->getParamType(i));
+      Args.push_back(PoisonValue::get(LLVMFuncTy->getParamType(i)));
+    }
+    if (VarArg) {
+      Args.push_back(PoisonValue::get(Builder.getPtrTy()));
     }
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_test_func);
     return Builder.CreateCall(Callee, Args);
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index e23d19d..b508709 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1051,18 +1051,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vfmsubsd3_mask3:
     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
                              /*NegAcc*/ true);
-  case X86::BI__builtin_ia32_vfmaddph:
-  case X86::BI__builtin_ia32_vfmaddps:
-  case X86::BI__builtin_ia32_vfmaddpd:
-  case X86::BI__builtin_ia32_vfmaddph256:
-  case X86::BI__builtin_ia32_vfmaddps256:
-  case X86::BI__builtin_ia32_vfmaddpd256:
   case X86::BI__builtin_ia32_vfmaddph512_mask:
   case X86::BI__builtin_ia32_vfmaddph512_maskz:
   case X86::BI__builtin_ia32_vfmaddph512_mask3:
-  case X86::BI__builtin_ia32_vfmaddbf16128:
-  case X86::BI__builtin_ia32_vfmaddbf16256:
-  case X86::BI__builtin_ia32_vfmaddbf16512:
   case X86::BI__builtin_ia32_vfmaddps512_mask:
   case X86::BI__builtin_ia32_vfmaddps512_maskz:
   case X86::BI__builtin_ia32_vfmaddps512_mask3:
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 586f287..8c0bba9 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1012,6 +1012,7 @@ inferOffloadToolchains(Compilation &C, Action::OffloadKind Kind) {
     Arg *A = new Arg(Opt, C.getArgs().getArgString(Index), Index,
                      C.getArgs().MakeArgString(Triple.split("-").first),
                      C.getArgs().MakeArgString("--offload-arch=" + Arch));
+    A->claim();
     C.getArgs().append(A);
     C.getArgs().AddSynthesizedArg(A);
     Triples.insert(Triple);
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 25c6b5a..7667dbd 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -855,17 +855,30 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
 
 void ToolChain::addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args,
                                              ArgStringList &CmdArgs) const {
-  // Default to the <driver-path>/../lib directory. This works fine on the
-  // platforms that we have tested so far. We will probably have to re-fine
-  // this in the future. In particular, on some platforms, we may need to use
-  // lib64 instead of lib.
+  auto AddLibSearchPathIfExists = [&](const Twine &Path) {
+    // Linker may emit warnings about non-existing directories
+    if (!llvm::sys::fs::is_directory(Path))
+      return;
+
+    if (getTriple().isKnownWindowsMSVCEnvironment())
+      CmdArgs.push_back(Args.MakeArgString("-libpath:" + Path));
+    else
+      CmdArgs.push_back(Args.MakeArgString("-L" + Path));
+  };
+
+  // Search for flang_rt.* at the same location as clang_rt.* with
+  // LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=0. On most platforms, flang_rt is
+  // located at the path returned by getRuntimePath() which is already added to
+  // the library search path. This exception is for Apple-Darwin.
+  AddLibSearchPathIfExists(getCompilerRTPath());
+
+  // Fall back to the non-resource directory <driver-path>/../lib. We will
+  // probably have to refine this in the future. In particular, on some
+  // platforms, we may need to use lib64 instead of lib.
   SmallString<256> DefaultLibPath =
       llvm::sys::path::parent_path(getDriver().Dir);
   llvm::sys::path::append(DefaultLibPath, "lib");
-  if (getTriple().isKnownWindowsMSVCEnvironment())
-    CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath));
-  else
-    CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath));
+  AddLibSearchPathIfExists(DefaultLibPath);
 }
 
 void ToolChain::addFlangRTLibPath(const ArgList &Args,
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index d64290f..9f99edad 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1485,6 +1485,14 @@ static bool compileModuleAndReadASTImpl(CompilerInstance &ImportingInstance,
     return false;
   }
 
+  // The module is built successfully, we can update its timestamp now.
+  if (ImportingInstance.getPreprocessor()
+          .getHeaderSearchInfo()
+          .getHeaderSearchOpts()
+          .ModulesValidateOncePerBuildSession) {
+    ImportingInstance.getModuleCache().updateModuleTimestamp(ModuleFileName);
+  }
+
   return readASTAfterCompileModule(ImportingInstance, ImportLoc, ModuleNameLoc,
                                    Module, ModuleFileName,
                                    /*OutOfDate=*/nullptr, /*Missing=*/nullptr);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 9f77e62..ccc3154 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3936,47 +3936,18 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fsanitize_ignorelist_EQ, F);
 
   switch (Opts.getClangABICompat()) {
-  case LangOptions::ClangABI::Ver3_8:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "3.8");
+#define ABI_VER_MAJOR_MINOR(Major, Minor)                                      \
+  case LangOptions::ClangABI::Ver##Major##_##Minor:                            \
+    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, #Major "." #Minor);        \
     break;
-  case LangOptions::ClangABI::Ver4:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "4.0");
+#define ABI_VER_MAJOR(Major)                                                   \
+  case LangOptions::ClangABI::Ver##Major:                                      \
+    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, #Major ".0");              \
     break;
-  case LangOptions::ClangABI::Ver6:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "6.0");
-    break;
-  case LangOptions::ClangABI::Ver7:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "7.0");
-    break;
-  case LangOptions::ClangABI::Ver9:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "9.0");
-    break;
-  case LangOptions::ClangABI::Ver11:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "11.0");
-    break;
-  case LangOptions::ClangABI::Ver12:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "12.0");
-    break;
-  case LangOptions::ClangABI::Ver14:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "14.0");
-    break;
-  case LangOptions::ClangABI::Ver15:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "15.0");
-    break;
-  case LangOptions::ClangABI::Ver17:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "17.0");
-    break;
-  case LangOptions::ClangABI::Ver18:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "18.0");
-    break;
-  case LangOptions::ClangABI::Ver19:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "19.0");
-    break;
-  case LangOptions::ClangABI::Ver20:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "20.0");
-    break;
-  case LangOptions::ClangABI::Latest:
+#define ABI_VER_LATEST(Latest)                                                 \
+  case LangOptions::ClangABI::Latest:                                          \
     break;
+#include "clang/Basic/ABIVersions.def"
   }
 
   if (Opts.getSignReturnAddressScope() ==
@@ -4482,32 +4453,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                    !VerParts.second.getAsInteger(10, Minor)
              : VerParts.first.size() == Ver.size() || VerParts.second == "0")) {
       // Got a valid version number.
-      if (Major == 3 && Minor <= 8)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver3_8);
-      else if (Major <= 4)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver4);
-      else if (Major <= 6)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver6);
-      else if (Major <= 7)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver7);
-      else if (Major <= 9)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver9);
-      else if (Major <= 11)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver11);
-      else if (Major <= 12)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver12);
-      else if (Major <= 14)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver14);
-      else if (Major <= 15)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver15);
-      else if (Major <= 17)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver17);
-      else if (Major <= 18)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver18);
-      else if (Major <= 19)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver19);
-      else if (Major <= 20)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver20);
+#define ABI_VER_MAJOR_MINOR(Major_, Minor_)                                    \
+  if (std::tie(Major, Minor) <= std::tuple(Major_, Minor_))                    \
+    Opts.setClangABICompat(LangOptions::ClangABI::Ver##Major_##_##Minor_);     \
+  else
+#define ABI_VER_MAJOR(Major_)                                                  \
+  if (Major <= Major_)                                                         \
+    Opts.setClangABICompat(LangOptions::ClangABI::Ver##Major_);                \
+  else
+#define ABI_VER_LATEST(Latest)                                                 \
+  { /* Equivalent to latest version - do nothing */                            \
+  }
+#include "clang/Basic/ABIVersions.def"
     } else if (Ver != "latest") {
       Diags.Report(diag::err_drv_invalid_value)
           << A->getAsString(Args) << A->getValue();
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 75290d2..95e9bd7a 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -441,8 +441,8 @@ _mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
-  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
-                                                (__v32bf)__C);
+  return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B,
+                                             (__v32bf)__C);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
@@ -469,8 +469,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
-  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
-                                                -(__v32bf)__C);
+  return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B,
+                                             -(__v32bf)__C);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
@@ -497,8 +497,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
-  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
-                                                (__v32bf)__C);
+  return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B,
+                                             (__v32bf)__C);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh(
@@ -527,8 +527,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
-  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
-                                                -(__v32bf)__C);
+  return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B,
+                                             -(__v32bf)__C);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh(
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 66797ae..0c7f381 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -852,8 +852,8 @@ _mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) {
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
-  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B,
-                                                (__v16bf)__C);
+  return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, (__v16bf)__B,
+                                             (__v16bf)__C);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -880,8 +880,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh(
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
-  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B,
-                                                -(__v16bf)__C);
+  return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, (__v16bf)__B,
+                                             -(__v16bf)__C);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -908,8 +908,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh(
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
-  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B,
-                                                (__v16bf)__C);
+  return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, -(__v16bf)__B,
+                                             (__v16bf)__C);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pbh(
@@ -938,8 +938,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh(
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
-  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B,
-                                                -(__v16bf)__C);
+  return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, -(__v16bf)__B,
+                                             -(__v16bf)__C);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pbh(
@@ -969,8 +969,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh(
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A,
                                                                __m128bh __B,
                                                                __m128bh __C) {
-  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B,
-                                                (__v8bf)__C);
+  return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, (__v8bf)__B,
+                                             (__v8bf)__C);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
@@ -997,8 +997,8 @@ _mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A,
                                                                __m128bh __B,
                                                                __m128bh __C) {
-  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B,
-                                                -(__v8bf)__C);
+  return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, (__v8bf)__B,
+                                             -(__v8bf)__C);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
@@ -1025,8 +1025,8 @@ _mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A,
                                                                 __m128bh __B,
                                                                 __m128bh __C) {
-  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B,
-                                                (__v8bf)__C);
+  return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, -(__v8bf)__B,
+                                             (__v8bf)__C);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
@@ -1053,8 +1053,8 @@ _mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A,
                                                                 __m128bh __B,
                                                                 __m128bh __C) {
-  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B,
-                                                -(__v8bf)__C);
+  return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, -(__v8bf)__B,
+                                             -(__v8bf)__C);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index dc9fc07..55e7102 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -31,6 +31,14 @@
                  __min_vector_width__(128)))
 #endif
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
 ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
@@ -460,7 +468,7 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_and_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a & (__v4du)__b);
@@ -478,7 +486,7 @@ _mm256_and_si256(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_andnot_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)(~(__v4du)__a & (__v4du)__b);
@@ -1721,10 +1729,10 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
 }
 
 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
@@ -1740,7 +1748,7 @@ _mm256_mulhi_epu16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
@@ -1759,7 +1767,7 @@ _mm256_mulhi_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mullo_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hu)__a * (__v16hu)__b);
@@ -1822,7 +1830,7 @@ _mm256_mul_epu32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_or_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a | (__v4du)__b);
@@ -2974,7 +2982,7 @@ _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_xor_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a ^ (__v4du)__b);
@@ -5289,5 +5297,7 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 
 #undef __DEFAULT_FN_ATTRS256
 #undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
 
 #endif /* __AVX2INTRIN_H */
diff --git a/clang/lib/Headers/avx512bitalgintrin.h b/clang/lib/Headers/avx512bitalgintrin.h
index 3c446b3..9a1ff8f3 100644
--- a/clang/lib/Headers/avx512bitalgintrin.h
+++ b/clang/lib/Headers/avx512bitalgintrin.h
@@ -20,7 +20,13 @@
                  __target__("avx512bitalg,evex512"),                           \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_popcnt_epi16(__m512i __A)
 {
   return (__m512i)__builtin_elementwise_popcount((__v32hu)__A);
@@ -42,7 +48,7 @@ _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
               __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_popcnt_epi8(__m512i __A)
 {
   return (__m512i)__builtin_elementwise_popcount((__v64qu)__A);
@@ -80,7 +86,7 @@ _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
               __B);
 }
 
-
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index c854720..233d4a6 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -25,6 +25,14 @@ typedef unsigned long long __mmask64;
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512bw,no-evex512")))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 static __inline __mmask32 __DEFAULT_FN_ATTRS
 _knot_mask32(__mmask32 __M)
 {
@@ -438,7 +446,7 @@ _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
                                              (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mullo_epi16 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v32hu) __A * (__v32hu) __B);
 }
@@ -1082,7 +1090,7 @@ _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
                                          (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mulhi_epi16(__m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B);
@@ -1105,10 +1113,10 @@ _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
                                           (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mulhi_epu16(__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_ia32_pmulhuw512((__v32hu) __A, (__v32hu) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -2010,5 +2018,7 @@ _mm512_sad_epu8 (__m512i __A, __m512i __B)
 
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index 88b48e3..62325b9 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -20,6 +20,14 @@
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512dq,no-evex512")))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 static __inline __mmask8 __DEFAULT_FN_ATTRS
 _knot_mask8(__mmask8 __M)
 {
@@ -167,7 +175,7 @@ _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_xor_pd(__m512d __A, __m512d __B) {
   return (__m512d)((__v8du)__A ^ (__v8du)__B);
 }
@@ -186,7 +194,7 @@ _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
                                               (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_xor_ps (__m512 __A, __m512 __B) {
   return (__m512)((__v16su)__A ^ (__v16su)__B);
 }
@@ -205,7 +213,7 @@ _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_or_pd(__m512d __A, __m512d __B) {
   return (__m512d)((__v8du)__A | (__v8du)__B);
 }
@@ -224,7 +232,7 @@ _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
                                               (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_or_ps(__m512 __A, __m512 __B) {
   return (__m512)((__v16su)__A | (__v16su)__B);
 }
@@ -243,7 +251,7 @@ _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_and_pd(__m512d __A, __m512d __B) {
   return (__m512d)((__v8du)__A & (__v8du)__B);
 }
@@ -262,7 +270,7 @@ _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
                                               (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_and_ps(__m512 __A, __m512 __B) {
   return (__m512)((__v16su)__A & (__v16su)__B);
 }
@@ -281,7 +289,7 @@ _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_andnot_pd(__m512d __A, __m512d __B) {
   return (__m512d)(~(__v8du)__A & (__v8du)__B);
 }
@@ -300,7 +308,7 @@ _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
                                               (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_andnot_ps(__m512 __A, __m512 __B) {
   return (__m512)(~(__v16su)__A & (__v16su)__B);
 }
@@ -1375,5 +1383,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
 
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 45e7eeb..95b80cc 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -277,20 +277,20 @@ _mm512_setzero_pd(void) {
   return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_set1_ps(float __w)
 {
   return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
                                  __w, __w, __w, __w, __w, __w, __w, __w  };
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_set1_pd(double __w)
 {
   return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_set1_epi8(char __w)
 {
   return __extension__ (__m512i)(__v64qi){
@@ -304,7 +304,7 @@ _mm512_set1_epi8(char __w)
     __w, __w, __w, __w, __w, __w, __w, __w  };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_set1_epi16(short __w)
 {
   return __extension__ (__m512i)(__v32hi){
@@ -314,7 +314,7 @@ _mm512_set1_epi16(short __w)
     __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_set1_epi32(int __s)
 {
   return __extension__ (__m512i)(__v16si){
@@ -330,7 +330,7 @@ _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
                                              (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_set1_epi64(long long __d)
 {
   return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
@@ -645,7 +645,7 @@ _mm512_zextsi256_si512(__m256i __a)
 }
 
 /* Bitwise operators */
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_and_epi32(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v16su)__a & (__v16su)__b);
@@ -666,7 +666,7 @@ _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
                                          __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_and_epi64(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a & (__v8du)__b);
@@ -687,13 +687,13 @@ _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
                                          __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_andnot_si512 (__m512i __A, __m512i __B)
 {
   return (__m512i)(~(__v8du)__A & (__v8du)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i)(~(__v16su)__A & (__v16su)__B);
@@ -714,7 +714,7 @@ _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
                                            __U, __A, __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_andnot_epi64(__m512i __A, __m512i __B)
 {
   return (__m512i)(~(__v8du)__A & (__v8du)__B);
@@ -735,7 +735,7 @@ _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
                                            __U, __A, __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_or_epi32(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v16su)__a | (__v16su)__b);
@@ -755,7 +755,7 @@ _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
   return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_or_epi64(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a | (__v8du)__b);
@@ -775,7 +775,7 @@ _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
   return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_xor_epi32(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v16su)__a ^ (__v16su)__b);
@@ -795,7 +795,7 @@ _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
   return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_xor_epi64(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a ^ (__v8du)__b);
@@ -815,19 +815,19 @@ _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
   return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_and_si512(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a & (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_or_si512(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a | (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_xor_si512(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a ^ (__v8du)__b);
@@ -835,45 +835,38 @@ _mm512_xor_si512(__m512i __a, __m512i __b)
 
 /* Arithmetic */
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_add_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_add_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a + (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_add_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_add_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a + (__v16sf)__b);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mul_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a * (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mul_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a * (__v16sf)__b);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_sub_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_sub_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a - (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_sub_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_sub_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a - (__v16sf)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi64 (__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_add_epi64(__m512i __A, __m512i __B) {
   return (__m512i) ((__v8du) __A + (__v8du) __B);
 }
 
@@ -2315,9 +2308,8 @@ _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
                                             (__v2df)_mm_setzero_pd(), \
                                             (__mmask8)(U), (int)(R)))
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_div_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d
+    __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_div_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a/(__v8df)__b);
 }
 
@@ -2335,9 +2327,8 @@ _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
                                               (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_div_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_div_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a/(__v16sf)__b);
 }
 
@@ -4123,9 +4114,8 @@ _mm512_cvtss_f32(__m512 __a)
 
 /* Unpack and Interleave */
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpackhi_pd(__m512d __a, __m512d __b) {
   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
 }
@@ -4146,9 +4136,8 @@ _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpacklo_pd(__m512d __a, __m512d __b) {
   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
 }
@@ -4169,9 +4158,8 @@ _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpackhi_ps(__m512 __a, __m512 __b) {
   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
                                          2,    18,    3,    19,
                                          2+4,  18+4,  3+4,  19+4,
@@ -4195,9 +4183,8 @@ _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
                                           (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpacklo_ps(__m512 __a, __m512 __b) {
   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
                                          0,    16,    1,    17,
                                          0+4,  16+4,  1+4,  17+4,
@@ -5303,7 +5290,7 @@ _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
           (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_movedup_pd (__m512d __A)
 {
   return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
@@ -8665,7 +8652,7 @@ _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
                                        _mm512_setzero_si512());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_movehdup_ps (__m512 __A)
 {
   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
@@ -8688,7 +8675,7 @@ _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_moveldup_ps (__m512 __A)
 {
   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
@@ -9337,19 +9324,23 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
  * This takes log2(n) steps where n is the number of elements in the vector.
  */
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_add_epi64(__m512i __W) {
   return __builtin_reduce_add((__v8di)__W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_mul_epi64(__m512i __W) {
   return __builtin_reduce_mul((__v8di)__W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_and_epi64(__m512i __W) {
   return __builtin_reduce_and((__v8di)__W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_or_epi64(__m512i __W) {
   return __builtin_reduce_or((__v8di)__W);
 }
 
@@ -9400,22 +9391,22 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_add_epi32(__m512i __W) {
   return __builtin_reduce_add((__v16si)__W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_mul_epi32(__m512i __W) {
   return __builtin_reduce_mul((__v16si)__W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_and_epi32(__m512i __W) {
   return __builtin_reduce_and((__v16si)__W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_or_epi32(__m512i __W) {
   return __builtin_reduce_or((__v16si)__W);
 }
@@ -9466,22 +9457,22 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epi64(__m512i __V) {
   return __builtin_reduce_max((__v8di)__V);
 }
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epu64(__m512i __V) {
   return __builtin_reduce_max((__v8du)__V);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epi64(__m512i __V) {
   return __builtin_reduce_min((__v8di)__V);
 }
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epu64(__m512i __V) {
   return __builtin_reduce_min((__v8du)__V);
 }
@@ -9509,22 +9500,22 @@ _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
   return __builtin_reduce_min((__v8du)__V);
 }
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epi32(__m512i __V) {
   return __builtin_reduce_max((__v16si)__V);
 }
 
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epu32(__m512i __V) {
   return __builtin_reduce_max((__v16su)__V);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epi32(__m512i __V) {
   return __builtin_reduce_min((__v16si)__V);
 }
 
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epu32(__m512i __V) {
   return __builtin_reduce_min((__v16su)__V);
 }
diff --git a/clang/lib/Headers/avx512vlbitalgintrin.h b/clang/lib/Headers/avx512vlbitalgintrin.h
index 1b01fe0..739e78a 100644
--- a/clang/lib/Headers/avx512vlbitalgintrin.h
+++ b/clang/lib/Headers/avx512vlbitalgintrin.h
@@ -24,7 +24,15 @@
                  __target__("avx512vl,avx512bitalg,no-evex512"),               \
                  __min_vector_width__(256)))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_popcnt_epi16(__m256i __A)
 {
   return (__m256i)__builtin_elementwise_popcount((__v16hu)__A);
@@ -46,7 +54,7 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
               __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_popcnt_epi16(__m128i __A)
 {
   return (__m128i)__builtin_elementwise_popcount((__v8hu)__A);
@@ -68,7 +76,7 @@ _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
               __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_popcnt_epi8(__m256i __A)
 {
   return (__m256i)__builtin_elementwise_popcount((__v32qu)__A);
@@ -90,7 +98,7 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
               __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_popcnt_epi8(__m128i __A)
 {
   return (__m128i)__builtin_elementwise_popcount((__v16qu)__A);
@@ -147,5 +155,7 @@ _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index a12acb7..1f8cca7 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -1419,8 +1419,8 @@ _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
                                                              __m128h __B,
                                                              __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
-                                          (__v8hf)__C);
+  return (__m128h)__builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B,
+                                            (__v8hf)__C);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
@@ -1429,7 +1429,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
                                                                   __m128h __C) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
       (__v8hf)__A);
 }
 
@@ -1437,7 +1437,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
       (__v8hf)__C);
 }
 
@@ -1445,15 +1445,15 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
       (__v8hf)_mm_setzero_ph());
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
                                                              __m128h __B,
                                                              __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
-                                          -(__v8hf)__C);
+  return (__m128h)__builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B,
+                                            -(__v8hf)__C);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
@@ -1476,7 +1476,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+      __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
       (__v8hf)__C);
 }
 
@@ -1484,7 +1484,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+      __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
       (__v8hf)_mm_setzero_ph());
 }
 
@@ -1492,22 +1492,22 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+      __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
       (__v8hf)_mm_setzero_ph());
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
                                                                 __m256h __B,
                                                                 __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
-                                             (__v16hf)__C);
+  return (__m256h)__builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B,
+                                            (__v16hf)__C);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
       (__v16hf)__A);
 }
 
@@ -1515,7 +1515,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
       (__v16hf)__C);
 }
 
@@ -1523,22 +1523,22 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
       (__v16hf)_mm256_setzero_ph());
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
                                                                 __m256h __B,
                                                                 __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
-                                             -(__v16hf)__C);
+  return (__m256h)__builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B,
+                                            -(__v16hf)__C);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
       (__v16hf)__A);
 }
 
@@ -1546,7 +1546,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
       (__v16hf)_mm256_setzero_ph());
 }
 
@@ -1554,7 +1554,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+      __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
       (__v16hf)__C);
 }
 
@@ -1562,7 +1562,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+      __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
       (__v16hf)_mm256_setzero_ph());
 }
 
@@ -1570,7 +1570,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+      __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
       (__v16hf)_mm256_setzero_ph());
 }
 
@@ -1684,7 +1684,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
       (__v8hf)__C);
 }
 
@@ -1692,7 +1692,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
       (__v16hf)__C);
 }
 
@@ -1715,45 +1715,45 @@ _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
                                                               __m128h __B,
                                                               __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
-                                          (__v8hf)__C);
+  return (__m128h)__builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B,
+                                            (__v8hf)__C);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
       (__v8hf)__A);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
                                                                  __m256h __B,
                                                                  __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
-                                             (__v16hf)__C);
+  return (__m256h)__builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B,
+                                            (__v16hf)__C);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
       (__v16hf)__A);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
                                                               __m128h __B,
                                                               __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
-                                          -(__v8hf)__C);
+  return (__m128h)__builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B,
+                                            -(__v8hf)__C);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
       (__v8hf)__A);
 }
 
@@ -1761,22 +1761,22 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
   return (__m128h)__builtin_ia32_selectph_128(
       (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
+      __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
       (__v8hf)__C);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
                                                                  __m256h __B,
                                                                  __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
-                                             -(__v16hf)__C);
+  return (__m256h)__builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B,
+                                            -(__v16hf)__C);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
       (__v16hf)__A);
 }
 
@@ -1784,7 +1784,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
   return (__m256h)__builtin_ia32_selectph_256(
       (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
+      __builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
       (__v16hf)__C);
 }
 
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 2a5f7b4..cbad39a 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -899,321 +899,289 @@ _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __A);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, (__v2df)__C),
+      (__v2df)__A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __C);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, (__v2df)__C),
+      (__v2df)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, (__v2df)__C),
+      (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __A);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, -(__v2df)__C),
+      (__v2df)__A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, -(__v2df)__C),
+      (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __C);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, (__v2df)__C),
+      (__v2df)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, (__v2df)__C),
+      (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, -(__v2df)__C),
+      (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __A);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C),
+      (__v4df)__A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __C);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C),
+      (__v4df)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C),
+      (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __A);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C),
+      (__v4df)__A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C),
+      (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __C);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, (__v4df)__C),
+      (__v4df)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, (__v4df)__C),
+      (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, -(__v4df)__C),
+      (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __A);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C),
+      (__v4sf)__A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __C);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C),
+      (__v4sf)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C),
+      (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __A);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C),
+      (__v4sf)__A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C),
+      (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __C);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C),
+      (__v4sf)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C),
+      (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C),
+      (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __A);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C),
+      (__v8sf)__A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __C);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C),
+      (__v8sf)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C),
+      (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __A);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C),
+      (__v8sf)__A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C),
+      (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __C);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C),
+      (__v8sf)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C),
+      (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C),
+      (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
@@ -1420,41 +1388,37 @@ _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __C);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, -(__v2df)__C),
+      (__v2df)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __C);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C),
+      (__v4df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __C);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C),
+      (__v4sf)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __C);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C),
+      (__v8sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
@@ -1500,121 +1464,109 @@ _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             -(__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __A);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, -(__v2df)__B, (__v2df)__C),
+      (__v2df)__A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                -(__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __A);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, -(__v4df)__B, (__v4df)__C),
+      (__v4df)__A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             -(__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __A);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C),
+      (__v4sf)__A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                -(__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __A);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, -(__v8sf)__B, (__v8sf)__C),
+      (__v8sf)__A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             -(__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __A);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, -(__v2df)__B, -(__v2df)__C),
+      (__v2df)__A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             -(__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __C);
+  return (__m128d)__builtin_ia32_selectpd_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v2df)__A, -(__v2df)__B, -(__v2df)__C),
+      (__v2df)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                -(__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __A);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, -(__v4df)__B, -(__v4df)__C),
+      (__v4df)__A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                -(__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __C);
+  return (__m256d)__builtin_ia32_selectpd_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4df)__A, -(__v4df)__B, -(__v4df)__C),
+      (__v4df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             -(__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __A);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C),
+      (__v4sf)__A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             -(__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __C);
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C),
+      (__v4sf)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                -(__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __A);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, -(__v8sf)__B, -(__v8sf)__C),
+      (__v8sf)__A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                -(__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __C);
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8sf)__A, -(__v8sf)__B, -(__v8sf)__C),
+      (__v8sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vpopcntdqintrin.h b/clang/lib/Headers/avx512vpopcntdqintrin.h
index e24c2c5..79fc6e1 100644
--- a/clang/lib/Headers/avx512vpopcntdqintrin.h
+++ b/clang/lib/Headers/avx512vpopcntdqintrin.h
@@ -60,5 +60,6 @@ _mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512vpopcntdqvlintrin.h b/clang/lib/Headers/avx512vpopcntdqvlintrin.h
index b6c819b..d14cb1e 100644
--- a/clang/lib/Headers/avx512vpopcntdqvlintrin.h
+++ b/clang/lib/Headers/avx512vpopcntdqvlintrin.h
@@ -99,5 +99,7 @@ _mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 8e497a9..5a6d48b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -87,9 +87,8 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x double] containing the sums of both
 ///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_add_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_add_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a+(__v4df)__b);
 }
 
@@ -105,9 +104,8 @@ _mm256_add_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x float] containing the sums of both
 ///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_add_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a+(__v8sf)__b);
 }
 
@@ -123,9 +121,8 @@ _mm256_add_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 /// \returns A 256-bit vector of [4 x double] containing the differences between
 ///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sub_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_sub_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a-(__v4df)__b);
 }
 
@@ -141,9 +138,8 @@ _mm256_sub_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 /// \returns A 256-bit vector of [8 x float] containing the differences between
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a-(__v8sf)__b);
 }
 
@@ -197,9 +193,8 @@ _mm256_addsub_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing the divisor.
 /// \returns A 256-bit vector of [4 x double] containing the quotients of both
 ///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_div_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_div_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a/(__v4df)__b);
 }
 
@@ -215,9 +210,8 @@ _mm256_div_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing the divisor.
 /// \returns A 256-bit vector of [8 x float] containing the quotients of both
 ///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_div_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a/(__v8sf)__b);
 }
 
@@ -317,9 +311,8 @@ _mm256_min_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing one of the operands.
 /// \returns A 256-bit vector of [4 x double] containing the products of both
 ///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_mul_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_mul_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a * (__v4df)__b);
 }
 
@@ -335,9 +328,8 @@ _mm256_mul_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the operands.
 /// \returns A 256-bit vector of [8 x float] containing the products of both
 ///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_mul_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a * (__v8sf)__b);
 }
 
@@ -555,7 +547,7 @@ _mm256_rcp_ps(__m256 __a)
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
 ///    values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_and_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4du)__a & (__v4du)__b);
@@ -573,7 +565,7 @@ _mm256_and_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
 ///    values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_and_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8su)__a & (__v8su)__b);
@@ -594,7 +586,7 @@ _mm256_and_ps(__m256 __a, __m256 __b)
 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
 ///    values of the second operand and the one's complement of the first
 ///    operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_andnot_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)(~(__v4du)__a & (__v4du)__b);
@@ -615,7 +607,7 @@ _mm256_andnot_pd(__m256d __a, __m256d __b)
 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
 ///    values of the second operand and the one's complement of the first
 ///    operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_andnot_ps(__m256 __a, __m256 __b)
 {
   return (__m256)(~(__v8su)__a & (__v8su)__b);
@@ -633,7 +625,7 @@ _mm256_andnot_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
 ///    values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_or_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4du)__a | (__v4du)__b);
@@ -651,7 +643,7 @@ _mm256_or_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
 ///    values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_or_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8su)__a | (__v8su)__b);
@@ -669,7 +661,7 @@ _mm256_or_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
 ///    values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_xor_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4du)__a ^ (__v4du)__b);
@@ -687,7 +679,7 @@ _mm256_xor_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
 ///    values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_xor_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8su)__a ^ (__v8su)__b);
@@ -2392,7 +2384,7 @@ _mm256_cvtss_f32(__m256 __a)
 ///    return value.
 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
 ///    values.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_movehdup_ps(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
@@ -2417,7 +2409,7 @@ _mm256_movehdup_ps(__m256 __a)
 ///    return value.
 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
 ///    values.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_moveldup_ps(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
@@ -2439,7 +2431,7 @@ _mm256_moveldup_ps(__m256 __a)
 ///    the return value.
 /// \returns A 256-bit vector of [4 x double] containing the moved and
 ///    duplicated values.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_movedup_pd(__m256d __a)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
@@ -2462,9 +2454,8 @@ _mm256_movedup_pd(__m256d __a)
 ///    Bits [127:64] are written to bits [127:64] of the return value. \n
 ///    Bits [255:192] are written to bits [255:192] of the return value. \n
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpackhi_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
 }
 
@@ -2484,9 +2475,8 @@ _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 ///    Bits [63:0] are written to bits [127:64] of the return value. \n
 ///    Bits [191:128] are written to bits [255:192] of the return value. \n
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpacklo_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
 }
 
@@ -2511,9 +2501,8 @@ _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 ///    Bits [223:192] are written to bits [191:160] of the return value. \n
 ///    Bits [255:224] are written to bits [255:224] of the return value.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpackhi_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
@@ -2538,9 +2527,8 @@ _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 ///    Bits [159:128] are written to bits [191:160] of the return value. \n
 ///    Bits [191:160] are written to bits [255:224] of the return value.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpacklo_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
 }
 
@@ -3777,7 +3765,7 @@ _mm256_set_ps(float __a, float __b, float __c, float __d,
 /// \param __i7
 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
                  int __i4, int __i5, int __i6, int __i7)
 {
@@ -3825,7 +3813,7 @@ _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
 /// \param __w00
 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
                  short __w11, short __w10, short __w09, short __w08,
                  short __w07, short __w06, short __w05, short __w04,
@@ -3908,7 +3896,7 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
 /// \param __b00
 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
                 char __b27, char __b26, char __b25, char __b24,
                 char __b23, char __b22, char __b21, char __b20,
@@ -3943,7 +3931,7 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
 /// \param __d
 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
   return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
@@ -4044,7 +4032,7 @@ _mm256_setr_ps(float __a, float __b, float __c, float __d,
 /// \param __i7
 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
                   int __i4, int __i5, int __i6, int __i7)
 {
@@ -4092,7 +4080,7 @@ _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
 /// \param __w00
 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
        short __w11, short __w10, short __w09, short __w08,
        short __w07, short __w06, short __w05, short __w04,
@@ -4177,7 +4165,7 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
 /// \param __b00
 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
                  char __b27, char __b26, char __b25, char __b24,
                  char __b23, char __b22, char __b21, char __b20,
@@ -4210,7 +4198,7 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
 /// \param __d
 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
 /// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
   return _mm256_set_epi64x(__d, __c, __b, __a);
@@ -4267,7 +4255,7 @@ _mm256_set1_ps(float __w)
 ///    A 32-bit integral value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 256-bit integer vector of [8 x i32].
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_epi32(int __i)
 {
   return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
@@ -4285,7 +4273,7 @@ _mm256_set1_epi32(int __i)
 ///    A 16-bit integral value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 256-bit integer vector of [16 x i16].
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_epi16(short __w)
 {
   return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
@@ -4303,7 +4291,7 @@ _mm256_set1_epi16(short __w)
 ///    An 8-bit integral value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 256-bit integer vector of [32 x i8].
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_epi8(char __b)
 {
   return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
@@ -4324,7 +4312,7 @@ _mm256_set1_epi8(char __b)
 ///    A 64-bit integral value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 256-bit integer vector of [4 x i64].
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_epi64x(long long __q)
 {
   return _mm256_set_epi64x(__q, __q, __q, __q);
@@ -4379,7 +4367,7 @@ _mm256_setzero_si256(void) {
 ///    A 256-bit floating-point vector of [4 x double].
 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castpd_ps(__m256d __a)
 {
   return (__m256)__a;
@@ -4396,7 +4384,7 @@ _mm256_castpd_ps(__m256d __a)
 ///    A 256-bit floating-point vector of [4 x double].
 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castpd_si256(__m256d __a)
 {
   return (__m256i)__a;
@@ -4413,7 +4401,7 @@ _mm256_castpd_si256(__m256d __a)
 ///    A 256-bit floating-point vector of [8 x float].
 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castps_pd(__m256 __a)
 {
   return (__m256d)__a;
@@ -4430,7 +4418,7 @@ _mm256_castps_pd(__m256 __a)
 ///    A 256-bit floating-point vector of [8 x float].
 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castps_si256(__m256 __a)
 {
   return (__m256i)__a;
@@ -4447,7 +4435,7 @@ _mm256_castps_si256(__m256 __a)
 ///    A 256-bit integer vector.
 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castsi256_ps(__m256i __a)
 {
   return (__m256)__a;
@@ -4464,7 +4452,7 @@ _mm256_castsi256_ps(__m256i __a)
 ///    A 256-bit integer vector.
 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castsi256_pd(__m256i __a)
 {
   return (__m256d)__a;
@@ -4481,7 +4469,7 @@ _mm256_castsi256_pd(__m256i __a)
 ///    A 256-bit floating-point vector of [4 x double].
 /// \returns A 128-bit floating-point vector of [2 x double] containing the
 ///    lower 128 bits of the parameter.
-static __inline __m128d __DEFAULT_FN_ATTRS
+static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castpd256_pd128(__m256d __a)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
@@ -4498,7 +4486,7 @@ _mm256_castpd256_pd128(__m256d __a)
 ///    A 256-bit floating-point vector of [8 x float].
 /// \returns A 128-bit floating-point vector of [4 x float] containing the
 ///    lower 128 bits of the parameter.
-static __inline __m128 __DEFAULT_FN_ATTRS
+static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castps256_ps128(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
@@ -4514,7 +4502,7 @@ _mm256_castps256_ps128(__m256 __a)
 ///    A 256-bit integer vector.
 /// \returns A 128-bit integer vector containing the lower 128 bits of the
 ///    parameter.
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castsi256_si128(__m256i __a)
 {
   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 52addb7..ce8c79e 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -345,10 +345,15 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
 // In some configurations, __cpuidex is defined as a builtin (primarily
 // -fms-extensions) which will conflict with the __cpuidex definition below.
 #if !(__has_builtin(__cpuidex))
+// In some cases, offloading will set the host as the aux triple and define the
+// builtin. Given __has_builtin does not detect builtins on aux triples, we need
+// to explicitly check for some offloading cases.
+#ifndef __NVPTX__
 static __inline void __cpuidex(int __cpu_info[4], int __leaf, int __subleaf) {
   __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2],
                 __cpu_info[3]);
 }
 #endif
+#endif
 
 #endif /* __CPUID_H */
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 78e8a42..60d2000 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2127,8 +2127,9 @@ _mm_add_epi32(__m128i __a, __m128i __b) {
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
-  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_si64(__m64 __a,
+                                                                  __m64 __b) {
+  return (__m64)(((__v1du)__a)[0] + ((__v1du)__b)[0]);
 }
 
 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@@ -2393,8 +2394,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
 ///    A 128-bit signed [8 x i16] vector.
 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
 ///    each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mulhi_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
 }
 
@@ -2412,9 +2413,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
 ///    A 128-bit unsigned [8 x i16] vector.
 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
 ///    of each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mulhi_epu16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hu)__a, (__v8hu)__b);
 }
 
 /// Multiplies the corresponding elements of two signed [8 x i16]
@@ -2431,8 +2432,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
 ///    A 128-bit signed [8 x i16] vector.
 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
 ///    each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mullo_epi16(__m128i __a, __m128i __b) {
   return (__m128i)((__v8hu)__a * (__v8hu)__b);
 }
 
@@ -2557,8 +2558,9 @@ _mm_sub_epi32(__m128i __a, __m128i __b) {
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
-  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_si64(__m64 __a,
+                                                                  __m64 __b) {
+  return (__m64)(((__v1du)__a)[0] - ((__v1du)__b)[0]);
 }
 
 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@@ -2676,8 +2678,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
 ///    A 128-bit integer vector containing one of the source operands.
 /// \returns A 128-bit integer vector containing the bitwise AND of the values
 ///    in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_si128(__m128i __a, __m128i __b) {
   return (__m128i)((__v2du)__a & (__v2du)__b);
 }
 
@@ -2695,8 +2697,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
 ///    A 128-bit vector containing the right source operand.
 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
 ///    complement of the first operand and the values in the second operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
-                                                              __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_si128(__m128i __a, __m128i __b) {
   return (__m128i)(~(__v2du)__a & (__v2du)__b);
 }
 /// Performs a bitwise OR of two 128-bit integer vectors.
@@ -2711,8 +2713,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
 ///    A 128-bit integer vector containing one of the source operands.
 /// \returns A 128-bit integer vector containing the bitwise OR of the values
 ///    in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
-                                                          __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_si128(__m128i __a, __m128i __b) {
   return (__m128i)((__v2du)__a | (__v2du)__b);
 }
 
@@ -2728,8 +2730,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
 ///    A 128-bit integer vector containing one of the source operands.
 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
 ///    values in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_si128(__m128i __a, __m128i __b) {
   return (__m128i)((__v2du)__a ^ (__v2du)__b);
 }
 
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index 694801b..69977fb 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -23,13 +23,15 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
@@ -47,13 +49,15 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
+                                           -(__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
+                                            -(__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
@@ -71,13 +75,15 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
@@ -95,13 +101,15 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
+                                           -(__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
+                                            -(__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
@@ -143,49 +151,57 @@ _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
+                                           (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
+                                            (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
+                                           -(__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
+                                            -(__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
+                                           (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
+                                            (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
+                                           -(__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
+                                            -(__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index 22d1a78..24584a9 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -35,7 +35,8 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
 }
 
 /// Computes a multiply-add of 128-bit vectors of [2 x double].
@@ -55,7 +56,8 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
 }
 
 /// Computes a scalar multiply-add of the single-precision values in the
@@ -133,7 +135,8 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
+                                           -(__v4sf)__C);
 }
 
 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
@@ -153,7 +156,8 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
+                                            -(__v2df)__C);
 }
 
 /// Computes a scalar multiply-subtract of the single-precision values in
@@ -231,7 +235,8 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
 }
 
 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
@@ -251,7 +256,8 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
 }
 
 /// Computes a scalar negated multiply-add of the single-precision values in
@@ -329,7 +335,8 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+  return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
+                                           -(__v4sf)__C);
 }
 
 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
@@ -349,7 +356,8 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+  return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
+                                            -(__v2df)__C);
 }
 
 /// Computes a scalar negated multiply-subtract of the single-precision
@@ -531,7 +539,8 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
+                                           (__v8sf)__C);
 }
 
 /// Computes a multiply-add of 256-bit vectors of [4 x double].
@@ -551,7 +560,8 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
+                                            (__v4df)__C);
 }
 
 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
@@ -571,7 +581,8 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
+                                           -(__v8sf)__C);
 }
 
 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
@@ -591,7 +602,8 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
+                                            -(__v4df)__C);
 }
 
 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
@@ -611,7 +623,8 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
+                                           (__v8sf)__C);
 }
 
 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
@@ -631,7 +644,8 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
+                                            (__v4df)__C);
 }
 
 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
@@ -651,7 +665,8 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+  return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
+                                           -(__v8sf)__C);
 }
 
 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
@@ -671,7 +686,8 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+  return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
+                                            -(__v4df)__C);
 }
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index dc0fa5c..3961b79 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -57,6 +57,9 @@ typedef char __v16qi __attribute__((__vector_size__(16)));
 
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
 #define __anyext128(x)                                                         \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, -1, -1)
@@ -85,7 +88,7 @@ _mm_empty(void) {
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cvtsi32_si64(int __i)
 {
     return __extension__ (__m64)(__v2si){__i, 0};
@@ -102,7 +105,7 @@ _mm_cvtsi32_si64(int __i)
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS_SSE2
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cvtsi64_si32(__m64 __m)
 {
     return ((__v2si)__m)[0];
@@ -118,10 +121,10 @@ _mm_cvtsi64_si32(__m64 __m)
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cvtsi64_m64(long long __i)
 {
-    return (__m64)__i;
+    return __extension__ (__m64)(__v1di){__i};
 }
 
 /// Casts a 64-bit integer vector into a 64-bit signed integer value.
@@ -134,10 +137,10 @@ _mm_cvtsi64_m64(long long __i)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cvtm64_si64(__m64 __m)
 {
-    return (long long)__m;
+    return ((__v1di)__m)[0];
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -379,7 +382,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
@@ -400,7 +403,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
@@ -421,7 +424,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
@@ -536,7 +539,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
@@ -557,7 +560,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
@@ -578,7 +581,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
@@ -723,11 +726,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
-                                              (__v8hi)__anyext128(__m2)));
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1),
+                                              (__v8hi)__zext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -745,7 +748,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
@@ -1134,7 +1137,7 @@ _mm_srli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
@@ -1155,7 +1158,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
     return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
@@ -1173,7 +1176,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
@@ -1191,7 +1194,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
@@ -1213,7 +1216,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
@@ -1235,7 +1238,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
@@ -1257,7 +1260,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
@@ -1279,7 +1282,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
   /* This function always performs a signed comparison, but __v8qi is a char
@@ -1303,7 +1306,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
@@ -1325,7 +1328,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)((__v2si)__m1 > (__v2si)__m2);
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6a64369..7bf6b84 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -24,6 +24,7 @@ typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
 
 /* Unsigned types */
 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
+typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
 
 /* This header should only be included in a hosted environment as it depends on
  * a standard library to provide allocation routines. */
@@ -2447,11 +2448,11 @@ _mm_movemask_pi8(__m64 __a)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
-                                             (__v8hi)__anyext128(__b)));
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hu)__zext128(__a),
+                                             (__v8hu)__zext128(__b)));
 }
 
 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
diff --git a/clang/lib/Interpreter/RemoteJITUtils.cpp b/clang/lib/Interpreter/RemoteJITUtils.cpp
index c0e663b..c100f46 100644
--- a/clang/lib/Interpreter/RemoteJITUtils.cpp
+++ b/clang/lib/Interpreter/RemoteJITUtils.cpp
@@ -33,6 +33,10 @@
 using namespace llvm;
 using namespace llvm::orc;
 
+#if LLVM_ON_UNIX
+static std::vector<pid_t> LaunchedExecutorPID;
+#endif
+
 Expected<uint64_t> getSlabAllocSize(StringRef SizeString) {
   SizeString = SizeString.trim();
 
@@ -89,9 +93,14 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC,
       SlabSize, SREPC, SAs);
 }
 
+// Launches an out-of-process executor for remote JIT. The calling program can
+// provide a CustomizeFork callback, which allows it to run custom code in the
+// child process before exec. This enables sending custom setup or code to be
+// executed in the child (out-of-process) executor.
 Expected<std::unique_ptr<SimpleRemoteEPC>>
 launchExecutor(StringRef ExecutablePath, bool UseSharedMemory,
-               llvm::StringRef SlabAllocateSizeString) {
+               llvm::StringRef SlabAllocateSizeString,
+               std::function<void()> CustomizeFork) {
 #ifndef LLVM_ON_UNIX
   // FIXME: Add support for Windows.
   return make_error<StringError>("-" + ExecutablePath +
@@ -134,6 +143,9 @@ launchExecutor(StringRef ExecutablePath, bool UseSharedMemory,
     close(ToExecutor[WriteEnd]);
     close(FromExecutor[ReadEnd]);
 
+    if (CustomizeFork)
+      CustomizeFork();
+
     // Execute the child process.
     std::unique_ptr<char[]> ExecutorPath, FDSpecifier;
     {
@@ -158,6 +170,8 @@ launchExecutor(StringRef ExecutablePath, bool UseSharedMemory,
   }
   // else we're the parent...
 
+  LaunchedExecutorPID.push_back(ChildPID);
+
   // Close the child ends of the pipes
   close(ToExecutor[ReadEnd]);
   close(FromExecutor[WriteEnd]);
@@ -265,3 +279,18 @@ connectTCPSocket(StringRef NetworkAddress, bool UseSharedMemory,
       std::move(S), *SockFD, *SockFD);
 #endif
 }
+
+#if LLVM_ON_UNIX
+
+pid_t getLastLaunchedExecutorPID() {
+  if (!LaunchedExecutorPID.size())
+    return -1;
+  return LaunchedExecutorPID.back();
+}
+
+pid_t getNthLaunchedExecutorPID(int n) {
+  if (n - 1 < 0 || n - 1 >= static_cast<int>(LaunchedExecutorPID.size()))
+    return -1;
+  return LaunchedExecutorPID.at(n - 1);
+}
+#endif
+\ No newline at end of file
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 0edfd60..9c55541 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3497,6 +3497,19 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name,
   }
 
   auto CreateAllocationFunctionDecl = [&](Attr *ExtraAttr) {
+    // The MSVC STL has explicit cdecl on its (host-side) allocation function
+    // specializations for the allocation, so in order to prevent a CC clash
+    // we use the host's CC, if available, or CC_C as a fallback, for the
+    // host-side implicit decls, knowing these do not get emitted when compiling
+    // for device.
+    if (getLangOpts().CUDAIsDevice && ExtraAttr &&
+        isa<CUDAHostAttr>(ExtraAttr) &&
+        Context.getTargetInfo().getTriple().isSPIRV()) {
+      if (auto *ATI = Context.getAuxTargetInfo())
+        EPI.ExtInfo = EPI.ExtInfo.withCallingConv(ATI->getDefaultCallingConv());
+      else
+        EPI.ExtInfo = EPI.ExtInfo.withCallingConv(CallingConv::CC_C);
+    }
     QualType FnType = Context.getFunctionType(Return, Params, EPI);
     FunctionDecl *Alloc = FunctionDecl::Create(
         Context, GlobalCtx, SourceLocation(), SourceLocation(), Name, FnType,
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 8536e04..17f17f8 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -337,16 +337,9 @@ static bool isZeroSizedArray(const ConstantArrayType *CAT) {
   return CAT != nullptr;
 }
 
-// Returns true if the record type is an HLSL resource class or an array of
-// resource classes
-static bool isResourceRecordTypeOrArrayOf(const Type *Ty) {
-  while (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
-    Ty = CAT->getArrayElementTypeNoTypeQual();
-  return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr;
-}
-
 static bool isResourceRecordTypeOrArrayOf(VarDecl *VD) {
-  return isResourceRecordTypeOrArrayOf(VD->getType().getTypePtr());
+  const Type *Ty = VD->getType().getTypePtr();
+  return Ty->isHLSLResourceRecord() || Ty->isHLSLResourceRecordArray();
 }
 
 // Returns true if the type is a leaf element type that is not valid to be
@@ -355,7 +348,7 @@ static bool isResourceRecordTypeOrArrayOf(VarDecl *VD) {
 // type or if it is a record type that needs to be inspected further.
 static bool isInvalidConstantBufferLeafElementType(const Type *Ty) {
   Ty = Ty->getUnqualifiedDesugaredType();
-  if (isResourceRecordTypeOrArrayOf(Ty))
+  if (Ty->isHLSLResourceRecord() || Ty->isHLSLResourceRecordArray())
     return true;
   if (Ty->isRecordType())
     return Ty->getAsCXXRecordDecl()->isEmpty();
@@ -3597,7 +3590,7 @@ void SemaHLSL::deduceAddressSpace(VarDecl *Decl) {
     return;
 
   // Resource handles.
-  if (isResourceRecordTypeOrArrayOf(Type->getUnqualifiedDesugaredType()))
+  if (Type->isHLSLResourceRecord() || Type->isHLSLResourceRecordArray())
     return;
 
   // Only static globals belong to the Private address space.
@@ -3637,10 +3630,7 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
     if (VD->getType()->isHLSLIntangibleType())
       collectResourceBindingsOnVarDecl(VD);
 
-    const Type *VarType = VD->getType().getTypePtr();
-    while (VarType->isArrayType())
-      VarType = VarType->getArrayElementTypeNoTypeQual();
-    if (VarType->isHLSLResourceRecord() ||
+    if (isResourceRecordTypeOrArrayOf(VD) ||
         VD->hasAttr<HLSLVkConstantIdAttr>()) {
       // Make the variable for resources static. The global externally visible
       // storage is accessed through the handle, which is a member. The variable
diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp
index 0f39a98..bde00bd 100644
--- a/clang/lib/Sema/SemaObjC.cpp
+++ b/clang/lib/Sema/SemaObjC.cpp
@@ -691,7 +691,7 @@ static QualType applyObjCTypeArgs(Sema &S, SourceLocation loc, QualType type,
   if (!anyPackExpansions && finalTypeArgs.size() != numTypeParams) {
     S.Diag(loc, diag::err_objc_type_args_wrong_arity)
         << (typeArgs.size() < typeParams->size()) << objcClass->getDeclName()
-        << (unsigned)finalTypeArgs.size() << (unsigned)numTypeParams;
+        << (unsigned)finalTypeArgs.size() << numTypeParams;
     S.Diag(objcClass->getLocation(), diag::note_previous_decl) << objcClass;
 
     if (failOnError)
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 62fe3d1..4d58b4a 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -646,8 +646,17 @@ ExprResult CheckVarType(SemaOpenACC &S, OpenACCClauseKind CK, Expr *VarExpr,
   if (auto *RefTy = InnerTy->getAs<ReferenceType>())
     InnerTy = RefTy->getPointeeType();
 
-  if (auto *ArrTy = InnerTy->getAsArrayTypeUnsafe())
+  if (auto *ArrTy = InnerTy->getAsArrayTypeUnsafe()) {
+    // Non constant arrays decay to 'pointer', so warn and return that we're
+    // successful.
+    if (!ArrTy->isConstantArrayType()) {
+      S.Diag(InnerLoc, clang::diag::warn_acc_var_referenced_non_const_array)
+          << InnerTy << CK;
+      return VarExpr;
+    }
+
     return CheckVarType(S, CK, VarExpr, InnerLoc, ArrTy->getElementType());
+  }
 
   auto *RD = InnerTy->getAsCXXRecordDecl();
 
@@ -2575,8 +2584,8 @@ SemaOpenACC::ActOnOpenACCAsteriskSizeExpr(SourceLocation AsteriskLoc) {
   return BuildOpenACCAsteriskSizeExpr(AsteriskLoc);
 }
 
-VarDecl *SemaOpenACC::CreateInitRecipe(OpenACCClauseKind CK,
-                                       const Expr *VarExpr) {
+std::pair<VarDecl *, VarDecl *>
+SemaOpenACC::CreateInitRecipe(OpenACCClauseKind CK, const Expr *VarExpr) {
   // Strip off any array subscripts/array section exprs to get to the type of
   // the variable.
   while (isa_and_present<ArraySectionExpr, ArraySubscriptExpr>(VarExpr)) {
@@ -2590,7 +2599,7 @@ VarDecl *SemaOpenACC::CreateInitRecipe(OpenACCClauseKind CK,
   // fill in with nullptr.  We'll count on TreeTransform to make this if
   // necessary.
   if (!VarExpr || VarExpr->getType()->isDependentType())
-    return nullptr;
+    return {nullptr, nullptr};
 
   QualType VarTy =
       VarExpr->getType().getNonReferenceType().getUnqualifiedType();
@@ -2602,6 +2611,7 @@ VarDecl *SemaOpenACC::CreateInitRecipe(OpenACCClauseKind CK,
       getASTContext().getTrivialTypeSourceInfo(VarTy), SC_Auto);
 
   ExprResult Init;
+  VarDecl *Temporary = nullptr;
 
   if (CK == OpenACCClauseKind::Private) {
     // Trap errors so we don't get weird ones here. If we can't init, we'll just
@@ -2626,5 +2636,5 @@ VarDecl *SemaOpenACC::CreateInitRecipe(OpenACCClauseKind CK,
     Recipe->setInitStyle(VarDecl::CallInit);
   }
 
-  return Recipe;
+  return {Recipe, Temporary};
 }
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index 88d217f..e8a18243 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -800,7 +800,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitPrivateClause(
   // Assemble the recipes list.
   for (const Expr *VarExpr : Clause.getVarList())
     InitRecipes.push_back(
-        SemaRef.CreateInitRecipe(OpenACCClauseKind::Private, VarExpr));
+        SemaRef.CreateInitRecipe(OpenACCClauseKind::Private, VarExpr).first);
 
   return OpenACCPrivateClause::Create(
       Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
@@ -813,7 +813,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitFirstPrivateClause(
   // really isn't anything to do here. GCC does some duplicate-finding, though
   // it isn't apparent in the standard where this is justified.
 
-  llvm::SmallVector<VarDecl *> InitRecipes;
+  llvm::SmallVector<OpenACCFirstPrivateRecipe> InitRecipes;
 
   // Assemble the recipes list.
   for (const Expr *VarExpr : Clause.getVarList())
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index b6b8932..2d8fdb5 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -367,7 +367,7 @@ bool Sema::DiagnoseUnknownTemplateName(const IdentifierInfo &II,
 
   // The code is missing a 'template' keyword prior to the dependent template
   // name.
-  NestedNameSpecifier *Qualifier = (NestedNameSpecifier *)SS->getScopeRep();
+  NestedNameSpecifier *Qualifier = SS->getScopeRep();
   SuggestedTemplate = TemplateTy::make(Context.getDependentTemplateName(
       {Qualifier, &II, /*HasTemplateKeyword=*/false}));
   Diag(IILoc, diag::err_template_kw_missing)
diff --git a/clang/lib/Sema/SemaWasm.cpp b/clang/lib/Sema/SemaWasm.cpp
index 8998492..e773113 100644
--- a/clang/lib/Sema/SemaWasm.cpp
+++ b/clang/lib/Sema/SemaWasm.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/AddressSpaces.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Attr.h"
 #include "clang/Sema/Sema.h"
 
@@ -227,7 +228,8 @@ bool SemaWasm::BuiltinWasmTableCopy(CallExpr *TheCall) {
   return false;
 }
 
-bool SemaWasm::BuiltinWasmTestFunctionPointerSignature(CallExpr *TheCall) {
+bool SemaWasm::BuiltinWasmTestFunctionPointerSignature(const TargetInfo &TI,
+                                                       CallExpr *TheCall) {
   if (SemaRef.checkArgCount(TheCall, 1))
     return true;
 
@@ -250,27 +252,31 @@ bool SemaWasm::BuiltinWasmTestFunctionPointerSignature(CallExpr *TheCall) {
            << ArgType << FuncPtrArg->getSourceRange();
   }
 
-  // Check that the function pointer doesn't use reference types
-  if (FuncTy->getReturnType().isWebAssemblyReferenceType()) {
-    return Diag(
-               FuncPtrArg->getBeginLoc(),
-               diag::err_wasm_builtin_test_fp_sig_cannot_include_reference_type)
-           << 0 << FuncTy->getReturnType() << FuncPtrArg->getSourceRange();
-  }
-  auto NParams = FuncTy->getNumParams();
-  for (unsigned I = 0; I < NParams; I++) {
-    if (FuncTy->getParamType(I).isWebAssemblyReferenceType()) {
+  if (TI.getABI() == "experimental-mv") {
+    auto isStructOrUnion = [](QualType T) {
+      return T->isUnionType() || T->isStructureType();
+    };
+    if (isStructOrUnion(FuncTy->getReturnType())) {
       return Diag(
                  FuncPtrArg->getBeginLoc(),
                  diag::
-                     err_wasm_builtin_test_fp_sig_cannot_include_reference_type)
-             << 1 << FuncPtrArg->getSourceRange();
+                     err_wasm_builtin_test_fp_sig_cannot_include_struct_or_union)
+             << 0 << FuncTy->getReturnType() << FuncPtrArg->getSourceRange();
+    }
+    auto NParams = FuncTy->getNumParams();
+    for (unsigned I = 0; I < NParams; I++) {
+      if (isStructOrUnion(FuncTy->getParamType(I))) {
+        return Diag(
+                   FuncPtrArg->getBeginLoc(),
+                   diag::
+                       err_wasm_builtin_test_fp_sig_cannot_include_struct_or_union)
+               << 1 << FuncPtrArg->getSourceRange();
+      }
     }
   }
 
   // Set return type to int (the result of the test)
   TheCall->setType(getASTContext().IntTy);
-
   return false;
 }
 
@@ -297,7 +303,7 @@ bool SemaWasm::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
   case WebAssembly::BI__builtin_wasm_table_copy:
     return BuiltinWasmTableCopy(TheCall);
   case WebAssembly::BI__builtin_wasm_test_function_pointer_signature:
-    return BuiltinWasmTestFunctionPointerSignature(TheCall);
+    return BuiltinWasmTestFunctionPointerSignature(TI, TheCall);
   }
 
   return false;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6ce5535..0030946 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -11901,8 +11901,11 @@ void OpenACCClauseTransform<Derived>::VisitPrivateClause(
       if (InitRecipe)
         InitRecipes.push_back(InitRecipe);
       else
-        InitRecipes.push_back(Self.getSema().OpenACC().CreateInitRecipe(
-            OpenACCClauseKind::Private, VarRef.get()));
+        InitRecipes.push_back(
+            Self.getSema()
+                .OpenACC()
+                .CreateInitRecipe(OpenACCClauseKind::Private, VarRef.get())
+                .first);
     }
   }
   ParsedClause.setVarListDetails(InstantiatedVarList,
@@ -11942,7 +11945,7 @@ template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitFirstPrivateClause(
     const OpenACCFirstPrivateClause &C) {
   llvm::SmallVector<Expr *> InstantiatedVarList;
-  llvm::SmallVector<VarDecl *> InitRecipes;
+  llvm::SmallVector<OpenACCFirstPrivateRecipe> InitRecipes;
 
   for (const auto [RefExpr, InitRecipe] :
        llvm::zip(C.getVarList(), C.getInitRecipes())) {
@@ -11953,7 +11956,7 @@ void OpenACCClauseTransform<Derived>::VisitFirstPrivateClause(
 
       // We only have to create a new one if it is dependent, and Sema won't
       // make one of these unless the type is non-dependent.
-      if (InitRecipe)
+      if (InitRecipe.RecipeDecl)
         InitRecipes.push_back(InitRecipe);
       else
         InitRecipes.push_back(Self.getSema().OpenACC().CreateInitRecipe(
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 1402f40..ed0ec9e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12877,9 +12877,12 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::FirstPrivate: {
     SourceLocation LParenLoc = readSourceLocation();
     llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
-    llvm::SmallVector<VarDecl *> RecipeList;
-    for (unsigned I = 0; I < VarList.size(); ++I)
-      RecipeList.push_back(readDeclAs<VarDecl>());
+    llvm::SmallVector<OpenACCFirstPrivateRecipe> RecipeList;
+    for (unsigned I = 0; I < VarList.size(); ++I) {
+      VarDecl *Recipe = readDeclAs<VarDecl>();
+      VarDecl *RecipeTemp = readDeclAs<VarDecl>();
+      RecipeList.push_back({Recipe, RecipeTemp});
+    }
 
     return OpenACCFirstPrivateClause::Create(getContext(), BeginLoc, LParenLoc,
                                              VarList, RecipeList, EndLoc);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index c038d4d..c072acd 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5461,11 +5461,6 @@ ASTWriter::WriteAST(llvm::PointerUnion<Sema *, Preprocessor *> Subject,
 
   WritingAST = false;
 
-  if (WritingModule && PPRef.getHeaderSearchInfo()
-                           .getHeaderSearchOpts()
-                           .ModulesValidateOncePerBuildSession)
-    ModCache.updateModuleTimestamp(OutputFile);
-
   if (ShouldCacheASTInMemory) {
     // Construct MemoryBuffer and update buffer manager.
     ModCache.getInMemoryModuleCache().addBuiltPCM(
@@ -8762,8 +8757,10 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(FPC->getLParenLoc());
     writeOpenACCVarList(FPC);
 
-    for (VarDecl *VD : FPC->getInitRecipes())
-      AddDeclRef(VD);
+    for (const OpenACCFirstPrivateRecipe &R : FPC->getInitRecipes()) {
+      AddDeclRef(R.RecipeDecl);
+      AddDeclRef(R.InitFromTemporary);
+    }
     return;
   }
   case OpenACCClauseKind::Attach: {
diff --git a/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
index 3b3def7..e64153d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
@@ -183,7 +183,8 @@ public:
       llvm::errs() << "NewAllocator\n";
   }
 
-  void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &C) const {
+  void checkBind(SVal Loc, SVal Val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const {
     if (isCallbackEnabled(C, "Bind"))
       llvm::errs() << "Bind\n";
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp
index 837cbbc..921114a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp
@@ -29,7 +29,8 @@ class BoolAssignmentChecker : public Checker<check::Bind> {
                   bool IsTainted = false) const;
 
 public:
-  void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal Loc, SVal Val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
 };
 } // end anonymous namespace
 
@@ -55,6 +56,7 @@ static bool isBooleanType(QualType Ty) {
 }
 
 void BoolAssignmentChecker::checkBind(SVal Loc, SVal Val, const Stmt *S,
+                                      bool AtDeclInit,
                                       CheckerContext &C) const {
 
   // We are only interested in stores into Booleans.
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index fd0a398..0e5fc0a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -97,10 +97,6 @@ public:
   CheckerFrontendWithBugType UninitializedRead{
       "Accessing unitialized/garbage values"};
 
-  // FIXME: This bug type should be removed because it is only emitted in a
-  // situation that is practically impossible.
-  const BugType AdditionOverflow{&OutOfBounds, "API"};
-
   StringRef getDebugTag() const override { return "MallocChecker"; }
 
   static void *getTag() { static int tag; return &tag; }
@@ -330,7 +326,6 @@ public:
                           const Stmt *S, StringRef WarningMsg) const;
   void emitNotCStringBug(CheckerContext &C, ProgramStateRef State,
                          const Stmt *S, StringRef WarningMsg) const;
-  void emitAdditionOverflowBug(CheckerContext &C, ProgramStateRef State) const;
   void emitUninitializedReadBug(CheckerContext &C, ProgramStateRef State,
                                 const Expr *E, const MemRegion *R,
                                 StringRef Msg) const;
@@ -843,22 +838,6 @@ void CStringChecker::emitNotCStringBug(CheckerContext &C, ProgramStateRef State,
   }
 }
 
-void CStringChecker::emitAdditionOverflowBug(CheckerContext &C,
-                                             ProgramStateRef State) const {
-  if (ExplodedNode *N = C.generateErrorNode(State)) {
-    // This isn't a great error message, but this should never occur in real
-    // code anyway -- you'd have to create a buffer longer than a size_t can
-    // represent, which is sort of a contradiction.
-    const char *WarningMsg =
-        "This expression will create a string whose length is too big to "
-        "be represented as a size_t";
-
-    auto Report = std::make_unique<PathSensitiveBugReport>(AdditionOverflow,
-                                                           WarningMsg, N);
-    C.emitReport(std::move(Report));
-  }
-}
-
 ProgramStateRef CStringChecker::checkAdditionOverflow(CheckerContext &C,
                                                      ProgramStateRef state,
                                                      NonLoc left,
@@ -896,19 +875,22 @@ ProgramStateRef CStringChecker::checkAdditionOverflow(CheckerContext &C,
     SVal willOverflow = svalBuilder.evalBinOpNN(state, BO_GT, left,
                                                 *maxMinusRightNL, cmpTy);
 
-    ProgramStateRef stateOverflow, stateOkay;
-    std::tie(stateOverflow, stateOkay) =
-      state->assume(willOverflow.castAs<DefinedOrUnknownSVal>());
+    auto [StateOverflow, StateOkay] =
+        state->assume(willOverflow.castAs<DefinedOrUnknownSVal>());
 
-    if (stateOverflow && !stateOkay) {
-      // We have an overflow. Emit a bug report.
-      emitAdditionOverflowBug(C, stateOverflow);
+    if (StateOverflow && !StateOkay) {
+      // On this path the analyzer is convinced that the addition of these two
+      // values would overflow `size_t` which must be caused by the inaccuracy
+      // of our modeling because this method is called in situations where the
+      // summands are size/length values which are much less than SIZE_MAX. To
+      // avoid false positives let's just sink this invalid path.
+      C.addSink(StateOverflow);
       return nullptr;
     }
 
     // From now on, assume an overflow didn't occur.
-    assert(stateOkay);
-    state = stateOkay;
+    assert(StateOkay);
+    state = StateOkay;
   }
 
   return state;
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
index 350db4b..392c7ee 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
@@ -175,9 +175,12 @@ public:
   /// \param Loc The value of the location (pointer).
   /// \param Val The value which will be stored at the location Loc.
   /// \param S   The bind is performed while processing the statement S.
+  /// \param AtDeclInit Whether the bind is performed during declaration
+  ///                   initialization.
   ///
   /// check::Bind
-  void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &) const {}
+  void checkBind(SVal Loc, SVal Val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &) const {}
 
   /// Called after a CFG edge is taken within a function.
   ///
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index 152129e..395d724 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -48,7 +48,8 @@ class DereferenceChecker
 public:
   void checkLocation(SVal location, bool isLoad, const Stmt* S,
                      CheckerContext &C) const;
-  void checkBind(SVal L, SVal V, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
 
   static void AddDerefSource(raw_ostream &os,
                              SmallVectorImpl<SourceRange> &Ranges,
@@ -309,7 +310,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
 }
 
 void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
-                                   CheckerContext &C) const {
+                                   bool AtDeclInit, CheckerContext &C) const {
   // If we're binding to a reference, check if the value is known to be null.
   if (V.isUndef())
     return;
diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
index 7ad54c0..7eb9a1d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
@@ -150,7 +150,8 @@ public:
   IteratorModeling() = default;
 
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
-  void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal Loc, SVal Val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
   void checkPostStmt(const UnaryOperator *UO, CheckerContext &C) const;
   void checkPostStmt(const BinaryOperator *BO, CheckerContext &C) const;
   void checkPostStmt(const MaterializeTemporaryExpr *MTE,
@@ -234,7 +235,7 @@ void IteratorModeling::checkPostCall(const CallEvent &Call,
 }
 
 void IteratorModeling::checkBind(SVal Loc, SVal Val, const Stmt *S,
-                                 CheckerContext &C) const {
+                                 bool AtDeclInit, CheckerContext &C) const {
   auto State = C.getState();
   const auto *Pos = getIteratorPosition(State, Val);
   if (Pos) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 369d619..efb9809 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -3156,7 +3156,7 @@ void MallocChecker::checkPreCall(const CallEvent &Call,
   for (unsigned I = 0, E = Call.getNumArgs(); I != E; ++I) {
     SVal ArgSVal = Call.getArgSVal(I);
     if (isa<Loc>(ArgSVal)) {
-      SymbolRef Sym = ArgSVal.getAsSymbol();
+      SymbolRef Sym = ArgSVal.getAsSymbol(/*IncludeBaseRegions=*/true);
       if (!Sym)
         continue;
       if (checkUseAfterFree(Sym, C, Call.getArgExpr(I)))
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index 9744d1a..eeb6b72 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -97,7 +97,8 @@ public:
   // libraries.
   bool NoDiagnoseCallsToSystemHeaders = false;
 
-  void checkBind(SVal L, SVal V, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
   void checkPostStmt(const ExplicitCastExpr *CE, CheckerContext &C) const;
   void checkPreStmt(const ReturnStmt *S, CheckerContext &C) const;
   void checkPostObjCMessage(const ObjCMethodCall &M, CheckerContext &C) const;
@@ -1250,7 +1251,7 @@ static bool isARCNilInitializedLocal(CheckerContext &C, const Stmt *S) {
 /// Propagate the nullability information through binds and warn when nullable
 /// pointer or null symbol is assigned to a pointer with a nonnull type.
 void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
-                                   CheckerContext &C) const {
+                                   bool AtDeclInit, CheckerContext &C) const {
   const TypedValueRegion *TVR =
       dyn_cast_or_null<TypedValueRegion>(L.getAsRegion());
   if (!TVR)
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
index ace3426..e40b4f8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
@@ -73,7 +73,8 @@ public:
   void checkPreStmt(const ReturnStmt *S, CheckerContext &C) const;
   void checkLocation(SVal location, bool isLoad, const Stmt *S,
                      CheckerContext &C) const;
-  void checkBind(SVal loc, SVal val, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal loc, SVal val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
 
   void checkPreCall(const CallEvent &CE, CheckerContext &C) const;
   void checkPostCall(const CallEvent &CE, CheckerContext &C) const;
@@ -311,9 +312,8 @@ void ObjCSelfInitChecker::checkLocation(SVal location, bool isLoad,
                 C);
 }
 
-
 void ObjCSelfInitChecker::checkBind(SVal loc, SVal val, const Stmt *S,
-                                    CheckerContext &C) const {
+                                    bool AtDeclInit, CheckerContext &C) const {
   // Allow assignment of anything to self. Self is a local variable in the
   // initializer, so it is legal to assign anything to it, like results of
   // static functions/method calls. After self is assigned something we cannot
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
index 62bc321..1762505 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
@@ -840,20 +840,27 @@ ProgramStateRef RetainCountChecker::updateSymbol(ProgramStateRef state,
 const RefCountBug &
 RetainCountChecker::errorKindToBugKind(RefVal::Kind ErrorKind,
                                        SymbolRef Sym) const {
+  const RefCountFrontend &FE = getPreferredFrontend();
+
   switch (ErrorKind) {
     case RefVal::ErrorUseAfterRelease:
-      return *UseAfterRelease;
+      return FE.UseAfterRelease;
     case RefVal::ErrorReleaseNotOwned:
-      return *ReleaseNotOwned;
+      return FE.ReleaseNotOwned;
     case RefVal::ErrorDeallocNotOwned:
       if (Sym->getType()->getPointeeCXXRecordDecl())
-        return *FreeNotOwned;
-      return *DeallocNotOwned;
+        return FE.FreeNotOwned;
+      return FE.DeallocNotOwned;
     default:
       llvm_unreachable("Unhandled error.");
   }
 }
 
+bool RetainCountChecker::isReleaseUnownedError(RefVal::Kind ErrorKind) const {
+  return ErrorKind == RefVal::ErrorReleaseNotOwned ||
+         ErrorKind == RefVal::ErrorDeallocNotOwned;
+}
+
 void RetainCountChecker::processNonLeakError(ProgramStateRef St,
                                              SourceRange ErrorRange,
                                              RefVal::Kind ErrorKind,
@@ -874,8 +881,8 @@ void RetainCountChecker::processNonLeakError(ProgramStateRef St,
     return;
 
   auto report = std::make_unique<RefCountReport>(
-      errorKindToBugKind(ErrorKind, Sym),
-      C.getASTContext().getLangOpts(), N, Sym);
+      errorKindToBugKind(ErrorKind, Sym), C.getASTContext().getLangOpts(), N,
+      Sym, /*isLeak=*/false, isReleaseUnownedError(ErrorKind));
   report->addRange(ErrorRange);
   C.emitReport(std::move(report));
 }
@@ -1090,8 +1097,8 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
         ExplodedNode *N = C.addTransition(state, Pred);
         if (N) {
           const LangOptions &LOpts = C.getASTContext().getLangOpts();
-          auto R =
-              std::make_unique<RefLeakReport>(*LeakAtReturn, LOpts, N, Sym, C);
+          auto R = std::make_unique<RefLeakReport>(
+              getPreferredFrontend().LeakAtReturn, LOpts, N, Sym, C);
           C.emitReport(std::move(R));
         }
         return N;
@@ -1113,7 +1120,8 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
         ExplodedNode *N = C.addTransition(state, Pred);
         if (N) {
           auto R = std::make_unique<RefCountReport>(
-              *ReturnNotOwnedForOwned, C.getASTContext().getLangOpts(), N, Sym);
+              getPreferredFrontend().ReturnNotOwnedForOwned,
+              C.getASTContext().getLangOpts(), N, Sym);
           C.emitReport(std::move(R));
         }
         return N;
@@ -1128,7 +1136,7 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
 //===----------------------------------------------------------------------===//
 
 void RetainCountChecker::checkBind(SVal loc, SVal val, const Stmt *S,
-                                   CheckerContext &C) const {
+                                   bool AtDeclInit, CheckerContext &C) const {
   ProgramStateRef state = C.getState();
   const MemRegion *MR = loc.getAsRegion();
 
@@ -1261,8 +1269,8 @@ ProgramStateRef RetainCountChecker::handleAutoreleaseCounts(
     os << "has a +" << V.getCount() << " retain count";
 
     const LangOptions &LOpts = Ctx.getASTContext().getLangOpts();
-    auto R = std::make_unique<RefCountReport>(*OverAutorelease, LOpts, N, Sym,
-                                              os.str());
+    auto R = std::make_unique<RefCountReport>(
+        getPreferredFrontend().OverAutorelease, LOpts, N, Sym, os.str());
     Ctx.emitReport(std::move(R));
   }
 
@@ -1307,8 +1315,10 @@ RetainCountChecker::processLeaks(ProgramStateRef state,
   const LangOptions &LOpts = Ctx.getASTContext().getLangOpts();
 
   if (N) {
+    const RefCountFrontend &FE = getPreferredFrontend();
+    const RefCountBug &BT = Pred ? FE.LeakWithinFunction : FE.LeakAtReturn;
+
     for (SymbolRef L : Leaked) {
-      const RefCountBug &BT = Pred ? *LeakWithinFunction : *LeakAtReturn;
       Ctx.emitReport(std::make_unique<RefLeakReport>(BT, LOpts, N, L, Ctx));
     }
   }
@@ -1463,44 +1473,31 @@ std::unique_ptr<SimpleProgramPointTag> RetainCountChecker::DeallocSentTag;
 std::unique_ptr<SimpleProgramPointTag> RetainCountChecker::CastFailTag;
 
 void ento::registerRetainCountBase(CheckerManager &Mgr) {
-  auto *Chk = Mgr.registerChecker<RetainCountChecker>();
+  auto *Chk = Mgr.getChecker<RetainCountChecker>();
   Chk->DeallocSentTag = std::make_unique<SimpleProgramPointTag>(
       "RetainCountChecker", "DeallocSent");
   Chk->CastFailTag = std::make_unique<SimpleProgramPointTag>(
       "RetainCountChecker", "DynamicCastFail");
 }
 
-bool ento::shouldRegisterRetainCountBase(const CheckerManager &mgr) {
+bool ento::shouldRegisterRetainCountBase(const CheckerManager &) {
   return true;
 }
+
 void ento::registerRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
-  Chk->TrackObjCAndCFObjects = true;
+  Chk->RetainCount.enable(Mgr);
   Chk->TrackNSCFStartParam = Mgr.getAnalyzerOptions().getCheckerBooleanOption(
       Mgr.getCurrentCheckerName(), "TrackNSCFStartParam");
-
-#define INIT_BUGTYPE(KIND)                                                     \
-  Chk->KIND = std::make_unique<RefCountBug>(Mgr.getCurrentCheckerName(),       \
-                                            RefCountBug::KIND);
-  // TODO: Ideally, we should have a checker for each of these bug types.
-  INIT_BUGTYPE(UseAfterRelease)
-  INIT_BUGTYPE(ReleaseNotOwned)
-  INIT_BUGTYPE(DeallocNotOwned)
-  INIT_BUGTYPE(FreeNotOwned)
-  INIT_BUGTYPE(OverAutorelease)
-  INIT_BUGTYPE(ReturnNotOwnedForOwned)
-  INIT_BUGTYPE(LeakWithinFunction)
-  INIT_BUGTYPE(LeakAtReturn)
-#undef INIT_BUGTYPE
 }
 
-bool ento::shouldRegisterRetainCountChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterRetainCountChecker(const CheckerManager &) {
   return true;
 }
 
 void ento::registerOSObjectRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
-  Chk->TrackOSObjects = true;
+  Chk->OSObjectRetainCount.enable(Mgr);
 
   // FIXME: We want bug reports to always have the same checker name associated
   // with them, yet here, if RetainCountChecker is disabled but
@@ -1511,21 +1508,8 @@ void ento::registerOSObjectRetainCountChecker(CheckerManager &Mgr) {
   // diagnostics, and **hidden checker options** with the fine-tuning of
   // modeling. Following this logic, OSObjectRetainCountChecker should be the
   // latter, but we can't just remove it for backward compatibility reasons.
-#define LAZY_INIT_BUGTYPE(KIND)                                                \
-  if (!Chk->KIND)                                                              \
-    Chk->KIND = std::make_unique<RefCountBug>(Mgr.getCurrentCheckerName(),     \
-                                              RefCountBug::KIND);
-  LAZY_INIT_BUGTYPE(UseAfterRelease)
-  LAZY_INIT_BUGTYPE(ReleaseNotOwned)
-  LAZY_INIT_BUGTYPE(DeallocNotOwned)
-  LAZY_INIT_BUGTYPE(FreeNotOwned)
-  LAZY_INIT_BUGTYPE(OverAutorelease)
-  LAZY_INIT_BUGTYPE(ReturnNotOwnedForOwned)
-  LAZY_INIT_BUGTYPE(LeakWithinFunction)
-  LAZY_INIT_BUGTYPE(LeakAtReturn)
-#undef LAZY_INIT_BUGTYPE
 }
 
-bool ento::shouldRegisterOSObjectRetainCountChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterOSObjectRetainCountChecker(const CheckerManager &) {
   return true;
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
index 0e81143..dc8bad6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
@@ -235,51 +235,32 @@ public:
 };
 
 class RetainCountChecker
-  : public Checker< check::Bind,
-                    check::DeadSymbols,
-                    check::BeginFunction,
-                    check::EndFunction,
-                    check::PostStmt<BlockExpr>,
-                    check::PostStmt<CastExpr>,
-                    check::PostStmt<ObjCArrayLiteral>,
-                    check::PostStmt<ObjCDictionaryLiteral>,
-                    check::PostStmt<ObjCBoxedExpr>,
-                    check::PostStmt<ObjCIvarRefExpr>,
-                    check::PostCall,
-                    check::RegionChanges,
-                    eval::Assume,
-                    eval::Call > {
+    : public CheckerFamily<
+          check::Bind, check::DeadSymbols, check::BeginFunction,
+          check::EndFunction, check::PostStmt<BlockExpr>,
+          check::PostStmt<CastExpr>, check::PostStmt<ObjCArrayLiteral>,
+          check::PostStmt<ObjCDictionaryLiteral>,
+          check::PostStmt<ObjCBoxedExpr>, check::PostStmt<ObjCIvarRefExpr>,
+          check::PostCall, check::RegionChanges, eval::Assume, eval::Call> {
 
 public:
-  std::unique_ptr<RefCountBug> UseAfterRelease;
-  std::unique_ptr<RefCountBug> ReleaseNotOwned;
-  std::unique_ptr<RefCountBug> DeallocNotOwned;
-  std::unique_ptr<RefCountBug> FreeNotOwned;
-  std::unique_ptr<RefCountBug> OverAutorelease;
-  std::unique_ptr<RefCountBug> ReturnNotOwnedForOwned;
-  std::unique_ptr<RefCountBug> LeakWithinFunction;
-  std::unique_ptr<RefCountBug> LeakAtReturn;
+  RefCountFrontend RetainCount;
+  RefCountFrontend OSObjectRetainCount;
 
   mutable std::unique_ptr<RetainSummaryManager> Summaries;
 
   static std::unique_ptr<SimpleProgramPointTag> DeallocSentTag;
   static std::unique_ptr<SimpleProgramPointTag> CastFailTag;
 
-  /// Track Objective-C and CoreFoundation objects.
-  bool TrackObjCAndCFObjects = false;
-
-  /// Track sublcasses of OSObject.
-  bool TrackOSObjects = false;
-
   /// Track initial parameters (for the entry point) for NS/CF objects.
   bool TrackNSCFStartParam = false;
 
-  RetainCountChecker() {};
+  StringRef getDebugTag() const override { return "RetainCountChecker"; }
 
   RetainSummaryManager &getSummaryManager(ASTContext &Ctx) const {
     if (!Summaries)
-      Summaries.reset(
-          new RetainSummaryManager(Ctx, TrackObjCAndCFObjects, TrackOSObjects));
+      Summaries = std::make_unique<RetainSummaryManager>(
+          Ctx, RetainCount.isEnabled(), OSObjectRetainCount.isEnabled());
     return *Summaries;
   }
 
@@ -287,10 +268,20 @@ public:
     return getSummaryManager(C.getASTContext());
   }
 
+  const RefCountFrontend &getPreferredFrontend() const {
+    // FIXME: The two frontends of this checker family are in an unusual
+    // relationship: if they are both enabled, then all bug reports are
+    // reported by RetainCount (i.e. `osx.cocoa.RetainCount`), even the bugs
+    // that "belong to" OSObjectRetainCount (i.e. `osx.OSObjectRetainCount`).
+    // This is counter-intuitive and should be fixed to avoid confusion.
+    return RetainCount.isEnabled() ? RetainCount : OSObjectRetainCount;
+  }
+
   void printState(raw_ostream &Out, ProgramStateRef State,
                   const char *NL, const char *Sep) const override;
 
-  void checkBind(SVal loc, SVal val, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal loc, SVal val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
   void checkPostStmt(const BlockExpr *BE, CheckerContext &C) const;
   void checkPostStmt(const CastExpr *CE, CheckerContext &C) const;
 
@@ -337,6 +328,8 @@ public:
   const RefCountBug &errorKindToBugKind(RefVal::Kind ErrorKind,
                                         SymbolRef Sym) const;
 
+  bool isReleaseUnownedError(RefVal::Kind ErrorKind) const;
+
   void processNonLeakError(ProgramStateRef St, SourceRange ErrorRange,
                            RefVal::Kind ErrorKind, SymbolRef Sym,
                            CheckerContext &C) const;
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
index c9f5dc9..cad2c72 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
@@ -21,57 +21,6 @@ using namespace clang;
 using namespace ento;
 using namespace retaincountchecker;
 
-StringRef RefCountBug::bugTypeToName(RefCountBug::RefCountBugKind BT) {
-  switch (BT) {
-  case UseAfterRelease:
-    return "Use-after-release";
-  case ReleaseNotOwned:
-    return "Bad release";
-  case DeallocNotOwned:
-    return "-dealloc sent to non-exclusively owned object";
-  case FreeNotOwned:
-    return "freeing non-exclusively owned object";
-  case OverAutorelease:
-    return "Object autoreleased too many times";
-  case ReturnNotOwnedForOwned:
-    return "Method should return an owned object";
-  case LeakWithinFunction:
-    return "Leak";
-  case LeakAtReturn:
-    return "Leak of returned object";
-  }
-  llvm_unreachable("Unknown RefCountBugKind");
-}
-
-StringRef RefCountBug::getDescription() const {
-  switch (BT) {
-  case UseAfterRelease:
-    return "Reference-counted object is used after it is released";
-  case ReleaseNotOwned:
-    return "Incorrect decrement of the reference count of an object that is "
-           "not owned at this point by the caller";
-  case DeallocNotOwned:
-    return "-dealloc sent to object that may be referenced elsewhere";
-  case FreeNotOwned:
-    return  "'free' called on an object that may be referenced elsewhere";
-  case OverAutorelease:
-    return "Object autoreleased too many times";
-  case ReturnNotOwnedForOwned:
-    return "Object with a +0 retain count returned to caller where a +1 "
-           "(owning) retain count is expected";
-  case LeakWithinFunction:
-  case LeakAtReturn:
-    return "";
-  }
-  llvm_unreachable("Unknown RefCountBugKind");
-}
-
-RefCountBug::RefCountBug(CheckerNameRef Checker, RefCountBugKind BT)
-    : BugType(Checker, bugTypeToName(BT), categories::MemoryRefCount,
-              /*SuppressOnSink=*/BT == LeakWithinFunction ||
-                  BT == LeakAtReturn),
-      BT(BT) {}
-
 static bool isNumericLiteralExpression(const Expr *E) {
   // FIXME: This set of cases was copied from SemaExprObjC.
   return isa<IntegerLiteral, CharacterLiteral, FloatingLiteral,
@@ -312,9 +261,11 @@ namespace retaincountchecker {
 class RefCountReportVisitor : public BugReporterVisitor {
 protected:
   SymbolRef Sym;
+  bool IsReleaseUnowned;
 
 public:
-  RefCountReportVisitor(SymbolRef sym) : Sym(sym) {}
+  RefCountReportVisitor(SymbolRef S, bool IRU)
+      : Sym(S), IsReleaseUnowned(IRU) {}
 
   void Profile(llvm::FoldingSetNodeID &ID) const override {
     static int x = 0;
@@ -334,7 +285,8 @@ public:
 class RefLeakReportVisitor : public RefCountReportVisitor {
 public:
   RefLeakReportVisitor(SymbolRef Sym, const MemRegion *LastBinding)
-      : RefCountReportVisitor(Sym), LastBinding(LastBinding) {}
+      : RefCountReportVisitor(Sym, /*IsReleaseUnowned=*/false),
+        LastBinding(LastBinding) {}
 
   PathDiagnosticPieceRef getEndPath(BugReporterContext &BRC,
                                     const ExplodedNode *N,
@@ -452,12 +404,6 @@ annotateStartParameter(const ExplodedNode *N, SymbolRef Sym,
 PathDiagnosticPieceRef
 RefCountReportVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BRC,
                                  PathSensitiveBugReport &BR) {
-
-  const auto &BT = static_cast<const RefCountBug&>(BR.getBugType());
-
-  bool IsFreeUnowned = BT.getBugType() == RefCountBug::FreeNotOwned ||
-                       BT.getBugType() == RefCountBug::DeallocNotOwned;
-
   const SourceManager &SM = BRC.getSourceManager();
   CallEventManager &CEMgr = BRC.getStateManager().getCallEventManager();
   if (auto CE = N->getLocationAs<CallExitBegin>())
@@ -490,7 +436,7 @@ RefCountReportVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BRC,
   std::string sbuf;
   llvm::raw_string_ostream os(sbuf);
 
-  if (PrevT && IsFreeUnowned && CurrV.isNotOwned() && PrevT->isOwned()) {
+  if (PrevT && IsReleaseUnowned && CurrV.isNotOwned() && PrevT->isOwned()) {
     os << "Object is now not exclusively owned";
     auto Pos = PathDiagnosticLocation::create(N->getLocation(), SM);
     return std::make_shared<PathDiagnosticEventPiece>(Pos, sbuf);
@@ -815,10 +761,8 @@ RefLeakReportVisitor::getEndPath(BugReporterContext &BRC,
         if (K == ObjKind::ObjC || K == ObjKind::CF) {
           os << "whose name ('" << *FD
              << "') does not contain 'Copy' or 'Create'.  This violates the "
-                "naming"
-                " convention rules given in the Memory Management Guide for "
-                "Core"
-                " Foundation";
+                "naming convention rules given in the Memory Management Guide "
+                "for Core Foundation";
         } else if (RV->getObjKind() == ObjKind::OS) {
           std::string FuncName = FD->getNameAsString();
           os << "whose name ('" << FuncName << "') starts with '"
@@ -836,19 +780,20 @@ RefLeakReportVisitor::getEndPath(BugReporterContext &BRC,
 }
 
 RefCountReport::RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
-                               ExplodedNode *n, SymbolRef sym, bool isLeak)
-    : PathSensitiveBugReport(D, D.getDescription(), n), Sym(sym),
+                               ExplodedNode *n, SymbolRef sym, bool isLeak,
+                               bool IsReleaseUnowned)
+    : PathSensitiveBugReport(D, D.getReportMessage(), n), Sym(sym),
       isLeak(isLeak) {
   if (!isLeak)
-    addVisitor<RefCountReportVisitor>(sym);
+    addVisitor<RefCountReportVisitor>(sym, IsReleaseUnowned);
 }
 
 RefCountReport::RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
                                ExplodedNode *n, SymbolRef sym,
                                StringRef endText)
-    : PathSensitiveBugReport(D, D.getDescription(), endText, n) {
+    : PathSensitiveBugReport(D, D.getReportMessage(), endText, n) {
 
-  addVisitor<RefCountReportVisitor>(sym);
+  addVisitor<RefCountReportVisitor>(sym, /*IsReleaseUnowned=*/false);
 }
 
 void RefLeakReport::deriveParamLocation(CheckerContext &Ctx) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
index d059008..6ceb86f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
@@ -25,25 +25,44 @@ namespace ento {
 namespace retaincountchecker {
 
 class RefCountBug : public BugType {
+  StringRef ReportMessage;
+
 public:
-  enum RefCountBugKind {
-    UseAfterRelease,
-    ReleaseNotOwned,
-    DeallocNotOwned,
-    FreeNotOwned,
-    OverAutorelease,
-    ReturnNotOwnedForOwned,
-    LeakWithinFunction,
-    LeakAtReturn,
-  };
-  RefCountBug(CheckerNameRef Checker, RefCountBugKind BT);
-  StringRef getDescription() const;
-
-  RefCountBugKind getBugType() const { return BT; }
-
-private:
-  RefCountBugKind BT;
-  static StringRef bugTypeToName(RefCountBugKind BT);
+  RefCountBug(const CheckerFrontend *CF, StringRef Desc, StringRef ReportMsg,
+              bool SuppressOnSink = false)
+      : BugType(CF, Desc, categories::MemoryRefCount, SuppressOnSink),
+        ReportMessage(ReportMsg) {}
+  StringRef getReportMessage() const { return ReportMessage; }
+};
+
+class RefCountFrontend : public CheckerFrontend {
+public:
+  const RefCountBug UseAfterRelease{
+      this, "Use-after-release",
+      "Reference-counted object is used after it is released"};
+  const RefCountBug ReleaseNotOwned{
+      this, "Bad release",
+      "Incorrect decrement of the reference count of an object that is not "
+      "owned at this point by the caller"};
+  const RefCountBug DeallocNotOwned{
+      this, "-dealloc sent to non-exclusively owned object",
+      "-dealloc sent to object that may be referenced elsewhere"};
+  const RefCountBug FreeNotOwned{
+      this, "freeing non-exclusively owned object",
+      "'free' called on an object that may be referenced elsewhere"};
+  const RefCountBug OverAutorelease{this, "Object autoreleased too many times",
+                                    "Object autoreleased too many times"};
+  const RefCountBug ReturnNotOwnedForOwned{
+      this, "Method should return an owned object",
+      "Object with a +0 retain count returned to caller where a +1 (owning) "
+      "retain count is expected"};
+  // For these two bug types the report message will be generated dynamically
+  // by `RefLeakReport::createDescription` so the empty string taken from the
+  // BugType will be ignored (overwritten).
+  const RefCountBug LeakWithinFunction{this, "Leak", /*ReportMsg=*/"",
+                                       /*SuppressOnSink=*/true};
+  const RefCountBug LeakAtReturn{this, "Leak of returned object",
+                                 /*ReportMsg=*/"", /*SuppressOnSink=*/true};
 };
 
 class RefCountReport : public PathSensitiveBugReport {
@@ -53,8 +72,8 @@ protected:
 
 public:
   RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
-              ExplodedNode *n, SymbolRef sym,
-              bool isLeak=false);
+                 ExplodedNode *n, SymbolRef sym, bool isLeak = false,
+                 bool IsReleaseUnowned = false);
 
   RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
               ExplodedNode *n, SymbolRef sym,
diff --git a/clang/lib/StaticAnalyzer/Checkers/StoreToImmutableChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StoreToImmutableChecker.cpp
index afad419..2bb3917 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StoreToImmutableChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StoreToImmutableChecker.cpp
@@ -26,53 +26,11 @@ class StoreToImmutableChecker : public Checker<check::Bind> {
   const BugType BT{this, "Write to immutable memory", "CERT Environment (ENV)"};
 
 public:
-  void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal Loc, SVal Val, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
 };
 } // end anonymous namespace
 
-static bool isInitializationContext(const Stmt *S, CheckerContext &C) {
-  // Check if this is a DeclStmt (variable declaration)
-  if (isa<DeclStmt>(S))
-    return true;
-
-  // This part is specific for initialization of const lambdas pre-C++17.
-  // Lets look at the AST of the statement:
-  // ```
-  // const auto lambda = [](){};
-  // ```
-  //
-  // The relevant part of the AST for this case prior to C++17 is:
-  // ...
-  // `-DeclStmt
-  //   `-VarDecl
-  //     `-ExprWithCleanups
-  //       `-CXXConstructExpr
-  // ...
-  // In C++17 and later, the AST is different:
-  // ...
-  // `-DeclStmt
-  //   `-VarDecl
-  //     `-ImplicitCastExpr
-  //       `-LambdaExpr
-  //         |-CXXRecordDecl
-  //         `-CXXConstructExpr
-  // ...
-  // And even beside this, the statement `S` that is given to the checkBind
-  // callback is the VarDecl in C++17 and later, and the CXXConstructExpr in
-  // C++14 and before. So in order to support the C++14 we need the following
-  // ugly hack to detect whether this construction is used to initialize a
-  // variable.
-  //
-  // FIXME: This should be eliminated by improving the API of checkBind to
-  // ensure that it consistently passes the `VarDecl` (instead of the
-  // `CXXConstructExpr`) when the constructor call denotes the initialization
-  // of a variable with a lambda, or maybe less preferably, try the more
-  // invasive approach of passing the information forward to the checkers
-  // whether the current bind is an initialization or an assignment.
-  const auto *ConstructExp = dyn_cast<CXXConstructExpr>(S);
-  return ConstructExp && ConstructExp->isElidable();
-}
-
 static bool isEffectivelyConstRegion(const MemRegion *MR, CheckerContext &C) {
   if (isa<GlobalImmutableSpaceRegion>(MR))
     return true;
@@ -128,6 +86,7 @@ getInnermostEnclosingConstDeclRegion(const MemRegion *MR, CheckerContext &C) {
 }
 
 void StoreToImmutableChecker::checkBind(SVal Loc, SVal Val, const Stmt *S,
+                                        bool AtDeclInit,
                                         CheckerContext &C) const {
   // We are only interested in stores to memory regions
   const MemRegion *MR = Loc.getAsRegion();
@@ -136,9 +95,7 @@ void StoreToImmutableChecker::checkBind(SVal Loc, SVal Val, const Stmt *S,
 
   // Skip variable declarations and initializations - we only want to catch
   // actual writes
-  // FIXME: If the API of checkBind would allow to distinguish between
-  // initialization and assignment, we could use that instead.
-  if (isInitializationContext(S, C))
+  if (AtDeclInit)
     return;
 
   // Check if the region is in the global immutable space
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
index e98de33..7f8923c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
@@ -26,13 +26,13 @@ class UndefinedAssignmentChecker
   const BugType BT{this, "Assigned value is uninitialized"};
 
 public:
-  void checkBind(SVal location, SVal val, const Stmt *S,
+  void checkBind(SVal location, SVal val, const Stmt *S, bool AtDeclInit,
                  CheckerContext &C) const;
 };
 }
 
 void UndefinedAssignmentChecker::checkBind(SVal location, SVal val,
-                                           const Stmt *StoreE,
+                                           const Stmt *StoreE, bool AtDeclInit,
                                            CheckerContext &C) const {
   if (!val.isUndef())
     return;
diff --git a/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
index cb73ac6..116dd93 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
@@ -62,7 +62,8 @@ public:
 
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
-  void checkBind(SVal L, SVal V, const Stmt *S, CheckerContext &C) const;
+  void checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
+                 CheckerContext &C) const;
   void checkPreStmt(const ReturnStmt *RS, CheckerContext &C) const;
 };
 
@@ -188,7 +189,7 @@ void VforkChecker::checkPreCall(const CallEvent &Call,
 }
 
 // Prohibit writes in child process (except for vfork's lhs).
-void VforkChecker::checkBind(SVal L, SVal V, const Stmt *S,
+void VforkChecker::checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
                              CheckerContext &C) const {
   ProgramStateRef State = C.getState();
   if (!isChildProcess(State))
diff --git a/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp b/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp
index 0fe677e..44c6f9f 100644
--- a/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp
@@ -376,11 +376,13 @@ namespace {
     const Stmt *S;
     ExprEngine &Eng;
     const ProgramPoint &PP;
+    bool AtDeclInit;
 
-    CheckBindContext(const CheckersTy &checkers,
-                     SVal loc, SVal val, const Stmt *s, ExprEngine &eng,
+    CheckBindContext(const CheckersTy &checkers, SVal loc, SVal val,
+                     const Stmt *s, bool AtDeclInit, ExprEngine &eng,
                      const ProgramPoint &pp)
-        : Checkers(checkers), Loc(loc), Val(val), S(s), Eng(eng), PP(pp) {}
+        : Checkers(checkers), Loc(loc), Val(val), S(s), Eng(eng), PP(pp),
+          AtDeclInit(AtDeclInit) {}
 
     CheckersTy::const_iterator checkers_begin() { return Checkers.begin(); }
     CheckersTy::const_iterator checkers_end() { return Checkers.end(); }
@@ -391,7 +393,7 @@ namespace {
       const ProgramPoint &L = PP.withTag(checkFn.Checker);
       CheckerContext C(Bldr, Eng, Pred, L);
 
-      checkFn(Loc, Val, S, C);
+      checkFn(Loc, Val, S, AtDeclInit, C);
     }
   };
 
@@ -408,10 +410,10 @@ namespace {
 /// Run checkers for binding of a value to a location.
 void CheckerManager::runCheckersForBind(ExplodedNodeSet &Dst,
                                         const ExplodedNodeSet &Src,
-                                        SVal location, SVal val,
-                                        const Stmt *S, ExprEngine &Eng,
+                                        SVal location, SVal val, const Stmt *S,
+                                        bool AtDeclInit, ExprEngine &Eng,
                                         const ProgramPoint &PP) {
-  CheckBindContext C(BindCheckers, location, val, S, Eng, PP);
+  CheckBindContext C(BindCheckers, location, val, S, AtDeclInit, Eng, PP);
   llvm::TimeTraceScope TimeScope{
       "CheckerManager::runCheckersForBind",
       [&val]() { return getTimeTraceBindMetadata(val); }};
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index d874844..c853c00 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -3714,9 +3714,8 @@ ExprEngine::notifyCheckersOfPointerEscape(ProgramStateRef State,
 /// evalBind - Handle the semantics of binding a value to a specific location.
 ///  This method is used by evalStore and (soon) VisitDeclStmt, and others.
 void ExprEngine::evalBind(ExplodedNodeSet &Dst, const Stmt *StoreE,
-                          ExplodedNode *Pred,
-                          SVal location, SVal Val,
-                          bool atDeclInit, const ProgramPoint *PP) {
+                          ExplodedNode *Pred, SVal location, SVal Val,
+                          bool AtDeclInit, const ProgramPoint *PP) {
   const LocationContext *LC = Pred->getLocationContext();
   PostStmt PS(StoreE, LC);
   if (!PP)
@@ -3725,7 +3724,7 @@ void ExprEngine::evalBind(ExplodedNodeSet &Dst, const Stmt *StoreE,
   // Do a previsit of the bind.
   ExplodedNodeSet CheckedSet;
   getCheckerManager().runCheckersForBind(CheckedSet, Pred, location, Val,
-                                         StoreE, *this, *PP);
+                                         StoreE, AtDeclInit, *this, *PP);
 
   StmtNodeBuilder Bldr(CheckedSet, Dst, *currBldrCtx);
 
@@ -3748,8 +3747,8 @@ void ExprEngine::evalBind(ExplodedNodeSet &Dst, const Stmt *StoreE,
     // When binding the value, pass on the hint that this is a initialization.
     // For initializations, we do not need to inform clients of region
     // changes.
-    state = state->bindLoc(location.castAs<Loc>(),
-                           Val, LC, /* notifyChanges = */ !atDeclInit);
+    state = state->bindLoc(location.castAs<Loc>(), Val, LC,
+                           /* notifyChanges = */ !AtDeclInit);
 
     const MemRegion *LocReg = nullptr;
     if (std::optional<loc::MemRegionVal> LocRegVal =
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index fe70558..c0b28d2 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -85,7 +85,7 @@ void ExprEngine::performTrivialCopy(NodeBuilder &Bldr, ExplodedNode *Pred,
   evalLocation(Tmp, CallExpr, VExpr, Pred, Pred->getState(), V,
                /*isLoad=*/true);
   for (ExplodedNode *N : Tmp)
-    evalBind(Dst, CallExpr, N, ThisVal, V, true);
+    evalBind(Dst, CallExpr, N, ThisVal, V, !AlwaysReturnsLValue);
 
   PostStmt PS(CallExpr, LCtx);
   for (ExplodedNode *N : Dst) {