9 files changed, 143 insertions, 210 deletions
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 21af3d6..8904396 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1638,6 +1638,36 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
   return true;
 }
 
+static bool GetDynamicDecl(InterpState &S, CodePtr OpPC, Pointer TypePtr,
+                           const CXXRecordDecl *&DynamicDecl) {
+  while (TypePtr.isBaseClass())
+    TypePtr = TypePtr.getBase();
+
+  QualType DynamicType = TypePtr.getType();
+  if (TypePtr.isStatic() || TypePtr.isConst()) {
+    const VarDecl *VD = TypePtr.getDeclDesc()->asVarDecl();
+    if (!VD->isConstexpr()) {
+      const Expr *E = S.Current->getExpr(OpPC);
+      APValue V = TypePtr.toAPValue(S.getASTContext());
+      QualType TT = S.getASTContext().getLValueReferenceType(DynamicType);
+      S.FFDiag(E, diag::note_constexpr_polymorphic_unknown_dynamic_type)
+          << AccessKinds::AK_MemberCall << V.getAsString(S.getASTContext(), TT);
+      return false;
+    }
+  }
+
+  if (DynamicType->isPointerType() || DynamicType->isReferenceType()) {
+    DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
+  } else if (DynamicType->isArrayType()) {
+    const Type *ElemType = DynamicType->getPointeeOrArrayElementType();
+    assert(ElemType);
+    DynamicDecl = ElemType->getAsCXXRecordDecl();
+  } else {
+    DynamicDecl = DynamicType->getAsCXXRecordDecl();
+  }
+  return true;
+}
+
 bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
               uint32_t VarArgSize) {
   assert(Func->hasThisPointer());
@@ -1662,17 +1692,8 @@ bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
   }
 
   const CXXRecordDecl *DynamicDecl = nullptr;
-  {
-    Pointer TypePtr = ThisPtr;
-    while (TypePtr.isBaseClass())
-      TypePtr = TypePtr.getBase();
-
-    QualType DynamicType = TypePtr.getType();
-    if (DynamicType->isPointerType() || DynamicType->isReferenceType())
-      DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
-    else
-      DynamicDecl = DynamicType->getAsCXXRecordDecl();
-  }
+  if (!GetDynamicDecl(S, OpPC, ThisPtr, DynamicDecl))
+    return false;
   assert(DynamicDecl);
 
   const auto *StaticDecl = cast<CXXRecordDecl>(Func->getParentDecl());
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 68ebfdf..a3c4ba5 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -736,25 +736,6 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-/// rotateleft(value, amount)
-static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
-                                   const InterpFrame *Frame,
-                                   const CallExpr *Call, bool Right) {
-  APSInt Amount = popToAPSInt(S, Call->getArg(1));
-  APSInt Value = popToAPSInt(S, Call->getArg(0));
-
-  APSInt Result;
-  if (Right)
-    Result = APSInt(Value.rotr(Amount.urem(Value.getBitWidth())),
-                    /*IsUnsigned=*/true);
-  else // Left.
-    Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())),
-                    /*IsUnsigned=*/true);
-
-  pushInteger(S, Result, Call->getType());
-  return true;
-}
-
 static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC,
                                 const InterpFrame *Frame,
                                 const CallExpr *Call) {
@@ -3160,7 +3141,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI_rotl:
   case Builtin::BI_lrotl:
   case Builtin::BI_rotl64:
-    return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/false);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
+          return Value.rotl(Amount);
+        });
 
   case Builtin::BI__builtin_rotateright8:
   case Builtin::BI__builtin_rotateright16:
@@ -3171,7 +3155,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI_rotr:
   case Builtin::BI_lrotr:
   case Builtin::BI_rotr64:
-    return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/true);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
+          return Value.rotr(Amount);
+        });
 
   case Builtin::BI__builtin_ffs:
   case Builtin::BI__builtin_ffsl:
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index bbc45e5..24a5fc2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -221,10 +221,9 @@ mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
   return initialAlloca;
 }
 
-mlir::Value
-OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
-                                           mlir::Value bound,
-                                           mlir::Location loc, bool inverse) {
+std::pair<mlir::Value, mlir::Value> OpenACCRecipeBuilderBase::createBoundsLoop(
+    mlir::Value subscriptedValue, mlir::Value subscriptedValue2,
+    mlir::Value bound, mlir::Location loc, bool inverse) {
   mlir::Operation *bodyInsertLoc;
 
   mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
@@ -249,7 +248,6 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
 
     return cir::PtrStrideOp::create(builder, loc, eltLoad.getType(), eltLoad,
                                     idxLoad);
-        
   };
 
   auto forStmtBuilder = [&]() {
@@ -303,6 +301,8 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
 
           if (subscriptedValue)
             subscriptedValue = doSubscriptOp(subscriptedValue, load);
+          if (subscriptedValue2)
+            subscriptedValue2 = doSubscriptOp(subscriptedValue2, load);
           bodyInsertLoc = builder.createYield(loc);
         },
         /*stepBuilder=*/
@@ -325,7 +325,7 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
   // Leave the insertion point to be inside the body, so we can loop over
   // these things.
   builder.setInsertionPoint(bodyInsertLoc);
-  return subscriptedValue;
+  return {subscriptedValue, subscriptedValue2};
 }
 
 mlir::acc::ReductionOperator
@@ -434,7 +434,7 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
     mlir::Location loc, mlir::Location locEnd, SourceRange exprRange,
     mlir::Value mainOp, mlir::Region &recipeInitRegion, size_t numBounds,
     llvm::ArrayRef<QualType> boundTypes, const VarDecl *allocaDecl,
-    QualType origType) {
+    QualType origType, bool emitInitExpr) {
   assert(allocaDecl && "Required recipe variable not set?");
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl};
 
@@ -464,14 +464,15 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
     // initialize this variable correctly.
     CIRGenFunction::AutoVarEmission tempDeclEmission =
         cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint());
-    cgf.emitAutoVarInit(tempDeclEmission);
+    if (emitInitExpr)
+      cgf.emitAutoVarInit(tempDeclEmission);
   } else {
     mlir::Value alloca = makeBoundsAlloca(
         block, exprRange, loc, allocaDecl->getName(), numBounds, boundTypes);
 
     // If the initializer is trivial, there is nothing to do here, so save
     // ourselves some effort.
-    if (allocaDecl->getInit() &&
+    if (emitInitExpr && allocaDecl->getInit() &&
         (!cgf.isTrivialInitializer(allocaDecl->getInit()) ||
          cgf.getContext().getLangOpts().getTrivialAutoVarInit() !=
              LangOptions::TrivialAutoVarInitKind::Uninitialized))
@@ -484,35 +485,42 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
 
 void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
-    CIRGenFunction::AutoVarEmission tempDeclEmission,
-    mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe,
-    const VarDecl *temporary) {
-  mlir::Block *block =
-      createRecipeBlock(recipe.getCopyRegion(), mainOp.getType(), loc,
-                        /*numBounds=*/0, /*isInit=*/false);
-  builder.setInsertionPointToEnd(&recipe.getCopyRegion().back());
+    const VarDecl *allocaDecl, const VarDecl *temporary,
+    mlir::Region &copyRegion, size_t numBounds) {
+  mlir::Block *block = createRecipeBlock(copyRegion, mainOp.getType(), loc,
+                                         numBounds, /*isInit=*/false);
+  builder.setInsertionPointToEnd(&copyRegion.back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);
 
-  mlir::BlockArgument fromArg = block->getArgument(0);
-  mlir::BlockArgument toArg = block->getArgument(1);
+  mlir::Value fromArg = block->getArgument(0);
+  mlir::Value toArg = block->getArgument(1);
 
-  mlir::Type elementTy =
-      mlir::cast<cir::PointerType>(mainOp.getType()).getPointee();
+  llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
+      block->getArguments().drop_front(2);
 
-  // Set the address of the emission to be the argument, so that we initialize
-  // that instead of the variable in the other block.
-  tempDeclEmission.setAllocatedAddress(
-      Address{toArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)});
+  for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
+    std::tie(fromArg, toArg) =
+        createBoundsLoop(fromArg, toArg, boundArg, loc, /*inverse=*/false);
+
+  // Set up the 'to' address.
+  mlir::Type elementTy =
+      mlir::cast<cir::PointerType>(toArg.getType()).getPointee();
+  CIRGenFunction::AutoVarEmission tempDeclEmission(*allocaDecl);
   tempDeclEmission.emittedAsOffload = true;
+  tempDeclEmission.setAllocatedAddress(
+      Address{toArg, elementTy, cgf.getContext().getDeclAlign(allocaDecl)});
 
+  // Set up the 'from' address from the temporary.
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary};
   cgf.setAddrOfLocalVar(
       temporary,
-      Address{fromArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)});
-
+      Address{fromArg, elementTy, cgf.getContext().getDeclAlign(allocaDecl)});
   cgf.emitAutoVarInit(tempDeclEmission);
+
+  builder.setInsertionPointToEnd(&copyRegion.back());
   mlir::acc::YieldOp::create(builder, locEnd);
 }
+
 // This function generates the 'combiner' section for a reduction recipe. Note
 // that this function is not 'insertion point' clean, in that it alters the
 // insertion point to be inside of the 'combiner' section of the recipe, but
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index 21707ad..a5da744 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -49,14 +49,16 @@ protected:
   // Creates a loop through an 'acc.bounds', leaving the 'insertion' point to be
   // the inside of the loop body. Traverses LB->UB UNLESS `inverse` is set.
   // Returns the 'subscriptedValue' changed with the new bounds subscript.
+  std::pair<mlir::Value, mlir::Value>
+  createBoundsLoop(mlir::Value subscriptedValue, mlir::Value subscriptedValue2,
+                   mlir::Value bound, mlir::Location loc, bool inverse);
+
   mlir::Value createBoundsLoop(mlir::Value subscriptedValue, mlir::Value bound,
-                               mlir::Location loc, bool inverse);
+                               mlir::Location loc, bool inverse) {
+    return createBoundsLoop(subscriptedValue, {}, bound, loc, inverse).first;
+  }
+
   mlir::acc::ReductionOperator convertReductionOp(OpenACCReductionOperator op);
-  void createFirstprivateRecipeCopy(
-      mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
-      CIRGenFunction::AutoVarEmission tempDeclEmission,
-      mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe,
-      const VarDecl *temporary);
 
   // This function generates the 'combiner' section for a reduction recipe. Note
   // that this function is not 'insertion point' clean, in that it alters the
@@ -66,11 +68,19 @@ protected:
                                      mlir::Value mainOp,
                                      mlir::acc::ReductionRecipeOp recipe,
                                      size_t numBounds);
+
   void createInitRecipe(mlir::Location loc, mlir::Location locEnd,
                         SourceRange exprRange, mlir::Value mainOp,
                         mlir::Region &recipeInitRegion, size_t numBounds,
                         llvm::ArrayRef<QualType> boundTypes,
-                        const VarDecl *allocaDecl, QualType origType);
+                        const VarDecl *allocaDecl, QualType origType,
+                        bool emitInitExpr);
+
+  void createFirstprivateRecipeCopy(mlir::Location loc, mlir::Location locEnd,
+                                    mlir::Value mainOp,
+                                    const VarDecl *allocaDecl,
+                                    const VarDecl *temporary,
+                                    mlir::Region &copyRegion, size_t numBounds);
 
   void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd,
                                   mlir::Value mainOp, CharUnits alignment,
@@ -150,63 +160,6 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
     return recipeName;
   }
 
-  // Create the 'init' section of the recipe, including the 'copy' section for
-  // 'firstprivate'.  Note that this function is not 'insertion point' clean, in
-  // that it alters the insertion point to be inside of the 'destroy' section of
-  // the recipe, but doesn't restore it aftewards.
-  void createRecipeInitCopy(mlir::Location loc, mlir::Location locEnd,
-                            SourceRange exprRange, mlir::Value mainOp,
-                            RecipeTy recipe, const VarDecl *varRecipe,
-                            const VarDecl *temporary) {
-    // TODO: OpenACC: when we get the 'pointer' variants for
-    // firstprivate/reduction, this probably should be removed/split into
-    // functions for the BuilderBase.
-    assert(varRecipe && "Required recipe variable not set?");
-
-    CIRGenFunction::AutoVarEmission tempDeclEmission{
-        CIRGenFunction::AutoVarEmission::invalid()};
-    CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, varRecipe};
-
-    // Do the 'init' section of the recipe IR, which does an alloca, then the
-    // initialization (except for firstprivate).
-    mlir::Block *block =
-        createRecipeBlock(recipe.getInitRegion(), mainOp.getType(), loc,
-                          /*numBounds=*/0, /*isInit=*/true);
-    builder.setInsertionPointToEnd(&recipe.getInitRegion().back());
-    CIRGenFunction::LexicalScope ls(cgf, loc, block);
-
-    tempDeclEmission =
-        cgf.emitAutoVarAlloca(*varRecipe, builder.saveInsertionPoint());
-
-    // 'firstprivate' doesn't do its initialization in the 'init' section,
-    // instead it does it in the 'copy' section.  SO, only do 'init' here for
-    // reduction.
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::ReductionRecipeOp>) {
-      // Unlike Private, the recipe here is always required as it has to do
-      // init, not just 'default' init.
-      if (!varRecipe->getInit())
-        cgf.cgm.errorNYI(exprRange, "reduction init recipe");
-      cgf.emitAutoVarInit(tempDeclEmission);
-    }
-
-    mlir::acc::YieldOp::create(builder, locEnd);
-
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>) {
-      if (!varRecipe->getInit()) {
-        // If we don't have any initialization recipe, we failed during Sema to
-        // initialize this correctly. If we disable the
-        // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll
-        // emit an error to tell us.  However, emitting those errors during
-        // production is a violation of the standard, so we cannot do them.
-        cgf.cgm.errorNYI(
-            exprRange, "firstprivate copy-init recipe not properly generated");
-      }
-
-      createFirstprivateRecipeCopy(loc, locEnd, mainOp, tempDeclEmission,
-                                   recipe, varRecipe, temporary);
-    }
-  }
-
 public:
   OpenACCRecipeBuilder(CIRGen::CIRGenFunction &cgf,
                        CIRGen::CIRGenBuilderTy &builder)
@@ -221,19 +174,6 @@ public:
                BuiltinType::ArraySection) &&
            "array section shouldn't make it to recipe creation");
 
-    // TODO: OpenACC: This is a bit of a hackery to get this to not change for
-    // the non-private recipes. This will be removed soon, when we get this
-    // 'right' for firstprivate and reduction.
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>) {
-      if (numBounds) {
-        cgf.cgm.errorNYI(varRef->getSourceRange(),
-                         "firstprivate-init with bounds");
-      }
-      boundTypes = {};
-      numBounds = 0;
-      origType = baseType;
-    }
-
     mlir::ModuleOp mod = builder.getBlock()
                              ->getParent()
                              ->template getParentOfType<mlir::ModuleOp>();
@@ -262,21 +202,20 @@ public:
     if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
       createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
                        recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
-                       origType);
+                       origType, /*emitInitExpr=*/true);
     } else if constexpr (std::is_same_v<RecipeTy,
                                         mlir::acc::ReductionRecipeOp>) {
       createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
                        recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
-                       origType);
+                       origType, /*emitInitExpr=*/true);
       createReductionRecipeCombiner(loc, locEnd, mainOp, recipe, numBounds);
     } else {
       static_assert(std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>);
-      // TODO: OpenACC: we probably want this to call createInitRecipe as well,
-      // but do so in a way that omits the 'initialization', so that we can do
-      // it separately, since it belongs in the 'copy' region. It also might
-      // need a way of getting the tempDeclEmission out of it for that purpose.
-      createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp,
-                           recipe, varRecipe, temporary);
+      createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
+                       recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
+                       origType, /*emitInitExpr=*/false);
+      createFirstprivateRecipeCopy(loc, locEnd, mainOp, varRecipe, temporary,
+                                   recipe.getCopyRegion(), numBounds);
     }
 
     if (origType.isDestructedType())
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 754f43a..965741f 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7330,9 +7330,8 @@ _mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi8 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi32_epi8(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
@@ -7360,9 +7359,8 @@ _mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi8 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi32_epi8(__m256i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v8si)__A, __v8qi),
       (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
@@ -7370,8 +7368,7 @@ _mm256_cvtepi32_epi8 (__m256i __A)
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
+_mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
   return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
               (__v16qi) __O, __M);
 }
@@ -7390,9 +7387,8 @@ _mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi16 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi32_epi16(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7);
@@ -7419,9 +7415,8 @@ _mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi16 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi32_epi16(__m256i __A) {
   return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
 }
 
@@ -7446,9 +7441,8 @@ _mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi8 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi64_epi8(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
@@ -7475,9 +7469,8 @@ _mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi8 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi64_epi8(__m256i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
@@ -7504,9 +7497,8 @@ _mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi32 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi64_epi32(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3);
 }
@@ -7532,23 +7524,20 @@ _mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi32 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi64_epi32(__m256i __A) {
   return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm256_cvtepi64_epi32(__A),
                                              (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm256_cvtepi64_epi32(__A),
                                              (__v4si)_mm_setzero_si128());
@@ -7560,9 +7549,8 @@ _mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi16 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi64_epi16(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3,
       3, 3, 3, 3);
@@ -7590,9 +7578,8 @@ _mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi16 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi64_epi16(__m256i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7);
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 11d2d5c..999e302c 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1049,6 +1049,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   case NormalizedConstraint::ConstraintKind::Compound:
     return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL);
   }
+  llvm_unreachable("Unknown ConstraintKind enum");
 }
 
 static bool CheckConstraintSatisfaction(
@@ -2141,6 +2142,7 @@ bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) {
     return substitute(Compound.getRHS());
   }
   }
+  llvm_unreachable("Unknown ConstraintKind enum");
 }
 
 } // namespace
@@ -2561,7 +2563,6 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
   };
 
   switch (NC.getKind()) {
-
   case NormalizedConstraint::ConstraintKind::Atomic:
     return {{find(&static_cast<const AtomicConstraint &>(NC))}};
 
@@ -2601,6 +2602,7 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
     return Res;
   }
   }
+  llvm_unreachable("Unknown ConstraintKind enum");
 }
 
 void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 0069b08..6eaf7b9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11041,17 +11041,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             << CUDA().getConfigureFuncName();
       Context.setcudaConfigureCallDecl(NewFD);
     }
-
-    // Variadic functions, other than a *declaration* of printf, are not allowed
-    // in device-side CUDA code, unless someone passed
-    // -fcuda-allow-variadic-functions.
-    if (!getLangOpts().CUDAAllowVariadicFunctions && NewFD->isVariadic() &&
-        (NewFD->hasAttr<CUDADeviceAttr>() ||
-         NewFD->hasAttr<CUDAGlobalAttr>()) &&
-        !(II && II->isStr("printf") && NewFD->isExternC() &&
-          !D.isFunctionDefinition())) {
-      Diag(NewFD->getLocation(), diag::err_variadic_device_fn);
-    }
   }
 
   MarkUnusedFileScopedDecl(NewFD);
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 7ad7049..8471f02 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2724,16 +2724,6 @@ Expr *GenerateReductionInitRecipeExpr(ASTContext &Context,
   return InitExpr;
 }
 
-const Expr *StripOffBounds(const Expr *VarExpr) {
-  while (isa_and_present<ArraySectionExpr, ArraySubscriptExpr>(VarExpr)) {
-    if (const auto *AS = dyn_cast<ArraySectionExpr>(VarExpr))
-      VarExpr = AS->getBase()->IgnoreParenImpCasts();
-    else if (const auto *Sub = dyn_cast<ArraySubscriptExpr>(VarExpr))
-      VarExpr = Sub->getBase()->IgnoreParenImpCasts();
-  }
-  return VarExpr;
-}
-
 VarDecl *CreateAllocaDecl(ASTContext &Ctx, DeclContext *DC,
                           SourceLocation BeginLoc, IdentifierInfo *VarName,
                           QualType VarTy) {
@@ -2794,17 +2784,18 @@ OpenACCPrivateRecipe SemaOpenACC::CreatePrivateInitRecipe(const Expr *VarExpr) {
 
 OpenACCFirstPrivateRecipe
 SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) {
-  // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe
-  VarExpr = StripOffBounds(VarExpr);
-
+  // We don't strip bounds here, so that we are doing our recipe init at the
+  // 'lowest' possible level.  Codegen is going to have to do its own 'looping'.
   if (!VarExpr || VarExpr->getType()->isDependentType())
     return OpenACCFirstPrivateRecipe::Empty();
 
   QualType VarTy =
       VarExpr->getType().getNonReferenceType().getUnqualifiedType();
 
-  // TODO: OpenACC: for arrays/bounds versions, we're going to have to do a
-  // different initializer, but for now we can go ahead with this.
+  // Array sections are special, and we have to treat them that way.
+  if (const auto *ASE =
+          dyn_cast<ArraySectionExpr>(VarExpr->IgnoreParenImpCasts()))
+    VarTy = ArraySectionExpr::getBaseOriginalType(ASE);
 
   VarDecl *AllocaDecl = CreateAllocaDecl(
       getASTContext(), SemaRef.getCurContext(), VarExpr->getBeginLoc(),
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index b870114..5657dfe 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -4413,14 +4413,23 @@ CompareImplicitConversionSequences(Sema &S, SourceLocation Loc,
     Result = CompareStandardConversionSequences(S, Loc,
                                                 ICS1.Standard, ICS2.Standard);
   else if (ICS1.isUserDefined()) {
+    // With lazy template loading, it is possible to find non-canonical
+    // FunctionDecls, depending on when redecl chains are completed. Make sure
+    // to compare the canonical decls of conversion functions. This avoids
+    // ambiguity problems for templated conversion operators.
+    const FunctionDecl *ConvFunc1 = ICS1.UserDefined.ConversionFunction;
+    if (ConvFunc1)
+      ConvFunc1 = ConvFunc1->getCanonicalDecl();
+    const FunctionDecl *ConvFunc2 = ICS2.UserDefined.ConversionFunction;
+    if (ConvFunc2)
+      ConvFunc2 = ConvFunc2->getCanonicalDecl();
     // User-defined conversion sequence U1 is a better conversion
     // sequence than another user-defined conversion sequence U2 if
     // they contain the same user-defined conversion function or
     // constructor and if the second standard conversion sequence of
     // U1 is better than the second standard conversion sequence of
     // U2 (C++ 13.3.3.2p3).
-    if (ICS1.UserDefined.ConversionFunction ==
-          ICS2.UserDefined.ConversionFunction)
+    if (ConvFunc1 == ConvFunc2)
       Result = CompareStandardConversionSequences(S, Loc,
                                                   ICS1.UserDefined.After,
                                                   ICS2.UserDefined.After);