41 files changed, 7047 insertions, 641 deletions
diff --git a/clang/lib/CIR/CodeGen/Address.h b/clang/lib/CIR/CodeGen/Address.h
index a67cbad..a425eeb 100644
--- a/clang/lib/CIR/CodeGen/Address.h
+++ b/clang/lib/CIR/CodeGen/Address.h
@@ -16,9 +16,11 @@
 
 #include "mlir/IR/Value.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/Casting.h"
 
 namespace clang::CIRGen {
 
@@ -75,6 +77,12 @@ public:
     return Address(newPtr, getElementType(), getAlignment());
   }
 
+  /// Return address with different alignment, but same pointer and element
+  /// type.
+  Address withAlignment(clang::CharUnits newAlignment) const {
+    return Address(getPointer(), getElementType(), newAlignment);
+  }
+
   /// Return address with different element type, a bitcast pointer, and
   /// the same alignment.
   Address withElementType(CIRGenBuilderTy &builder, mlir::Type ElemTy) const;
@@ -114,6 +122,11 @@ public:
     return elementType;
   }
 
+  cir::TargetAddressSpaceAttr getAddressSpace() const {
+    auto ptrTy = mlir::dyn_cast<cir::PointerType>(getType());
+    return ptrTy.getAddrSpace();
+  }
+
   clang::CharUnits getAlignment() const { return alignment; }
 
   /// Get the operation which defines this address.
diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index 7db6e28..4c94db5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -27,6 +27,7 @@ class AtomicInfo {
   CharUnits atomicAlign;
   CharUnits valueAlign;
   TypeEvaluationKind evaluationKind = cir::TEK_Scalar;
+  bool useLibCall = true;
   LValue lvalue;
   mlir::Location loc;
 
@@ -62,8 +63,8 @@ public:
       assert(!cir::MissingFeatures::atomicInfo());
       cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue");
     }
-
-    assert(!cir::MissingFeatures::atomicUseLibCall());
+    useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic(
+        atomicSizeInBits, ctx.toBits(lvalue.getAlignment()));
   }
 
   QualType getValueType() const { return valueTy; }
@@ -75,6 +76,8 @@ public:
     assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer());
     return nullptr;
   }
+  bool shouldUseLibCall() const { return useLibCall; }
+  const LValue &getAtomicLValue() const { return lvalue; }
   Address getAtomicAddress() const {
     mlir::Type elemTy;
     if (lvalue.isSimple()) {
@@ -96,6 +99,8 @@ public:
 
   bool emitMemSetZeroIfNecessary() const;
 
+  mlir::Value getScalarRValValueOrNull(RValue rvalue) const;
+
   /// Cast the given pointer to an integer pointer suitable for atomic
   /// operations on the source.
   Address castToAtomicIntPointer(Address addr) const;
@@ -105,6 +110,9 @@ public:
   /// copy the value across.
   Address convertToAtomicIntPointer(Address addr) const;
 
+  /// Converts a rvalue to integer value.
+  mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const;
+
   /// Copy an atomic r-value into atomic-layout memory.
   void emitCopyIntoMemory(RValue rvalue) const;
 
@@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const {
   return tempAlloca;
 }
 
+mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const {
+  if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple()))
+    return rvalue.getValue();
+  return nullptr;
+}
+
 Address AtomicInfo::castToAtomicIntPointer(Address addr) const {
   auto intTy = mlir::dyn_cast<cir::IntType>(addr.getElementType());
   // Don't bother with int casts if the integer size is the same.
@@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
     return false;
 
   cgf.cgm.errorNYI(loc,
-                   "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero");
+                   "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero");
   return false;
 }
 
+/// Return true if \param valueTy is a type that should be casted to integer
+/// around the atomic memory operation. If \param cmpxchg is true, then the
+/// cast of a floating point type is made as that instruction can not have
+/// floating point operands.  TODO: Allow compare-and-exchange and FP - see
+/// comment in CIRGenAtomicExpandPass.cpp.
+static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) {
+  if (cir::isAnyFloatingPointType(valueTy))
+    return isa<cir::FP80Type>(valueTy) || cmpxchg;
+  return !isa<cir::IntType>(valueTy) && !isa<cir::PointerType>(valueTy);
+}
+
+mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const {
+  // If we've got a scalar value of the right size, try to avoid going
+  // through memory. Floats get casted if needed by AtomicExpandPass.
+  if (mlir::Value value = getScalarRValValueOrNull(rvalue)) {
+    if (!shouldCastToInt(value.getType(), cmpxchg))
+      return cgf.emitToMemory(value, valueTy);
+
+    cgf.cgm.errorNYI(
+        loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int");
+    return nullptr;
+  }
+
+  cgf.cgm.errorNYI(
+      loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int");
+  return nullptr;
+}
+
 /// Copy an r-value into memory as part of storing to an atomic type.
 /// This needs to create a bit-pattern suitable for atomic operations.
 void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
@@ -244,6 +286,31 @@ void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
   }
 }
 
+static void emitMemOrderDefaultCaseLabel(CIRGenBuilderTy &builder,
+                                         mlir::Location loc) {
+  mlir::ArrayAttr ordersAttr = builder.getArrayAttr({});
+  mlir::OpBuilder::InsertPoint insertPoint;
+  cir::CaseOp::create(builder, loc, ordersAttr, cir::CaseOpKind::Default,
+                      insertPoint);
+  builder.restoreInsertionPoint(insertPoint);
+}
+
+// Create a "case" operation with the given list of orders as its values. Also
+// create the region that will hold the body of the switch-case label.
+static void emitMemOrderCaseLabel(CIRGenBuilderTy &builder, mlir::Location loc,
+                                  mlir::Type orderType,
+                                  llvm::ArrayRef<cir::MemOrder> orders) {
+  llvm::SmallVector<mlir::Attribute, 2> orderAttrs;
+  for (cir::MemOrder order : orders)
+    orderAttrs.push_back(cir::IntAttr::get(orderType, static_cast<int>(order)));
+  mlir::ArrayAttr ordersAttr = builder.getArrayAttr(orderAttrs);
+
+  mlir::OpBuilder::InsertPoint insertPoint;
+  cir::CaseOp::create(builder, loc, ordersAttr, cir::CaseOpKind::Anyof,
+                      insertPoint);
+  builder.restoreInsertionPoint(insertPoint);
+}
+
 static void emitAtomicCmpXchg(CIRGenFunction &cgf, AtomicExpr *e, bool isWeak,
                               Address dest, Address ptr, Address val1,
                               Address val2, uint64_t size,
@@ -577,6 +644,9 @@ static void emitAtomicOp(CIRGenFunction &cgf, AtomicExpr *expr, Address dest,
   case AtomicExpr::AO__scoped_atomic_nand_fetch:
 
   case AtomicExpr::AO__scoped_atomic_fetch_nand:
+
+  case AtomicExpr::AO__scoped_atomic_uinc_wrap:
+  case AtomicExpr::AO__scoped_atomic_udec_wrap:
     cgf.cgm.errorNYI(expr->getSourceRange(), "emitAtomicOp: expr op NYI");
     return;
   }
@@ -615,6 +685,70 @@ static bool isMemOrderValid(uint64_t order, bool isStore, bool isLoad) {
   return true;
 }
 
+static void emitAtomicExprWithDynamicMemOrder(
+    CIRGenFunction &cgf, mlir::Value order, AtomicExpr *e, Address dest,
+    Address ptr, Address val1, Address val2, Expr *isWeakExpr,
+    Expr *orderFailExpr, uint64_t size, bool isStore, bool isLoad) {
+  // The memory order is not known at compile-time.  The atomic operations
+  // can't handle runtime memory orders; the memory order must be hard coded.
+  // Generate a "switch" statement that converts a runtime value into a
+  // compile-time value.
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  cir::SwitchOp::create(
+      builder, order.getLoc(), order,
+      [&](mlir::OpBuilder &, mlir::Location loc, mlir::OperationState &) {
+        mlir::Block *switchBlock = builder.getBlock();
+
+        auto emitMemOrderCase = [&](llvm::ArrayRef<cir::MemOrder> caseOrders,
+                                    cir::MemOrder actualOrder) {
+          if (caseOrders.empty())
+            emitMemOrderDefaultCaseLabel(builder, loc);
+          else
+            emitMemOrderCaseLabel(builder, loc, order.getType(), caseOrders);
+          emitAtomicOp(cgf, e, dest, ptr, val1, val2, isWeakExpr, orderFailExpr,
+                       size, actualOrder);
+          builder.createBreak(loc);
+          builder.setInsertionPointToEnd(switchBlock);
+        };
+
+        // default:
+        // Use memory_order_relaxed for relaxed operations and for any memory
+        // order value that is not supported.  There is no good way to report
+        // an unsupported memory order at runtime, hence the fallback to
+        // memory_order_relaxed.
+        emitMemOrderCase(/*caseOrders=*/{}, cir::MemOrder::Relaxed);
+
+        if (!isStore) {
+          // case consume:
+          // case acquire:
+          // memory_order_consume is not implemented; it is always treated
+          // like memory_order_acquire.  These memory orders are not valid for
+          // write-only operations.
+          emitMemOrderCase({cir::MemOrder::Consume, cir::MemOrder::Acquire},
+                           cir::MemOrder::Acquire);
+        }
+
+        if (!isLoad) {
+          // case release:
+          // memory_order_release is not valid for read-only operations.
+          emitMemOrderCase({cir::MemOrder::Release}, cir::MemOrder::Release);
+        }
+
+        if (!isLoad && !isStore) {
+          // case acq_rel:
+          // memory_order_acq_rel is only valid for read-write operations.
+          emitMemOrderCase({cir::MemOrder::AcquireRelease},
+                           cir::MemOrder::AcquireRelease);
+        }
+
+        // case seq_cst:
+        emitMemOrderCase({cir::MemOrder::SequentiallyConsistent},
+                         cir::MemOrder::SequentiallyConsistent);
+
+        builder.createYield(loc);
+      });
+}
+
 RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
   QualType atomicTy = e->getPtr()->getType()->getPointeeType();
   QualType memTy = atomicTy;
@@ -802,9 +936,9 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       emitAtomicOp(*this, e, dest, ptr, val1, val2, isWeakExpr, orderFailExpr,
                    size, static_cast<cir::MemOrder>(ord));
   } else {
-    assert(!cir::MissingFeatures::atomicExpr());
-    cgm.errorNYI(e->getSourceRange(), "emitAtomicExpr: dynamic memory order");
-    return RValue::get(nullptr);
+    emitAtomicExprWithDynamicMemOrder(*this, order, e, dest, ptr, val1, val2,
+                                      isWeakExpr, orderFailExpr, size, isStore,
+                                      isLoad);
   }
 
   if (resultTy->isVoidType())
@@ -815,6 +949,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       e->getExprLoc());
 }
 
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) {
+  bool isVolatile = dest.isVolatileQualified();
+  auto order = cir::MemOrder::SequentiallyConsistent;
+  if (!dest.getType()->isAtomicType()) {
+    assert(!cir::MissingFeatures::atomicMicrosoftVolatile());
+  }
+  return emitAtomicStore(rvalue, dest, order, isVolatile, isInit);
+}
+
+/// Emit a store to an l-value of atomic type.
+///
+/// Note that the r-value is expected to be an r-value of the atomic type; this
+/// means that for aggregate r-values, it should include storage for any padding
+/// that was necessary.
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest,
+                                     cir::MemOrder order, bool isVolatile,
+                                     bool isInit) {
+  // If this is an aggregate r-value, it should agree in type except
+  // maybe for address-space qualification.
+  mlir::Location loc = dest.getPointer().getLoc();
+  assert(!rvalue.isAggregate() ||
+         rvalue.getAggregateAddress().getElementType() ==
+             dest.getAddress().getElementType());
+
+  AtomicInfo atomics(*this, dest, loc);
+  LValue lvalue = atomics.getAtomicLValue();
+
+  if (lvalue.isSimple()) {
+    // If this is an initialization, just put the value there normally.
+    if (isInit) {
+      atomics.emitCopyIntoMemory(rvalue);
+      return;
+    }
+
+    // Check whether we should use a library call.
+    if (atomics.shouldUseLibCall()) {
+      assert(!cir::MissingFeatures::atomicUseLibCall());
+      cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call");
+      return;
+    }
+
+    // Okay, we're doing this natively.
+    mlir::Value valueToStore = atomics.convertRValueToInt(rvalue);
+
+    // Do the atomic store.
+    Address addr = atomics.getAtomicAddress();
+    if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) {
+      if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) {
+        addr = atomics.castToAtomicIntPointer(addr);
+        valueToStore =
+            builder.createIntCast(valueToStore, addr.getElementType());
+      }
+    }
+    cir::StoreOp store = builder.createStore(loc, valueToStore, addr);
+
+    // Initializations don't need to be atomic.
+    if (!isInit) {
+      assert(!cir::MissingFeatures::atomicOpenMP());
+      store.setMemOrder(order);
+    }
+
+    // Other decoration.
+    if (isVolatile)
+      store.setIsVolatile(true);
+
+    assert(!cir::MissingFeatures::opLoadStoreTbaa());
+    return;
+  }
+
+  cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue");
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+}
+
 void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
   AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange()));
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index e5066fa..bf13eee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -189,6 +189,11 @@ public:
     return getType<cir::RecordType>(nameAttr, kind);
   }
 
+  cir::DataMemberAttr getDataMemberAttr(cir::DataMemberType ty,
+                                        unsigned memberIndex) {
+    return cir::DataMemberAttr::get(ty, memberIndex);
+  }
+
   // Return true if the value is a null constant such as null pointer, (+0.0)
   // for floating-point or zero initializer
   bool isNullValue(mlir::Attribute attr) const {
@@ -212,6 +217,16 @@ public:
                  &ignored);
       return fv.bitwiseIsEqual(fpVal);
     }
+    if (const auto recordVal = mlir::dyn_cast<cir::ConstRecordAttr>(attr)) {
+      for (const auto elt : recordVal.getMembers()) {
+        // FIXME(cir): the record's ID should not be considered a member.
+        if (mlir::isa<mlir::StringAttr>(elt))
+          continue;
+        if (!isNullValue(elt))
+          return false;
+      }
+      return true;
+    }
 
     if (const auto arrayVal = mlir::dyn_cast<cir::ConstArrayAttr>(attr)) {
       if (mlir::isa<mlir::StringAttr>(arrayVal.getElts()))
@@ -315,8 +330,10 @@ public:
     return getConstantInt(loc, getUInt32Ty(), c);
   }
   cir::ConstantOp getSInt64(uint64_t c, mlir::Location loc) {
-    cir::IntType sInt64Ty = getSInt64Ty();
-    return cir::ConstantOp::create(*this, loc, cir::IntAttr::get(sInt64Ty, c));
+    return getConstantInt(loc, getSInt64Ty(), c);
+  }
+  cir::ConstantOp getUInt64(uint64_t c, mlir::Location loc) {
+    return getConstantInt(loc, getUInt64Ty(), c);
   }
 
   mlir::Value createNeg(mlir::Value value) {
@@ -332,6 +349,11 @@ public:
     llvm_unreachable("negation for the given type is NYI");
   }
 
+  cir::IsFPClassOp createIsFPClass(mlir::Location loc, mlir::Value src,
+                                   cir::FPClassTest flags) {
+    return cir::IsFPClassOp::create(*this, loc, src, flags);
+  }
+
   // TODO: split this to createFPExt/createFPTrunc when we have dedicated cast
   // operations.
   mlir::Value createFloatingCast(mlir::Value v, mlir::Type destType) {
@@ -403,6 +425,19 @@ public:
     return Address(baseAddr, destType, addr.getAlignment());
   }
 
+  Address createDerivedClassAddr(mlir::Location loc, Address addr,
+                                 mlir::Type destType, unsigned offset,
+                                 bool assumeNotNull) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    cir::PointerType ptrTy = getPointerTo(destType);
+    auto derivedAddr =
+        cir::DerivedClassAddrOp::create(*this, loc, ptrTy, addr.getPointer(),
+                                        mlir::APInt(64, offset), assumeNotNull);
+    return Address(derivedAddr, destType, addr.getAlignment());
+  }
+
   mlir::Value createVTTAddrPoint(mlir::Location loc, mlir::Type retTy,
                                  mlir::Value addr, uint64_t offset) {
     return cir::VTTAddrPointOp::create(*this, loc, retTy,
@@ -572,6 +607,35 @@ public:
                                       info.isSigned, isLvalueVolatile,
                                       addr.getAlignment().getAsAlign().value());
   }
+
+  cir::VecShuffleOp
+  createVecShuffle(mlir::Location loc, mlir::Value vec1, mlir::Value vec2,
+                   llvm::ArrayRef<mlir::Attribute> maskAttrs) {
+    auto vecType = mlir::cast<cir::VectorType>(vec1.getType());
+    auto resultTy = cir::VectorType::get(getContext(), vecType.getElementType(),
+                                         maskAttrs.size());
+    return cir::VecShuffleOp::create(*this, loc, resultTy, vec1, vec2,
+                                     getArrayAttr(maskAttrs));
+  }
+
+  cir::VecShuffleOp createVecShuffle(mlir::Location loc, mlir::Value vec1,
+                                     mlir::Value vec2,
+                                     llvm::ArrayRef<int64_t> mask) {
+    auto maskAttrs = llvm::to_vector_of<mlir::Attribute>(
+        llvm::map_range(mask, [&](int32_t idx) {
+          return cir::IntAttr::get(getSInt32Ty(), idx);
+        }));
+    return createVecShuffle(loc, vec1, vec2, maskAttrs);
+  }
+
+  cir::VecShuffleOp createVecShuffle(mlir::Location loc, mlir::Value vec1,
+                                     llvm::ArrayRef<int64_t> mask) {
+    /// Create a unary shuffle. The second vector operand of the IR instruction
+    /// is poison.
+    cir::ConstantOp poison =
+        getConstant(loc, cir::PoisonAttr::get(vec1.getType()));
+    return createVecShuffle(loc, vec1, poison, mask);
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index e35100f..16c006d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -12,16 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "CIRGenCall.h"
-#include "CIRGenConstantEmitter.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
 #include "CIRGenValue.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
+#include "clang/AST/DeclBase.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/OperatorKinds.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -58,6 +60,72 @@ static RValue emitBuiltinBitOp(CIRGenFunction &cgf, const CallExpr *e,
   return RValue::get(result);
 }
 
+static mlir::Value makeAtomicFenceValue(CIRGenFunction &cgf,
+                                        const CallExpr *expr,
+                                        cir::SyncScopeKind syncScope) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Value orderingVal = cgf.emitScalarExpr(expr->getArg(0));
+
+  auto constOrdering = orderingVal.getDefiningOp<cir::ConstantOp>();
+
+  if (!constOrdering) {
+    // TODO(cir): Emit code to switch on `orderingVal`,
+    //            and creating the fence op for valid values.
+    cgf.cgm.errorNYI("Variable atomic fence ordering");
+    return {};
+  }
+
+  auto constOrderingAttr = constOrdering.getValueAttr<cir::IntAttr>();
+  assert(constOrderingAttr && "Expected integer constant for ordering");
+
+  auto ordering = static_cast<cir::MemOrder>(constOrderingAttr.getUInt());
+
+  cir::AtomicFence::create(
+      builder, cgf.getLoc(expr->getSourceRange()), ordering,
+      cir::SyncScopeKindAttr::get(&cgf.getMLIRContext(), syncScope));
+
+  return {};
+}
+
+namespace {
+struct WidthAndSignedness {
+  unsigned width;
+  bool isSigned;
+};
+} // namespace
+
+static WidthAndSignedness
+getIntegerWidthAndSignedness(const clang::ASTContext &astContext,
+                             const clang::QualType type) {
+  assert(type->isIntegerType() && "Given type is not an integer.");
+  unsigned width = type->isBooleanType()  ? 1
+                   : type->isBitIntType() ? astContext.getIntWidth(type)
+                                          : astContext.getTypeInfo(type).Width;
+  bool isSigned = type->isSignedIntegerType();
+  return {width, isSigned};
+}
+
+// Given one or more integer types, this function produces an integer type that
+// encompasses them: any value in one of the given types could be expressed in
+// the encompassing type.
+static struct WidthAndSignedness
+EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> types) {
+  assert(types.size() > 0 && "Empty list of types.");
+
+  // If any of the given types is signed, we must return a signed type.
+  bool isSigned = llvm::any_of(types, [](const auto &t) { return t.isSigned; });
+
+  // The encompassing type must have a width greater than or equal to the width
+  // of the specified types.  Additionally, if the encompassing type is signed,
+  // its width must be strictly greater than the width of any unsigned types
+  // given.
+  unsigned width = 0;
+  for (const auto &type : types)
+    width = std::max(width, type.width + (isSigned && !type.isSigned));
+
+  return {width, isSigned};
+}
+
 RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) {
   mlir::Value input = emitScalarExpr(e->getArg(0));
   mlir::Value amount = emitScalarExpr(e->getArg(1));
@@ -93,6 +161,83 @@ static RValue emitUnaryFPBuiltin(CIRGenFunction &cgf, const CallExpr &e) {
   return RValue::get(call->getResult(0));
 }
 
+static RValue errorBuiltinNYI(CIRGenFunction &cgf, const CallExpr *e,
+                              unsigned builtinID) {
+
+  if (cgf.getContext().BuiltinInfo.isLibFunction(builtinID)) {
+    cgf.cgm.errorNYI(
+        e->getSourceRange(),
+        std::string("unimplemented X86 library function builtin call: ") +
+            cgf.getContext().BuiltinInfo.getName(builtinID));
+  } else {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     std::string("unimplemented X86 builtin call: ") +
+                         cgf.getContext().BuiltinInfo.getName(builtinID));
+  }
+
+  return cgf.getUndefRValue(e->getType());
+}
+
+static RValue emitBuiltinAlloca(CIRGenFunction &cgf, const CallExpr *e,
+                                unsigned builtinID) {
+  assert(builtinID == Builtin::BI__builtin_alloca ||
+         builtinID == Builtin::BI__builtin_alloca_uninitialized ||
+         builtinID == Builtin::BIalloca || builtinID == Builtin::BI_alloca);
+
+  // Get alloca size input
+  mlir::Value size = cgf.emitScalarExpr(e->getArg(0));
+
+  // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
+  const TargetInfo &ti = cgf.getContext().getTargetInfo();
+  const CharUnits suitableAlignmentInBytes =
+      cgf.getContext().toCharUnitsFromBits(ti.getSuitableAlign());
+
+  // Emit the alloca op with type `u8 *` to match the semantics of
+  // `llvm.alloca`. We later bitcast the type to `void *` to match the
+  // semantics of C/C++
+  // FIXME(cir): It may make sense to allow AllocaOp of type `u8` to return a
+  // pointer of type `void *`. This will require a change to the allocaOp
+  // verifier.
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Value allocaAddr = builder.createAlloca(
+      cgf.getLoc(e->getSourceRange()), builder.getUInt8PtrTy(),
+      builder.getUInt8Ty(), "bi_alloca", suitableAlignmentInBytes, size);
+
+  // Initialize the allocated buffer if required.
+  if (builtinID != Builtin::BI__builtin_alloca_uninitialized) {
+    // Initialize the alloca with the given size and alignment according to
+    // the lang opts. Only the trivial non-initialization is supported for
+    // now.
+
+    switch (cgf.getLangOpts().getTrivialAutoVarInit()) {
+    case LangOptions::TrivialAutoVarInitKind::Uninitialized:
+      // Nothing to initialize.
+      break;
+    case LangOptions::TrivialAutoVarInitKind::Zero:
+    case LangOptions::TrivialAutoVarInitKind::Pattern:
+      cgf.cgm.errorNYI("trivial auto var init");
+      break;
+    }
+  }
+
+  // An alloca will always return a pointer to the alloca (stack) address
+  // space. This address space need not be the same as the AST / Language
+  // default (e.g. in C / C++ auto vars are in the generic address space). At
+  // the AST level this is handled within CreateTempAlloca et al., but for the
+  // builtin / dynamic alloca we have to handle it here.
+
+  if (!cir::isMatchingAddressSpace(
+          cgf.getCIRAllocaAddressSpace(),
+          e->getType()->getPointeeType().getAddressSpace())) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "Non-default address space for alloca");
+  }
+
+  // Bitcast the alloca to the expected type.
+  return RValue::get(builder.createBitcast(
+      allocaAddr, builder.getVoidPtrTy(cgf.getCIRAllocaAddressSpace())));
+}
+
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
                                        ReturnValueSlot returnValue) {
@@ -148,58 +293,12 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_va_end:
     emitVAEnd(emitVAListRef(e->getArg(0)).getPointer());
     return {};
-
-  case Builtin::BIalloca:
-  case Builtin::BI_alloca:
-  case Builtin::BI__builtin_alloca_uninitialized:
-  case Builtin::BI__builtin_alloca: {
-    // Get alloca size input
-    mlir::Value size = emitScalarExpr(e->getArg(0));
-
-    // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
-    const TargetInfo &ti = getContext().getTargetInfo();
-    const CharUnits suitableAlignmentInBytes =
-        getContext().toCharUnitsFromBits(ti.getSuitableAlign());
-
-    // Emit the alloca op with type `u8 *` to match the semantics of
-    // `llvm.alloca`. We later bitcast the type to `void *` to match the
-    // semantics of C/C++
-    // FIXME(cir): It may make sense to allow AllocaOp of type `u8` to return a
-    // pointer of type `void *`. This will require a change to the allocaOp
-    // verifier.
-    mlir::Value allocaAddr = builder.createAlloca(
-        getLoc(e->getSourceRange()), builder.getUInt8PtrTy(),
-        builder.getUInt8Ty(), "bi_alloca", suitableAlignmentInBytes, size);
-
-    // Initialize the allocated buffer if required.
-    if (builtinID != Builtin::BI__builtin_alloca_uninitialized) {
-      // Initialize the alloca with the given size and alignment according to
-      // the lang opts. Only the trivial non-initialization is supported for
-      // now.
-
-      switch (getLangOpts().getTrivialAutoVarInit()) {
-      case LangOptions::TrivialAutoVarInitKind::Uninitialized:
-        // Nothing to initialize.
-        break;
-      case LangOptions::TrivialAutoVarInitKind::Zero:
-      case LangOptions::TrivialAutoVarInitKind::Pattern:
-        cgm.errorNYI("trivial auto var init");
-        break;
-      }
-    }
-
-    // An alloca will always return a pointer to the alloca (stack) address
-    // space. This address space need not be the same as the AST / Language
-    // default (e.g. in C / C++ auto vars are in the generic address space). At
-    // the AST level this is handled within CreateTempAlloca et al., but for the
-    // builtin / dynamic alloca we have to handle it here.
-    assert(!cir::MissingFeatures::addressSpace());
-
-    // Bitcast the alloca to the expected type.
-    return RValue::get(
-        builder.createBitcast(allocaAddr, builder.getVoidPtrTy()));
+  case Builtin::BI__builtin_va_copy: {
+    mlir::Value dstPtr = emitVAListRef(e->getArg(0)).getPointer();
+    mlir::Value srcPtr = emitVAListRef(e->getArg(1)).getPointer();
+    cir::VACopyOp::create(builder, dstPtr.getLoc(), dstPtr, srcPtr);
+    return {};
   }
-
   case Builtin::BIcos:
   case Builtin::BIcosf:
   case Builtin::BIcosl:
@@ -211,6 +310,39 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     assert(!cir::MissingFeatures::fastMathFlags());
     return emitUnaryMaybeConstrainedFPBuiltin<cir::CosOp>(*this, *e);
 
+  case Builtin::BIceil:
+  case Builtin::BIceilf:
+  case Builtin::BIceill:
+  case Builtin::BI__builtin_ceil:
+  case Builtin::BI__builtin_ceilf:
+  case Builtin::BI__builtin_ceilf16:
+  case Builtin::BI__builtin_ceill:
+  case Builtin::BI__builtin_ceilf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e);
+
+  case Builtin::BIexp:
+  case Builtin::BIexpf:
+  case Builtin::BIexpl:
+  case Builtin::BI__builtin_exp:
+  case Builtin::BI__builtin_expf:
+  case Builtin::BI__builtin_expf16:
+  case Builtin::BI__builtin_expl:
+  case Builtin::BI__builtin_expf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::ExpOp>(*this, *e);
+
+  case Builtin::BIexp2:
+  case Builtin::BIexp2f:
+  case Builtin::BIexp2l:
+  case Builtin::BI__builtin_exp2:
+  case Builtin::BI__builtin_exp2f:
+  case Builtin::BI__builtin_exp2f16:
+  case Builtin::BI__builtin_exp2l:
+  case Builtin::BI__builtin_exp2f128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::Exp2Op>(*this, *e);
+
   case Builtin::BIfabs:
   case Builtin::BIfabsf:
   case Builtin::BIfabsl:
@@ -221,6 +353,16 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_fabsf128:
     return emitUnaryMaybeConstrainedFPBuiltin<cir::FAbsOp>(*this, *e);
 
+  case Builtin::BIfloor:
+  case Builtin::BIfloorf:
+  case Builtin::BIfloorl:
+  case Builtin::BI__builtin_floor:
+  case Builtin::BI__builtin_floorf:
+  case Builtin::BI__builtin_floorf16:
+  case Builtin::BI__builtin_floorl:
+  case Builtin::BI__builtin_floorf128:
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::FloorOp>(*this, *e);
+
   case Builtin::BI__assume:
   case Builtin::BI__builtin_assume: {
     if (e->getArg(0)->HasSideEffects(getContext()))
@@ -398,36 +540,6 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_rotateright64:
     return emitRotate(e, /*isRotateLeft=*/false);
 
-  case Builtin::BI__builtin_return_address:
-  case Builtin::BI__builtin_frame_address: {
-    mlir::Location loc = getLoc(e->getExprLoc());
-    llvm::APSInt level = e->getArg(0)->EvaluateKnownConstInt(getContext());
-    if (builtinID == Builtin::BI__builtin_return_address) {
-      return RValue::get(cir::ReturnAddrOp::create(
-          builder, loc,
-          builder.getConstAPInt(loc, builder.getUInt32Ty(), level)));
-    }
-    return RValue::get(cir::FrameAddrOp::create(
-        builder, loc,
-        builder.getConstAPInt(loc, builder.getUInt32Ty(), level)));
-  }
-
-  case Builtin::BI__builtin_trap:
-    emitTrap(loc, /*createNewBlock=*/true);
-    return RValue::get(nullptr);
-
-  case Builtin::BI__builtin_unreachable:
-    emitUnreachable(e->getExprLoc(), /*createNewBlock=*/true);
-    return RValue::get(nullptr);
-
-  case Builtin::BI__builtin_elementwise_acos:
-    return emitUnaryFPBuiltin<cir::ACosOp>(*this, *e);
-  case Builtin::BI__builtin_elementwise_asin:
-    return emitUnaryFPBuiltin<cir::ASinOp>(*this, *e);
-  case Builtin::BI__builtin_elementwise_atan:
-    return emitUnaryFPBuiltin<cir::ATanOp>(*this, *e);
-  case Builtin::BI__builtin_elementwise_cos:
-    return emitUnaryFPBuiltin<cir::CosOp>(*this, *e);
   case Builtin::BI__builtin_coro_id:
   case Builtin::BI__builtin_coro_promise:
   case Builtin::BI__builtin_coro_resume:
@@ -443,9 +555,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return getUndefRValue(e->getType());
 
   case Builtin::BI__builtin_coro_frame: {
-    cgm.errorNYI(e->getSourceRange(), "BI__builtin_coro_frame NYI");
-    assert(!cir::MissingFeatures::coroutineFrame());
-    return getUndefRValue(e->getType());
+    return emitCoroutineFrame();
   }
   case Builtin::BI__builtin_coro_free:
   case Builtin::BI__builtin_coro_size: {
@@ -459,6 +569,58 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitCall(e->getCallee()->getType(), CIRGenCallee::forDirect(fnOp), e,
                     returnValue);
   }
+
+  case Builtin::BI__builtin_constant_p: {
+    mlir::Type resultType = convertType(e->getType());
+
+    const Expr *arg = e->getArg(0);
+    QualType argType = arg->getType();
+    // FIXME: The allowance for Obj-C pointers and block pointers is historical
+    // and likely a mistake.
+    if (!argType->isIntegralOrEnumerationType() && !argType->isFloatingType() &&
+        !argType->isObjCObjectPointerType() && !argType->isBlockPointerType()) {
+      // Per the GCC documentation, only numeric constants are recognized after
+      // inlining.
+      return RValue::get(
+          builder.getConstInt(getLoc(e->getSourceRange()),
+                              mlir::cast<cir::IntType>(resultType), 0));
+    }
+
+    if (arg->HasSideEffects(getContext())) {
+      // The argument is unevaluated, so be conservative if it might have
+      // side-effects.
+      return RValue::get(
+          builder.getConstInt(getLoc(e->getSourceRange()),
+                              mlir::cast<cir::IntType>(resultType), 0));
+    }
+
+    mlir::Value argValue = emitScalarExpr(arg);
+    if (argType->isObjCObjectPointerType()) {
+      cgm.errorNYI(e->getSourceRange(),
+                   "__builtin_constant_p: Obj-C object pointer");
+      return {};
+    }
+    argValue = builder.createBitcast(argValue, convertType(argType));
+
+    mlir::Value result = cir::IsConstantOp::create(
+        builder, getLoc(e->getSourceRange()), argValue);
+    // IsConstantOp returns a bool, but __builtin_constant_p returns an int.
+    result = builder.createBoolToInt(result, resultType);
+    return RValue::get(result);
+  }
+  case Builtin::BI__builtin_dynamic_object_size:
+  case Builtin::BI__builtin_object_size: {
+    unsigned type =
+        e->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
+    auto resType = mlir::cast<cir::IntType>(convertType(e->getType()));
+
+    // We pass this builtin onto the optimizer so that it can figure out the
+    // object size in more complex cases.
+    bool isDynamic = builtinID == Builtin::BI__builtin_dynamic_object_size;
+    return RValue::get(emitBuiltinObjectSize(e->getArg(0), type, resType,
+                                             /*EmittedE=*/nullptr, isDynamic));
+  }
+
   case Builtin::BI__builtin_prefetch: {
     auto evaluateOperandAsInt = [&](const Expr *arg) {
       Expr::EvalResult res;
@@ -480,6 +642,696 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     cir::PrefetchOp::create(builder, loc, address, locality, isWrite);
     return RValue::get(nullptr);
   }
+  case Builtin::BI__builtin_readcyclecounter:
+  case Builtin::BI__builtin_readsteadycounter:
+  case Builtin::BI__builtin___clear_cache:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_trap:
+    emitTrap(loc, /*createNewBlock=*/true);
+    return RValue::getIgnored();
+  case Builtin::BI__builtin_verbose_trap:
+  case Builtin::BI__debugbreak:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_unreachable:
+    emitUnreachable(e->getExprLoc(), /*createNewBlock=*/true);
+    return RValue::getIgnored();
+  case Builtin::BI__builtin_powi:
+  case Builtin::BI__builtin_powif:
+  case Builtin::BI__builtin_powil:
+  case Builtin::BI__builtin_frexpl:
+  case Builtin::BI__builtin_frexp:
+  case Builtin::BI__builtin_frexpf:
+  case Builtin::BI__builtin_frexpf128:
+  case Builtin::BI__builtin_frexpf16:
+  case Builtin::BImodf:
+  case Builtin::BImodff:
+  case Builtin::BImodfl:
+  case Builtin::BI__builtin_modf:
+  case Builtin::BI__builtin_modff:
+  case Builtin::BI__builtin_modfl:
+  case Builtin::BI__builtin_isgreater:
+  case Builtin::BI__builtin_isgreaterequal:
+  case Builtin::BI__builtin_isless:
+  case Builtin::BI__builtin_islessequal:
+  case Builtin::BI__builtin_islessgreater:
+  case Builtin::BI__builtin_isunordered:
+  // From https://clang.llvm.org/docs/LanguageExtensions.html#builtin-isfpclass
+  //
+  //  The `__builtin_isfpclass()` builtin is a generalization of functions
+  //  isnan, isinf, isfinite and some others defined by the C standard. It tests
+  //  if the floating-point value, specified by the first argument, falls into
+  //  any of data classes, specified by the second argument.
+  case Builtin::BI__builtin_isnan: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    assert(!cir::MissingFeatures::fpConstraints());
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::Nan),
+        convertType(e->getType())));
+  }
+
+  case Builtin::BI__builtin_issignaling: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::SignalingNaN),
+        convertType(e->getType())));
+  }
+
+  case Builtin::BI__builtin_isinf: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    assert(!cir::MissingFeatures::fpConstraints());
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::Infinity),
+        convertType(e->getType())));
+  }
+  case Builtin::BIfinite:
+  case Builtin::BI__finite:
+  case Builtin::BIfinitef:
+  case Builtin::BI__finitef:
+  case Builtin::BIfinitel:
+  case Builtin::BI__finitel:
+  case Builtin::BI__builtin_isfinite: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    assert(!cir::MissingFeatures::fpConstraints());
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::Finite),
+        convertType(e->getType())));
+  }
+
+  case Builtin::BI__builtin_isnormal: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::Normal),
+        convertType(e->getType())));
+  }
+
+  case Builtin::BI__builtin_issubnormal: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::Subnormal),
+        convertType(e->getType())));
+  }
+
+  case Builtin::BI__builtin_iszero: {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest::Zero),
+        convertType(e->getType())));
+  }
+  case Builtin::BI__builtin_isfpclass: {
+    Expr::EvalResult result;
+    if (!e->getArg(1)->EvaluateAsInt(result, cgm.getASTContext()))
+      break;
+
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    mlir::Value v = emitScalarExpr(e->getArg(0));
+    uint64_t test = result.Val.getInt().getLimitedValue();
+    mlir::Location loc = getLoc(e->getBeginLoc());
+    //
+    return RValue::get(builder.createBoolToInt(
+        builder.createIsFPClass(loc, v, cir::FPClassTest(test)),
+        convertType(e->getType())));
+  }
+  case Builtin::BI__builtin_nondeterministic_value:
+  case Builtin::BI__builtin_elementwise_abs:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_elementwise_acos:
+    return emitUnaryFPBuiltin<cir::ACosOp>(*this, *e);
+  case Builtin::BI__builtin_elementwise_asin:
+    return emitUnaryFPBuiltin<cir::ASinOp>(*this, *e);
+  case Builtin::BI__builtin_elementwise_atan:
+    return emitUnaryFPBuiltin<cir::ATanOp>(*this, *e);
+  case Builtin::BI__builtin_elementwise_atan2:
+  case Builtin::BI__builtin_elementwise_ceil:
+  case Builtin::BI__builtin_elementwise_exp:
+  case Builtin::BI__builtin_elementwise_exp2:
+  case Builtin::BI__builtin_elementwise_exp10:
+  case Builtin::BI__builtin_elementwise_ldexp:
+  case Builtin::BI__builtin_elementwise_log:
+  case Builtin::BI__builtin_elementwise_log2:
+  case Builtin::BI__builtin_elementwise_log10:
+  case Builtin::BI__builtin_elementwise_pow:
+  case Builtin::BI__builtin_elementwise_bitreverse:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_elementwise_cos:
+    return emitUnaryFPBuiltin<cir::CosOp>(*this, *e);
+  case Builtin::BI__builtin_elementwise_cosh:
+  case Builtin::BI__builtin_elementwise_floor:
+  case Builtin::BI__builtin_elementwise_popcount:
+  case Builtin::BI__builtin_elementwise_roundeven:
+  case Builtin::BI__builtin_elementwise_round:
+  case Builtin::BI__builtin_elementwise_rint:
+  case Builtin::BI__builtin_elementwise_nearbyint:
+  case Builtin::BI__builtin_elementwise_sin:
+  case Builtin::BI__builtin_elementwise_sinh:
+  case Builtin::BI__builtin_elementwise_tan:
+  case Builtin::BI__builtin_elementwise_tanh:
+  case Builtin::BI__builtin_elementwise_trunc:
+  case Builtin::BI__builtin_elementwise_canonicalize:
+  case Builtin::BI__builtin_elementwise_copysign:
+  case Builtin::BI__builtin_elementwise_fma:
+  case Builtin::BI__builtin_elementwise_fshl:
+  case Builtin::BI__builtin_elementwise_fshr:
+  case Builtin::BI__builtin_elementwise_add_sat:
+  case Builtin::BI__builtin_elementwise_sub_sat:
+  case Builtin::BI__builtin_elementwise_max:
+  case Builtin::BI__builtin_elementwise_min:
+  case Builtin::BI__builtin_elementwise_maxnum:
+  case Builtin::BI__builtin_elementwise_minnum:
+  case Builtin::BI__builtin_elementwise_maximum:
+  case Builtin::BI__builtin_elementwise_minimum:
+  case Builtin::BI__builtin_elementwise_maximumnum:
+  case Builtin::BI__builtin_elementwise_minimumnum:
+  case Builtin::BI__builtin_reduce_max:
+  case Builtin::BI__builtin_reduce_min:
+  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_mul:
+  case Builtin::BI__builtin_reduce_xor:
+  case Builtin::BI__builtin_reduce_or:
+  case Builtin::BI__builtin_reduce_and:
+  case Builtin::BI__builtin_reduce_maximum:
+  case Builtin::BI__builtin_reduce_minimum:
+  case Builtin::BI__builtin_matrix_transpose:
+  case Builtin::BI__builtin_matrix_column_major_load:
+  case Builtin::BI__builtin_matrix_column_major_store:
+  case Builtin::BI__builtin_masked_load:
+  case Builtin::BI__builtin_masked_expand_load:
+  case Builtin::BI__builtin_masked_gather:
+  case Builtin::BI__builtin_masked_store:
+  case Builtin::BI__builtin_masked_compress_store:
+  case Builtin::BI__builtin_masked_scatter:
+  case Builtin::BI__builtin_isinf_sign:
+  case Builtin::BI__builtin_flt_rounds:
+  case Builtin::BI__builtin_set_flt_rounds:
+  case Builtin::BI__builtin_fpclassify:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BIalloca:
+  case Builtin::BI_alloca:
+  case Builtin::BI__builtin_alloca_uninitialized:
+  case Builtin::BI__builtin_alloca:
+    return emitBuiltinAlloca(*this, e, builtinID);
+  case Builtin::BI__builtin_alloca_with_align_uninitialized:
+  case Builtin::BI__builtin_alloca_with_align:
+  case Builtin::BI__builtin_infer_alloc_token:
+  case Builtin::BIbzero:
+  case Builtin::BI__builtin_bzero:
+  case Builtin::BIbcopy:
+  case Builtin::BI__builtin_bcopy:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BImemcpy:
+  case Builtin::BI__builtin_memcpy:
+  case Builtin::BImempcpy:
+  case Builtin::BI__builtin_mempcpy:
+  case Builtin::BI__builtin_memcpy_inline:
+  case Builtin::BI__builtin_char_memchr:
+  case Builtin::BI__builtin___memcpy_chk:
+  case Builtin::BI__builtin_objc_memmove_collectable:
+  case Builtin::BI__builtin___memmove_chk:
+  case Builtin::BI__builtin_trivially_relocate:
+  case Builtin::BImemmove:
+  case Builtin::BI__builtin_memmove:
+  case Builtin::BImemset:
+  case Builtin::BI__builtin_memset:
+  case Builtin::BI__builtin_memset_inline:
+  case Builtin::BI__builtin___memset_chk:
+  case Builtin::BI__builtin_wmemchr:
+  case Builtin::BI__builtin_wmemcmp:
+    break; // Handled as library calls below.
+  case Builtin::BI__builtin_dwarf_cfa:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_return_address:
+  case Builtin::BI_ReturnAddress:
+  case Builtin::BI__builtin_frame_address: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    llvm::APSInt level = e->getArg(0)->EvaluateKnownConstInt(getContext());
+    if (builtinID == Builtin::BI__builtin_return_address) {
+      return RValue::get(cir::ReturnAddrOp::create(
+          builder, loc,
+          builder.getConstAPInt(loc, builder.getUInt32Ty(), level)));
+    }
+    return RValue::get(cir::FrameAddrOp::create(
+        builder, loc,
+        builder.getConstAPInt(loc, builder.getUInt32Ty(), level)));
+  }
+  case Builtin::BI__builtin_extract_return_addr:
+  case Builtin::BI__builtin_frob_return_addr:
+  case Builtin::BI__builtin_dwarf_sp_column:
+  case Builtin::BI__builtin_init_dwarf_reg_size_table:
+  case Builtin::BI__builtin_eh_return:
+  case Builtin::BI__builtin_unwind_init:
+  case Builtin::BI__builtin_extend_pointer:
+  case Builtin::BI__builtin_setjmp:
+  case Builtin::BI__builtin_longjmp:
+  case Builtin::BI__builtin_launder:
+  case Builtin::BI__sync_fetch_and_add:
+  case Builtin::BI__sync_fetch_and_sub:
+  case Builtin::BI__sync_fetch_and_or:
+  case Builtin::BI__sync_fetch_and_and:
+  case Builtin::BI__sync_fetch_and_xor:
+  case Builtin::BI__sync_fetch_and_nand:
+  case Builtin::BI__sync_add_and_fetch:
+  case Builtin::BI__sync_sub_and_fetch:
+  case Builtin::BI__sync_and_and_fetch:
+  case Builtin::BI__sync_or_and_fetch:
+  case Builtin::BI__sync_xor_and_fetch:
+  case Builtin::BI__sync_nand_and_fetch:
+  case Builtin::BI__sync_val_compare_and_swap:
+  case Builtin::BI__sync_bool_compare_and_swap:
+  case Builtin::BI__sync_lock_test_and_set:
+  case Builtin::BI__sync_lock_release:
+  case Builtin::BI__sync_swap:
+  case Builtin::BI__sync_fetch_and_add_1:
+  case Builtin::BI__sync_fetch_and_add_2:
+  case Builtin::BI__sync_fetch_and_add_4:
+  case Builtin::BI__sync_fetch_and_add_8:
+  case Builtin::BI__sync_fetch_and_add_16:
+  case Builtin::BI__sync_fetch_and_sub_1:
+  case Builtin::BI__sync_fetch_and_sub_2:
+  case Builtin::BI__sync_fetch_and_sub_4:
+  case Builtin::BI__sync_fetch_and_sub_8:
+  case Builtin::BI__sync_fetch_and_sub_16:
+  case Builtin::BI__sync_fetch_and_or_1:
+  case Builtin::BI__sync_fetch_and_or_2:
+  case Builtin::BI__sync_fetch_and_or_4:
+  case Builtin::BI__sync_fetch_and_or_8:
+  case Builtin::BI__sync_fetch_and_or_16:
+  case Builtin::BI__sync_fetch_and_and_1:
+  case Builtin::BI__sync_fetch_and_and_2:
+  case Builtin::BI__sync_fetch_and_and_4:
+  case Builtin::BI__sync_fetch_and_and_8:
+  case Builtin::BI__sync_fetch_and_and_16:
+  case Builtin::BI__sync_fetch_and_xor_1:
+  case Builtin::BI__sync_fetch_and_xor_2:
+  case Builtin::BI__sync_fetch_and_xor_4:
+  case Builtin::BI__sync_fetch_and_xor_8:
+  case Builtin::BI__sync_fetch_and_xor_16:
+  case Builtin::BI__sync_fetch_and_nand_1:
+  case Builtin::BI__sync_fetch_and_nand_2:
+  case Builtin::BI__sync_fetch_and_nand_4:
+  case Builtin::BI__sync_fetch_and_nand_8:
+  case Builtin::BI__sync_fetch_and_nand_16:
+  case Builtin::BI__sync_fetch_and_min:
+  case Builtin::BI__sync_fetch_and_max:
+  case Builtin::BI__sync_fetch_and_umin:
+  case Builtin::BI__sync_fetch_and_umax:
+  case Builtin::BI__sync_add_and_fetch_1:
+  case Builtin::BI__sync_add_and_fetch_2:
+  case Builtin::BI__sync_add_and_fetch_4:
+  case Builtin::BI__sync_add_and_fetch_8:
+  case Builtin::BI__sync_add_and_fetch_16:
+  case Builtin::BI__sync_sub_and_fetch_1:
+  case Builtin::BI__sync_sub_and_fetch_2:
+  case Builtin::BI__sync_sub_and_fetch_4:
+  case Builtin::BI__sync_sub_and_fetch_8:
+  case Builtin::BI__sync_sub_and_fetch_16:
+  case Builtin::BI__sync_and_and_fetch_1:
+  case Builtin::BI__sync_and_and_fetch_2:
+  case Builtin::BI__sync_and_and_fetch_4:
+  case Builtin::BI__sync_and_and_fetch_8:
+  case Builtin::BI__sync_and_and_fetch_16:
+  case Builtin::BI__sync_or_and_fetch_1:
+  case Builtin::BI__sync_or_and_fetch_2:
+  case Builtin::BI__sync_or_and_fetch_4:
+  case Builtin::BI__sync_or_and_fetch_8:
+  case Builtin::BI__sync_or_and_fetch_16:
+  case Builtin::BI__sync_xor_and_fetch_1:
+  case Builtin::BI__sync_xor_and_fetch_2:
+  case Builtin::BI__sync_xor_and_fetch_4:
+  case Builtin::BI__sync_xor_and_fetch_8:
+  case Builtin::BI__sync_xor_and_fetch_16:
+  case Builtin::BI__sync_nand_and_fetch_1:
+  case Builtin::BI__sync_nand_and_fetch_2:
+  case Builtin::BI__sync_nand_and_fetch_4:
+  case Builtin::BI__sync_nand_and_fetch_8:
+  case Builtin::BI__sync_nand_and_fetch_16:
+  case Builtin::BI__sync_val_compare_and_swap_1:
+  case Builtin::BI__sync_val_compare_and_swap_2:
+  case Builtin::BI__sync_val_compare_and_swap_4:
+  case Builtin::BI__sync_val_compare_and_swap_8:
+  case Builtin::BI__sync_val_compare_and_swap_16:
+  case Builtin::BI__sync_bool_compare_and_swap_1:
+  case Builtin::BI__sync_bool_compare_and_swap_2:
+  case Builtin::BI__sync_bool_compare_and_swap_4:
+  case Builtin::BI__sync_bool_compare_and_swap_8:
+  case Builtin::BI__sync_bool_compare_and_swap_16:
+  case Builtin::BI__sync_swap_1:
+  case Builtin::BI__sync_swap_2:
+  case Builtin::BI__sync_swap_4:
+  case Builtin::BI__sync_swap_8:
+  case Builtin::BI__sync_swap_16:
+  case Builtin::BI__sync_lock_test_and_set_1:
+  case Builtin::BI__sync_lock_test_and_set_2:
+  case Builtin::BI__sync_lock_test_and_set_4:
+  case Builtin::BI__sync_lock_test_and_set_8:
+  case Builtin::BI__sync_lock_test_and_set_16:
+  case Builtin::BI__sync_lock_release_1:
+  case Builtin::BI__sync_lock_release_2:
+  case Builtin::BI__sync_lock_release_4:
+  case Builtin::BI__sync_lock_release_8:
+  case Builtin::BI__sync_lock_release_16:
+  case Builtin::BI__sync_synchronize:
+  case Builtin::BI__builtin_nontemporal_load:
+  case Builtin::BI__builtin_nontemporal_store:
+  case Builtin::BI__c11_atomic_is_lock_free:
+  case Builtin::BI__atomic_is_lock_free:
+  case Builtin::BI__atomic_test_and_set:
+  case Builtin::BI__atomic_clear:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__atomic_thread_fence:
+    return RValue::get(
+        makeAtomicFenceValue(*this, e, cir::SyncScopeKind::System));
+  case Builtin::BI__atomic_signal_fence:
+    return RValue::get(
+        makeAtomicFenceValue(*this, e, cir::SyncScopeKind::SingleThread));
+  case Builtin::BI__c11_atomic_thread_fence:
+  case Builtin::BI__c11_atomic_signal_fence:
+  case Builtin::BI__scoped_atomic_thread_fence:
+  case Builtin::BI__builtin_signbit:
+  case Builtin::BI__builtin_signbitf:
+  case Builtin::BI__builtin_signbitl:
+  case Builtin::BI__warn_memset_zero_len:
+  case Builtin::BI__annotation:
+  case Builtin::BI__builtin_annotation:
+  case Builtin::BI__builtin_addcb:
+  case Builtin::BI__builtin_addcs:
+  case Builtin::BI__builtin_addc:
+  case Builtin::BI__builtin_addcl:
+  case Builtin::BI__builtin_addcll:
+  case Builtin::BI__builtin_subcb:
+  case Builtin::BI__builtin_subcs:
+  case Builtin::BI__builtin_subc:
+  case Builtin::BI__builtin_subcl:
+  case Builtin::BI__builtin_subcll:
+    return errorBuiltinNYI(*this, e, builtinID);
+
+  case Builtin::BI__builtin_add_overflow:
+  case Builtin::BI__builtin_sub_overflow:
+  case Builtin::BI__builtin_mul_overflow: {
+    const clang::Expr *leftArg = e->getArg(0);
+    const clang::Expr *rightArg = e->getArg(1);
+    const clang::Expr *resultArg = e->getArg(2);
+
+    clang::QualType resultQTy =
+        resultArg->getType()->castAs<clang::PointerType>()->getPointeeType();
+
+    WidthAndSignedness leftInfo =
+        getIntegerWidthAndSignedness(cgm.getASTContext(), leftArg->getType());
+    WidthAndSignedness rightInfo =
+        getIntegerWidthAndSignedness(cgm.getASTContext(), rightArg->getType());
+    WidthAndSignedness resultInfo =
+        getIntegerWidthAndSignedness(cgm.getASTContext(), resultQTy);
+
+    // Note we compute the encompassing type with the consideration to the
+    // result type, so later in LLVM lowering we don't get redundant integral
+    // extension casts.
+    WidthAndSignedness encompassingInfo =
+        EncompassingIntegerType({leftInfo, rightInfo, resultInfo});
+
+    auto encompassingCIRTy = cir::IntType::get(
+        &getMLIRContext(), encompassingInfo.width, encompassingInfo.isSigned);
+    auto resultCIRTy = mlir::cast<cir::IntType>(cgm.convertType(resultQTy));
+
+    mlir::Value left = emitScalarExpr(leftArg);
+    mlir::Value right = emitScalarExpr(rightArg);
+    Address resultPtr = emitPointerWithAlignment(resultArg);
+
+    // Extend each operand to the encompassing type, if necessary.
+    if (left.getType() != encompassingCIRTy)
+      left =
+          builder.createCast(cir::CastKind::integral, left, encompassingCIRTy);
+    if (right.getType() != encompassingCIRTy)
+      right =
+          builder.createCast(cir::CastKind::integral, right, encompassingCIRTy);
+
+    // Perform the operation on the extended values.
+    cir::BinOpOverflowKind opKind;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unknown overflow builtin id.");
+    case Builtin::BI__builtin_add_overflow:
+      opKind = cir::BinOpOverflowKind::Add;
+      break;
+    case Builtin::BI__builtin_sub_overflow:
+      opKind = cir::BinOpOverflowKind::Sub;
+      break;
+    case Builtin::BI__builtin_mul_overflow:
+      opKind = cir::BinOpOverflowKind::Mul;
+      break;
+    }
+
+    mlir::Location loc = getLoc(e->getSourceRange());
+    auto arithOp = cir::BinOpOverflowOp::create(builder, loc, resultCIRTy,
+                                                opKind, left, right);
+
+    // Here is a slight difference from the original clang CodeGen:
+    //   - In the original clang CodeGen, the checked arithmetic result is
+    //     first computed as a value of the encompassing type, and then it is
+    //     truncated to the actual result type with a second overflow checking.
+    //   - In CIRGen, the checked arithmetic operation directly produce the
+    //     checked arithmetic result in its expected type.
+    //
+    // So we don't need a truncation and a second overflow checking here.
+
+    // Finally, store the result using the pointer.
+    bool isVolatile =
+        resultArg->getType()->getPointeeType().isVolatileQualified();
+    builder.createStore(loc, emitToMemory(arithOp.getResult(), resultQTy),
+                        resultPtr, isVolatile);
+
+    return RValue::get(arithOp.getOverflow());
+  }
+
+  case Builtin::BI__builtin_uadd_overflow:
+  case Builtin::BI__builtin_uaddl_overflow:
+  case Builtin::BI__builtin_uaddll_overflow:
+  case Builtin::BI__builtin_usub_overflow:
+  case Builtin::BI__builtin_usubl_overflow:
+  case Builtin::BI__builtin_usubll_overflow:
+  case Builtin::BI__builtin_umul_overflow:
+  case Builtin::BI__builtin_umull_overflow:
+  case Builtin::BI__builtin_umulll_overflow:
+  case Builtin::BI__builtin_sadd_overflow:
+  case Builtin::BI__builtin_saddl_overflow:
+  case Builtin::BI__builtin_saddll_overflow:
+  case Builtin::BI__builtin_ssub_overflow:
+  case Builtin::BI__builtin_ssubl_overflow:
+  case Builtin::BI__builtin_ssubll_overflow:
+  case Builtin::BI__builtin_smul_overflow:
+  case Builtin::BI__builtin_smull_overflow:
+  case Builtin::BI__builtin_smulll_overflow: {
+    // Scalarize our inputs.
+    mlir::Value x = emitScalarExpr(e->getArg(0));
+    mlir::Value y = emitScalarExpr(e->getArg(1));
+
+    const clang::Expr *resultArg = e->getArg(2);
+    Address resultPtr = emitPointerWithAlignment(resultArg);
+
+    // Decide which of the arithmetic operation we are lowering to:
+    cir::BinOpOverflowKind arithKind;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unknown overflow builtin id.");
+    case Builtin::BI__builtin_uadd_overflow:
+    case Builtin::BI__builtin_uaddl_overflow:
+    case Builtin::BI__builtin_uaddll_overflow:
+    case Builtin::BI__builtin_sadd_overflow:
+    case Builtin::BI__builtin_saddl_overflow:
+    case Builtin::BI__builtin_saddll_overflow:
+      arithKind = cir::BinOpOverflowKind::Add;
+      break;
+    case Builtin::BI__builtin_usub_overflow:
+    case Builtin::BI__builtin_usubl_overflow:
+    case Builtin::BI__builtin_usubll_overflow:
+    case Builtin::BI__builtin_ssub_overflow:
+    case Builtin::BI__builtin_ssubl_overflow:
+    case Builtin::BI__builtin_ssubll_overflow:
+      arithKind = cir::BinOpOverflowKind::Sub;
+      break;
+    case Builtin::BI__builtin_umul_overflow:
+    case Builtin::BI__builtin_umull_overflow:
+    case Builtin::BI__builtin_umulll_overflow:
+    case Builtin::BI__builtin_smul_overflow:
+    case Builtin::BI__builtin_smull_overflow:
+    case Builtin::BI__builtin_smulll_overflow:
+      arithKind = cir::BinOpOverflowKind::Mul;
+      break;
+    }
+
+    clang::QualType resultQTy =
+        resultArg->getType()->castAs<clang::PointerType>()->getPointeeType();
+    auto resultCIRTy = mlir::cast<cir::IntType>(cgm.convertType(resultQTy));
+
+    mlir::Location loc = getLoc(e->getSourceRange());
+    cir::BinOpOverflowOp arithOp = cir::BinOpOverflowOp::create(
+        builder, loc, resultCIRTy, arithKind, x, y);
+
+    bool isVolatile =
+        resultArg->getType()->getPointeeType().isVolatileQualified();
+    builder.createStore(loc, emitToMemory(arithOp.getResult(), resultQTy),
+                        resultPtr, isVolatile);
+
+    return RValue::get(arithOp.getOverflow());
+  }
+
+  case Builtin::BIaddressof:
+  case Builtin::BI__addressof:
+  case Builtin::BI__builtin_addressof:
+  case Builtin::BI__builtin_function_start:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_operator_new:
+    return emitNewOrDeleteBuiltinCall(
+        e->getCallee()->getType()->castAs<FunctionProtoType>(), e, OO_New);
+  case Builtin::BI__builtin_operator_delete:
+    emitNewOrDeleteBuiltinCall(
+        e->getCallee()->getType()->castAs<FunctionProtoType>(), e, OO_Delete);
+    return RValue::get(nullptr);
+  case Builtin::BI__builtin_is_aligned:
+  case Builtin::BI__builtin_align_up:
+  case Builtin::BI__builtin_align_down:
+  case Builtin::BI__noop:
+  case Builtin::BI__builtin_call_with_static_chain:
+  case Builtin::BI_InterlockedExchange8:
+  case Builtin::BI_InterlockedExchange16:
+  case Builtin::BI_InterlockedExchange:
+  case Builtin::BI_InterlockedExchangePointer:
+  case Builtin::BI_InterlockedCompareExchangePointer:
+  case Builtin::BI_InterlockedCompareExchangePointer_nf:
+  case Builtin::BI_InterlockedCompareExchange8:
+  case Builtin::BI_InterlockedCompareExchange16:
+  case Builtin::BI_InterlockedCompareExchange:
+  case Builtin::BI_InterlockedCompareExchange64:
+  case Builtin::BI_InterlockedIncrement16:
+  case Builtin::BI_InterlockedIncrement:
+  case Builtin::BI_InterlockedDecrement16:
+  case Builtin::BI_InterlockedDecrement:
+  case Builtin::BI_InterlockedAnd8:
+  case Builtin::BI_InterlockedAnd16:
+  case Builtin::BI_InterlockedAnd:
+  case Builtin::BI_InterlockedExchangeAdd8:
+  case Builtin::BI_InterlockedExchangeAdd16:
+  case Builtin::BI_InterlockedExchangeAdd:
+  case Builtin::BI_InterlockedExchangeSub8:
+  case Builtin::BI_InterlockedExchangeSub16:
+  case Builtin::BI_InterlockedExchangeSub:
+  case Builtin::BI_InterlockedOr8:
+  case Builtin::BI_InterlockedOr16:
+  case Builtin::BI_InterlockedOr:
+  case Builtin::BI_InterlockedXor8:
+  case Builtin::BI_InterlockedXor16:
+  case Builtin::BI_InterlockedXor:
+  case Builtin::BI_bittest64:
+  case Builtin::BI_bittest:
+  case Builtin::BI_bittestandcomplement64:
+  case Builtin::BI_bittestandcomplement:
+  case Builtin::BI_bittestandreset64:
+  case Builtin::BI_bittestandreset:
+  case Builtin::BI_bittestandset64:
+  case Builtin::BI_bittestandset:
+  case Builtin::BI_interlockedbittestandreset:
+  case Builtin::BI_interlockedbittestandreset64:
+  case Builtin::BI_interlockedbittestandreset64_acq:
+  case Builtin::BI_interlockedbittestandreset64_rel:
+  case Builtin::BI_interlockedbittestandreset64_nf:
+  case Builtin::BI_interlockedbittestandset64:
+  case Builtin::BI_interlockedbittestandset64_acq:
+  case Builtin::BI_interlockedbittestandset64_rel:
+  case Builtin::BI_interlockedbittestandset64_nf:
+  case Builtin::BI_interlockedbittestandset:
+  case Builtin::BI_interlockedbittestandset_acq:
+  case Builtin::BI_interlockedbittestandset_rel:
+  case Builtin::BI_interlockedbittestandset_nf:
+  case Builtin::BI_interlockedbittestandreset_acq:
+  case Builtin::BI_interlockedbittestandreset_rel:
+  case Builtin::BI_interlockedbittestandreset_nf:
+  case Builtin::BI__iso_volatile_load8:
+  case Builtin::BI__iso_volatile_load16:
+  case Builtin::BI__iso_volatile_load32:
+  case Builtin::BI__iso_volatile_load64:
+  case Builtin::BI__iso_volatile_store8:
+  case Builtin::BI__iso_volatile_store16:
+  case Builtin::BI__iso_volatile_store32:
+  case Builtin::BI__iso_volatile_store64:
+  case Builtin::BI__builtin_ptrauth_sign_constant:
+  case Builtin::BI__builtin_ptrauth_auth:
+  case Builtin::BI__builtin_ptrauth_auth_and_resign:
+  case Builtin::BI__builtin_ptrauth_blend_discriminator:
+  case Builtin::BI__builtin_ptrauth_sign_generic_data:
+  case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+  case Builtin::BI__builtin_ptrauth_strip:
+  case Builtin::BI__builtin_get_vtable_pointer:
+  case Builtin::BI__exception_code:
+  case Builtin::BI_exception_code:
+  case Builtin::BI__exception_info:
+  case Builtin::BI_exception_info:
+  case Builtin::BI__abnormal_termination:
+  case Builtin::BI_abnormal_termination:
+  case Builtin::BI_setjmpex:
+  case Builtin::BI_setjmp:
+  case Builtin::BImove:
+  case Builtin::BImove_if_noexcept:
+  case Builtin::BIforward:
+  case Builtin::BIforward_like:
+  case Builtin::BIas_const:
+  case Builtin::BI__GetExceptionInfo:
+  case Builtin::BI__fastfail:
+  case Builtin::BIread_pipe:
+  case Builtin::BIwrite_pipe:
+  case Builtin::BIreserve_read_pipe:
+  case Builtin::BIreserve_write_pipe:
+  case Builtin::BIwork_group_reserve_read_pipe:
+  case Builtin::BIwork_group_reserve_write_pipe:
+  case Builtin::BIsub_group_reserve_read_pipe:
+  case Builtin::BIsub_group_reserve_write_pipe:
+  case Builtin::BIcommit_read_pipe:
+  case Builtin::BIcommit_write_pipe:
+  case Builtin::BIwork_group_commit_read_pipe:
+  case Builtin::BIwork_group_commit_write_pipe:
+  case Builtin::BIsub_group_commit_read_pipe:
+  case Builtin::BIsub_group_commit_write_pipe:
+  case Builtin::BIget_pipe_num_packets:
+  case Builtin::BIget_pipe_max_packets:
+  case Builtin::BIto_global:
+  case Builtin::BIto_local:
+  case Builtin::BIto_private:
+  case Builtin::BIenqueue_kernel:
+  case Builtin::BIget_kernel_work_group_size:
+  case Builtin::BIget_kernel_preferred_work_group_size_multiple:
+  case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
+  case Builtin::BIget_kernel_sub_group_count_for_ndrange:
+  case Builtin::BI__builtin_store_half:
+  case Builtin::BI__builtin_store_halff:
+  case Builtin::BI__builtin_load_half:
+  case Builtin::BI__builtin_load_halff:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_printf:
+  case Builtin::BIprintf:
+    break;
+  case Builtin::BI__builtin_canonicalize:
+  case Builtin::BI__builtin_canonicalizef:
+  case Builtin::BI__builtin_canonicalizef16:
+  case Builtin::BI__builtin_canonicalizel:
+  case Builtin::BI__builtin_thread_pointer:
+  case Builtin::BI__builtin_os_log_format:
+  case Builtin::BI__xray_customevent:
+  case Builtin::BI__xray_typedevent:
+  case Builtin::BI__builtin_ms_va_start:
+  case Builtin::BI__builtin_ms_va_end:
+  case Builtin::BI__builtin_ms_va_copy:
+  case Builtin::BI__builtin_get_device_side_mangled_name:
+    return errorBuiltinNYI(*this, e, builtinID);
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
@@ -541,9 +1393,13 @@ static mlir::Value emitTargetArchBuiltinExpr(CIRGenFunction *cgf,
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
+    // These are actually NYI, but that will be reported by emitBuiltinExpr.
+    // At this point, we don't even know that the builtin is target-specific.
+    return nullptr;
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_32:
   case llvm::Triple::aarch64_be:
+    return cgf->emitAArch64BuiltinExpr(builtinID, e, returnValue, arch);
   case llvm::Triple::bpfeb:
   case llvm::Triple::bpfel:
     // These are actually NYI, but that will be reported by emitBuiltinExpr.
@@ -590,6 +1446,22 @@ CIRGenFunction::emitTargetBuiltinExpr(unsigned builtinID, const CallExpr *e,
                                    getTarget().getTriple().getArch());
 }
 
+mlir::Value CIRGenFunction::emitScalarOrConstFoldImmArg(
+    const unsigned iceArguments, const unsigned idx, const Expr *argExpr) {
+  mlir::Value arg = {};
+  if ((iceArguments & (1 << idx)) == 0) {
+    arg = emitScalarExpr(argExpr);
+  } else {
+    // If this is required to be a constant, constant fold it so that we
+    // know that the generated intrinsic gets a ConstantInt.
+    const std::optional<llvm::APSInt> result =
+        argExpr->getIntegerConstantExpr(getContext());
+    assert(result && "Expected argument to be a constant");
+    arg = builder.getConstInt(getLoc(argExpr->getSourceRange()), *result);
+  }
+  return arg;
+}
+
 /// Given a builtin id for a function like "__builtin_fabsf", return a Function*
 /// for "fabsf".
 cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
@@ -641,3 +1513,42 @@ mlir::Value CIRGenFunction::emitVAArg(VAArgExpr *ve) {
   mlir::Value vaList = emitVAListRef(ve->getSubExpr()).getPointer();
   return cir::VAArgOp::create(builder, loc, type, vaList);
 }
+
+mlir::Value CIRGenFunction::emitBuiltinObjectSize(const Expr *e, unsigned type,
+                                                  cir::IntType resType,
+                                                  mlir::Value emittedE,
+                                                  bool isDynamic) {
+  assert(!cir::MissingFeatures::opCallImplicitObjectSizeArgs());
+
+  // LLVM can't handle type=3 appropriately, and __builtin_object_size shouldn't
+  // evaluate e for side-effects. In either case, just like original LLVM
+  // lowering, we shouldn't lower to `cir.objsize` but to a constant instead.
+  if (type == 3 || (!emittedE && e->HasSideEffects(getContext())))
+    return builder.getConstInt(getLoc(e->getSourceRange()), resType,
+                               (type & 2) ? 0 : -1);
+
+  mlir::Value ptr = emittedE ? emittedE : emitScalarExpr(e);
+  assert(mlir::isa<cir::PointerType>(ptr.getType()) &&
+         "Non-pointer passed to __builtin_object_size?");
+
+  assert(!cir::MissingFeatures::countedBySize());
+
+  // Extract the min/max mode from type. CIR only supports type 0
+  // (max, whole object) and type 2 (min, whole object), not type 1 or 3
+  // (closest subobject variants).
+  const bool min = ((type & 2) != 0);
+  // For GCC compatibility, __builtin_object_size treats NULL as unknown size.
+  auto op =
+      cir::ObjSizeOp::create(builder, getLoc(e->getSourceRange()), resType, ptr,
+                             min, /*nullUnknown=*/true, isDynamic);
+  return op.getResult();
+}
+
+mlir::Value CIRGenFunction::evaluateOrEmitBuiltinObjectSize(
+    const Expr *e, unsigned type, cir::IntType resType, mlir::Value emittedE,
+    bool isDynamic) {
+  uint64_t objectSize;
+  if (!e->tryEvaluateObjectSize(objectSize, getContext(), type))
+    return emitBuiltinObjectSize(e, type, resType, emittedE, isDynamic);
+  return builder.getConstInt(getLoc(e->getSourceRange()), resType, objectSize);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
new file mode 100644
index 0000000..5a9ae59
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -0,0 +1,1583 @@
+//===---- CIRGenBuiltinAArch64.cpp - Emit CIR for AArch64 builtins --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit ARM64 Builtin calls as CIR or a function call
+// to be later resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+#include "clang/CIR/MissingFeatures.h"
+
+// TODO(cir): once all builtins are covered, decide whether we still
+// need to use LLVM intrinsics or if there's a better approach to follow. Right
+// now the intrinsics are reused to make it convenient to encode all thousands
+// of them and passing down to LLVM lowering.
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+
+#include "mlir/IR/Value.h"
+#include "clang/AST/GlobalDecl.h"
+#include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+using namespace llvm;
+
+mlir::Value CIRGenFunction::emitAArch64SVEBuiltinExpr(unsigned builtinID,
+                                                      const CallExpr *expr) {
+  if (builtinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
+      builtinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  assert(!cir::MissingFeatures::aarch64SVEIntrinsics());
+
+  switch (builtinID) {
+  default:
+    return {};
+
+  case SVE::BI__builtin_sve_svreinterpret_b:
+  case SVE::BI__builtin_sve_svreinterpret_c:
+  case SVE::BI__builtin_sve_svpsel_lane_b8:
+  case SVE::BI__builtin_sve_svpsel_lane_b16:
+  case SVE::BI__builtin_sve_svpsel_lane_b32:
+  case SVE::BI__builtin_sve_svpsel_lane_b64:
+  case SVE::BI__builtin_sve_svpsel_lane_c8:
+  case SVE::BI__builtin_sve_svpsel_lane_c16:
+  case SVE::BI__builtin_sve_svpsel_lane_c32:
+  case SVE::BI__builtin_sve_svpsel_lane_c64:
+  case SVE::BI__builtin_sve_svmov_b_z:
+  case SVE::BI__builtin_sve_svnot_b_z:
+  case SVE::BI__builtin_sve_svmovlb_u16:
+  case SVE::BI__builtin_sve_svmovlb_u32:
+  case SVE::BI__builtin_sve_svmovlb_u64:
+  case SVE::BI__builtin_sve_svmovlb_s16:
+  case SVE::BI__builtin_sve_svmovlb_s32:
+  case SVE::BI__builtin_sve_svmovlb_s64:
+  case SVE::BI__builtin_sve_svmovlt_u16:
+  case SVE::BI__builtin_sve_svmovlt_u32:
+  case SVE::BI__builtin_sve_svmovlt_u64:
+  case SVE::BI__builtin_sve_svmovlt_s16:
+  case SVE::BI__builtin_sve_svmovlt_s32:
+  case SVE::BI__builtin_sve_svmovlt_s64:
+  case SVE::BI__builtin_sve_svpmullt_u16:
+  case SVE::BI__builtin_sve_svpmullt_u64:
+  case SVE::BI__builtin_sve_svpmullt_n_u16:
+  case SVE::BI__builtin_sve_svpmullt_n_u64:
+  case SVE::BI__builtin_sve_svpmullb_u16:
+  case SVE::BI__builtin_sve_svpmullb_u64:
+  case SVE::BI__builtin_sve_svpmullb_n_u16:
+  case SVE::BI__builtin_sve_svpmullb_n_u64:
+  case SVE::BI__builtin_sve_svdup_n_b8:
+  case SVE::BI__builtin_sve_svdup_n_b16:
+  case SVE::BI__builtin_sve_svdup_n_b32:
+  case SVE::BI__builtin_sve_svdup_n_b64:
+  case SVE::BI__builtin_sve_svdupq_n_b8:
+  case SVE::BI__builtin_sve_svdupq_n_b16:
+  case SVE::BI__builtin_sve_svdupq_n_b32:
+  case SVE::BI__builtin_sve_svdupq_n_b64:
+  case SVE::BI__builtin_sve_svdupq_n_u8:
+  case SVE::BI__builtin_sve_svdupq_n_s8:
+  case SVE::BI__builtin_sve_svdupq_n_u64:
+  case SVE::BI__builtin_sve_svdupq_n_f64:
+  case SVE::BI__builtin_sve_svdupq_n_s64:
+  case SVE::BI__builtin_sve_svdupq_n_u16:
+  case SVE::BI__builtin_sve_svdupq_n_f16:
+  case SVE::BI__builtin_sve_svdupq_n_bf16:
+  case SVE::BI__builtin_sve_svdupq_n_s16:
+  case SVE::BI__builtin_sve_svdupq_n_u32:
+  case SVE::BI__builtin_sve_svdupq_n_f32:
+  case SVE::BI__builtin_sve_svdupq_n_s32:
+  case SVE::BI__builtin_sve_svpfalse_b:
+  case SVE::BI__builtin_sve_svpfalse_c:
+  case SVE::BI__builtin_sve_svlen_bf16:
+  case SVE::BI__builtin_sve_svlen_f16:
+  case SVE::BI__builtin_sve_svlen_f32:
+  case SVE::BI__builtin_sve_svlen_f64:
+  case SVE::BI__builtin_sve_svlen_s8:
+  case SVE::BI__builtin_sve_svlen_s16:
+  case SVE::BI__builtin_sve_svlen_s32:
+  case SVE::BI__builtin_sve_svlen_s64:
+  case SVE::BI__builtin_sve_svlen_u8:
+  case SVE::BI__builtin_sve_svlen_u16:
+  case SVE::BI__builtin_sve_svlen_u32:
+  case SVE::BI__builtin_sve_svlen_u64:
+  case SVE::BI__builtin_sve_svtbl2_u8:
+  case SVE::BI__builtin_sve_svtbl2_s8:
+  case SVE::BI__builtin_sve_svtbl2_u16:
+  case SVE::BI__builtin_sve_svtbl2_s16:
+  case SVE::BI__builtin_sve_svtbl2_u32:
+  case SVE::BI__builtin_sve_svtbl2_s32:
+  case SVE::BI__builtin_sve_svtbl2_u64:
+  case SVE::BI__builtin_sve_svtbl2_s64:
+  case SVE::BI__builtin_sve_svtbl2_f16:
+  case SVE::BI__builtin_sve_svtbl2_bf16:
+  case SVE::BI__builtin_sve_svtbl2_f32:
+  case SVE::BI__builtin_sve_svtbl2_f64:
+  case SVE::BI__builtin_sve_svset_neonq_s8:
+  case SVE::BI__builtin_sve_svset_neonq_s16:
+  case SVE::BI__builtin_sve_svset_neonq_s32:
+  case SVE::BI__builtin_sve_svset_neonq_s64:
+  case SVE::BI__builtin_sve_svset_neonq_u8:
+  case SVE::BI__builtin_sve_svset_neonq_u16:
+  case SVE::BI__builtin_sve_svset_neonq_u32:
+  case SVE::BI__builtin_sve_svset_neonq_u64:
+  case SVE::BI__builtin_sve_svset_neonq_f16:
+  case SVE::BI__builtin_sve_svset_neonq_f32:
+  case SVE::BI__builtin_sve_svset_neonq_f64:
+  case SVE::BI__builtin_sve_svset_neonq_bf16:
+  case SVE::BI__builtin_sve_svget_neonq_s8:
+  case SVE::BI__builtin_sve_svget_neonq_s16:
+  case SVE::BI__builtin_sve_svget_neonq_s32:
+  case SVE::BI__builtin_sve_svget_neonq_s64:
+  case SVE::BI__builtin_sve_svget_neonq_u8:
+  case SVE::BI__builtin_sve_svget_neonq_u16:
+  case SVE::BI__builtin_sve_svget_neonq_u32:
+  case SVE::BI__builtin_sve_svget_neonq_u64:
+  case SVE::BI__builtin_sve_svget_neonq_f16:
+  case SVE::BI__builtin_sve_svget_neonq_f32:
+  case SVE::BI__builtin_sve_svget_neonq_f64:
+  case SVE::BI__builtin_sve_svget_neonq_bf16:
+  case SVE::BI__builtin_sve_svdup_neonq_s8:
+  case SVE::BI__builtin_sve_svdup_neonq_s16:
+  case SVE::BI__builtin_sve_svdup_neonq_s32:
+  case SVE::BI__builtin_sve_svdup_neonq_s64:
+  case SVE::BI__builtin_sve_svdup_neonq_u8:
+  case SVE::BI__builtin_sve_svdup_neonq_u16:
+  case SVE::BI__builtin_sve_svdup_neonq_u32:
+  case SVE::BI__builtin_sve_svdup_neonq_u64:
+  case SVE::BI__builtin_sve_svdup_neonq_f16:
+  case SVE::BI__builtin_sve_svdup_neonq_f32:
+  case SVE::BI__builtin_sve_svdup_neonq_f64:
+  case SVE::BI__builtin_sve_svdup_neonq_bf16:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // Unreachable: All cases in the switch above return.
+}
+
+mlir::Value CIRGenFunction::emitAArch64SMEBuiltinExpr(unsigned builtinID,
+                                                      const CallExpr *expr) {
+  assert(!cir::MissingFeatures::aarch64SMEIntrinsics());
+
+  cgm.errorNYI(expr->getSourceRange(),
+               std::string("unimplemented AArch64 builtin call: ") +
+                   getContext().BuiltinInfo.getName(builtinID));
+  return {};
+}
+
+// Some intrinsics are equivalent for codegen.
+static const std::pair<unsigned, unsigned> neonEquivalentIntrinsicMap[] = {
+    {
+        NEON::BI__builtin_neon_splat_lane_bf16,
+        NEON::BI__builtin_neon_splat_lane_v,
+    },
+    {
+        NEON::BI__builtin_neon_splat_laneq_bf16,
+        NEON::BI__builtin_neon_splat_laneq_v,
+    },
+    {
+        NEON::BI__builtin_neon_splatq_lane_bf16,
+        NEON::BI__builtin_neon_splatq_lane_v,
+    },
+    {
+        NEON::BI__builtin_neon_splatq_laneq_bf16,
+        NEON::BI__builtin_neon_splatq_laneq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vabd_f16,
+        NEON::BI__builtin_neon_vabd_v,
+    },
+    {
+        NEON::BI__builtin_neon_vabdq_f16,
+        NEON::BI__builtin_neon_vabdq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vabs_f16,
+        NEON::BI__builtin_neon_vabs_v,
+    },
+    {
+        NEON::BI__builtin_neon_vabsq_f16,
+        NEON::BI__builtin_neon_vabsq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcage_f16,
+        NEON::BI__builtin_neon_vcage_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcageq_f16,
+        NEON::BI__builtin_neon_vcageq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcagt_f16,
+        NEON::BI__builtin_neon_vcagt_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcagtq_f16,
+        NEON::BI__builtin_neon_vcagtq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcale_f16,
+        NEON::BI__builtin_neon_vcale_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcaleq_f16,
+        NEON::BI__builtin_neon_vcaleq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcalt_f16,
+        NEON::BI__builtin_neon_vcalt_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcaltq_f16,
+        NEON::BI__builtin_neon_vcaltq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vceqz_f16,
+        NEON::BI__builtin_neon_vceqz_v,
+    },
+    {
+        NEON::BI__builtin_neon_vceqzq_f16,
+        NEON::BI__builtin_neon_vceqzq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcgez_f16,
+        NEON::BI__builtin_neon_vcgez_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcgezq_f16,
+        NEON::BI__builtin_neon_vcgezq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcgtz_f16,
+        NEON::BI__builtin_neon_vcgtz_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcgtzq_f16,
+        NEON::BI__builtin_neon_vcgtzq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vclez_f16,
+        NEON::BI__builtin_neon_vclez_v,
+    },
+    {
+        NEON::BI__builtin_neon_vclezq_f16,
+        NEON::BI__builtin_neon_vclezq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcltz_f16,
+        NEON::BI__builtin_neon_vcltz_v,
+    },
+    {
+        NEON::BI__builtin_neon_vcltzq_f16,
+        NEON::BI__builtin_neon_vcltzq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vfma_f16,
+        NEON::BI__builtin_neon_vfma_v,
+    },
+    {
+        NEON::BI__builtin_neon_vfma_lane_f16,
+        NEON::BI__builtin_neon_vfma_lane_v,
+    },
+    {
+        NEON::BI__builtin_neon_vfma_laneq_f16,
+        NEON::BI__builtin_neon_vfma_laneq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vfmaq_f16,
+        NEON::BI__builtin_neon_vfmaq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vfmaq_lane_f16,
+        NEON::BI__builtin_neon_vfmaq_lane_v,
+    },
+    {
+        NEON::BI__builtin_neon_vfmaq_laneq_f16,
+        NEON::BI__builtin_neon_vfmaq_laneq_v,
+    },
+    {NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v},
+    {NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v},
+    {NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v},
+    {NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v},
+    {NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v},
+    {NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v},
+    {NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v},
+    {NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v},
+    {NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v},
+    {NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v},
+    {NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v},
+    {NEON::BI__builtin_neon_vld1q_lane_bf16,
+     NEON::BI__builtin_neon_vld1q_lane_v},
+    {NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v},
+    {NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v},
+    {NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v},
+    {NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v},
+    {NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v},
+    {NEON::BI__builtin_neon_vld2q_lane_bf16,
+     NEON::BI__builtin_neon_vld2q_lane_v},
+    {NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v},
+    {NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v},
+    {NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v},
+    {NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v},
+    {NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v},
+    {NEON::BI__builtin_neon_vld3q_lane_bf16,
+     NEON::BI__builtin_neon_vld3q_lane_v},
+    {NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v},
+    {NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v},
+    {NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v},
+    {NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v},
+    {NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v},
+    {NEON::BI__builtin_neon_vld4q_lane_bf16,
+     NEON::BI__builtin_neon_vld4q_lane_v},
+    {
+        NEON::BI__builtin_neon_vmax_f16,
+        NEON::BI__builtin_neon_vmax_v,
+    },
+    {
+        NEON::BI__builtin_neon_vmaxnm_f16,
+        NEON::BI__builtin_neon_vmaxnm_v,
+    },
+    {
+        NEON::BI__builtin_neon_vmaxnmq_f16,
+        NEON::BI__builtin_neon_vmaxnmq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vmaxq_f16,
+        NEON::BI__builtin_neon_vmaxq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vmin_f16,
+        NEON::BI__builtin_neon_vmin_v,
+    },
+    {
+        NEON::BI__builtin_neon_vminnm_f16,
+        NEON::BI__builtin_neon_vminnm_v,
+    },
+    {
+        NEON::BI__builtin_neon_vminnmq_f16,
+        NEON::BI__builtin_neon_vminnmq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vminq_f16,
+        NEON::BI__builtin_neon_vminq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vmulx_f16,
+        NEON::BI__builtin_neon_vmulx_v,
+    },
+    {
+        NEON::BI__builtin_neon_vmulxq_f16,
+        NEON::BI__builtin_neon_vmulxq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpadd_f16,
+        NEON::BI__builtin_neon_vpadd_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpaddq_f16,
+        NEON::BI__builtin_neon_vpaddq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpmax_f16,
+        NEON::BI__builtin_neon_vpmax_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpmaxnm_f16,
+        NEON::BI__builtin_neon_vpmaxnm_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpmaxnmq_f16,
+        NEON::BI__builtin_neon_vpmaxnmq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpmaxq_f16,
+        NEON::BI__builtin_neon_vpmaxq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpmin_f16,
+        NEON::BI__builtin_neon_vpmin_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpminnm_f16,
+        NEON::BI__builtin_neon_vpminnm_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpminnmq_f16,
+        NEON::BI__builtin_neon_vpminnmq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vpminq_f16,
+        NEON::BI__builtin_neon_vpminq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrecpe_f16,
+        NEON::BI__builtin_neon_vrecpe_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrecpeq_f16,
+        NEON::BI__builtin_neon_vrecpeq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrecps_f16,
+        NEON::BI__builtin_neon_vrecps_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrecpsq_f16,
+        NEON::BI__builtin_neon_vrecpsq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrnd_f16,
+        NEON::BI__builtin_neon_vrnd_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrnda_f16,
+        NEON::BI__builtin_neon_vrnda_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndaq_f16,
+        NEON::BI__builtin_neon_vrndaq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndi_f16,
+        NEON::BI__builtin_neon_vrndi_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndiq_f16,
+        NEON::BI__builtin_neon_vrndiq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndm_f16,
+        NEON::BI__builtin_neon_vrndm_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndmq_f16,
+        NEON::BI__builtin_neon_vrndmq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndn_f16,
+        NEON::BI__builtin_neon_vrndn_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndnq_f16,
+        NEON::BI__builtin_neon_vrndnq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndp_f16,
+        NEON::BI__builtin_neon_vrndp_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndpq_f16,
+        NEON::BI__builtin_neon_vrndpq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndq_f16,
+        NEON::BI__builtin_neon_vrndq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndx_f16,
+        NEON::BI__builtin_neon_vrndx_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrndxq_f16,
+        NEON::BI__builtin_neon_vrndxq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrsqrte_f16,
+        NEON::BI__builtin_neon_vrsqrte_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrsqrteq_f16,
+        NEON::BI__builtin_neon_vrsqrteq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrsqrts_f16,
+        NEON::BI__builtin_neon_vrsqrts_v,
+    },
+    {
+        NEON::BI__builtin_neon_vrsqrtsq_f16,
+        NEON::BI__builtin_neon_vrsqrtsq_v,
+    },
+    {
+        NEON::BI__builtin_neon_vsqrt_f16,
+        NEON::BI__builtin_neon_vsqrt_v,
+    },
+    {
+        NEON::BI__builtin_neon_vsqrtq_f16,
+        NEON::BI__builtin_neon_vsqrtq_v,
+    },
+    {NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v},
+    {NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v},
+    {NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v},
+    {NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v},
+    {NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v},
+    {NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v},
+    {NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v},
+    {NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v},
+    {NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v},
+    {NEON::BI__builtin_neon_vst1q_lane_bf16,
+     NEON::BI__builtin_neon_vst1q_lane_v},
+    {NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v},
+    {NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v},
+    {NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v},
+    {NEON::BI__builtin_neon_vst2q_lane_bf16,
+     NEON::BI__builtin_neon_vst2q_lane_v},
+    {NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v},
+    {NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v},
+    {NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v},
+    {NEON::BI__builtin_neon_vst3q_lane_bf16,
+     NEON::BI__builtin_neon_vst3q_lane_v},
+    {NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v},
+    {NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v},
+    {NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v},
+    {NEON::BI__builtin_neon_vst4q_lane_bf16,
+     NEON::BI__builtin_neon_vst4q_lane_v},
+    // The mangling rules cause us to have one ID for each type for
+    // vldap1(q)_lane and vstl1(q)_lane, but codegen is equivalent for all of
+    // them. Choose an arbitrary one to be handled as tha canonical variation.
+    {NEON::BI__builtin_neon_vldap1_lane_u64,
+     NEON::BI__builtin_neon_vldap1_lane_s64},
+    {NEON::BI__builtin_neon_vldap1_lane_f64,
+     NEON::BI__builtin_neon_vldap1_lane_s64},
+    {NEON::BI__builtin_neon_vldap1_lane_p64,
+     NEON::BI__builtin_neon_vldap1_lane_s64},
+    {NEON::BI__builtin_neon_vldap1q_lane_u64,
+     NEON::BI__builtin_neon_vldap1q_lane_s64},
+    {NEON::BI__builtin_neon_vldap1q_lane_f64,
+     NEON::BI__builtin_neon_vldap1q_lane_s64},
+    {NEON::BI__builtin_neon_vldap1q_lane_p64,
+     NEON::BI__builtin_neon_vldap1q_lane_s64},
+    {NEON::BI__builtin_neon_vstl1_lane_u64,
+     NEON::BI__builtin_neon_vstl1_lane_s64},
+    {NEON::BI__builtin_neon_vstl1_lane_f64,
+     NEON::BI__builtin_neon_vstl1_lane_s64},
+    {NEON::BI__builtin_neon_vstl1_lane_p64,
+     NEON::BI__builtin_neon_vstl1_lane_s64},
+    {NEON::BI__builtin_neon_vstl1q_lane_u64,
+     NEON::BI__builtin_neon_vstl1q_lane_s64},
+    {NEON::BI__builtin_neon_vstl1q_lane_f64,
+     NEON::BI__builtin_neon_vstl1q_lane_s64},
+    {NEON::BI__builtin_neon_vstl1q_lane_p64,
+     NEON::BI__builtin_neon_vstl1q_lane_s64},
+};
+
+mlir::Value
+CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
+                                       ReturnValueSlot returnValue,
+                                       llvm::Triple::ArchType arch) {
+  if (builtinID >= clang::AArch64::FirstSVEBuiltin &&
+      builtinID <= clang::AArch64::LastSVEBuiltin)
+    return emitAArch64SVEBuiltinExpr(builtinID, expr);
+
+  if (builtinID >= clang::AArch64::FirstSMEBuiltin &&
+      builtinID <= clang::AArch64::LastSMEBuiltin)
+    return emitAArch64SMEBuiltinExpr(builtinID, expr);
+
+  if (builtinID == Builtin::BI__builtin_cpu_supports) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  switch (builtinID) {
+  default:
+    break;
+  case clang::AArch64::BI__builtin_arm_nop:
+  case clang::AArch64::BI__builtin_arm_yield:
+  case clang::AArch64::BI__yield:
+  case clang::AArch64::BI__builtin_arm_wfe:
+  case clang::AArch64::BI__wfe:
+  case clang::AArch64::BI__builtin_arm_wfi:
+  case clang::AArch64::BI__wfi:
+  case clang::AArch64::BI__builtin_arm_sev:
+  case clang::AArch64::BI__sev:
+  case clang::AArch64::BI__builtin_arm_sevl:
+  case clang::AArch64::BI__sevl:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_trap) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rbit) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+  if (builtinID == clang::AArch64::BI__builtin_arm_rbit64) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_clz ||
+      builtinID == clang::AArch64::BI__builtin_arm_clz64) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_cls) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+  if (builtinID == clang::AArch64::BI__builtin_arm_cls64) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rint32zf ||
+      builtinID == clang::AArch64::BI__builtin_arm_rint32z) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rint64zf ||
+      builtinID == clang::AArch64::BI__builtin_arm_rint64z) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rint32xf ||
+      builtinID == clang::AArch64::BI__builtin_arm_rint32x) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rint64xf ||
+      builtinID == clang::AArch64::BI__builtin_arm_rint64x) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_jcvt) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_ld64b ||
+      builtinID == clang::AArch64::BI__builtin_arm_st64b ||
+      builtinID == clang::AArch64::BI__builtin_arm_st64bv ||
+      builtinID == clang::AArch64::BI__builtin_arm_st64bv0) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rndr ||
+      builtinID == clang::AArch64::BI__builtin_arm_rndrrs) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__clear_cache) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if ((builtinID == clang::AArch64::BI__builtin_arm_ldrex ||
+       builtinID == clang::AArch64::BI__builtin_arm_ldaex) &&
+      getContext().getTypeSize(expr->getType()) == 128) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+  if (builtinID == clang::AArch64::BI__builtin_arm_ldrex ||
+      builtinID == clang::AArch64::BI__builtin_arm_ldaex) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if ((builtinID == clang::AArch64::BI__builtin_arm_strex ||
+       builtinID == clang::AArch64::BI__builtin_arm_stlex) &&
+      getContext().getTypeSize(expr->getArg(0)->getType()) == 128) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_strex ||
+      builtinID == clang::AArch64::BI__builtin_arm_stlex) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__getReg) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__break) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_clrex) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI_ReadWriteBarrier) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // CRC32
+  Intrinsic::ID crcIntrinsicID = Intrinsic::not_intrinsic;
+  switch (builtinID) {
+  case clang::AArch64::BI__builtin_arm_crc32b:
+    crcIntrinsicID = Intrinsic::aarch64_crc32b;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32cb:
+    crcIntrinsicID = Intrinsic::aarch64_crc32cb;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32h:
+    crcIntrinsicID = Intrinsic::aarch64_crc32h;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32ch:
+    crcIntrinsicID = Intrinsic::aarch64_crc32ch;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32w:
+    crcIntrinsicID = Intrinsic::aarch64_crc32w;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32cw:
+    crcIntrinsicID = Intrinsic::aarch64_crc32cw;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32d:
+    crcIntrinsicID = Intrinsic::aarch64_crc32x;
+    break;
+  case clang::AArch64::BI__builtin_arm_crc32cd:
+    crcIntrinsicID = Intrinsic::aarch64_crc32cx;
+    break;
+  }
+
+  if (crcIntrinsicID != Intrinsic::not_intrinsic) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // Memory Operations (MOPS)
+  if (builtinID == AArch64::BI__builtin_arm_mops_memset_tag) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // Memory Tagging Extensions (MTE) Intrinsics
+  Intrinsic::ID mteIntrinsicID = Intrinsic::not_intrinsic;
+  switch (builtinID) {
+  case clang::AArch64::BI__builtin_arm_irg:
+    mteIntrinsicID = Intrinsic::aarch64_irg;
+    break;
+  case clang::AArch64::BI__builtin_arm_addg:
+    mteIntrinsicID = Intrinsic::aarch64_addg;
+    break;
+  case clang::AArch64::BI__builtin_arm_gmi:
+    mteIntrinsicID = Intrinsic::aarch64_gmi;
+    break;
+  case clang::AArch64::BI__builtin_arm_ldg:
+    mteIntrinsicID = Intrinsic::aarch64_ldg;
+    break;
+  case clang::AArch64::BI__builtin_arm_stg:
+    mteIntrinsicID = Intrinsic::aarch64_stg;
+    break;
+  case clang::AArch64::BI__builtin_arm_subp:
+    mteIntrinsicID = Intrinsic::aarch64_subp;
+    break;
+  }
+
+  if (mteIntrinsicID != Intrinsic::not_intrinsic) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_arm_rsr ||
+      builtinID == clang::AArch64::BI__builtin_arm_rsr64 ||
+      builtinID == clang::AArch64::BI__builtin_arm_rsr128 ||
+      builtinID == clang::AArch64::BI__builtin_arm_rsrp ||
+      builtinID == clang::AArch64::BI__builtin_arm_wsr ||
+      builtinID == clang::AArch64::BI__builtin_arm_wsr64 ||
+      builtinID == clang::AArch64::BI__builtin_arm_wsr128 ||
+      builtinID == clang::AArch64::BI__builtin_arm_wsrp) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI_ReadStatusReg ||
+      builtinID == clang::AArch64::BI_WriteStatusReg ||
+      builtinID == clang::AArch64::BI__sys) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI_AddressOfReturnAddress) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__builtin_sponentry) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == clang::AArch64::BI__mulh ||
+      builtinID == clang::AArch64::BI__umulh) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI__writex18byte ||
+      builtinID == AArch64::BI__writex18word ||
+      builtinID == AArch64::BI__writex18dword ||
+      builtinID == AArch64::BI__writex18qword) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI__readx18byte ||
+      builtinID == AArch64::BI__readx18word ||
+      builtinID == AArch64::BI__readx18dword ||
+      builtinID == AArch64::BI__readx18qword) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI__addx18byte ||
+      builtinID == AArch64::BI__addx18word ||
+      builtinID == AArch64::BI__addx18dword ||
+      builtinID == AArch64::BI__addx18qword ||
+      builtinID == AArch64::BI__incx18byte ||
+      builtinID == AArch64::BI__incx18word ||
+      builtinID == AArch64::BI__incx18dword ||
+      builtinID == AArch64::BI__incx18qword) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI_CopyDoubleFromInt64 ||
+      builtinID == AArch64::BI_CopyFloatFromInt32 ||
+      builtinID == AArch64::BI_CopyInt32FromFloat ||
+      builtinID == AArch64::BI_CopyInt64FromDouble) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI_CountLeadingOnes ||
+      builtinID == AArch64::BI_CountLeadingOnes64 ||
+      builtinID == AArch64::BI_CountLeadingZeros ||
+      builtinID == AArch64::BI_CountLeadingZeros64) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI_CountLeadingSigns ||
+      builtinID == AArch64::BI_CountLeadingSigns64) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI_CountOneBits ||
+      builtinID == AArch64::BI_CountOneBits64) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI__prefetch) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == AArch64::BI__hlt) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  if (builtinID == NEON::BI__builtin_neon_vcvth_bf16_f32) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // Handle MSVC intrinsics before argument evaluation to prevent double
+  // evaluation.
+  assert(!cir::MissingFeatures::msvcBuiltins());
+
+  // Some intrinsics are equivalent - if they are use the base intrinsic ID.
+  auto it = llvm::find_if(neonEquivalentIntrinsicMap, [builtinID](auto &p) {
+    return p.first == builtinID;
+  });
+  if (it != end(neonEquivalentIntrinsicMap))
+    builtinID = it->second;
+
+  // Find out if any arguments are required to be integer constant
+  // expressions.
+  assert(!cir::MissingFeatures::handleBuiltinICEArguments());
+
+  assert(!cir::MissingFeatures::neonSISDIntrinsics());
+
+  // Handle non-overloaded intrinsics first.
+  switch (builtinID) {
+  default:
+    break;
+  case NEON::BI__builtin_neon_vabsh_f16:
+  case NEON::BI__builtin_neon_vaddq_p128:
+  case NEON::BI__builtin_neon_vldrq_p128:
+  case NEON::BI__builtin_neon_vstrq_p128:
+  case NEON::BI__builtin_neon_vcvts_f32_u32:
+  case NEON::BI__builtin_neon_vcvtd_f64_u64:
+  case NEON::BI__builtin_neon_vcvts_f32_s32:
+  case NEON::BI__builtin_neon_vcvtd_f64_s64:
+  case NEON::BI__builtin_neon_vcvth_f16_u16:
+  case NEON::BI__builtin_neon_vcvth_f16_u32:
+  case NEON::BI__builtin_neon_vcvth_f16_u64:
+  case NEON::BI__builtin_neon_vcvth_f16_s16:
+  case NEON::BI__builtin_neon_vcvth_f16_s32:
+  case NEON::BI__builtin_neon_vcvth_f16_s64:
+  case NEON::BI__builtin_neon_vcvtah_u16_f16:
+  case NEON::BI__builtin_neon_vcvtmh_u16_f16:
+  case NEON::BI__builtin_neon_vcvtnh_u16_f16:
+  case NEON::BI__builtin_neon_vcvtph_u16_f16:
+  case NEON::BI__builtin_neon_vcvth_u16_f16:
+  case NEON::BI__builtin_neon_vcvtah_s16_f16:
+  case NEON::BI__builtin_neon_vcvtmh_s16_f16:
+  case NEON::BI__builtin_neon_vcvtnh_s16_f16:
+  case NEON::BI__builtin_neon_vcvtph_s16_f16:
+  case NEON::BI__builtin_neon_vcvth_s16_f16:
+  case NEON::BI__builtin_neon_vcaleh_f16:
+  case NEON::BI__builtin_neon_vcalth_f16:
+  case NEON::BI__builtin_neon_vcageh_f16:
+  case NEON::BI__builtin_neon_vcagth_f16:
+  case NEON::BI__builtin_neon_vcvth_n_s16_f16:
+  case NEON::BI__builtin_neon_vcvth_n_u16_f16:
+  case NEON::BI__builtin_neon_vcvth_n_f16_s16:
+  case NEON::BI__builtin_neon_vcvth_n_f16_u16:
+  case NEON::BI__builtin_neon_vpaddd_s64:
+  case NEON::BI__builtin_neon_vpaddd_f64:
+  case NEON::BI__builtin_neon_vpadds_f32:
+  case NEON::BI__builtin_neon_vceqzd_s64:
+  case NEON::BI__builtin_neon_vceqzd_f64:
+  case NEON::BI__builtin_neon_vceqzs_f32:
+  case NEON::BI__builtin_neon_vceqzh_f16:
+  case NEON::BI__builtin_neon_vcgezd_s64:
+  case NEON::BI__builtin_neon_vcgezd_f64:
+  case NEON::BI__builtin_neon_vcgezs_f32:
+  case NEON::BI__builtin_neon_vcgezh_f16:
+  case NEON::BI__builtin_neon_vclezd_s64:
+  case NEON::BI__builtin_neon_vclezd_f64:
+  case NEON::BI__builtin_neon_vclezs_f32:
+  case NEON::BI__builtin_neon_vclezh_f16:
+  case NEON::BI__builtin_neon_vcgtzd_s64:
+  case NEON::BI__builtin_neon_vcgtzd_f64:
+  case NEON::BI__builtin_neon_vcgtzs_f32:
+  case NEON::BI__builtin_neon_vcgtzh_f16:
+  case NEON::BI__builtin_neon_vcltzd_s64:
+  case NEON::BI__builtin_neon_vcltzd_f64:
+  case NEON::BI__builtin_neon_vcltzs_f32:
+  case NEON::BI__builtin_neon_vcltzh_f16:
+  case NEON::BI__builtin_neon_vceqzd_u64:
+  case NEON::BI__builtin_neon_vceqd_f64:
+  case NEON::BI__builtin_neon_vcled_f64:
+  case NEON::BI__builtin_neon_vcltd_f64:
+  case NEON::BI__builtin_neon_vcged_f64:
+  case NEON::BI__builtin_neon_vcgtd_f64:
+  case NEON::BI__builtin_neon_vceqs_f32:
+  case NEON::BI__builtin_neon_vcles_f32:
+  case NEON::BI__builtin_neon_vclts_f32:
+  case NEON::BI__builtin_neon_vcges_f32:
+  case NEON::BI__builtin_neon_vcgts_f32:
+  case NEON::BI__builtin_neon_vceqh_f16:
+  case NEON::BI__builtin_neon_vcleh_f16:
+  case NEON::BI__builtin_neon_vclth_f16:
+  case NEON::BI__builtin_neon_vcgeh_f16:
+  case NEON::BI__builtin_neon_vcgth_f16:
+  case NEON::BI__builtin_neon_vceqd_s64:
+  case NEON::BI__builtin_neon_vceqd_u64:
+  case NEON::BI__builtin_neon_vcgtd_s64:
+  case NEON::BI__builtin_neon_vcgtd_u64:
+  case NEON::BI__builtin_neon_vcltd_s64:
+  case NEON::BI__builtin_neon_vcltd_u64:
+  case NEON::BI__builtin_neon_vcged_u64:
+  case NEON::BI__builtin_neon_vcged_s64:
+  case NEON::BI__builtin_neon_vcled_u64:
+  case NEON::BI__builtin_neon_vcled_s64:
+  case NEON::BI__builtin_neon_vtstd_s64:
+  case NEON::BI__builtin_neon_vtstd_u64:
+  case NEON::BI__builtin_neon_vset_lane_i8:
+  case NEON::BI__builtin_neon_vset_lane_i16:
+  case NEON::BI__builtin_neon_vset_lane_i32:
+  case NEON::BI__builtin_neon_vset_lane_i64:
+  case NEON::BI__builtin_neon_vset_lane_bf16:
+  case NEON::BI__builtin_neon_vset_lane_f32:
+  case NEON::BI__builtin_neon_vsetq_lane_i8:
+  case NEON::BI__builtin_neon_vsetq_lane_i16:
+  case NEON::BI__builtin_neon_vsetq_lane_i32:
+  case NEON::BI__builtin_neon_vsetq_lane_i64:
+  case NEON::BI__builtin_neon_vsetq_lane_bf16:
+  case NEON::BI__builtin_neon_vsetq_lane_f32:
+  case NEON::BI__builtin_neon_vset_lane_f64:
+  case NEON::BI__builtin_neon_vset_lane_mf8:
+  case NEON::BI__builtin_neon_vsetq_lane_mf8:
+  case NEON::BI__builtin_neon_vsetq_lane_f64:
+  case NEON::BI__builtin_neon_vget_lane_i8:
+  case NEON::BI__builtin_neon_vdupb_lane_i8:
+  case NEON::BI__builtin_neon_vgetq_lane_i8:
+  case NEON::BI__builtin_neon_vdupb_laneq_i8:
+  case NEON::BI__builtin_neon_vget_lane_mf8:
+  case NEON::BI__builtin_neon_vdupb_lane_mf8:
+  case NEON::BI__builtin_neon_vgetq_lane_mf8:
+  case NEON::BI__builtin_neon_vdupb_laneq_mf8:
+  case NEON::BI__builtin_neon_vget_lane_i16:
+  case NEON::BI__builtin_neon_vduph_lane_i16:
+  case NEON::BI__builtin_neon_vgetq_lane_i16:
+  case NEON::BI__builtin_neon_vduph_laneq_i16:
+  case NEON::BI__builtin_neon_vget_lane_i32:
+  case NEON::BI__builtin_neon_vdups_lane_i32:
+  case NEON::BI__builtin_neon_vdups_lane_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_i32:
+  case NEON::BI__builtin_neon_vdups_laneq_i32:
+  case NEON::BI__builtin_neon_vget_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_lane_f64:
+  case NEON::BI__builtin_neon_vgetq_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_laneq_i64:
+  case NEON::BI__builtin_neon_vget_lane_f32:
+  case NEON::BI__builtin_neon_vget_lane_f64:
+  case NEON::BI__builtin_neon_vgetq_lane_f32:
+  case NEON::BI__builtin_neon_vdups_laneq_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_f64:
+  case NEON::BI__builtin_neon_vdupd_laneq_f64:
+  case NEON::BI__builtin_neon_vaddh_f16:
+  case NEON::BI__builtin_neon_vsubh_f16:
+  case NEON::BI__builtin_neon_vmulh_f16:
+  case NEON::BI__builtin_neon_vdivh_f16:
+  case NEON::BI__builtin_neon_vfmah_f16:
+  case NEON::BI__builtin_neon_vfmsh_f16:
+  case NEON::BI__builtin_neon_vaddd_s64:
+  case NEON::BI__builtin_neon_vaddd_u64:
+  case NEON::BI__builtin_neon_vsubd_s64:
+  case NEON::BI__builtin_neon_vsubd_u64:
+  case NEON::BI__builtin_neon_vqdmlalh_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_s16:
+  case NEON::BI__builtin_neon_vqshlud_n_s64:
+  case NEON::BI__builtin_neon_vqshld_n_u64:
+  case NEON::BI__builtin_neon_vqshld_n_s64:
+  case NEON::BI__builtin_neon_vrshrd_n_u64:
+  case NEON::BI__builtin_neon_vrshrd_n_s64:
+  case NEON::BI__builtin_neon_vrsrad_n_u64:
+  case NEON::BI__builtin_neon_vrsrad_n_s64:
+  case NEON::BI__builtin_neon_vshld_n_s64:
+  case NEON::BI__builtin_neon_vshld_n_u64:
+  case NEON::BI__builtin_neon_vshrd_n_s64:
+  case NEON::BI__builtin_neon_vshrd_n_u64:
+  case NEON::BI__builtin_neon_vsrad_n_s64:
+  case NEON::BI__builtin_neon_vsrad_n_u64:
+  case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
+  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16:
+  case NEON::BI__builtin_neon_vqdmlals_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_s32:
+  case NEON::BI__builtin_neon_vqdmlals_lane_s32:
+  case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32:
+  case NEON::BI__builtin_neon_vget_lane_bf16:
+  case NEON::BI__builtin_neon_vduph_lane_bf16:
+  case NEON::BI__builtin_neon_vduph_lane_f16:
+  case NEON::BI__builtin_neon_vgetq_lane_bf16:
+  case NEON::BI__builtin_neon_vduph_laneq_bf16:
+  case NEON::BI__builtin_neon_vduph_laneq_f16:
+  case NEON::BI__builtin_neon_vcvt_bf16_f32:
+  case NEON::BI__builtin_neon_vcvtq_low_bf16_f32:
+  case NEON::BI__builtin_neon_vcvtq_high_bf16_f32:
+  case clang::AArch64::BI_InterlockedAdd:
+  case clang::AArch64::BI_InterlockedAdd_acq:
+  case clang::AArch64::BI_InterlockedAdd_rel:
+  case clang::AArch64::BI_InterlockedAdd_nf:
+  case clang::AArch64::BI_InterlockedAdd64:
+  case clang::AArch64::BI_InterlockedAdd64_acq:
+  case clang::AArch64::BI_InterlockedAdd64_rel:
+  case clang::AArch64::BI_InterlockedAdd64_nf:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // Not all intrinsics handled by the common case work for AArch64 yet, so only
+  // defer to common code if it's been added to our special map.
+  assert(!cir::MissingFeatures::aarch64SIMDIntrinsics());
+
+  assert(!cir::MissingFeatures::aarch64TblBuiltinExpr());
+
+  switch (builtinID) {
+  default:
+    return {};
+  case NEON::BI__builtin_neon_vbsl_v:
+  case NEON::BI__builtin_neon_vbslq_v:
+  case NEON::BI__builtin_neon_vfma_lane_v:
+  case NEON::BI__builtin_neon_vfmaq_lane_v:
+  case NEON::BI__builtin_neon_vfma_laneq_v:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v:
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64:
+  case NEON::BI__builtin_neon_vmull_v:
+  case NEON::BI__builtin_neon_vmax_v:
+  case NEON::BI__builtin_neon_vmaxq_v:
+  case NEON::BI__builtin_neon_vmaxh_f16:
+  case NEON::BI__builtin_neon_vmin_v:
+  case NEON::BI__builtin_neon_vminq_v:
+  case NEON::BI__builtin_neon_vminh_f16:
+  case NEON::BI__builtin_neon_vabd_v:
+  case NEON::BI__builtin_neon_vabdq_v:
+  case NEON::BI__builtin_neon_vpadal_v:
+  case NEON::BI__builtin_neon_vpadalq_v:
+  case NEON::BI__builtin_neon_vpmin_v:
+  case NEON::BI__builtin_neon_vpminq_v:
+  case NEON::BI__builtin_neon_vpmax_v:
+  case NEON::BI__builtin_neon_vpmaxq_v:
+  case NEON::BI__builtin_neon_vminnm_v:
+  case NEON::BI__builtin_neon_vminnmq_v:
+  case NEON::BI__builtin_neon_vminnmh_f16:
+  case NEON::BI__builtin_neon_vmaxnm_v:
+  case NEON::BI__builtin_neon_vmaxnmq_v:
+  case NEON::BI__builtin_neon_vmaxnmh_f16:
+  case NEON::BI__builtin_neon_vrecpss_f32:
+  case NEON::BI__builtin_neon_vrecpsd_f64:
+  case NEON::BI__builtin_neon_vrecpsh_f16:
+  case NEON::BI__builtin_neon_vqshrun_n_v:
+  case NEON::BI__builtin_neon_vqrshrun_n_v:
+  case NEON::BI__builtin_neon_vqshrn_n_v:
+  case NEON::BI__builtin_neon_vrshrn_n_v:
+  case NEON::BI__builtin_neon_vqrshrn_n_v:
+  case NEON::BI__builtin_neon_vrndah_f16:
+  case NEON::BI__builtin_neon_vrnda_v:
+  case NEON::BI__builtin_neon_vrndaq_v:
+  case NEON::BI__builtin_neon_vrndih_f16:
+  case NEON::BI__builtin_neon_vrndmh_f16:
+  case NEON::BI__builtin_neon_vrndm_v:
+  case NEON::BI__builtin_neon_vrndmq_v:
+  case NEON::BI__builtin_neon_vrndnh_f16:
+  case NEON::BI__builtin_neon_vrndn_v:
+  case NEON::BI__builtin_neon_vrndnq_v:
+  case NEON::BI__builtin_neon_vrndns_f32:
+  case NEON::BI__builtin_neon_vrndph_f16:
+  case NEON::BI__builtin_neon_vrndp_v:
+  case NEON::BI__builtin_neon_vrndpq_v:
+  case NEON::BI__builtin_neon_vrndxh_f16:
+  case NEON::BI__builtin_neon_vrndx_v:
+  case NEON::BI__builtin_neon_vrndxq_v:
+  case NEON::BI__builtin_neon_vrndh_f16:
+  case NEON::BI__builtin_neon_vrnd32x_f32:
+  case NEON::BI__builtin_neon_vrnd32xq_f32:
+  case NEON::BI__builtin_neon_vrnd32x_f64:
+  case NEON::BI__builtin_neon_vrnd32xq_f64:
+  case NEON::BI__builtin_neon_vrnd32z_f32:
+  case NEON::BI__builtin_neon_vrnd32zq_f32:
+  case NEON::BI__builtin_neon_vrnd32z_f64:
+  case NEON::BI__builtin_neon_vrnd32zq_f64:
+  case NEON::BI__builtin_neon_vrnd64x_f32:
+  case NEON::BI__builtin_neon_vrnd64xq_f32:
+  case NEON::BI__builtin_neon_vrnd64x_f64:
+  case NEON::BI__builtin_neon_vrnd64xq_f64:
+  case NEON::BI__builtin_neon_vrnd64z_f32:
+  case NEON::BI__builtin_neon_vrnd64zq_f32:
+  case NEON::BI__builtin_neon_vrnd64z_f64:
+  case NEON::BI__builtin_neon_vrnd64zq_f64:
+  case NEON::BI__builtin_neon_vrnd_v:
+  case NEON::BI__builtin_neon_vrndq_v:
+  case NEON::BI__builtin_neon_vcvt_f64_v:
+  case NEON::BI__builtin_neon_vcvtq_f64_v:
+  case NEON::BI__builtin_neon_vcvt_f64_f32:
+  case NEON::BI__builtin_neon_vcvt_f32_f64:
+  case NEON::BI__builtin_neon_vcvt_s32_v:
+  case NEON::BI__builtin_neon_vcvt_u32_v:
+  case NEON::BI__builtin_neon_vcvt_s64_v:
+  case NEON::BI__builtin_neon_vcvt_u64_v:
+  case NEON::BI__builtin_neon_vcvt_s16_f16:
+  case NEON::BI__builtin_neon_vcvt_u16_f16:
+  case NEON::BI__builtin_neon_vcvtq_s32_v:
+  case NEON::BI__builtin_neon_vcvtq_u32_v:
+  case NEON::BI__builtin_neon_vcvtq_s64_v:
+  case NEON::BI__builtin_neon_vcvtq_u64_v:
+  case NEON::BI__builtin_neon_vcvtq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtq_u16_f16:
+  case NEON::BI__builtin_neon_vcvta_s16_f16:
+  case NEON::BI__builtin_neon_vcvta_u16_f16:
+  case NEON::BI__builtin_neon_vcvta_s32_v:
+  case NEON::BI__builtin_neon_vcvtaq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtaq_s32_v:
+  case NEON::BI__builtin_neon_vcvta_u32_v:
+  case NEON::BI__builtin_neon_vcvtaq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtaq_u32_v:
+  case NEON::BI__builtin_neon_vcvta_s64_v:
+  case NEON::BI__builtin_neon_vcvtaq_s64_v:
+  case NEON::BI__builtin_neon_vcvta_u64_v:
+  case NEON::BI__builtin_neon_vcvtaq_u64_v:
+  case NEON::BI__builtin_neon_vcvtm_s16_f16:
+  case NEON::BI__builtin_neon_vcvtm_s32_v:
+  case NEON::BI__builtin_neon_vcvtmq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtmq_s32_v:
+  case NEON::BI__builtin_neon_vcvtm_u16_f16:
+  case NEON::BI__builtin_neon_vcvtm_u32_v:
+  case NEON::BI__builtin_neon_vcvtmq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtmq_u32_v:
+  case NEON::BI__builtin_neon_vcvtm_s64_v:
+  case NEON::BI__builtin_neon_vcvtmq_s64_v:
+  case NEON::BI__builtin_neon_vcvtm_u64_v:
+  case NEON::BI__builtin_neon_vcvtmq_u64_v:
+  case NEON::BI__builtin_neon_vcvtn_s16_f16:
+  case NEON::BI__builtin_neon_vcvtn_s32_v:
+  case NEON::BI__builtin_neon_vcvtnq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtnq_s32_v:
+  case NEON::BI__builtin_neon_vcvtn_u16_f16:
+  case NEON::BI__builtin_neon_vcvtn_u32_v:
+  case NEON::BI__builtin_neon_vcvtnq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtnq_u32_v:
+  case NEON::BI__builtin_neon_vcvtn_s64_v:
+  case NEON::BI__builtin_neon_vcvtnq_s64_v:
+  case NEON::BI__builtin_neon_vcvtn_u64_v:
+  case NEON::BI__builtin_neon_vcvtnq_u64_v:
+  case NEON::BI__builtin_neon_vcvtp_s16_f16:
+  case NEON::BI__builtin_neon_vcvtp_s32_v:
+  case NEON::BI__builtin_neon_vcvtpq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtpq_s32_v:
+  case NEON::BI__builtin_neon_vcvtp_u16_f16:
+  case NEON::BI__builtin_neon_vcvtp_u32_v:
+  case NEON::BI__builtin_neon_vcvtpq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtpq_u32_v:
+  case NEON::BI__builtin_neon_vcvtp_s64_v:
+  case NEON::BI__builtin_neon_vcvtpq_s64_v:
+  case NEON::BI__builtin_neon_vcvtp_u64_v:
+  case NEON::BI__builtin_neon_vcvtpq_u64_v:
+  case NEON::BI__builtin_neon_vmulx_v:
+  case NEON::BI__builtin_neon_vmulxq_v:
+  case NEON::BI__builtin_neon_vmulxh_lane_f16:
+  case NEON::BI__builtin_neon_vmulxh_laneq_f16:
+  case NEON::BI__builtin_neon_vmul_lane_v:
+  case NEON::BI__builtin_neon_vmul_laneq_v:
+  case NEON::BI__builtin_neon_vnegd_s64:
+  case NEON::BI__builtin_neon_vnegh_f16:
+  case NEON::BI__builtin_neon_vpmaxnm_v:
+  case NEON::BI__builtin_neon_vpmaxnmq_v:
+  case NEON::BI__builtin_neon_vpminnm_v:
+  case NEON::BI__builtin_neon_vpminnmq_v:
+  case NEON::BI__builtin_neon_vsqrth_f16:
+  case NEON::BI__builtin_neon_vsqrt_v:
+  case NEON::BI__builtin_neon_vsqrtq_v:
+  case NEON::BI__builtin_neon_vrbit_v:
+  case NEON::BI__builtin_neon_vrbitq_v:
+  case NEON::BI__builtin_neon_vmaxv_f16:
+  case NEON::BI__builtin_neon_vmaxvq_f16:
+  case NEON::BI__builtin_neon_vminv_f16:
+  case NEON::BI__builtin_neon_vminvq_f16:
+  case NEON::BI__builtin_neon_vmaxnmv_f16:
+  case NEON::BI__builtin_neon_vmaxnmvq_f16:
+  case NEON::BI__builtin_neon_vminnmv_f16:
+  case NEON::BI__builtin_neon_vminnmvq_f16:
+  case NEON::BI__builtin_neon_vmul_n_f64:
+  case NEON::BI__builtin_neon_vaddlv_u8:
+  case NEON::BI__builtin_neon_vaddlv_u16:
+  case NEON::BI__builtin_neon_vaddlvq_u8:
+  case NEON::BI__builtin_neon_vaddlvq_u16:
+  case NEON::BI__builtin_neon_vaddlv_s8:
+  case NEON::BI__builtin_neon_vaddlv_s16:
+  case NEON::BI__builtin_neon_vaddlvq_s8:
+  case NEON::BI__builtin_neon_vaddlvq_s16:
+  case NEON::BI__builtin_neon_vsri_n_v:
+  case NEON::BI__builtin_neon_vsriq_n_v:
+  case NEON::BI__builtin_neon_vsli_n_v:
+  case NEON::BI__builtin_neon_vsliq_n_v:
+  case NEON::BI__builtin_neon_vsra_n_v:
+  case NEON::BI__builtin_neon_vsraq_n_v:
+  case NEON::BI__builtin_neon_vrsra_n_v:
+  case NEON::BI__builtin_neon_vrsraq_n_v:
+  case NEON::BI__builtin_neon_vld1_v:
+  case NEON::BI__builtin_neon_vld1q_v:
+  case NEON::BI__builtin_neon_vst1_v:
+  case NEON::BI__builtin_neon_vst1q_v:
+  case NEON::BI__builtin_neon_vld1_lane_v:
+  case NEON::BI__builtin_neon_vld1q_lane_v:
+  case NEON::BI__builtin_neon_vldap1_lane_s64:
+  case NEON::BI__builtin_neon_vldap1q_lane_s64:
+  case NEON::BI__builtin_neon_vld1_dup_v:
+  case NEON::BI__builtin_neon_vld1q_dup_v:
+  case NEON::BI__builtin_neon_vst1_lane_v:
+  case NEON::BI__builtin_neon_vst1q_lane_v:
+  case NEON::BI__builtin_neon_vstl1_lane_s64:
+  case NEON::BI__builtin_neon_vstl1q_lane_s64:
+  case NEON::BI__builtin_neon_vld2_v:
+  case NEON::BI__builtin_neon_vld2q_v:
+  case NEON::BI__builtin_neon_vld3_v:
+  case NEON::BI__builtin_neon_vld3q_v:
+  case NEON::BI__builtin_neon_vld4_v:
+  case NEON::BI__builtin_neon_vld4q_v:
+  case NEON::BI__builtin_neon_vld2_dup_v:
+  case NEON::BI__builtin_neon_vld2q_dup_v:
+  case NEON::BI__builtin_neon_vld3_dup_v:
+  case NEON::BI__builtin_neon_vld3q_dup_v:
+  case NEON::BI__builtin_neon_vld4_dup_v:
+  case NEON::BI__builtin_neon_vld4q_dup_v:
+  case NEON::BI__builtin_neon_vld2_lane_v:
+  case NEON::BI__builtin_neon_vld2q_lane_v:
+  case NEON::BI__builtin_neon_vld3_lane_v:
+  case NEON::BI__builtin_neon_vld3q_lane_v:
+  case NEON::BI__builtin_neon_vld4_lane_v:
+  case NEON::BI__builtin_neon_vld4q_lane_v:
+  case NEON::BI__builtin_neon_vst2_v:
+  case NEON::BI__builtin_neon_vst2q_v:
+  case NEON::BI__builtin_neon_vst2_lane_v:
+  case NEON::BI__builtin_neon_vst2q_lane_v:
+  case NEON::BI__builtin_neon_vst3_v:
+  case NEON::BI__builtin_neon_vst3q_v:
+  case NEON::BI__builtin_neon_vst3_lane_v:
+  case NEON::BI__builtin_neon_vst3q_lane_v:
+  case NEON::BI__builtin_neon_vst4_v:
+  case NEON::BI__builtin_neon_vst4q_v:
+  case NEON::BI__builtin_neon_vst4_lane_v:
+  case NEON::BI__builtin_neon_vst4q_lane_v:
+  case NEON::BI__builtin_neon_vtrn_v:
+  case NEON::BI__builtin_neon_vtrnq_v:
+  case NEON::BI__builtin_neon_vuzp_v:
+  case NEON::BI__builtin_neon_vuzpq_v:
+  case NEON::BI__builtin_neon_vzip_v:
+  case NEON::BI__builtin_neon_vzipq_v:
+  case NEON::BI__builtin_neon_vqtbl1q_v:
+  case NEON::BI__builtin_neon_vqtbl2q_v:
+  case NEON::BI__builtin_neon_vqtbl3q_v:
+  case NEON::BI__builtin_neon_vqtbl4q_v:
+  case NEON::BI__builtin_neon_vqtbx1q_v:
+  case NEON::BI__builtin_neon_vqtbx2q_v:
+  case NEON::BI__builtin_neon_vqtbx3q_v:
+  case NEON::BI__builtin_neon_vqtbx4q_v:
+  case NEON::BI__builtin_neon_vsqadd_v:
+  case NEON::BI__builtin_neon_vsqaddq_v:
+  case NEON::BI__builtin_neon_vuqadd_v:
+  case NEON::BI__builtin_neon_vuqaddq_v:
+  case NEON::BI__builtin_neon_vluti2_laneq_mf8:
+  case NEON::BI__builtin_neon_vluti2_laneq_bf16:
+  case NEON::BI__builtin_neon_vluti2_laneq_f16:
+  case NEON::BI__builtin_neon_vluti2_laneq_p16:
+  case NEON::BI__builtin_neon_vluti2_laneq_p8:
+  case NEON::BI__builtin_neon_vluti2_laneq_s16:
+  case NEON::BI__builtin_neon_vluti2_laneq_s8:
+  case NEON::BI__builtin_neon_vluti2_laneq_u16:
+  case NEON::BI__builtin_neon_vluti2_laneq_u8:
+  case NEON::BI__builtin_neon_vluti2q_laneq_mf8:
+  case NEON::BI__builtin_neon_vluti2q_laneq_bf16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_f16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_p16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_p8:
+  case NEON::BI__builtin_neon_vluti2q_laneq_s16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_s8:
+  case NEON::BI__builtin_neon_vluti2q_laneq_u16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_u8:
+  case NEON::BI__builtin_neon_vluti2_lane_mf8:
+  case NEON::BI__builtin_neon_vluti2_lane_bf16:
+  case NEON::BI__builtin_neon_vluti2_lane_f16:
+  case NEON::BI__builtin_neon_vluti2_lane_p16:
+  case NEON::BI__builtin_neon_vluti2_lane_p8:
+  case NEON::BI__builtin_neon_vluti2_lane_s16:
+  case NEON::BI__builtin_neon_vluti2_lane_s8:
+  case NEON::BI__builtin_neon_vluti2_lane_u16:
+  case NEON::BI__builtin_neon_vluti2_lane_u8:
+  case NEON::BI__builtin_neon_vluti2q_lane_mf8:
+  case NEON::BI__builtin_neon_vluti2q_lane_bf16:
+  case NEON::BI__builtin_neon_vluti2q_lane_f16:
+  case NEON::BI__builtin_neon_vluti2q_lane_p16:
+  case NEON::BI__builtin_neon_vluti2q_lane_p8:
+  case NEON::BI__builtin_neon_vluti2q_lane_s16:
+  case NEON::BI__builtin_neon_vluti2q_lane_s8:
+  case NEON::BI__builtin_neon_vluti2q_lane_u16:
+  case NEON::BI__builtin_neon_vluti2q_lane_u8:
+  case NEON::BI__builtin_neon_vluti4q_lane_mf8:
+  case NEON::BI__builtin_neon_vluti4q_lane_p8:
+  case NEON::BI__builtin_neon_vluti4q_lane_s8:
+  case NEON::BI__builtin_neon_vluti4q_lane_u8:
+  case NEON::BI__builtin_neon_vluti4q_laneq_mf8:
+  case NEON::BI__builtin_neon_vluti4q_laneq_p8:
+  case NEON::BI__builtin_neon_vluti4q_laneq_s8:
+  case NEON::BI__builtin_neon_vluti4q_laneq_u8:
+  case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_f16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_p16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_s16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_u16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2:
+  case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
+  case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm:
+  case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vamin_f16:
+  case NEON::BI__builtin_neon_vaminq_f16:
+  case NEON::BI__builtin_neon_vamin_f32:
+  case NEON::BI__builtin_neon_vaminq_f32:
+  case NEON::BI__builtin_neon_vaminq_f64:
+  case NEON::BI__builtin_neon_vamax_f16:
+  case NEON::BI__builtin_neon_vamaxq_f16:
+  case NEON::BI__builtin_neon_vamax_f32:
+  case NEON::BI__builtin_neon_vamaxq_f32:
+  case NEON::BI__builtin_neon_vamaxq_f64:
+  case NEON::BI__builtin_neon_vscale_f16:
+  case NEON::BI__builtin_neon_vscaleq_f16:
+  case NEON::BI__builtin_neon_vscale_f32:
+  case NEON::BI__builtin_neon_vscaleq_f32:
+  case NEON::BI__builtin_neon_vscaleq_f64:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+
+  // Unreachable: All cases in the switch above return.
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 0198a9d..62836ce 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -11,28 +11,388 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenBuilder.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/ValueRange.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
-#include "llvm/IR/IntrinsicsX86.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
 
+template <typename... Operands>
+static mlir::Value emitIntrinsicCallOp(CIRGenBuilderTy &builder,
+                                       mlir::Location loc, const StringRef str,
+                                       const mlir::Type &resTy,
+                                       Operands &&...op) {
+  return cir::LLVMIntrinsicCallOp::create(builder, loc,
+                                          builder.getStringAttr(str), resTy,
+                                          std::forward<Operands>(op)...)
+      .getResult();
+}
+
+// OG has unordered comparison as a form of optimization in addition to
+// ordered comparison, while CIR doesn't.
+//
+// This means that we can't encode the comparison code of UGT (unordered
+// greater than), at least not at the CIR level.
+//
+// The boolean shouldInvert compensates for this.
+// For example: to get to the comparison code UGT, we pass in
+// emitVectorFCmp (OLE, shouldInvert = true) since OLE is the inverse of UGT.
+
+// There are several ways to support this otherwise:
+// - register extra CmpOpKind for unordered comparison types and build the
+// translation code for
+//    to go from CIR -> LLVM dialect. Notice we get this naturally with
+//    shouldInvert, benefiting from existing infrastructure, albeit having to
+//    generate an extra `not` at CIR).
+// - Just add extra comparison code to a new VecCmpOpKind instead of
+// cluttering CmpOpKind.
+// - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered
+// comparison
+// - Just emit the intrinsics call instead of calling this helper, see how the
+// LLVM lowering handles this.
+static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
+                                  llvm::SmallVector<mlir::Value> &ops,
+                                  mlir::Location loc, cir::CmpOpKind pred,
+                                  bool shouldInvert) {
+  assert(!cir::MissingFeatures::cgFPOptionsRAII());
+  // TODO(cir): Add isSignaling boolean once emitConstrainedFPCall implemented
+  assert(!cir::MissingFeatures::emitConstrainedFPCall());
+  mlir::Value cmp = builder.createVecCompare(loc, pred, ops[0], ops[1]);
+  mlir::Value bitCast = builder.createBitcast(
+      shouldInvert ? builder.createNot(cmp) : cmp, ops[0].getType());
+  return bitCast;
+}
+
+static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
+                                   mlir::Value mask, unsigned numElems) {
+  auto maskTy = cir::VectorType::get(
+      builder.getUIntNTy(1), cast<cir::IntType>(mask.getType()).getWidth());
+  mlir::Value maskVec = builder.createBitcast(mask, maskTy);
+
+  // If we have less than 8 elements, then the starting mask was an i8 and
+  // we need to extract down to the right number of elements.
+  if (numElems < 8) {
+    SmallVector<mlir::Attribute, 4> indices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, i));
+
+    maskVec = builder.createVecShuffle(loc, maskVec, maskVec, indices);
+  }
+  return maskVec;
+}
+
+// Builds the VecShuffleOp for pshuflw and pshufhw x86 builtins.
+//
+// The vector is split into lanes of 8 word elements (16 bits). The lower or
+// upper half of each lane, controlled by `isLow`, is shuffled in the following
+// way: The immediate is truncated to 8 bits, separated into 4 2-bit fields. The
+// i-th field's value represents the resulting index of the i-th element in the
+// half lane after shuffling. The other half of the lane remains unchanged.
+static cir::VecShuffleOp emitPshufWord(CIRGenBuilderTy &builder,
+                                       const mlir::Value vec,
+                                       const mlir::Value immediate,
+                                       const mlir::Location loc,
+                                       const bool isLow) {
+  uint32_t imm = CIRGenFunction::getZExtIntValueFromConstOp(immediate);
+
+  auto vecTy = cast<cir::VectorType>(vec.getType());
+  unsigned numElts = vecTy.getSize();
+
+  unsigned firstHalfStart = isLow ? 0 : 4;
+  unsigned secondHalfStart = 4 - firstHalfStart;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  int64_t indices[32];
+  for (unsigned l = 0; l != numElts; l += 8) {
+    for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
+      indices[l + i] = l + (imm & 3) + firstHalfStart;
+      imm >>= 2;
+    }
+    for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
+      indices[l + i] = l + i;
+  }
+
+  return builder.createVecShuffle(loc, vec, ArrayRef(indices, numElts));
+}
+
+// Builds the shuffle mask for pshufd and shufpd/shufps x86 builtins.
+// The shuffle mask is written to outIndices.
+static void
+computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
+                           uint32_t imm, const bool isShufP,
+                           llvm::SmallVectorImpl<int64_t> &outIndices) {
+  auto vecTy = cast<cir::VectorType>(vec.getType());
+  unsigned numElts = vecTy.getSize();
+  unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+  unsigned numLaneElts = numElts / numLanes;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  for (unsigned l = 0; l != numElts; l += numLaneElts) {
+    for (unsigned i = 0; i != numLaneElts; ++i) {
+      uint32_t idx = imm % numLaneElts;
+      imm /= numLaneElts;
+      if (isShufP && i >= (numLaneElts / 2))
+        idx += numElts;
+      outIndices[l + i] = l + idx;
+    }
+  }
+
+  outIndices.resize(numElts);
+}
+static mlir::Value emitX86CompressExpand(CIRGenBuilderTy &builder,
+                                         mlir::Location loc, mlir::Value source,
+                                         mlir::Value mask,
+                                         mlir::Value inputVector,
+                                         const std::string &id) {
+  auto resultTy = cast<cir::VectorType>(mask.getType());
+  mlir::Value maskValue = getMaskVecValue(
+      builder, loc, inputVector, cast<cir::VectorType>(resultTy).getSize());
+  return emitIntrinsicCallOp(builder, loc, id, resultTy,
+                             mlir::ValueRange{source, mask, maskValue});
+}
+
+static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
+                                       mlir::Location loc,
+                                       const std::string &intrinsicName,
+                                       SmallVectorImpl<mlir::Value> &ops) {
+
+  auto intTy = cast<cir::IntType>(ops[0].getType());
+  unsigned numElts = intTy.getWidth();
+  mlir::Value lhsVec = getMaskVecValue(builder, loc, ops[0], numElts);
+  mlir::Value rhsVec = getMaskVecValue(builder, loc, ops[1], numElts);
+  mlir::Type vecTy = lhsVec.getType();
+  mlir::Value resVec = emitIntrinsicCallOp(builder, loc, intrinsicName, vecTy,
+                                           mlir::ValueRange{lhsVec, rhsVec});
+  return builder.createBitcast(resVec, ops[0].getType());
+}
+
+static mlir::Value emitX86MaskUnpack(CIRGenBuilderTy &builder,
+                                     mlir::Location loc,
+                                     const std::string &intrinsicName,
+                                     SmallVectorImpl<mlir::Value> &ops) {
+  unsigned numElems = cast<cir::IntType>(ops[0].getType()).getWidth();
+
+  // Convert both operands to mask vectors.
+  mlir::Value lhs = getMaskVecValue(builder, loc, ops[0], numElems);
+  mlir::Value rhs = getMaskVecValue(builder, loc, ops[1], numElems);
+
+  mlir::Type i32Ty = builder.getSInt32Ty();
+
+  // Create indices for extracting the first half of each vector.
+  SmallVector<mlir::Attribute, 32> halfIndices;
+  for (auto i : llvm::seq<unsigned>(0, numElems / 2))
+    halfIndices.push_back(cir::IntAttr::get(i32Ty, i));
+
+  // Extract first half of each vector. This gives better codegen than
+  // doing it in a single shuffle.
+  mlir::Value lhsHalf = builder.createVecShuffle(loc, lhs, lhs, halfIndices);
+  mlir::Value rhsHalf = builder.createVecShuffle(loc, rhs, rhs, halfIndices);
+
+  // Create indices for concatenating the vectors.
+  // NOTE: Operands are swapped to match the intrinsic definition.
+  // After the half extraction, both vectors have numElems/2 elements.
+  // In createVecShuffle(rhsHalf, lhsHalf, indices), indices [0..numElems/2-1]
+  // select from rhsHalf, and indices [numElems/2..numElems-1] select from
+  // lhsHalf.
+  SmallVector<mlir::Attribute, 64> concatIndices;
+  for (auto i : llvm::seq<unsigned>(0, numElems))
+    concatIndices.push_back(cir::IntAttr::get(i32Ty, i));
+
+  // Concat the vectors (RHS first, then LHS).
+  mlir::Value res =
+      builder.createVecShuffle(loc, rhsHalf, lhsHalf, concatIndices);
+  return builder.createBitcast(res, ops[0].getType());
+}
+
+static mlir::Value emitX86MaskLogic(CIRGenBuilderTy &builder,
+                                    mlir::Location loc,
+                                    cir::BinOpKind binOpKind,
+                                    SmallVectorImpl<mlir::Value> &ops,
+                                    bool invertLHS = false) {
+  unsigned numElts = cast<cir::IntType>(ops[0].getType()).getWidth();
+  mlir::Value lhs = getMaskVecValue(builder, loc, ops[0], numElts);
+  mlir::Value rhs = getMaskVecValue(builder, loc, ops[1], numElts);
+
+  if (invertLHS)
+    lhs = builder.createNot(lhs);
+  return builder.createBitcast(builder.createBinop(loc, lhs, binOpKind, rhs),
+                               ops[0].getType());
+}
+
+static mlir::Value emitX86MaskTest(CIRGenBuilderTy &builder, mlir::Location loc,
+                                   const std::string &intrinsicName,
+                                   SmallVectorImpl<mlir::Value> &ops) {
+  auto intTy = cast<cir::IntType>(ops[0].getType());
+  unsigned numElts = intTy.getWidth();
+  mlir::Value lhsVec = getMaskVecValue(builder, loc, ops[0], numElts);
+  mlir::Value rhsVec = getMaskVecValue(builder, loc, ops[1], numElts);
+  mlir::Type resTy = builder.getSInt32Ty();
+  return emitIntrinsicCallOp(builder, loc, intrinsicName, resTy,
+                             mlir::ValueRange{lhsVec, rhsVec});
+}
+
+static mlir::Value emitVecInsert(CIRGenBuilderTy &builder, mlir::Location loc,
+                                 mlir::Value vec, mlir::Value value,
+                                 mlir::Value indexOp) {
+  unsigned numElts = cast<cir::VectorType>(vec.getType()).getSize();
+
+  uint64_t index =
+      indexOp.getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
+
+  index &= numElts - 1;
+
+  cir::ConstantOp indexVal = builder.getUInt64(index, loc);
+
+  return cir::VecInsertOp::create(builder, loc, vec, value, indexVal);
+}
+
+static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder,
+                                      mlir::Location location, mlir::Value &op0,
+                                      mlir::Value &op1, mlir::Value &amt,
+                                      bool isRight) {
+  mlir::Type op0Ty = op0.getType();
+
+  // Amount may be scalar immediate, in which case create a splat vector.
+  // Funnel shifts amounts are treated as modulo and types are all power-of-2
+  // so we only care about the lowest log2 bits anyway.
+  if (amt.getType() != op0Ty) {
+    auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
+    uint64_t numElems = vecTy.getSize();
+
+    auto amtTy = mlir::cast<cir::IntType>(amt.getType());
+    auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
+
+    // If signed, cast to the same width but unsigned first to
+    // ensure zero-extension when casting to a bigger unsigned `vecElemeTy`.
+    if (amtTy.isSigned()) {
+      cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
+      amt = builder.createIntCast(amt, unsignedAmtTy);
+    }
+    cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
+    amt = builder.createIntCast(amt, unsignedVecElemType);
+    amt = cir::VecSplatOp::create(
+        builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
+        amt);
+  }
+
+  const StringRef intrinsicName = isRight ? "fshr" : "fshl";
+  return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
+                             mlir::ValueRange{op0, op1, amt});
+}
+
+static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc,
+                                bool isSigned,
+                                SmallVectorImpl<mlir::Value> &ops,
+                                unsigned opTypePrimitiveSizeInBits) {
+  mlir::Type ty = cir::VectorType::get(builder.getSInt64Ty(),
+                                       opTypePrimitiveSizeInBits / 64);
+  mlir::Value lhs = builder.createBitcast(loc, ops[0], ty);
+  mlir::Value rhs = builder.createBitcast(loc, ops[1], ty);
+  if (isSigned) {
+    cir::ConstantOp shiftAmt =
+        builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32));
+    cir::VecSplatOp shiftSplatVecOp =
+        cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult());
+    mlir::Value shiftSplatValue = shiftSplatVecOp.getResult();
+    // In CIR, right-shift operations are automatically lowered to either an
+    // arithmetic or logical shift depending on the operand type. The purpose
+    // of the shifts here is to propagate the sign bit of the 32-bit input
+    // into the upper bits of each vector lane.
+    lhs = builder.createShift(loc, lhs, shiftSplatValue, true);
+    lhs = builder.createShift(loc, lhs, shiftSplatValue, false);
+    rhs = builder.createShift(loc, rhs, shiftSplatValue, true);
+    rhs = builder.createShift(loc, rhs, shiftSplatValue, false);
+  } else {
+    cir::ConstantOp maskScalar = builder.getConstant(
+        loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff));
+    cir::VecSplatOp mask =
+        cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult());
+    // Clear the upper bits
+    lhs = builder.createAnd(loc, lhs, mask);
+    rhs = builder.createAnd(loc, rhs, mask);
+  }
+  return builder.createMul(loc, lhs, rhs);
+}
+
+static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
+                                llvm::SmallVector<mlir::Value> ops,
+                                bool isSigned) {
+  mlir::Value op0 = ops[0];
+  mlir::Value op1 = ops[1];
+
+  cir::VectorType ty = cast<cir::VectorType>(op0.getType());
+  cir::IntType elementTy = cast<cir::IntType>(ty.getElementType());
+
+  uint64_t imm = CIRGenFunction::getZExtIntValueFromConstOp(ops[2]) & 0x7;
+
+  cir::CmpOpKind pred;
+  switch (imm) {
+  case 0x0:
+    pred = cir::CmpOpKind::lt;
+    break;
+  case 0x1:
+    pred = cir::CmpOpKind::le;
+    break;
+  case 0x2:
+    pred = cir::CmpOpKind::gt;
+    break;
+  case 0x3:
+    pred = cir::CmpOpKind::ge;
+    break;
+  case 0x4:
+    pred = cir::CmpOpKind::eq;
+    break;
+  case 0x5:
+    pred = cir::CmpOpKind::ne;
+    break;
+  case 0x6:
+    return builder.getNullValue(ty, loc); // FALSE
+  case 0x7: {
+    llvm::APInt allOnes = llvm::APInt::getAllOnes(elementTy.getWidth());
+    return cir::VecSplatOp::create(
+        builder, loc, ty,
+        builder.getConstAPInt(loc, elementTy, allOnes)); // TRUE
+  }
+  default:
+    llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
+  }
+
+  if ((!isSigned && elementTy.isSigned()) ||
+      (isSigned && elementTy.isUnsigned())) {
+    elementTy = elementTy.isSigned() ? builder.getUIntNTy(elementTy.getWidth())
+                                     : builder.getSIntNTy(elementTy.getWidth());
+    ty = cir::VectorType::get(elementTy, ty.getSize());
+    op0 = builder.createBitcast(op0, ty);
+    op1 = builder.createBitcast(op1, ty);
+  }
+
+  return builder.createVecCompare(loc, pred, op0, op1);
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
-                                               const CallExpr *e) {
+                                               const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
-    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is");
+    cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is");
     return {};
   }
   if (builtinID == Builtin::BI__builtin_cpu_supports) {
-    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports");
+    cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_supports");
     return {};
   }
   if (builtinID == Builtin::BI__builtin_cpu_init) {
-    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init");
+    cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_init");
     return {};
   }
 
@@ -43,26 +403,74 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   // Find out if any arguments are required to be integer constant expressions.
   assert(!cir::MissingFeatures::handleBuiltinICEArguments());
 
+  // The operands of the builtin call
+  llvm::SmallVector<mlir::Value> ops;
+
+  // `ICEArguments` is a bitmap indicating whether the argument at the i-th bit
+  // is required to be a constant integer expression.
+  unsigned iceArguments = 0;
+  ASTContext::GetBuiltinTypeError error;
+  getContext().GetBuiltinType(builtinID, error, &iceArguments);
+  assert(error == ASTContext::GE_None && "Error while getting builtin type.");
+
+  for (auto [idx, arg] : llvm::enumerate(expr->arguments()))
+    ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg));
+
+  CIRGenBuilderTy &builder = getBuilder();
+  mlir::Type voidTy = builder.getVoidTy();
+
   switch (builtinID) {
   default:
     return {};
-  case X86::BI_mm_prefetch:
   case X86::BI_mm_clflush:
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.sse2.clflush", voidTy, ops[0]);
   case X86::BI_mm_lfence:
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.sse2.lfence", voidTy);
   case X86::BI_mm_pause:
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.sse2.pause", voidTy);
   case X86::BI_mm_mfence:
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.sse2.mfence", voidTy);
   case X86::BI_mm_sfence:
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.sse.sfence", voidTy);
+  case X86::BI_mm_prefetch:
   case X86::BI__rdtsc:
-  case X86::BI__builtin_ia32_rdtscp:
+  case X86::BI__builtin_ia32_rdtscp: {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
   case X86::BI__builtin_ia32_lzcnt_u16:
   case X86::BI__builtin_ia32_lzcnt_u32:
-  case X86::BI__builtin_ia32_lzcnt_u64:
+  case X86::BI__builtin_ia32_lzcnt_u64: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    mlir::Value isZeroPoison = builder.getFalse(loc);
+    return emitIntrinsicCallOp(builder, loc, "ctlz", ops[0].getType(),
+                               mlir::ValueRange{ops[0], isZeroPoison});
+  }
   case X86::BI__builtin_ia32_tzcnt_u16:
   case X86::BI__builtin_ia32_tzcnt_u32:
-  case X86::BI__builtin_ia32_tzcnt_u64:
+  case X86::BI__builtin_ia32_tzcnt_u64: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    mlir::Value isZeroPoison = builder.getFalse(loc);
+    return emitIntrinsicCallOp(builder, loc, "cttz", ops[0].getType(),
+                               mlir::ValueRange{ops[0], isZeroPoison});
+  }
   case X86::BI__builtin_ia32_undef128:
   case X86::BI__builtin_ia32_undef256:
   case X86::BI__builtin_ia32_undef512:
+    // The x86 definition of "undef" is not the same as the LLVM definition
+    // (PR32176). We leave optimizing away an unnecessary zero constant to the
+    // IR optimizer and backend.
+    // TODO: If we had a "freeze" IR instruction to generate a fixed undef
+    //  value, we should use that here instead of a zero.
+    return builder.getNullValue(convertType(expr->getType()),
+                                getLoc(expr->getExprLoc()));
   case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
@@ -72,7 +480,20 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vec_ext_v32qi:
   case X86::BI__builtin_ia32_vec_ext_v16hi:
   case X86::BI__builtin_ia32_vec_ext_v8si:
-  case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_vec_ext_v4di: {
+    unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+
+    uint64_t index = getZExtIntValueFromConstOp(ops[1]);
+    index &= numElts - 1;
+
+    cir::ConstantOp indexVal =
+        builder.getUInt64(index, getLoc(expr->getExprLoc()));
+
+    // These builtins exist so we can ensure the index is an ICE and in range.
+    // Otherwise we could just do this in the header file.
+    return cir::VecExtractOp::create(builder, getLoc(expr->getExprLoc()),
+                                     ops[0], indexVal);
+  }
   case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v16qi:
   case X86::BI__builtin_ia32_vec_set_v8hi:
@@ -81,11 +502,35 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vec_set_v32qi:
   case X86::BI__builtin_ia32_vec_set_v16hi:
   case X86::BI__builtin_ia32_vec_set_v8si:
-  case X86::BI__builtin_ia32_vec_set_v4di:
+  case X86::BI__builtin_ia32_vec_set_v4di: {
+    return emitVecInsert(builder, getLoc(expr->getExprLoc()), ops[0], ops[1],
+                         ops[2]);
+  }
+  case X86::BI__builtin_ia32_kunpckhi:
+    return emitX86MaskUnpack(builder, getLoc(expr->getExprLoc()),
+                             "x86.avx512.kunpackb", ops);
+  case X86::BI__builtin_ia32_kunpcksi:
+    return emitX86MaskUnpack(builder, getLoc(expr->getExprLoc()),
+                             "x86.avx512.kunpackw", ops);
+  case X86::BI__builtin_ia32_kunpckdi:
+    return emitX86MaskUnpack(builder, getLoc(expr->getExprLoc()),
+                             "x86.avx512.kunpackd", ops);
   case X86::BI_mm_setcsr:
-  case X86::BI__builtin_ia32_ldmxcsr:
+  case X86::BI__builtin_ia32_ldmxcsr: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    Address tmp = createMemTemp(expr->getArg(0)->getType(), loc);
+    builder.createStore(loc, ops[0], tmp);
+    return emitIntrinsicCallOp(builder, loc, "x86.sse.ldmxcsr",
+                               builder.getVoidTy(), tmp.getPointer());
+  }
   case X86::BI_mm_getcsr:
-  case X86::BI__builtin_ia32_stmxcsr:
+  case X86::BI__builtin_ia32_stmxcsr: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    Address tmp = createMemTemp(expr->getType(), loc);
+    emitIntrinsicCallOp(builder, loc, "x86.sse.stmxcsr", builder.getVoidTy(),
+                        tmp.getPointer());
+    return builder.createLoad(loc, tmp);
+  }
   case X86::BI__builtin_ia32_xsave:
   case X86::BI__builtin_ia32_xsave64:
   case X86::BI__builtin_ia32_xrstor:
@@ -99,9 +544,78 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_xsaves:
   case X86::BI__builtin_ia32_xsaves64:
   case X86::BI__builtin_ia32_xsetbv:
-  case X86::BI_xsetbv:
+  case X86::BI_xsetbv: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_xsave:
+      intrinsicName = "x86.xsave";
+      break;
+    case X86::BI__builtin_ia32_xsave64:
+      intrinsicName = "x86.xsave64";
+      break;
+    case X86::BI__builtin_ia32_xrstor:
+      intrinsicName = "x86.xrstor";
+      break;
+    case X86::BI__builtin_ia32_xrstor64:
+      intrinsicName = "x86.xrstor64";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt:
+      intrinsicName = "x86.xsaveopt";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt64:
+      intrinsicName = "x86.xsaveopt64";
+      break;
+    case X86::BI__builtin_ia32_xrstors:
+      intrinsicName = "x86.xrstors";
+      break;
+    case X86::BI__builtin_ia32_xrstors64:
+      intrinsicName = "x86.xrstors64";
+      break;
+    case X86::BI__builtin_ia32_xsavec:
+      intrinsicName = "x86.xsavec";
+      break;
+    case X86::BI__builtin_ia32_xsavec64:
+      intrinsicName = "x86.xsavec64";
+      break;
+    case X86::BI__builtin_ia32_xsaves:
+      intrinsicName = "x86.xsaves";
+      break;
+    case X86::BI__builtin_ia32_xsaves64:
+      intrinsicName = "x86.xsaves64";
+      break;
+    case X86::BI__builtin_ia32_xsetbv:
+    case X86::BI_xsetbv:
+      intrinsicName = "x86.xsetbv";
+      break;
+    }
+
+    // The xsave family of instructions take a 64-bit mask that specifies
+    // which processor state components to save/restore. The hardware expects
+    // this mask split into two 32-bit registers: EDX (high 32 bits) and
+    // EAX (low 32 bits).
+    mlir::Type i32Ty = builder.getSInt32Ty();
+
+    // Mhi = (uint32_t)(ops[1] >> 32) - extract high 32 bits via right shift
+    cir::ConstantOp shift32 = builder.getSInt64(32, loc);
+    mlir::Value mhi = builder.createShift(loc, ops[1], shift32.getResult(),
+                                          /*isShiftLeft=*/false);
+    mhi = builder.createIntCast(mhi, i32Ty);
+
+    // Mlo = (uint32_t)ops[1] - extract low 32 bits by truncation
+    mlir::Value mlo = builder.createIntCast(ops[1], i32Ty);
+
+    return emitIntrinsicCallOp(builder, loc, intrinsicName, voidTy,
+                               mlir::ValueRange{ops[0], mhi, mlo});
+  }
   case X86::BI__builtin_ia32_xgetbv:
   case X86::BI_xgetbv:
+    // xgetbv reads the extended control register specified by ops[0] (ECX)
+    // and returns the 64-bit value
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.xgetbv", builder.getUInt64Ty(), ops[0]);
   case X86::BI__builtin_ia32_storedqudi128_mask:
   case X86::BI__builtin_ia32_storedqusi128_mask:
   case X86::BI__builtin_ia32_storedquhi128_mask:
@@ -160,13 +674,9 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
   case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
-  case X86::BI__builtin_ia32_vfmaddss3:
-  case X86::BI__builtin_ia32_vfmaddsd3:
   case X86::BI__builtin_ia32_vfmaddsh3_mask:
   case X86::BI__builtin_ia32_vfmaddss3_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_mask:
-  case X86::BI__builtin_ia32_vfmaddss:
-  case X86::BI__builtin_ia32_vfmaddsd:
   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
   case X86::BI__builtin_ia32_vfmaddss3_maskz:
   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
@@ -282,6 +792,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compressstoreqi128_mask:
   case X86::BI__builtin_ia32_compressstoreqi256_mask:
   case X86::BI__builtin_ia32_compressstoreqi512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_expanddf128_mask:
   case X86::BI__builtin_ia32_expanddf256_mask:
   case X86::BI__builtin_ia32_expanddf512_mask:
@@ -299,7 +813,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_expandhi512_mask:
   case X86::BI__builtin_ia32_expandqi128_mask:
   case X86::BI__builtin_ia32_expandqi256_mask:
-  case X86::BI__builtin_ia32_expandqi512_mask:
+  case X86::BI__builtin_ia32_expandqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.expand");
+  }
   case X86::BI__builtin_ia32_compressdf128_mask:
   case X86::BI__builtin_ia32_compressdf256_mask:
   case X86::BI__builtin_ia32_compressdf512_mask:
@@ -317,7 +835,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compresshi512_mask:
   case X86::BI__builtin_ia32_compressqi128_mask:
   case X86::BI__builtin_ia32_compressqi256_mask:
-  case X86::BI__builtin_ia32_compressqi512_mask:
+  case X86::BI__builtin_ia32_compressqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.compress");
+  }
   case X86::BI__builtin_ia32_gather3div2df:
   case X86::BI__builtin_ia32_gather3div2di:
   case X86::BI__builtin_ia32_gather3div4df:
@@ -341,7 +863,93 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_gathersiv8di:
   case X86::BI__builtin_ia32_gathersiv16si:
   case X86::BI__builtin_ia32_gatherdiv8di:
-  case X86::BI__builtin_ia32_gatherdiv16si:
+  case X86::BI__builtin_ia32_gatherdiv16si: {
+    StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_gather3div2df:
+      intrinsicName = "x86.avx512.mask.gather3div2.df";
+      break;
+    case X86::BI__builtin_ia32_gather3div2di:
+      intrinsicName = "x86.avx512.mask.gather3div2.di";
+      break;
+    case X86::BI__builtin_ia32_gather3div4df:
+      intrinsicName = "x86.avx512.mask.gather3div4.df";
+      break;
+    case X86::BI__builtin_ia32_gather3div4di:
+      intrinsicName = "x86.avx512.mask.gather3div4.di";
+      break;
+    case X86::BI__builtin_ia32_gather3div4sf:
+      intrinsicName = "x86.avx512.mask.gather3div4.sf";
+      break;
+    case X86::BI__builtin_ia32_gather3div4si:
+      intrinsicName = "x86.avx512.mask.gather3div4.si";
+      break;
+    case X86::BI__builtin_ia32_gather3div8sf:
+      intrinsicName = "x86.avx512.mask.gather3div8.sf";
+      break;
+    case X86::BI__builtin_ia32_gather3div8si:
+      intrinsicName = "x86.avx512.mask.gather3div8.si";
+      break;
+    case X86::BI__builtin_ia32_gather3siv2df:
+      intrinsicName = "x86.avx512.mask.gather3siv2.df";
+      break;
+    case X86::BI__builtin_ia32_gather3siv2di:
+      intrinsicName = "x86.avx512.mask.gather3siv2.di";
+      break;
+    case X86::BI__builtin_ia32_gather3siv4df:
+      intrinsicName = "x86.avx512.mask.gather3siv4.df";
+      break;
+    case X86::BI__builtin_ia32_gather3siv4di:
+      intrinsicName = "x86.avx512.mask.gather3siv4.di";
+      break;
+    case X86::BI__builtin_ia32_gather3siv4sf:
+      intrinsicName = "x86.avx512.mask.gather3siv4.sf";
+      break;
+    case X86::BI__builtin_ia32_gather3siv4si:
+      intrinsicName = "x86.avx512.mask.gather3siv4.si";
+      break;
+    case X86::BI__builtin_ia32_gather3siv8sf:
+      intrinsicName = "x86.avx512.mask.gather3siv8.sf";
+      break;
+    case X86::BI__builtin_ia32_gather3siv8si:
+      intrinsicName = "x86.avx512.mask.gather3siv8.si";
+      break;
+    case X86::BI__builtin_ia32_gathersiv8df:
+      intrinsicName = "x86.avx512.mask.gather.dpd.512";
+      break;
+    case X86::BI__builtin_ia32_gathersiv16sf:
+      intrinsicName = "x86.avx512.mask.gather.dps.512";
+      break;
+    case X86::BI__builtin_ia32_gatherdiv8df:
+      intrinsicName = "x86.avx512.mask.gather.qpd.512";
+      break;
+    case X86::BI__builtin_ia32_gatherdiv16sf:
+      intrinsicName = "x86.avx512.mask.gather.qps.512";
+      break;
+    case X86::BI__builtin_ia32_gathersiv8di:
+      intrinsicName = "x86.avx512.mask.gather.dpq.512";
+      break;
+    case X86::BI__builtin_ia32_gathersiv16si:
+      intrinsicName = "x86.avx512.mask.gather.dpi.512";
+      break;
+    case X86::BI__builtin_ia32_gatherdiv8di:
+      intrinsicName = "x86.avx512.mask.gather.qpq.512";
+      break;
+    case X86::BI__builtin_ia32_gatherdiv16si:
+      intrinsicName = "x86.avx512.mask.gather.qpi.512";
+      break;
+    }
+
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    unsigned minElts =
+        std::min(cast<cir::VectorType>(ops[0].getType()).getSize(),
+                 cast<cir::VectorType>(ops[2].getType()).getSize());
+    ops[3] = getMaskVecValue(builder, loc, ops[3], minElts);
+    return emitIntrinsicCallOp(builder, loc, intrinsicName,
+                               convertType(expr->getType()), ops);
+  }
   case X86::BI__builtin_ia32_scattersiv8df:
   case X86::BI__builtin_ia32_scattersiv16sf:
   case X86::BI__builtin_ia32_scatterdiv8df:
@@ -365,7 +973,94 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_scattersiv4sf:
   case X86::BI__builtin_ia32_scattersiv4si:
   case X86::BI__builtin_ia32_scattersiv8sf:
-  case X86::BI__builtin_ia32_scattersiv8si:
+  case X86::BI__builtin_ia32_scattersiv8si: {
+    llvm::StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_scattersiv8df:
+      intrinsicName = "x86.avx512.mask.scatter.dpd.512";
+      break;
+    case X86::BI__builtin_ia32_scattersiv16sf:
+      intrinsicName = "x86.avx512.mask.scatter.dps.512";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv8df:
+      intrinsicName = "x86.avx512.mask.scatter.qpd.512";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv16sf:
+      intrinsicName = "x86.avx512.mask.scatter.qps.512";
+      break;
+    case X86::BI__builtin_ia32_scattersiv8di:
+      intrinsicName = "x86.avx512.mask.scatter.dpq.512";
+      break;
+    case X86::BI__builtin_ia32_scattersiv16si:
+      intrinsicName = "x86.avx512.mask.scatter.dpi.512";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv8di:
+      intrinsicName = "x86.avx512.mask.scatter.qpq.512";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv16si:
+      intrinsicName = "x86.avx512.mask.scatter.qpi.512";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv2df:
+      intrinsicName = "x86.avx512.mask.scatterdiv2.df";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv2di:
+      intrinsicName = "x86.avx512.mask.scatterdiv2.di";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv4df:
+      intrinsicName = "x86.avx512.mask.scatterdiv4.df";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv4di:
+      intrinsicName = "x86.avx512.mask.scatterdiv4.di";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv4sf:
+      intrinsicName = "x86.avx512.mask.scatterdiv4.sf";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv4si:
+      intrinsicName = "x86.avx512.mask.scatterdiv4.si";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv8sf:
+      intrinsicName = "x86.avx512.mask.scatterdiv8.sf";
+      break;
+    case X86::BI__builtin_ia32_scatterdiv8si:
+      intrinsicName = "x86.avx512.mask.scatterdiv8.si";
+      break;
+    case X86::BI__builtin_ia32_scattersiv2df:
+      intrinsicName = "x86.avx512.mask.scattersiv2.df";
+      break;
+    case X86::BI__builtin_ia32_scattersiv2di:
+      intrinsicName = "x86.avx512.mask.scattersiv2.di";
+      break;
+    case X86::BI__builtin_ia32_scattersiv4df:
+      intrinsicName = "x86.avx512.mask.scattersiv4.df";
+      break;
+    case X86::BI__builtin_ia32_scattersiv4di:
+      intrinsicName = "x86.avx512.mask.scattersiv4.di";
+      break;
+    case X86::BI__builtin_ia32_scattersiv4sf:
+      intrinsicName = "x86.avx512.mask.scattersiv4.sf";
+      break;
+    case X86::BI__builtin_ia32_scattersiv4si:
+      intrinsicName = "x86.avx512.mask.scattersiv4.si";
+      break;
+    case X86::BI__builtin_ia32_scattersiv8sf:
+      intrinsicName = "x86.avx512.mask.scattersiv8.sf";
+      break;
+    case X86::BI__builtin_ia32_scattersiv8si:
+      intrinsicName = "x86.avx512.mask.scattersiv8.si";
+      break;
+    }
+
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    unsigned minElts =
+        std::min(cast<cir::VectorType>(ops[2].getType()).getSize(),
+                 cast<cir::VectorType>(ops[3].getType()).getSize());
+    ops[1] = getMaskVecValue(builder, loc, ops[1], minElts);
+
+    return emitIntrinsicCallOp(builder, loc, intrinsicName,
+                               convertType(expr->getType()), ops);
+  }
   case X86::BI__builtin_ia32_vextractf128_pd256:
   case X86::BI__builtin_ia32_vextractf128_ps256:
   case X86::BI__builtin_ia32_vextractf128_si256:
@@ -408,12 +1103,20 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512:
+    return emitPshufWord(builder, ops[0], ops[1], getLoc(expr->getExprLoc()),
+                         true);
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512:
+    return emitPshufWord(builder, ops[0], ops[1], getLoc(expr->getExprLoc()),
+                         false);
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
@@ -422,13 +1125,28 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vpermilpd256:
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
-  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_vpermilps512: {
+    const uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
+
+    llvm::SmallVector<int64_t, 16> mask(16);
+    computeFullLaneShuffleMask(*this, ops[0], imm, false, mask);
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask);
+  }
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
   case X86::BI__builtin_ia32_shufpd512:
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufps512: {
+    const uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+
+    llvm::SmallVector<int64_t, 16> mask(16);
+    computeFullLaneShuffleMask(*this, ops[0], imm, true, mask);
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+                                    mask);
+  }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
   case X86::BI__builtin_ia32_permdi512:
@@ -460,14 +1178,58 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_kshiftliqi:
   case X86::BI__builtin_ia32_kshiftlihi:
   case X86::BI__builtin_ia32_kshiftlisi:
-  case X86::BI__builtin_ia32_kshiftlidi:
+  case X86::BI__builtin_ia32_kshiftlidi: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    unsigned shiftVal =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue() &
+        0xff;
+    unsigned numElems = cast<cir::IntType>(ops[0].getType()).getWidth();
+
+    if (shiftVal >= numElems)
+      return builder.getNullValue(ops[0].getType(), loc);
+
+    mlir::Value in = getMaskVecValue(builder, loc, ops[0], numElems);
+
+    SmallVector<mlir::Attribute, 64> indices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, numElems + i - shiftVal));
+
+    mlir::Value zero = builder.getNullValue(in.getType(), loc);
+    mlir::Value sv = builder.createVecShuffle(loc, zero, in, indices);
+    return builder.createBitcast(sv, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kshiftriqi:
   case X86::BI__builtin_ia32_kshiftrihi:
   case X86::BI__builtin_ia32_kshiftrisi:
-  case X86::BI__builtin_ia32_kshiftridi:
+  case X86::BI__builtin_ia32_kshiftridi: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    unsigned shiftVal =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue() &
+        0xff;
+    unsigned numElems = cast<cir::IntType>(ops[0].getType()).getWidth();
+
+    if (shiftVal >= numElems)
+      return builder.getNullValue(ops[0].getType(), loc);
+
+    mlir::Value in = getMaskVecValue(builder, loc, ops[0], numElems);
+
+    SmallVector<mlir::Attribute, 64> indices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, i + shiftVal));
+
+    mlir::Value zero = builder.getNullValue(in.getType(), loc);
+    mlir::Value sv = builder.createVecShuffle(loc, in, zero, indices);
+    return builder.createBitcast(sv, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_vprotbi:
   case X86::BI__builtin_ia32_vprotwi:
   case X86::BI__builtin_ia32_vprotdi:
@@ -478,12 +1240,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_prolq128:
   case X86::BI__builtin_ia32_prolq256:
   case X86::BI__builtin_ia32_prolq512:
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
+                              ops[0], ops[1], false);
   case X86::BI__builtin_ia32_prord128:
   case X86::BI__builtin_ia32_prord256:
   case X86::BI__builtin_ia32_prord512:
   case X86::BI__builtin_ia32_prorq128:
   case X86::BI__builtin_ia32_prorq256:
   case X86::BI__builtin_ia32_prorq512:
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
+                              ops[0], ops[1], true);
   case X86::BI__builtin_ia32_selectb_128:
   case X86::BI__builtin_ia32_selectb_256:
   case X86::BI__builtin_ia32_selectb_512:
@@ -536,86 +1302,166 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_ucmpq128_mask:
   case X86::BI__builtin_ia32_ucmpq256_mask:
   case X86::BI__builtin_ia32_ucmpq512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_vpcomb:
   case X86::BI__builtin_ia32_vpcomw:
   case X86::BI__builtin_ia32_vpcomd:
   case X86::BI__builtin_ia32_vpcomq:
+    return emitX86vpcom(builder, getLoc(expr->getExprLoc()), ops, true);
   case X86::BI__builtin_ia32_vpcomub:
   case X86::BI__builtin_ia32_vpcomuw:
   case X86::BI__builtin_ia32_vpcomud:
   case X86::BI__builtin_ia32_vpcomuq:
+    return emitX86vpcom(builder, getLoc(expr->getExprLoc()), ops, false);
   case X86::BI__builtin_ia32_kortestcqi:
   case X86::BI__builtin_ia32_kortestchi:
   case X86::BI__builtin_ia32_kortestcsi:
-  case X86::BI__builtin_ia32_kortestcdi:
+  case X86::BI__builtin_ia32_kortestcdi: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    cir::IntType ty = cast<cir::IntType>(ops[0].getType());
+    mlir::Value allOnesOp =
+        builder.getConstAPInt(loc, ty, APInt::getAllOnes(ty.getWidth()));
+    mlir::Value orOp = emitX86MaskLogic(builder, loc, cir::BinOpKind::Or, ops);
+    mlir::Value cmp =
+        cir::CmpOp::create(builder, loc, cir::CmpOpKind::eq, orOp, allOnesOp);
+    return builder.createCast(cir::CastKind::bool_to_int, cmp,
+                              cgm.convertType(expr->getType()));
+  }
   case X86::BI__builtin_ia32_kortestzqi:
   case X86::BI__builtin_ia32_kortestzhi:
   case X86::BI__builtin_ia32_kortestzsi:
-  case X86::BI__builtin_ia32_kortestzdi:
+  case X86::BI__builtin_ia32_kortestzdi: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    cir::IntType ty = cast<cir::IntType>(ops[0].getType());
+    mlir::Value allZerosOp = builder.getNullValue(ty, loc).getResult();
+    mlir::Value orOp = emitX86MaskLogic(builder, loc, cir::BinOpKind::Or, ops);
+    mlir::Value cmp =
+        cir::CmpOp::create(builder, loc, cir::CmpOpKind::eq, orOp, allZerosOp);
+    return builder.createCast(cir::CastKind::bool_to_int, cmp,
+                              cgm.convertType(expr->getType()));
+  }
   case X86::BI__builtin_ia32_ktestcqi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestc.b", ops);
   case X86::BI__builtin_ia32_ktestzqi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestz.b", ops);
   case X86::BI__builtin_ia32_ktestchi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestc.w", ops);
   case X86::BI__builtin_ia32_ktestzhi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestz.w", ops);
   case X86::BI__builtin_ia32_ktestcsi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestc.d", ops);
   case X86::BI__builtin_ia32_ktestzsi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestz.d", ops);
   case X86::BI__builtin_ia32_ktestcdi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestc.q", ops);
   case X86::BI__builtin_ia32_ktestzdi:
+    return emitX86MaskTest(builder, getLoc(expr->getExprLoc()),
+                           "x86.avx512.ktestz.q", ops);
   case X86::BI__builtin_ia32_kaddqi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.b", ops);
   case X86::BI__builtin_ia32_kaddhi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.w", ops);
   case X86::BI__builtin_ia32_kaddsi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.d", ops);
   case X86::BI__builtin_ia32_kadddi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.q", ops);
   case X86::BI__builtin_ia32_kandqi:
   case X86::BI__builtin_ia32_kandhi:
   case X86::BI__builtin_ia32_kandsi:
   case X86::BI__builtin_ia32_kanddi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::And, ops);
   case X86::BI__builtin_ia32_kandnqi:
   case X86::BI__builtin_ia32_kandnhi:
   case X86::BI__builtin_ia32_kandnsi:
   case X86::BI__builtin_ia32_kandndi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::And, ops, true);
   case X86::BI__builtin_ia32_korqi:
   case X86::BI__builtin_ia32_korhi:
   case X86::BI__builtin_ia32_korsi:
   case X86::BI__builtin_ia32_kordi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::Or, ops);
   case X86::BI__builtin_ia32_kxnorqi:
   case X86::BI__builtin_ia32_kxnorhi:
   case X86::BI__builtin_ia32_kxnorsi:
   case X86::BI__builtin_ia32_kxnordi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::Xor, ops, true);
   case X86::BI__builtin_ia32_kxorqi:
   case X86::BI__builtin_ia32_kxorhi:
   case X86::BI__builtin_ia32_kxorsi:
   case X86::BI__builtin_ia32_kxordi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::Xor, ops);
   case X86::BI__builtin_ia32_knotqi:
   case X86::BI__builtin_ia32_knothi:
   case X86::BI__builtin_ia32_knotsi:
-  case X86::BI__builtin_ia32_knotdi:
+  case X86::BI__builtin_ia32_knotdi: {
+    cir::IntType intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value resVec =
+        getMaskVecValue(builder, getLoc(expr->getExprLoc()), ops[0], numElts);
+    return builder.createBitcast(builder.createNot(resVec), ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kmovb:
   case X86::BI__builtin_ia32_kmovw:
   case X86::BI__builtin_ia32_kmovd:
-  case X86::BI__builtin_ia32_kmovq:
-  case X86::BI__builtin_ia32_kunpckdi:
-  case X86::BI__builtin_ia32_kunpcksi:
-  case X86::BI__builtin_ia32_kunpckhi:
+  case X86::BI__builtin_ia32_kmovq: {
+    // Bitcast to vXi1 type and then back to integer. This gets the mask
+    // register type into the IR, but might be optimized out depending on
+    // what's around it.
+    cir::IntType intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value resVec =
+        getMaskVecValue(builder, getLoc(expr->getExprLoc()), ops[0], numElts);
+    return builder.createBitcast(resVec, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask:
-  case X86::BI__builtin_ia32_sqrtpd256:
-  case X86::BI__builtin_ia32_sqrtpd:
-  case X86::BI__builtin_ia32_sqrtps256:
-  case X86::BI__builtin_ia32_sqrtps:
-  case X86::BI__builtin_ia32_sqrtph256:
-  case X86::BI__builtin_ia32_sqrtph:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_sqrtph512:
-  case X86::BI__builtin_ia32_vsqrtbf16256:
-  case X86::BI__builtin_ia32_vsqrtbf16:
-  case X86::BI__builtin_ia32_vsqrtbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
-  case X86::BI__builtin_ia32_sqrtpd512:
+  case X86::BI__builtin_ia32_sqrtpd512: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    mlir::Value arg = ops[0];
+    return cir::SqrtOp::create(builder, loc, arg.getType(), arg).getResult();
+  }
   case X86::BI__builtin_ia32_pmuludq128:
   case X86::BI__builtin_ia32_pmuludq256:
-  case X86::BI__builtin_ia32_pmuludq512:
+  case X86::BI__builtin_ia32_pmuludq512: {
+    unsigned opTypePrimitiveSizeInBits =
+        cgm.getDataLayout().getTypeSizeInBits(ops[0].getType());
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ false,
+                        ops, opTypePrimitiveSizeInBits);
+  }
   case X86::BI__builtin_ia32_pmuldq128:
   case X86::BI__builtin_ia32_pmuldq256:
-  case X86::BI__builtin_ia32_pmuldq512:
+  case X86::BI__builtin_ia32_pmuldq512: {
+    unsigned opTypePrimitiveSizeInBits =
+        cgm.getDataLayout().getTypeSizeInBits(ops[0].getType());
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ true,
+                        ops, opTypePrimitiveSizeInBits);
+  }
   case X86::BI__builtin_ia32_pternlogd512_mask:
   case X86::BI__builtin_ia32_pternlogq512_mask:
   case X86::BI__builtin_ia32_pternlogd128_mask:
@@ -710,10 +1556,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_cmpunordpd:
   case X86::BI__builtin_ia32_cmpneqps:
   case X86::BI__builtin_ia32_cmpneqpd:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_cmpnltps:
   case X86::BI__builtin_ia32_cmpnltpd:
+    return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()),
+                          cir::CmpOpKind::lt, /*shouldInvert=*/true);
   case X86::BI__builtin_ia32_cmpnleps:
   case X86::BI__builtin_ia32_cmpnlepd:
+    return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()),
+                          cir::CmpOpKind::le, /*shouldInvert=*/true);
   case X86::BI__builtin_ia32_cmpordps:
   case X86::BI__builtin_ia32_cmpordpd:
   case X86::BI__builtin_ia32_cmpph128_mask:
@@ -752,7 +1606,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
   case X86::BI__builtin_ia32_vcvtph2ps512_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask:
-  case X86::BI__builtin_ia32_cvtsbf162ss_32:
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
   case X86::BI__cpuid:
@@ -798,7 +1651,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
   case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
   case X86::BI__builtin_ia32_prefetchi:
-    cgm.errorNYI(e->getSourceRange(),
+    cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented X86 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return {};
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
index a3e2081..71568ec 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -135,15 +135,27 @@ static void emitDeclDestroy(CIRGenFunction &cgf, const VarDecl *vd,
     // call right here.
     auto gd = GlobalDecl(dtor, Dtor_Complete);
     fnOp = cgm.getAddrAndTypeOfCXXStructor(gd).second;
-    cgf.getBuilder().createCallOp(
-        cgf.getLoc(vd->getSourceRange()),
-        mlir::FlatSymbolRefAttr::get(fnOp.getSymNameAttr()),
-        mlir::ValueRange{cgm.getAddrOfGlobalVar(vd)});
+    builder.createCallOp(cgf.getLoc(vd->getSourceRange()),
+                         mlir::FlatSymbolRefAttr::get(fnOp.getSymNameAttr()),
+                         mlir::ValueRange{cgm.getAddrOfGlobalVar(vd)});
+    assert(fnOp && "expected cir.func");
+    // TODO(cir): This doesn't do anything but check for unhandled conditions.
+    // What it is meant to do should really be happening in LoweringPrepare.
+    cgm.getCXXABI().registerGlobalDtor(vd, fnOp, nullptr);
   } else {
-    cgm.errorNYI(vd->getSourceRange(), "array destructor");
+    // Otherwise, a custom destroyed is needed. Classic codegen creates a helper
+    // function here and emits the destroy into the helper function, which is
+    // called from __cxa_atexit.
+    // In CIR, we just emit the destroy into the dtor region. It will be moved
+    // into a separate function during the LoweringPrepare pass.
+    // FIXME(cir): We should create a new operation here to explicitly get the
+    // address of the global into whose dtor region we are emiiting the destroy.
+    // The same applies to code above where it is calling getAddrOfGlobalVar.
+    mlir::Value globalVal = builder.createGetGlobal(addr);
+    CharUnits alignment = cgf.getContext().getDeclAlign(vd);
+    Address globalAddr{globalVal, cgf.convertTypeForMem(type), alignment};
+    cgf.emitDestroy(globalAddr, type, cgf.getDestroyer(dtorKind));
   }
-  assert(fnOp && "expected cir.func");
-  cgm.getCXXABI().registerGlobalDtor(vd, fnOp, nullptr);
 
   builder.setInsertionPointToEnd(block);
   if (block->empty()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index 13dc9f3..57b1a1f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -192,6 +192,9 @@ public:
                                        QualType elementType,
                                        const CXXDestructorDecl *dtor) = 0;
 
+  virtual size_t getSrcArgforCopyCtor(const CXXConstructorDecl *,
+                                      FunctionArgList &args) const = 0;
+
   /// Checks if ABI requires extra virtual offset for vtable field.
   virtual bool
   isVirtualOffsetNeededForVTableField(CIRGenFunction &cgf,
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 50d4c03..17f0c6d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -465,12 +465,47 @@ static cir::CIRCallOpInterface
 emitCallLikeOp(CIRGenFunction &cgf, mlir::Location callLoc,
                cir::FuncType indirectFuncTy, mlir::Value indirectFuncVal,
                cir::FuncOp directFuncOp,
-               const SmallVectorImpl<mlir::Value> &cirCallArgs,
+               const SmallVectorImpl<mlir::Value> &cirCallArgs, bool isInvoke,
                const mlir::NamedAttrList &attrs) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
 
   assert(!cir::MissingFeatures::opCallSurroundingTry());
-  assert(!cir::MissingFeatures::invokeOp());
+
+  if (isInvoke) {
+    // This call may throw and requires catch and/or cleanup handling.
+    // If this call does not appear within the `try` region of an existing
+    // TryOp, we must create a synthetic TryOp to contain the call. This
+    // happens when a call that may throw appears within a cleanup
+    // scope.
+
+    // In OG, we build the landing pad for this scope. In CIR, we emit a
+    // synthetic cir.try because this didn't come from code generating from a
+    // try/catch in C++.
+    assert(cgf.curLexScope && "expected scope");
+    cir::TryOp tryOp = cgf.curLexScope->getClosestTryParent();
+    if (!tryOp) {
+      cgf.cgm.errorNYI(
+          "emitCallLikeOp: call does not have an associated cir.try");
+      return {};
+    }
+
+    if (tryOp.getSynthetic()) {
+      cgf.cgm.errorNYI("emitCallLikeOp: tryOp synthetic");
+      return {};
+    }
+
+    cir::CallOp callOpWithExceptions;
+    if (indirectFuncTy) {
+      cgf.cgm.errorNYI("emitCallLikeOp: indirect function type");
+      return {};
+    }
+
+    callOpWithExceptions =
+        builder.createCallOp(callLoc, directFuncOp, cirCallArgs);
+
+    cgf.populateCatchHandlersIfRequired(tryOp);
+    return callOpWithExceptions;
+  }
 
   assert(builder.getInsertionBlock() && "expected valid basic block");
 
@@ -601,8 +636,6 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
   assert(!cir::MissingFeatures::opCallAttrs());
   cgm.constructAttributeList(callee.getAbstractInfo(), attrs);
 
-  assert(!cir::MissingFeatures::invokeOp());
-
   cir::FuncType indirectFuncTy;
   mlir::Value indirectFuncVal;
   cir::FuncOp directFuncOp;
@@ -628,10 +661,17 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
     indirectFuncVal = calleePtr->getResult(0);
   }
 
+  assert(!cir::MissingFeatures::msvcCXXPersonality());
+  assert(!cir::MissingFeatures::functionUsesSEHTry());
+  assert(!cir::MissingFeatures::nothrowAttr());
+
+  bool cannotThrow = attrs.getNamed("nothrow").has_value();
+  bool isInvoke = !cannotThrow && isCatchOrCleanupRequired();
+
   mlir::Location callLoc = loc;
   cir::CIRCallOpInterface theCall =
       emitCallLikeOp(*this, loc, indirectFuncTy, indirectFuncVal, directFuncOp,
-                     cirCallArgs, attrs);
+                     cirCallArgs, isInvoke, attrs);
 
   if (callOp)
     *callOp = theCall;
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index a829678..2a26e38 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -18,6 +18,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/Type.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -110,8 +111,23 @@ static void emitMemberInitializer(CIRGenFunction &cgf,
     // NOTE(cir): CodeGen allows record types to be memcpy'd if applicable,
     // whereas ClangIR wants to represent all object construction explicitly.
     if (!baseElementTy->isRecordType()) {
-      cgf.cgm.errorNYI(memberInit->getSourceRange(),
-                       "emitMemberInitializer: array of non-record type");
+      unsigned srcArgIndex =
+          cgf.cgm.getCXXABI().getSrcArgforCopyCtor(constructor, args);
+      cir::LoadOp srcPtr = cgf.getBuilder().createLoad(
+          cgf.getLoc(memberInit->getSourceLocation()),
+          cgf.getAddrOfLocalVar(args[srcArgIndex]));
+      LValue thisRhslv = cgf.makeNaturalAlignAddrLValue(srcPtr, recordTy);
+      LValue src = cgf.emitLValueForFieldInitialization(thisRhslv, field,
+                                                        field->getName());
+
+      // Copy the aggregate.
+      cgf.emitAggregateCopy(lhs, src, fieldType,
+                            cgf.getOverlapForFieldInit(field),
+                            lhs.isVolatileQualified());
+      // Ensure that we destroy the objects if an exception is thrown later in
+      // the constructor.
+      assert(!cgf.needsEHCleanup(fieldType.isDestructedType()) &&
+             "Arrays of non-record types shouldn't need EH cleanup");
       return;
     }
   }
@@ -133,7 +149,7 @@ struct CallBaseDtor final : EHScopeStack::Cleanup {
   CallBaseDtor(const CXXRecordDecl *base, bool baseIsVirtual)
       : baseClass(base), baseIsVirtual(baseIsVirtual) {}
 
-  void emit(CIRGenFunction &cgf) override {
+  void emit(CIRGenFunction &cgf, Flags flags) override {
     const CXXRecordDecl *derivedClass =
         cast<CXXMethodDecl>(cgf.curFuncDecl)->getParent();
 
@@ -786,6 +802,8 @@ void CIRGenFunction::emitImplicitAssignmentOperatorBody(FunctionArgList &args) {
          "Body of an implicit assignment operator should be compound stmt.");
   const auto *rootCS = cast<CompoundStmt>(rootS);
 
+  cgm.setCXXSpecialMemberAttr(cast<cir::FuncOp>(curFn), assignOp);
+
   assert(!cir::MissingFeatures::incrementProfileCounter());
   assert(!cir::MissingFeatures::runCleanupsScope());
 
@@ -906,7 +924,7 @@ mlir::Value loadThisForDtorDelete(CIRGenFunction &cgf,
 struct CallDtorDelete final : EHScopeStack::Cleanup {
   CallDtorDelete() {}
 
-  void emit(CIRGenFunction &cgf) override {
+  void emit(CIRGenFunction &cgf, Flags flags) override {
     const CXXDestructorDecl *dtor = cast<CXXDestructorDecl>(cgf.curFuncDecl);
     const CXXRecordDecl *classDecl = dtor->getParent();
     cgf.emitDeleteCall(dtor->getOperatorDelete(),
@@ -923,7 +941,7 @@ public:
   DestroyField(const FieldDecl *field, CIRGenFunction::Destroyer *destroyer)
       : field(field), destroyer(destroyer) {}
 
-  void emit(CIRGenFunction &cgf) override {
+  void emit(CIRGenFunction &cgf, Flags flags) override {
     // Find the address of the field.
     Address thisValue = cgf.loadCXXThisAddress();
     CanQualType recordTy =
@@ -932,7 +950,7 @@ public:
     LValue lv = cgf.emitLValueForField(thisLV, field);
     assert(lv.isSimple());
 
-    assert(!cir::MissingFeatures::ehCleanupFlags());
+    assert(!cir::MissingFeatures::useEHCleanupForArray());
     cgf.emitDestroy(lv.getAddress(), field->getType(), destroyer);
   }
 };
@@ -1029,7 +1047,7 @@ void CIRGenFunction::enterDtorCleanups(const CXXDestructorDecl *dd,
       continue;
 
     CleanupKind cleanupKind = getCleanupKind(dtorKind);
-    assert(!cir::MissingFeatures::ehCleanupFlags());
+    assert(!cir::MissingFeatures::useEHCleanupForArray());
     ehStack.pushCleanup<DestroyField>(cleanupKind, field,
                                       getDestroyer(dtorKind));
   }
@@ -1110,6 +1128,25 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase,
   }
 }
 
+Address CIRGenFunction::getAddressOfDerivedClass(
+    mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+    llvm::iterator_range<CastExpr::path_const_iterator> path,
+    bool nullCheckValue) {
+  assert(!path.empty() && "Base path should not be empty!");
+
+  QualType derivedTy = getContext().getCanonicalTagType(derived);
+  mlir::Type derivedValueTy = convertType(derivedTy);
+  CharUnits nonVirtualOffset =
+      cgm.computeNonVirtualBaseClassOffset(derived, path);
+
+  // Note that in OG, no offset (nonVirtualOffset.getQuantity() == 0) means it
+  // just gives the address back. In CIR a `cir.derived_class` is created and
+  // made into a nop later on during lowering.
+  return builder.createDerivedClassAddr(loc, baseAddr, derivedValueTy,
+                                        nonVirtualOffset.getQuantity(),
+                                        /*assumeNotNull=*/!nullCheckValue);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index 851328a..6c6cb40 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
 
   assert(!cir::MissingFeatures::innermostEHScope());
 
-  EHCleanupScope *scope = new (buffer)
-      EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup);
+  EHCleanupScope *scope = new (buffer) EHCleanupScope(
+      size, branchFixups.size(), innermostNormalCleanup, innermostEHScope);
 
   if (isNormalCleanup)
     innermostNormalCleanup = stable_begin();
@@ -188,18 +188,34 @@ void EHScopeStack::popCleanup() {
   }
 }
 
+bool EHScopeStack::requiresCatchOrCleanup() const {
+  for (stable_iterator si = getInnermostEHScope(); si != stable_end();) {
+    if (auto *cleanup = dyn_cast<EHCleanupScope>(&*find(si))) {
+      if (cleanup->isLifetimeMarker()) {
+        // Skip lifetime markers and continue from the enclosing EH scope
+        assert(!cir::MissingFeatures::emitLifetimeMarkers());
+        continue;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
   char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers));
-  assert(!cir::MissingFeatures::innermostEHScope());
-  EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers);
+  EHCatchScope *scope =
+      new (buffer) EHCatchScope(numHandlers, innermostEHScope);
+  innermostEHScope = stable_begin();
   return scope;
 }
 
-static void emitCleanup(CIRGenFunction &cgf, EHScopeStack::Cleanup *cleanup) {
+static void emitCleanup(CIRGenFunction &cgf, EHScopeStack::Cleanup *cleanup,
+                        EHScopeStack::Cleanup::Flags flags) {
   // Ask the cleanup to emit itself.
   assert(cgf.haveInsertPoint() && "expected insertion point");
-  assert(!cir::MissingFeatures::ehCleanupFlags());
-  cleanup->emit(cgf);
+  assert(!cir::MissingFeatures::ehCleanupActiveFlag());
+  cleanup->emit(cgf, flags);
   assert(cgf.haveInsertPoint() && "cleanup ended with no insertion point?");
 }
 
@@ -269,7 +285,11 @@ void CIRGenFunction::popCleanupBlock() {
         reinterpret_cast<EHScopeStack::Cleanup *>(cleanupBufferHeap.get());
   }
 
-  assert(!cir::MissingFeatures::ehCleanupFlags());
+  EHScopeStack::Cleanup::Flags cleanupFlags;
+  if (scope.isNormalCleanup())
+    cleanupFlags.setIsNormalCleanupKind();
+  if (scope.isEHCleanup())
+    cleanupFlags.setIsEHCleanupKind();
 
   // If we have a fallthrough and no other need for the cleanup,
   // emit it directly.
@@ -277,7 +297,7 @@ void CIRGenFunction::popCleanupBlock() {
     assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup());
     ehStack.popCleanup();
     scope.markEmitted();
-    emitCleanup(*this, cleanup);
+    emitCleanup(*this, cleanup, cleanupFlags);
   } else {
     // Otherwise, the best approach is to thread everything through
     // the cleanup block and then try to clean up after ourselves.
@@ -339,7 +359,7 @@ void CIRGenFunction::popCleanupBlock() {
     ehStack.popCleanup();
     assert(ehStack.hasNormalCleanups() == hasEnclosingCleanups);
 
-    emitCleanup(*this, cleanup);
+    emitCleanup(*this, cleanup, cleanupFlags);
 
     // Append the prepared cleanup prologue from above.
     assert(!cir::MissingFeatures::cleanupAppendInsts());
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
index 61a09a5..85dbde4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -18,6 +18,7 @@
 #include "CIRGenModule.h"
 #include "EHScopeStack.h"
 #include "mlir/IR/Value.h"
+#include "clang/AST/StmtCXX.h"
 
 namespace clang::CIRGen {
 
@@ -30,12 +31,16 @@ struct CatchTypeInfo {
 
 /// A protected scope for zero-cost EH handling.
 class EHScope {
+  EHScopeStack::stable_iterator enclosingEHScope;
+
   class CommonBitFields {
     friend class EHScope;
     unsigned kind : 3;
   };
   enum { NumCommonBits = 3 };
 
+  bool scopeMayThrow;
+
 protected:
   class CatchBitFields {
     friend class EHCatchScope;
@@ -79,7 +84,10 @@ protected:
 public:
   enum Kind { Cleanup, Catch, Terminate, Filter };
 
-  EHScope(Kind kind) { commonBits.kind = kind; }
+  EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
+      : enclosingEHScope(enclosingEHScope) {
+    commonBits.kind = kind;
+  }
 
   Kind getKind() const { return static_cast<Kind>(commonBits.kind); }
 
@@ -87,8 +95,13 @@ public:
     // Traditional LLVM codegen also checks for `!block->use_empty()`, but
     // in CIRGen the block content is not important, just used as a way to
     // signal `hasEHBranches`.
-    assert(!cir::MissingFeatures::ehstackBranches());
-    return false;
+    return scopeMayThrow;
+  }
+
+  void setMayThrow(bool mayThrow) { scopeMayThrow = mayThrow; }
+
+  EHScopeStack::stable_iterator getEnclosingEHScope() const {
+    return enclosingEHScope;
   }
 };
 
@@ -111,6 +124,11 @@ public:
 
     /// The catch handler for this type.
     mlir::Region *region;
+
+    /// The catch handler stmt.
+    const CXXCatchStmt *stmt;
+
+    bool isCatchAll() const { return type.rtti == nullptr; }
   };
 
 private:
@@ -118,22 +136,36 @@ private:
 
   Handler *getHandlers() { return reinterpret_cast<Handler *>(this + 1); }
 
+  const Handler *getHandlers() const {
+    return reinterpret_cast<const Handler *>(this + 1);
+  }
+
 public:
   static size_t getSizeForNumHandlers(unsigned n) {
     return sizeof(EHCatchScope) + n * sizeof(Handler);
   }
 
-  EHCatchScope(unsigned numHandlers) : EHScope(Catch) {
+  EHCatchScope(unsigned numHandlers,
+               EHScopeStack::stable_iterator enclosingEHScope)
+      : EHScope(Catch, enclosingEHScope) {
     catchBits.numHandlers = numHandlers;
     assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?");
   }
 
   unsigned getNumHandlers() const { return catchBits.numHandlers; }
 
-  void setHandler(unsigned i, CatchTypeInfo type, mlir::Region *region) {
+  void setHandler(unsigned i, CatchTypeInfo type, mlir::Region *region,
+                  const CXXCatchStmt *stmt) {
     assert(i < getNumHandlers());
-    getHandlers()[i].type = type;
-    getHandlers()[i].region = region;
+    Handler *handler = &getHandlers()[i];
+    handler->type = type;
+    handler->region = region;
+    handler->stmt = stmt;
+  }
+
+  const Handler &getHandler(unsigned i) const {
+    assert(i < getNumHandlers());
+    return getHandlers()[i];
   }
 
   // Clear all handler blocks.
@@ -144,6 +176,10 @@ public:
     // The blocks are owned by TryOp, nothing to delete.
   }
 
+  using iterator = const Handler *;
+  iterator begin() const { return getHandlers(); }
+  iterator end() const { return getHandlers() + getNumHandlers(); }
+
   static bool classof(const EHScope *scope) {
     return scope->getKind() == Catch;
   }
@@ -176,9 +212,10 @@ public:
   }
 
   EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth,
-                 EHScopeStack::stable_iterator enclosingNormal)
-      : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal),
-        fixupDepth(fixupDepth) {
+                 EHScopeStack::stable_iterator enclosingNormal,
+                 EHScopeStack::stable_iterator enclosingEH)
+      : EHScope(EHScope::Cleanup, enclosingEH),
+        enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) {
     // TODO(cir): When exception handling is upstreamed, isNormalCleanup and
     // isEHCleanup will be arguments to the constructor.
     cleanupBits.isNormalCleanup = true;
@@ -200,10 +237,13 @@ public:
   void setNormalBlock(mlir::Block *bb) { normalBlock = bb; }
 
   bool isNormalCleanup() const { return cleanupBits.isNormalCleanup; }
+  bool isEHCleanup() const { return cleanupBits.isEHCleanup; }
 
   bool isActive() const { return cleanupBits.isActive; }
   void setActive(bool isActive) { cleanupBits.isActive = isActive; }
 
+  bool isLifetimeMarker() const { return cleanupBits.isLifetimeMarker; }
+
   unsigned getFixupDepth() const { return fixupDepth; }
   EHScopeStack::stable_iterator getEnclosingNormalCleanup() const {
     return enclosingNormal;
@@ -235,13 +275,45 @@ public:
 
   EHScope *get() const { return reinterpret_cast<EHScope *>(ptr); }
 
+  EHScope *operator->() const { return get(); }
   EHScope &operator*() const { return *get(); }
+
+  iterator &operator++() {
+    size_t size;
+    switch (get()->getKind()) {
+    case EHScope::Catch:
+      size = EHCatchScope::getSizeForNumHandlers(
+          static_cast<const EHCatchScope *>(get())->getNumHandlers());
+      break;
+
+    case EHScope::Filter:
+      llvm_unreachable("EHScopeStack::iterator Filter");
+      break;
+
+    case EHScope::Cleanup:
+      llvm_unreachable("EHScopeStack::iterator Cleanup");
+      break;
+
+    case EHScope::Terminate:
+      llvm_unreachable("EHScopeStack::iterator Terminate");
+      break;
+    }
+    ptr += llvm::alignTo(size, ScopeStackAlignment);
+    return *this;
+  }
+
+  bool operator==(iterator other) const { return ptr == other.ptr; }
+  bool operator!=(iterator other) const { return ptr != other.ptr; }
 };
 
 inline EHScopeStack::iterator EHScopeStack::begin() const {
   return iterator(startOfData);
 }
 
+inline EHScopeStack::iterator EHScopeStack::end() const {
+  return iterator(endOfBuffer);
+}
+
 inline EHScopeStack::iterator
 EHScopeStack::find(stable_iterator savePoint) const {
   assert(savePoint.isValid() && "finding invalid savepoint");
@@ -254,7 +326,7 @@ inline void EHScopeStack::popCatch() {
   assert(!empty() && "popping exception stack when not empty");
 
   EHCatchScope &scope = llvm::cast<EHCatchScope>(*begin());
-  assert(!cir::MissingFeatures::innermostEHScope());
+  innermostEHScope = scope.getEnclosingEHScope();
   deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
index 930ae55..b4f185d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
@@ -13,6 +13,7 @@
 #include "CIRGenFunction.h"
 #include "mlir/Support/LLVM.h"
 #include "clang/AST/StmtCXX.h"
+#include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -21,18 +22,93 @@ using namespace clang;
 using namespace clang::CIRGen;
 
 struct clang::CIRGen::CGCoroData {
+  // What is the current await expression kind and how many
+  // await/yield expressions were encountered so far.
+  // These are used to generate pretty labels for await expressions in LLVM IR.
+  cir::AwaitKind currentAwaitKind = cir::AwaitKind::Init;
   // Stores the __builtin_coro_id emitted in the function so that we can supply
   // it as the first argument to other builtins.
   cir::CallOp coroId = nullptr;
 
   // Stores the result of __builtin_coro_begin call.
   mlir::Value coroBegin = nullptr;
+
+  // The promise type's 'unhandled_exception' handler, if it defines one.
+  Stmt *exceptionHandler = nullptr;
 };
 
 // Defining these here allows to keep CGCoroData private to this file.
 CIRGenFunction::CGCoroInfo::CGCoroInfo() {}
 CIRGenFunction::CGCoroInfo::~CGCoroInfo() {}
 
+namespace {
+// FIXME: both GetParamRef and ParamReferenceReplacerRAII are good template
+// candidates to be shared among LLVM / CIR codegen.
+
+// Hunts for the parameter reference in the parameter copy/move declaration.
+struct GetParamRef : public StmtVisitor<GetParamRef> {
+public:
+  DeclRefExpr *expr = nullptr;
+  GetParamRef() {}
+  void VisitDeclRefExpr(DeclRefExpr *e) {
+    assert(expr == nullptr && "multilple declref in param move");
+    expr = e;
+  }
+  void VisitStmt(Stmt *s) {
+    for (Stmt *c : s->children()) {
+      if (c)
+        Visit(c);
+    }
+  }
+};
+
+// This class replaces references to parameters to their copies by changing
+// the addresses in CGF.LocalDeclMap and restoring back the original values in
+// its destructor.
+struct ParamReferenceReplacerRAII {
+  CIRGenFunction::DeclMapTy savedLocals;
+  CIRGenFunction::DeclMapTy &localDeclMap;
+
+  ParamReferenceReplacerRAII(CIRGenFunction::DeclMapTy &localDeclMap)
+      : localDeclMap(localDeclMap) {}
+
+  void addCopy(const DeclStmt *pm) {
+    // Figure out what param it refers to.
+
+    assert(pm->isSingleDecl());
+    const VarDecl *vd = static_cast<const VarDecl *>(pm->getSingleDecl());
+    const Expr *initExpr = vd->getInit();
+    GetParamRef visitor;
+    visitor.Visit(const_cast<Expr *>(initExpr));
+    assert(visitor.expr);
+    DeclRefExpr *dreOrig = visitor.expr;
+    auto *pd = dreOrig->getDecl();
+
+    auto it = localDeclMap.find(pd);
+    assert(it != localDeclMap.end() && "parameter is not found");
+    savedLocals.insert({pd, it->second});
+
+    auto copyIt = localDeclMap.find(vd);
+    assert(copyIt != localDeclMap.end() && "parameter copy is not found");
+    it->second = copyIt->getSecond();
+  }
+
+  ~ParamReferenceReplacerRAII() {
+    for (auto &&savedLocal : savedLocals) {
+      localDeclMap.insert({savedLocal.first, savedLocal.second});
+    }
+  }
+};
+} // namespace
+
+RValue CIRGenFunction::emitCoroutineFrame() {
+  if (curCoro.data && curCoro.data->coroBegin) {
+    return RValue::get(curCoro.data->coroBegin);
+  }
+  cgm.errorNYI("NYI");
+  return RValue();
+}
+
 static void createCoroData(CIRGenFunction &cgf,
                            CIRGenFunction::CGCoroInfo &curCoro,
                            cir::CallOp coroId) {
@@ -149,7 +225,203 @@ CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
   if (s.getReturnStmtOnAllocFailure())
     cgm.errorNYI("handle coroutine return alloc failure");
 
-  assert(!cir::MissingFeatures::generateDebugInfo());
-  assert(!cir::MissingFeatures::emitBodyAndFallthrough());
+  {
+    assert(!cir::MissingFeatures::generateDebugInfo());
+    ParamReferenceReplacerRAII paramReplacer(localDeclMap);
+    // Create mapping between parameters and copy-params for coroutine
+    // function.
+    llvm::ArrayRef<const Stmt *> paramMoves = s.getParamMoves();
+    assert((paramMoves.size() == 0 || (paramMoves.size() == fnArgs.size())) &&
+           "ParamMoves and FnArgs should be the same size for coroutine "
+           "function");
+    // For zipping the arg map into debug info.
+    assert(!cir::MissingFeatures::generateDebugInfo());
+
+    // Create parameter copies. We do it before creating a promise, since an
+    // evolution of coroutine TS may allow promise constructor to observe
+    // parameter copies.
+    assert(!cir::MissingFeatures::coroOutsideFrameMD());
+    for (auto *pm : paramMoves) {
+      if (emitStmt(pm, /*useCurrentScope=*/true).failed())
+        return mlir::failure();
+      paramReplacer.addCopy(cast<DeclStmt>(pm));
+    }
+
+    if (emitStmt(s.getPromiseDeclStmt(), /*useCurrentScope=*/true).failed())
+      return mlir::failure();
+    // returnValue should be valid as long as the coroutine's return type
+    // is not void. The assertion could help us to reduce the check later.
+    assert(returnValue.isValid() == (bool)s.getReturnStmt());
+    // Now we have the promise, initialize the GRO.
+    // We need to emit `get_return_object` first. According to:
+    // [dcl.fct.def.coroutine]p7
+    // The call to get_return_object is sequenced before the call to
+    // initial_suspend and is invoked at most once.
+    //
+    // So we couldn't emit return value when we emit return statment,
+    // otherwise the call to get_return_object wouldn't be in front
+    // of initial_suspend.
+    if (returnValue.isValid())
+      emitAnyExprToMem(s.getReturnValue(), returnValue,
+                       s.getReturnValue()->getType().getQualifiers(),
+                       /*isInit*/ true);
+
+    assert(!cir::MissingFeatures::ehCleanupScope());
+    // FIXME(cir): EHStack.pushCleanup<CallCoroEnd>(EHCleanup);
+    curCoro.data->currentAwaitKind = cir::AwaitKind::Init;
+    if (emitStmt(s.getInitSuspendStmt(), /*useCurrentScope=*/true).failed())
+      return mlir::failure();
+    assert(!cir::MissingFeatures::emitBodyAndFallthrough());
+  }
   return mlir::success();
 }
+
+static bool memberCallExpressionCanThrow(const Expr *e) {
+  if (const auto *ce = dyn_cast<CXXMemberCallExpr>(e))
+    if (const auto *proto =
+            ce->getMethodDecl()->getType()->getAs<FunctionProtoType>())
+      if (isNoexceptExceptionSpec(proto->getExceptionSpecType()) &&
+          proto->canThrow() == CT_Cannot)
+        return false;
+  return true;
+}
+
+// Given a suspend expression which roughly looks like:
+//
+//   auto && x = CommonExpr();
+//   if (!x.await_ready()) {
+//      x.await_suspend(...); (*)
+//   }
+//   x.await_resume();
+//
+// where the result of the entire expression is the result of x.await_resume()
+//
+//   (*) If x.await_suspend return type is bool, it allows to veto a suspend:
+//      if (x.await_suspend(...))
+//        llvm_coro_suspend();
+//
+// This is more higher level than LLVM codegen, for that one see llvm's
+// docs/Coroutines.rst for more details.
+namespace {
+struct LValueOrRValue {
+  LValue lv;
+  RValue rv;
+};
+} // namespace
+
+static LValueOrRValue
+emitSuspendExpression(CIRGenFunction &cgf, CGCoroData &coro,
+                      CoroutineSuspendExpr const &s, cir::AwaitKind kind,
+                      AggValueSlot aggSlot, bool ignoreResult,
+                      mlir::Block *scopeParentBlock,
+                      mlir::Value &tmpResumeRValAddr, bool forLValue) {
+  [[maybe_unused]] mlir::LogicalResult awaitBuild = mlir::success();
+  LValueOrRValue awaitRes;
+
+  CIRGenFunction::OpaqueValueMapping binder =
+      CIRGenFunction::OpaqueValueMapping(cgf, s.getOpaqueValue());
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  [[maybe_unused]] cir::AwaitOp awaitOp = cir::AwaitOp::create(
+      builder, cgf.getLoc(s.getSourceRange()), kind,
+      /*readyBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location loc) {
+        Expr *condExpr = s.getReadyExpr()->IgnoreParens();
+        builder.createCondition(cgf.evaluateExprAsBool(condExpr));
+      },
+      /*suspendBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location loc) {
+        // Note that differently from LLVM codegen we do not emit coro.save
+        // and coro.suspend here, that should be done as part of lowering this
+        // to LLVM dialect (or some other MLIR dialect)
+
+        // A invalid suspendRet indicates "void returning await_suspend"
+        mlir::Value suspendRet = cgf.emitScalarExpr(s.getSuspendExpr());
+
+        // Veto suspension if requested by bool returning await_suspend.
+        if (suspendRet) {
+          cgf.cgm.errorNYI("Veto await_suspend");
+        }
+
+        // Signals the parent that execution flows to next region.
+        cir::YieldOp::create(builder, loc);
+      },
+      /*resumeBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location loc) {
+        // Exception handling requires additional IR. If the 'await_resume'
+        // function is marked as 'noexcept', we avoid generating this additional
+        // IR.
+        CXXTryStmt *tryStmt = nullptr;
+        if (coro.exceptionHandler && kind == cir::AwaitKind::Init &&
+            memberCallExpressionCanThrow(s.getResumeExpr()))
+          cgf.cgm.errorNYI("Coro resume Exception");
+
+        // FIXME(cir): the alloca for the resume expr should be placed in the
+        // enclosing cir.scope instead.
+        if (forLValue) {
+          assert(!cir::MissingFeatures::coroCoYield());
+        } else {
+          awaitRes.rv =
+              cgf.emitAnyExpr(s.getResumeExpr(), aggSlot, ignoreResult);
+          if (!awaitRes.rv.isIgnored())
+            // Create the alloca in the block before the scope wrapping
+            // cir.await.
+            assert(!cir::MissingFeatures::coroCoReturn());
+        }
+
+        if (tryStmt)
+          cgf.cgm.errorNYI("Coro tryStmt");
+
+        // Returns control back to parent.
+        cir::YieldOp::create(builder, loc);
+      });
+
+  assert(awaitBuild.succeeded() && "Should know how to codegen");
+  return awaitRes;
+}
+
+static RValue emitSuspendExpr(CIRGenFunction &cgf,
+                              const CoroutineSuspendExpr &e,
+                              cir::AwaitKind kind, AggValueSlot aggSlot,
+                              bool ignoreResult) {
+  RValue rval;
+  mlir::Location scopeLoc = cgf.getLoc(e.getSourceRange());
+
+  // Since we model suspend / resume as an inner region, we must store
+  // resume scalar results in a tmp alloca, and load it after we build the
+  // suspend expression. An alternative way to do this would be to make
+  // every region return a value when promise.return_value() is used, but
+  // it's a bit awkward given that resume is the only region that actually
+  // returns a value.
+  mlir::Block *currEntryBlock = cgf.curLexScope->getEntryBlock();
+  [[maybe_unused]] mlir::Value tmpResumeRValAddr;
+
+  // No need to explicitly wrap this into a scope since the AST already uses a
+  // ExprWithCleanups, which will wrap this into a cir.scope anyways.
+  rval = emitSuspendExpression(cgf, *cgf.curCoro.data, e, kind, aggSlot,
+                               ignoreResult, currEntryBlock, tmpResumeRValAddr,
+                               /*forLValue*/ false)
+             .rv;
+
+  if (ignoreResult || rval.isIgnored())
+    return rval;
+
+  if (rval.isScalar()) {
+    rval = RValue::get(cir::LoadOp::create(cgf.getBuilder(), scopeLoc,
+                                           rval.getValue().getType(),
+                                           tmpResumeRValAddr));
+  } else if (rval.isAggregate()) {
+    // This is probably already handled via AggSlot, remove this assertion
+    // once we have a testcase and prove all pieces work.
+    cgf.cgm.errorNYI("emitSuspendExpr Aggregate");
+  } else { // complex
+    cgf.cgm.errorNYI("emitSuspendExpr Complex");
+  }
+  return rval;
+}
+
+RValue CIRGenFunction::emitCoawaitExpr(const CoawaitExpr &e,
+                                       AggValueSlot aggSlot,
+                                       bool ignoreResult) {
+  return emitSuspendExpr(*this, e, curCoro.data->currentAwaitKind, aggSlot,
+                         ignoreResult);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index aeea0ef..12b153a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -50,6 +50,41 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
 
   Address address = Address::invalid();
   if (ty->isConstantSizeType()) {
+    // If this value is an array, struct, or vector with a statically
+    // determinable constant initializer, there are optimizations we can do.
+    //
+    // TODO: We should constant-evaluate the initializer of any variable,
+    // as long as it is initialized by a constant expression. Currently,
+    // isConstantInitializer produces wrong answers for structs with
+    // reference or bitfield members, and a few other cases, and checking
+    // for POD-ness protects us from some of these.
+    if (d.getInit() &&
+        (ty->isArrayType() || ty->isRecordType() || ty->isVectorType()) &&
+        (d.isConstexpr() ||
+         ((ty.isPODType(getContext()) ||
+           getContext().getBaseElementType(ty)->isObjCObjectPointerType()) &&
+          d.getInit()->isConstantInitializer(getContext(), false)))) {
+
+      // If the variable's a const type, and it's neither an NRVO
+      // candidate nor a __block variable and has no mutable members,
+      // emit it as a global instead.
+      // Exception is if a variable is located in non-constant address space
+      // in OpenCL.
+      // TODO(cir): perhaps we don't need this at all at CIR since this can
+      // be done as part of lowering down to LLVM.
+      bool needsDtor =
+          d.needsDestruction(getContext()) == QualType::DK_cxx_destructor;
+      if ((!getContext().getLangOpts().OpenCL ||
+           ty.getAddressSpace() == LangAS::opencl_constant) &&
+          (cgm.getCodeGenOpts().MergeAllConstants && !nrvo &&
+           !d.isEscapingByref() &&
+           ty.isConstantStorage(getContext(), true, !needsDtor))) {
+        cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: type constant");
+      }
+      // Otherwise, tell the initialization code that we're in this case.
+      emission.isConstantAggregate = true;
+    }
+
     // A normal fixed sized variable becomes an alloca in the entry block,
     // unless:
     // - it's an NRVO variable.
@@ -63,8 +98,22 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
       if (const RecordDecl *rd = ty->getAsRecordDecl()) {
         if (const auto *cxxrd = dyn_cast<CXXRecordDecl>(rd);
             (cxxrd && !cxxrd->hasTrivialDestructor()) ||
-            rd->isNonTrivialToPrimitiveDestroy())
-          cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: set NRVO flag");
+            rd->isNonTrivialToPrimitiveDestroy()) {
+          // In LLVM: Create a flag that is used to indicate when the NRVO was
+          // applied to this variable. Set it to zero to indicate that NRVO was
+          // not applied. For now, use the same approach for CIRGen until we can
+          // be sure it's worth doing something more aggressive.
+          cir::ConstantOp falseNVRO = builder.getFalse(loc);
+          Address nrvoFlag = createTempAlloca(falseNVRO.getType(),
+                                              CharUnits::One(), loc, "nrvo",
+                                              /*arraySize=*/nullptr);
+          assert(builder.getInsertionBlock());
+          builder.createStore(loc, falseNVRO, nrvoFlag);
+
+          // Record the NRVO flag for this variable.
+          nrvoFlags[&d] = nrvoFlag.getPointer();
+          emission.nrvoFlag = nrvoFlag.getPointer();
+        }
       }
     } else {
       // A normal fixed sized variable becomes an alloca in the entry block,
@@ -131,6 +180,47 @@ bool CIRGenFunction::isTrivialInitializer(const Expr *init) {
   return false;
 }
 
+static void emitStoresForConstant(CIRGenModule &cgm, const VarDecl &d,
+                                  Address addr, bool isVolatile,
+                                  CIRGenBuilderTy &builder,
+                                  mlir::TypedAttr constant) {
+  mlir::Type ty = constant.getType();
+  cir::CIRDataLayout layout{cgm.getModule()};
+  uint64_t constantSize = layout.getTypeAllocSize(ty);
+  if (!constantSize)
+    return;
+  assert(!cir::MissingFeatures::addAutoInitAnnotation());
+  assert(!cir::MissingFeatures::vectorConstants());
+  assert(!cir::MissingFeatures::shouldUseBZeroPlusStoresToInitialize());
+  assert(!cir::MissingFeatures::shouldUseMemSetToInitialize());
+  assert(!cir::MissingFeatures::shouldSplitConstantStore());
+  assert(!cir::MissingFeatures::shouldCreateMemCpyFromGlobal());
+  // In CIR we want to emit a store for the whole thing, later lowering
+  // prepare to LLVM should unwrap this into the best policy (see asserts
+  // above).
+  //
+  // FIXME(cir): This is closer to memcpy behavior but less optimal, instead of
+  // copy from a global, we just create a cir.const out of it.
+
+  if (addr.getElementType() != ty)
+    addr = addr.withElementType(builder, ty);
+
+  // If the address is an alloca, set the init attribute.
+  // The address is usually and alloca, but there is at least one case where
+  // emitAutoVarInit is called from the OpenACC codegen with an address that
+  // is not an alloca.
+  auto allocaOp = addr.getDefiningOp<cir::AllocaOp>();
+  if (allocaOp)
+    allocaOp.setInitAttr(mlir::UnitAttr::get(&cgm.getMLIRContext()));
+
+  // There are cases where OpenACC codegen calls emitAutoVarInit with a
+  // temporary decl that doesn't have a source range set.
+  mlir::Location loc = builder.getUnknownLoc();
+  if (d.getSourceRange().isValid())
+    loc = cgm.getLoc(d.getSourceRange());
+  builder.createStore(loc, builder.getConstant(loc, constant), addr);
+}
+
 void CIRGenFunction::emitAutoVarInit(
     const CIRGenFunction::AutoVarEmission &emission) {
   assert(emission.variable && "emission was not valid!");
@@ -237,6 +327,9 @@ void CIRGenFunction::emitAutoVarInit(
     return emitStoreThroughLValue(
         RValue::get(builder.getConstant(initLoc, typedConstant)), lv);
   }
+
+  emitStoresForConstant(cgm, d, addr, type.isVolatileQualified(), builder,
+                        typedConstant);
 }
 
 void CIRGenFunction::emitAutoVarCleanups(
@@ -361,7 +454,8 @@ CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &d,
   if (supportsCOMDAT() && gv.isWeakForLinker())
     gv.setComdat(true);
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  if (d.getTLSKind())
+    errorNYI(d.getSourceRange(), "getOrCreateStaticVarDecl: TLS");
 
   setGVProperties(gv, &d);
 
@@ -447,7 +541,8 @@ cir::GlobalOp CIRGenFunction::addInitializerToStaticVarDecl(
   bool needsDtor =
       d.needsDestruction(getContext()) == QualType::DK_cxx_destructor;
 
-  assert(!cir::MissingFeatures::opGlobalConstant());
+  gv.setConstant(d.getType().isConstantStorage(
+      getContext(), /*ExcludeCtor=*/true, !needsDtor));
   gv.setInitialValueAttr(init);
 
   emitter.finalize(gv);
@@ -718,21 +813,72 @@ namespace {
 struct DestroyObject final : EHScopeStack::Cleanup {
   DestroyObject(Address addr, QualType type,
                 CIRGenFunction::Destroyer *destroyer)
-      : addr(addr), type(type), destroyer(destroyer) {}
+      : addr(addr), type(type), destroyer(destroyer) {
+    assert(!cir::MissingFeatures::useEHCleanupForArray());
+  }
 
   Address addr;
   QualType type;
   CIRGenFunction::Destroyer *destroyer;
 
-  void emit(CIRGenFunction &cgf) override {
+  void emit(CIRGenFunction &cgf, Flags flags) override {
+    assert(!cir::MissingFeatures::useEHCleanupForArray());
     cgf.emitDestroy(addr, type, destroyer);
   }
 };
 
+template <class Derived> struct DestroyNRVOVariable : EHScopeStack::Cleanup {
+  DestroyNRVOVariable(Address addr, QualType type, mlir::Value nrvoFlag)
+      : nrvoFlag(nrvoFlag), addr(addr), ty(type) {}
+
+  mlir::Value nrvoFlag;
+  Address addr;
+  QualType ty;
+
+  void emit(CIRGenFunction &cgf, Flags flags) override {
+    // Along the exceptions path we always execute the dtor.
+    bool nrvo = flags.isForNormalCleanup() && nrvoFlag;
+
+    CIRGenBuilderTy &builder = cgf.getBuilder();
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    if (nrvo) {
+      // If we exited via NRVO, we skip the destructor call.
+      mlir::Location loc = addr.getPointer().getLoc();
+      mlir::Value didNRVO = builder.createFlagLoad(loc, nrvoFlag);
+      mlir::Value notNRVO = builder.createNot(didNRVO);
+      cir::IfOp::create(builder, loc, notNRVO, /*withElseRegion=*/false,
+                        [&](mlir::OpBuilder &b, mlir::Location) {
+                          static_cast<Derived *>(this)->emitDestructorCall(cgf);
+                          builder.createYield(loc);
+                        });
+    } else {
+      static_cast<Derived *>(this)->emitDestructorCall(cgf);
+    }
+  }
+
+  virtual ~DestroyNRVOVariable() = default;
+};
+
+struct DestroyNRVOVariableCXX final
+    : DestroyNRVOVariable<DestroyNRVOVariableCXX> {
+  DestroyNRVOVariableCXX(Address addr, QualType type,
+                         const CXXDestructorDecl *dtor, mlir::Value nrvoFlag)
+      : DestroyNRVOVariable<DestroyNRVOVariableCXX>(addr, type, nrvoFlag),
+        dtor(dtor) {}
+
+  const CXXDestructorDecl *dtor;
+
+  void emitDestructorCall(CIRGenFunction &cgf) {
+    cgf.emitCXXDestructorCall(dtor, Dtor_Complete,
+                              /*forVirtualBase=*/false,
+                              /*delegating=*/false, addr, ty);
+  }
+};
+
 struct CallStackRestore final : EHScopeStack::Cleanup {
   Address stack;
   CallStackRestore(Address stack) : stack(stack) {}
-  void emit(CIRGenFunction &cgf) override {
+  void emit(CIRGenFunction &cgf, Flags flags) override {
     mlir::Location loc = stack.getPointer().getLoc();
     mlir::Value v = cgf.getBuilder().createLoad(loc, stack);
     cgf.getBuilder().createStackRestore(loc, v);
@@ -834,6 +980,7 @@ void CIRGenFunction::emitDestroy(Address addr, QualType type,
     return;
 
   mlir::Value begin = addr.getPointer();
+  assert(!cir::MissingFeatures::useEHCleanupForArray());
   emitArrayDestroy(begin, length, type, elementAlign, destroyer);
 
   // If the array destroy didn't use the length op, we can erase it.
@@ -885,7 +1032,10 @@ void CIRGenFunction::emitAutoVarTypeCleanup(
     // If there's an NRVO flag on the emission, we need a different
     // cleanup.
     if (emission.nrvoFlag) {
-      cgm.errorNYI(var->getSourceRange(), "emitAutoVarTypeCleanup: NRVO");
+      assert(!type->isArrayType());
+      CXXDestructorDecl *dtor = type->getAsCXXRecordDecl()->getDestructor();
+      ehStack.pushCleanup<DestroyNRVOVariableCXX>(cleanupKind, addr, type, dtor,
+                                                  emission.nrvoFlag);
       return;
     }
     // Otherwise, this is handled below.
@@ -903,7 +1053,7 @@ void CIRGenFunction::emitAutoVarTypeCleanup(
   if (!destroyer)
     destroyer = getDestroyer(dtorKind);
 
-  assert(!cir::MissingFeatures::ehCleanupFlags());
+  assert(!cir::MissingFeatures::useEHCleanupForArray());
   ehStack.pushCleanup<DestroyObject>(cleanupKind, addr, type, destroyer);
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
index b588a50..8e6a693 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
@@ -11,24 +11,411 @@
 //===----------------------------------------------------------------------===//
 
 #include "CIRGenFunction.h"
+#include "CIRGenOpenACCHelpers.h"
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "clang/AST/DeclOpenACC.h"
+#include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
 
+namespace {
+struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
+  mlir::acc::DeclareEnterOp enterOp;
+
+  OpenACCDeclareCleanup(mlir::acc::DeclareEnterOp enterOp) : enterOp(enterOp) {}
+
+  template <typename OutTy, typename InTy>
+  void createOutOp(CIRGenFunction &cgf, InTy inOp) {
+    if constexpr (std::is_same_v<OutTy, mlir::acc::DeleteOp>) {
+      auto outOp =
+          OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp,
+                        inOp.getStructured(), inOp.getImplicit(),
+                        llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+      outOp.setDataClause(inOp.getDataClause());
+      outOp.setModifiers(inOp.getModifiers());
+    } else {
+      auto outOp =
+          OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp, inOp.getVarPtr(),
+                        inOp.getStructured(), inOp.getImplicit(),
+                        llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+      outOp.setDataClause(inOp.getDataClause());
+      outOp.setModifiers(inOp.getModifiers());
+    }
+  }
+
+  void emit(CIRGenFunction &cgf, Flags flags) override {
+    auto exitOp = mlir::acc::DeclareExitOp::create(
+        cgf.getBuilder(), enterOp.getLoc(), enterOp, {});
+
+    // Some data clauses need to be referenced in 'exit', AND need to have an
+    // operation after the exit.  Copy these from the enter operation.
+    for (mlir::Value val : enterOp.getDataClauseOperands()) {
+      if (auto copyin = val.getDefiningOp<mlir::acc::CopyinOp>()) {
+        switch (copyin.getDataClause()) {
+        default:
+          llvm_unreachable(
+              "OpenACC local declare clause copyin unexpected data clause");
+          break;
+        case mlir::acc::DataClause::acc_copy:
+          createOutOp<mlir::acc::CopyoutOp>(cgf, copyin);
+          break;
+        case mlir::acc::DataClause::acc_copyin:
+          createOutOp<mlir::acc::DeleteOp>(cgf, copyin);
+          break;
+        }
+      } else if (auto create = val.getDefiningOp<mlir::acc::CreateOp>()) {
+        switch (create.getDataClause()) {
+        default:
+          llvm_unreachable(
+              "OpenACC local declare clause create unexpected data clause");
+          break;
+        case mlir::acc::DataClause::acc_copyout:
+          createOutOp<mlir::acc::CopyoutOp>(cgf, create);
+          break;
+        case mlir::acc::DataClause::acc_create:
+          createOutOp<mlir::acc::DeleteOp>(cgf, create);
+          break;
+        }
+      } else if (auto present = val.getDefiningOp<mlir::acc::PresentOp>()) {
+        createOutOp<mlir::acc::DeleteOp>(cgf, present);
+      } else if (auto dev_res =
+                     val.getDefiningOp<mlir::acc::DeclareDeviceResidentOp>()) {
+        createOutOp<mlir::acc::DeleteOp>(cgf, dev_res);
+      } else if (val.getDefiningOp<mlir::acc::DeclareLinkOp>()) {
+        // Link has no exit clauses, and shouldn't be copied.
+        continue;
+      } else if (val.getDefiningOp<mlir::acc::DevicePtrOp>()) {
+        // DevicePtr has no exit clauses, and shouldn't be copied.
+        continue;
+      } else {
+        llvm_unreachable("OpenACC local declare clause unexpected defining op");
+        continue;
+      }
+      exitOp.getDataClauseOperandsMutable().append(val);
+    }
+  }
+};
+} // namespace
+
+void CIRGenModule::emitGlobalOpenACCDecl(const OpenACCConstructDecl *d) {
+  if (const auto *rd = dyn_cast<OpenACCRoutineDecl>(d))
+    emitGlobalOpenACCRoutineDecl(rd);
+  else
+    emitGlobalOpenACCDeclareDecl(cast<OpenACCDeclareDecl>(d));
+}
+
 void CIRGenFunction::emitOpenACCDeclare(const OpenACCDeclareDecl &d) {
-  getCIRGenModule().errorNYI(d.getSourceRange(), "OpenACC Declare Construct");
+  mlir::Location exprLoc = cgm.getLoc(d.getBeginLoc());
+  auto enterOp = mlir::acc::DeclareEnterOp::create(
+      builder, exprLoc, mlir::acc::DeclareTokenType::get(&cgm.getMLIRContext()),
+      {});
+
+  emitOpenACCClauses(enterOp, OpenACCDirectiveKind::Declare, d.clauses());
+
+  ehStack.pushCleanup<OpenACCDeclareCleanup>(CleanupKind::NormalCleanup,
+                                             enterOp);
+}
+
+// Helper function that gets the declaration referenced by the declare clause.
+// This is a simplified verison of the work that `getOpenACCDataOperandInfo`
+// does, as it only has to get forms that 'declare' does.
+static const Decl *getDeclareReferencedDecl(const Expr *e) {
+  const Expr *curVarExpr = e->IgnoreParenImpCasts();
+
+  // Since we allow array sections, we have to unpack the array sections here.
+  // We don't have to worry about other bounds, since only variable or array
+  // name (plus array sections as an extension) are permitted.
+  while (const auto *ase = dyn_cast<ArraySectionExpr>(curVarExpr))
+    curVarExpr = ase->getBase()->IgnoreParenImpCasts();
+
+  if (const auto *dre = dyn_cast<DeclRefExpr>(curVarExpr))
+    return dre->getFoundDecl()->getCanonicalDecl();
+
+  // MemberExpr is allowed when it is implicit 'this'.
+  return cast<MemberExpr>(curVarExpr)->getMemberDecl()->getCanonicalDecl();
+}
+
+template <typename BeforeOpTy, typename DataClauseTy>
+void CIRGenModule::emitGlobalOpenACCDeclareDataOperands(
+    const Expr *varOperand, DataClauseTy dataClause,
+    OpenACCModifierKind modifiers, bool structured, bool implicit,
+    bool requiresDtor) {
+  // This is a template argument so that we don't have to include all of
+  // mlir::acc into CIRGenModule.
+  static_assert(std::is_same_v<DataClauseTy, mlir::acc::DataClause>);
+  mlir::Location exprLoc = getLoc(varOperand->getBeginLoc());
+  const Decl *refedDecl = getDeclareReferencedDecl(varOperand);
+  StringRef varName = getMangledName(GlobalDecl{cast<VarDecl>(refedDecl)});
+
+  // We have to emit two separate functions in this case, an acc_ctor and an
+  // acc_dtor. These two sections are/should remain reasonably equal, however
+  // the order of the clauses/vs-enter&exit in them makes combining these two
+  // sections not particularly attractive, so we have a bit of repetition.
+  {
+    mlir::OpBuilder::InsertionGuard guardCase(builder);
+    auto ctorOp = mlir::acc::GlobalConstructorOp::create(
+        builder, exprLoc, (varName + "_acc_ctor").str());
+    getModule().push_back(ctorOp);
+    mlir::Block *block = builder.createBlock(&ctorOp.getRegion(),
+                                             ctorOp.getRegion().end(), {}, {});
+    builder.setInsertionPointToEnd(block);
+    // These things are close enough to a function handling-wise we can just
+    // create this here.
+    CIRGenFunction cgf{*this, builder, true};
+    llvm::SaveAndRestore<CIRGenFunction *> savedCGF(curCGF, &cgf);
+    cgf.curFn = ctorOp;
+    CIRGenFunction::SourceLocRAIIObject fnLoc{cgf, exprLoc};
+
+    // This gets the information we need, PLUS emits the bounds correctly, so we
+    // have to do this in both enter and exit.
+    CIRGenFunction::OpenACCDataOperandInfo inf =
+        cgf.getOpenACCDataOperandInfo(varOperand);
+    auto beforeOp =
+        BeforeOpTy::create(builder, exprLoc, inf.varValue, structured, implicit,
+                           inf.name, inf.bounds);
+    beforeOp.setDataClause(dataClause);
+    beforeOp.setModifiers(convertOpenACCModifiers(modifiers));
+
+    mlir::acc::DeclareEnterOp::create(
+        builder, exprLoc, mlir::acc::DeclareTokenType::get(&getMLIRContext()),
+        beforeOp.getResult());
+
+    mlir::acc::TerminatorOp::create(builder, exprLoc);
+  }
+
+  // copyin, create, and device_resident require a destructor, link does not. In
+  // the case of the first three, they are all a 'getdeviceptr', followed by the
+  // declare_exit, followed by a delete op in the destructor region.
+  if (requiresDtor) {
+    mlir::OpBuilder::InsertionGuard guardCase(builder);
+    auto ctorOp = mlir::acc::GlobalDestructorOp::create(
+        builder, exprLoc, (varName + "_acc_dtor").str());
+    getModule().push_back(ctorOp);
+    mlir::Block *block = builder.createBlock(&ctorOp.getRegion(),
+                                             ctorOp.getRegion().end(), {}, {});
+    builder.setInsertionPointToEnd(block);
+
+    // These things are close enough to a function handling-wise we can just
+    // create this here.
+    CIRGenFunction cgf{*this, builder, true};
+    llvm::SaveAndRestore<CIRGenFunction *> savedCGF(curCGF, &cgf);
+    cgf.curFn = ctorOp;
+    CIRGenFunction::SourceLocRAIIObject fnLoc{cgf, exprLoc};
+
+    CIRGenFunction::OpenACCDataOperandInfo inf =
+        cgf.getOpenACCDataOperandInfo(varOperand);
+    auto getDevPtr = mlir::acc::GetDevicePtrOp::create(
+        builder, exprLoc, inf.varValue, structured, implicit, inf.name,
+        inf.bounds);
+    getDevPtr.setDataClause(dataClause);
+    getDevPtr.setModifiers(convertOpenACCModifiers(modifiers));
+
+    mlir::acc::DeclareExitOp::create(builder, exprLoc, /*token=*/mlir::Value{},
+                                     getDevPtr.getResult());
+    auto deleteOp = mlir::acc::DeleteOp::create(
+        builder, exprLoc, getDevPtr, structured, implicit, inf.name, {});
+    deleteOp.setDataClause(dataClause);
+    deleteOp.setModifiers(convertOpenACCModifiers(modifiers));
+    mlir::acc::TerminatorOp::create(builder, exprLoc);
+  }
+}
+namespace {
+// This class emits all of the information for a 'declare' at a global/ns/class
+// scope. Each clause results in its own acc_ctor and acc_dtor for the variable.
+// This class creates those and emits them properly.
+// This behavior is unique/special enough from the emission of statement-level
+// clauses that it doesn't really make sense to use that clause visitor.
+class OpenACCGlobalDeclareClauseEmitter final
+    : public OpenACCClauseVisitor<OpenACCGlobalDeclareClauseEmitter> {
+  CIRGenModule &cgm;
+
+public:
+  OpenACCGlobalDeclareClauseEmitter(CIRGenModule &cgm) : cgm(cgm) {}
+
+  void VisitClause(const OpenACCClause &clause) {
+    llvm_unreachable("Invalid OpenACC clause on global Declare");
+  }
+
+  void emitClauses(ArrayRef<const OpenACCClause *> clauses) {
+    this->VisitClauseList(clauses);
+  }
+
+  void VisitCopyInClause(const OpenACCCopyInClause &clause) {
+    for (const Expr *var : clause.getVarList())
+      cgm.emitGlobalOpenACCDeclareDataOperands<mlir::acc::CopyinOp>(
+          var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
+          /*structured=*/true,
+          /*implicit=*/false, /*requiresDtor=*/true);
+  }
+
+  void VisitCreateClause(const OpenACCCreateClause &clause) {
+    for (const Expr *var : clause.getVarList())
+      cgm.emitGlobalOpenACCDeclareDataOperands<mlir::acc::CreateOp>(
+          var, mlir::acc::DataClause::acc_create, clause.getModifierList(),
+          /*structured=*/true,
+          /*implicit=*/false, /*requiresDtor=*/true);
+  }
+
+  void VisitDeviceResidentClause(const OpenACCDeviceResidentClause &clause) {
+    for (const Expr *var : clause.getVarList())
+      cgm.emitGlobalOpenACCDeclareDataOperands<
+          mlir::acc::DeclareDeviceResidentOp>(
+          var, mlir::acc::DataClause::acc_declare_device_resident, {},
+          /*structured=*/true,
+          /*implicit=*/false, /*requiresDtor=*/true);
+  }
+
+  void VisitLinkClause(const OpenACCLinkClause &clause) {
+    for (const Expr *var : clause.getVarList())
+      cgm.emitGlobalOpenACCDeclareDataOperands<mlir::acc::DeclareLinkOp>(
+          var, mlir::acc::DataClause::acc_declare_link, {},
+          /*structured=*/true,
+          /*implicit=*/false, /*requiresDtor=*/false);
+  }
+};
+} // namespace
+
+void CIRGenModule::emitGlobalOpenACCDeclareDecl(const OpenACCDeclareDecl *d) {
+  // Declare creates 1 'acc_ctor' and 0-1 'acc_dtor' per clause, since it needs
+  // a unique one on a per-variable basis. We can just use a clause emitter to
+  // do all the work.
+  mlir::OpBuilder::InsertionGuard guardCase(builder);
+  OpenACCGlobalDeclareClauseEmitter em{*this};
+  em.emitClauses(d->clauses());
 }
 
 void CIRGenFunction::emitOpenACCRoutine(const OpenACCRoutineDecl &d) {
-  getCIRGenModule().errorNYI(d.getSourceRange(), "OpenACC Routine Construct");
+  // Do nothing here. The OpenACCRoutineDeclAttr handles the implicit name
+  // cases, and the end-of-TU handling manages the named cases. This is
+  // necessary because these references aren't necessarily emitted themselves,
+  // but can be named anywhere.
 }
 
-void CIRGenModule::emitGlobalOpenACCDecl(const OpenACCConstructDecl *d) {
-  if (isa<OpenACCRoutineDecl>(d))
-    errorNYI(d->getSourceRange(), "OpenACC Routine Construct");
-  else if (isa<OpenACCDeclareDecl>(d))
-    errorNYI(d->getSourceRange(), "OpenACC Declare Construct");
-  else
-    llvm_unreachable("unknown OpenACC declaration kind?");
+void CIRGenModule::emitGlobalOpenACCRoutineDecl(const OpenACCRoutineDecl *d) {
+  // Do nothing here. The OpenACCRoutineDeclAttr handles the implicit name
+  // cases, and the end-of-TU handling manages the named cases. This is
+  // necessary because these references aren't necessarily emitted themselves,
+  // but can be named anywhere.
+}
+
+namespace {
+class OpenACCRoutineClauseEmitter final
+    : public OpenACCClauseVisitor<OpenACCRoutineClauseEmitter> {
+  CIRGenModule &cgm;
+  CIRGen::CIRGenBuilderTy &builder;
+  mlir::acc::RoutineOp routineOp;
+  llvm::SmallVector<mlir::acc::DeviceType> lastDeviceTypeValues;
+
+public:
+  OpenACCRoutineClauseEmitter(CIRGenModule &cgm,
+                              CIRGen::CIRGenBuilderTy &builder,
+                              mlir::acc::RoutineOp routineOp)
+      : cgm(cgm), builder(builder), routineOp(routineOp) {}
+
+  void emitClauses(ArrayRef<const OpenACCClause *> clauses) {
+    this->VisitClauseList(clauses);
+  }
+
+  void VisitClause(const OpenACCClause &clause) {
+    llvm_unreachable("Invalid OpenACC clause on routine");
+  }
+
+  void VisitSeqClause(const OpenACCSeqClause &clause) {
+    routineOp.addSeq(builder.getContext(), lastDeviceTypeValues);
+  }
+  void VisitWorkerClause(const OpenACCWorkerClause &clause) {
+    routineOp.addWorker(builder.getContext(), lastDeviceTypeValues);
+  }
+  void VisitVectorClause(const OpenACCVectorClause &clause) {
+    routineOp.addVector(builder.getContext(), lastDeviceTypeValues);
+  }
+
+  void VisitNoHostClause(const OpenACCNoHostClause &clause) {
+    routineOp.setNohost(/*attrValue=*/true);
+  }
+
+  void VisitGangClause(const OpenACCGangClause &clause) {
+    // Gang has an optional 'dim' value, which is a constant int of 1, 2, or 3.
+    // If we don't store any expressions in the clause, there are none, else we
+    // expect there is 1, since Sema should enforce that the single 'dim' is the
+    // only valid value.
+    if (clause.getNumExprs() == 0) {
+      routineOp.addGang(builder.getContext(), lastDeviceTypeValues);
+    } else {
+      assert(clause.getNumExprs() == 1);
+      auto [kind, expr] = clause.getExpr(0);
+      assert(kind == OpenACCGangKind::Dim);
+
+      llvm::APSInt curValue = expr->EvaluateKnownConstInt(cgm.getASTContext());
+      // The value is 1, 2, or 3, but 64 bit seems right enough.
+      curValue = curValue.sextOrTrunc(64);
+      routineOp.addGang(builder.getContext(), lastDeviceTypeValues,
+                        curValue.getZExtValue());
+    }
+  }
+
+  void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) {
+    lastDeviceTypeValues.clear();
+
+    for (const DeviceTypeArgument &arg : clause.getArchitectures())
+      lastDeviceTypeValues.push_back(decodeDeviceType(arg.getIdentifierInfo()));
+  }
+
+  void VisitBindClause(const OpenACCBindClause &clause) {
+    if (clause.isStringArgument()) {
+      mlir::StringAttr value =
+          builder.getStringAttr(clause.getStringArgument()->getString());
+
+      routineOp.addBindStrName(builder.getContext(), lastDeviceTypeValues,
+                               value);
+    } else {
+      assert(clause.isIdentifierArgument());
+      cgm.errorNYI(clause.getSourceRange(),
+                   "Bind with an identifier argument is not yet supported");
+    }
+  }
+};
+} // namespace
+
+void CIRGenModule::emitOpenACCRoutineDecl(
+    const clang::FunctionDecl *funcDecl, cir::FuncOp func,
+    SourceLocation pragmaLoc, ArrayRef<const OpenACCClause *> clauses) {
+  mlir::OpBuilder::InsertionGuard guardCase(builder);
+  // These need to appear at the global module.
+  builder.setInsertionPointToEnd(&getModule().getBodyRegion().front());
+
+  mlir::Location routineLoc = getLoc(pragmaLoc);
+
+  std::stringstream routineNameSS;
+  // This follows the same naming format as Flang.
+  routineNameSS << "acc_routine_" << routineCounter++;
+  std::string routineName = routineNameSS.str();
+
+  // There isn't a good constructor for RoutineOp that just takes a location +
+  // name + function, so we use one that creates an otherwise RoutineOp and
+  // count on the visitor/emitter to fill these in.
+  auto routineOp = mlir::acc::RoutineOp::create(
+      builder, routineLoc, routineName,
+      mlir::SymbolRefAttr::get(builder.getContext(), func.getName()),
+      /*implicit=*/false);
+
+  // We have to add a pointer going the other direction via an acc.routine_info,
+  // from the func to the routine.
+  llvm::SmallVector<mlir::SymbolRefAttr> funcRoutines;
+  if (auto routineInfo =
+          func.getOperation()->getAttrOfType<mlir::acc::RoutineInfoAttr>(
+              mlir::acc::getRoutineInfoAttrName()))
+    funcRoutines.append(routineInfo.getAccRoutines().begin(),
+                        routineInfo.getAccRoutines().end());
+
+  funcRoutines.push_back(
+      mlir::SymbolRefAttr::get(builder.getContext(), routineName));
+  func.getOperation()->setAttr(
+      mlir::acc::getRoutineInfoAttrName(),
+      mlir::acc::RoutineInfoAttr::get(func.getContext(), funcRoutines));
+
+  OpenACCRoutineClauseEmitter emitter{*this, builder, routineOp};
+  emitter.emitClauses(clauses);
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenException.cpp b/clang/lib/CIR/CodeGen/CIRGenException.cpp
index 67f46ff..3758284 100644
--- a/clang/lib/CIR/CodeGen/CIRGenException.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenException.cpp
@@ -14,6 +14,8 @@
 #include "CIRGenFunction.h"
 
 #include "clang/AST/StmtVisitor.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
@@ -343,7 +345,8 @@ void CIRGenFunction::enterCXXTryStmt(const CXXTryStmt &s, cir::TryOp tryOp,
 
     // No exception decl indicates '...', a catch-all.
     mlir::Region *handler = &tryOp.getHandlerRegions()[i];
-    catchScope->setHandler(i, cgm.getCXXABI().getCatchAllTypeInfo(), handler);
+    catchScope->setHandler(i, cgm.getCXXABI().getCatchAllTypeInfo(), handler,
+                           s.getHandler(i));
 
     // Under async exceptions, catch(...) needs to catch HW exception too
     // Mark scope with SehTryBegin as a SEH __try scope
@@ -382,5 +385,268 @@ void CIRGenFunction::exitCXXTryStmt(const CXXTryStmt &s, bool isFnTryBlock) {
     return;
   }
 
-  cgm.errorNYI("exitCXXTryStmt: Required catch");
+  // Copy the handler blocks off before we pop the EH stack.  Emitting
+  // the handlers might scribble on this memory.
+  SmallVector<EHCatchScope::Handler> handlers(catchScope.begin(),
+                                              catchScope.begin() + numHandlers);
+
+  ehStack.popCatch();
+
+  // Determine if we need an implicit rethrow for all these catch handlers;
+  // see the comment below.
+  bool doImplicitRethrow =
+      isFnTryBlock && isa<CXXDestructorDecl, CXXConstructorDecl>(curCodeDecl);
+
+  // Wasm uses Windows-style EH instructions, but merges all catch clauses into
+  // one big catchpad. So we save the old funclet pad here before we traverse
+  // each catch handler.
+  if (EHPersonality::get(*this).isWasmPersonality()) {
+    cgm.errorNYI("exitCXXTryStmt: WASM personality");
+    return;
+  }
+
+  bool hasCatchAll = false;
+  for (auto &handler : llvm::reverse(handlers)) {
+    hasCatchAll |= handler.isCatchAll();
+    mlir::Region *catchRegion = handler.region;
+    const CXXCatchStmt *catchStmt = handler.stmt;
+
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(&catchRegion->front());
+
+    // Enter a cleanup scope, including the catch variable and the
+    // end-catch.
+    RunCleanupsScope catchScope(*this);
+
+    // Initialize the catch variable and set up the cleanups.
+    assert(!cir::MissingFeatures::catchParamOp());
+
+    // Emit the PGO counter increment.
+    assert(!cir::MissingFeatures::incrementProfileCounter());
+
+    // Perform the body of the catch.
+    [[maybe_unused]] mlir::LogicalResult emitResult =
+        emitStmt(catchStmt->getHandlerBlock(), /*useCurrentScope=*/true);
+    assert(emitResult.succeeded() && "failed to emit catch handler block");
+
+    assert(!cir::MissingFeatures::catchParamOp());
+    cir::YieldOp::create(builder, tryOp->getLoc());
+
+    // [except.handle]p11:
+    //   The currently handled exception is rethrown if control
+    //   reaches the end of a handler of the function-try-block of a
+    //   constructor or destructor.
+
+    // It is important that we only do this on fallthrough and not on
+    // return.  Note that it's illegal to put a return in a
+    // constructor function-try-block's catch handler (p14), so this
+    // really only applies to destructors.
+    if (doImplicitRethrow) {
+      cgm.errorNYI("exitCXXTryStmt: doImplicitRethrow");
+      return;
+    }
+
+    // Fall out through the catch cleanups.
+    catchScope.forceCleanup();
+  }
+
+  // Because in wasm we merge all catch clauses into one big catchpad, in case
+  // none of the types in catch handlers matches after we test against each of
+  // them, we should unwind to the next EH enclosing scope. We generate a call
+  // to rethrow function here to do that.
+  if (EHPersonality::get(*this).isWasmPersonality() && !hasCatchAll) {
+    cgm.errorNYI("exitCXXTryStmt: WASM personality without catch all");
+  }
+
+  assert(!cir::MissingFeatures::incrementProfileCounter());
+}
+
+void CIRGenFunction::populateCatchHandlers(cir::TryOp tryOp) {
+  assert(ehStack.requiresCatchOrCleanup());
+  assert(!cgm.getLangOpts().IgnoreExceptions &&
+         "LandingPad should not be emitted when -fignore-exceptions are in "
+         "effect.");
+
+  EHScope &innermostEHScope = *ehStack.find(ehStack.getInnermostEHScope());
+  switch (innermostEHScope.getKind()) {
+  case EHScope::Terminate:
+    cgm.errorNYI("populateCatchHandlers: terminate");
+    return;
+
+  case EHScope::Catch:
+  case EHScope::Cleanup:
+  case EHScope::Filter:
+    // CIR does not cache landing pads.
+    break;
+  }
+
+  // If there's an existing TryOp, it means we got a `cir.try` scope
+  // that leads to this "landing pad" creation site. Otherwise, exceptions
+  // are enabled but a throwing function is called anyways (common pattern
+  // with function local static initializers).
+  mlir::ArrayAttr handlerTypesAttr = tryOp.getHandlerTypesAttr();
+  if (!handlerTypesAttr || handlerTypesAttr.empty()) {
+    // Accumulate all the handlers in scope.
+    bool hasCatchAll = false;
+    llvm::SmallVector<mlir::Attribute, 4> handlerAttrs;
+    for (EHScopeStack::iterator i = ehStack.begin(), e = ehStack.end(); i != e;
+         ++i) {
+      switch (i->getKind()) {
+      case EHScope::Cleanup:
+        cgm.errorNYI("emitLandingPad: Cleanup");
+        return;
+
+      case EHScope::Filter:
+        cgm.errorNYI("emitLandingPad: Filter");
+        return;
+
+      case EHScope::Terminate:
+        cgm.errorNYI("emitLandingPad: Terminate");
+        return;
+
+      case EHScope::Catch:
+        break;
+      } // end switch
+
+      EHCatchScope &catchScope = cast<EHCatchScope>(*i);
+      for (const EHCatchScope::Handler &handler :
+           llvm::make_range(catchScope.begin(), catchScope.end())) {
+        assert(handler.type.flags == 0 &&
+               "landingpads do not support catch handler flags");
+
+        // If this is a catch-all, register that and abort.
+        if (handler.isCatchAll()) {
+          assert(!hasCatchAll);
+          hasCatchAll = true;
+          break;
+        }
+
+        cgm.errorNYI("emitLandingPad: non catch-all");
+        return;
+      }
+
+      if (hasCatchAll)
+        break;
+    }
+
+    if (hasCatchAll) {
+      handlerAttrs.push_back(cir::CatchAllAttr::get(&getMLIRContext()));
+    } else {
+      cgm.errorNYI("emitLandingPad: non catch-all");
+      return;
+    }
+
+    // Add final array of clauses into TryOp.
+    tryOp.setHandlerTypesAttr(
+        mlir::ArrayAttr::get(&getMLIRContext(), handlerAttrs));
+  }
+
+  // In traditional LLVM codegen. this tells the backend how to generate the
+  // landing pad by generating a branch to the dispatch block. In CIR,
+  // this is used to populate blocks for later filing during
+  // cleanup handling.
+  populateEHCatchRegions(ehStack.getInnermostEHScope(), tryOp);
+}
+
+// Differently from LLVM traditional codegen, there are no dispatch blocks
+// to look at given cir.try_call does not jump to blocks like invoke does.
+// However.
+void CIRGenFunction::populateEHCatchRegions(EHScopeStack::stable_iterator scope,
+                                            cir::TryOp tryOp) {
+  if (EHPersonality::get(*this).usesFuncletPads()) {
+    cgm.errorNYI("getEHDispatchBlock: usesFuncletPads");
+    return;
+  }
+
+  // Otherwise, we should look at the actual scope.
+  EHScope &ehScope = *ehStack.find(scope);
+  bool mayThrow = ehScope.mayThrow();
+
+  mlir::Block *originalBlock = nullptr;
+  if (mayThrow && tryOp) {
+    // If the dispatch is cached but comes from a different tryOp, make sure:
+    // - Populate current `tryOp` with a new dispatch block regardless.
+    // - Update the map to enqueue new dispatchBlock to also get a cleanup. See
+    // code at the end of the function.
+    cgm.errorNYI("getEHDispatchBlock: mayThrow & tryOp");
+    return;
+  }
+
+  if (!mayThrow) {
+    switch (ehScope.getKind()) {
+    case EHScope::Catch: {
+      // LLVM does some optimization with branches here, CIR just keep track of
+      // the corresponding calls.
+      EHCatchScope &catchScope = cast<EHCatchScope>(ehScope);
+      if (catchScope.getNumHandlers() == 1 &&
+          catchScope.getHandler(0).isCatchAll()) {
+        mayThrow = true;
+        break;
+      }
+      cgm.errorNYI("getEHDispatchBlock: mayThrow non-catch all");
+      return;
+    }
+    case EHScope::Cleanup: {
+      cgm.errorNYI("getEHDispatchBlock: mayThrow & cleanup");
+      return;
+    }
+    case EHScope::Filter: {
+      cgm.errorNYI("getEHDispatchBlock: mayThrow & Filter");
+      return;
+    }
+    case EHScope::Terminate: {
+      cgm.errorNYI("getEHDispatchBlock: mayThrow & Terminate");
+      return;
+    }
+    }
+  }
+
+  if (originalBlock) {
+    cgm.errorNYI("getEHDispatchBlock: originalBlock");
+    return;
+  }
+
+  ehScope.setMayThrow(mayThrow);
+}
+
+// in classic codegen this function is mapping to `isInvokeDest` previously and
+// currently it's mapping to the conditions that performs early returns in
+// `getInvokeDestImpl`, in CIR we need the condition to know if the EH scope may
+// throw exception or now.
+bool CIRGenFunction::isCatchOrCleanupRequired() {
+  // If exceptions are disabled/ignored and SEH is not in use, then there is no
+  // invoke destination. SEH "works" even if exceptions are off. In practice,
+  // this means that C++ destructors and other EH cleanups don't run, which is
+  // consistent with MSVC's behavior, except in the presence of -EHa
+  const LangOptions &lo = cgm.getLangOpts();
+  if (!lo.Exceptions || lo.IgnoreExceptions) {
+    if (!lo.Borland && !lo.MicrosoftExt)
+      return false;
+    cgm.errorNYI("isInvokeDest: no exceptions or ignore exception");
+    return false;
+  }
+
+  // CUDA device code doesn't have exceptions.
+  if (lo.CUDA && lo.CUDAIsDevice)
+    return false;
+
+  return ehStack.requiresCatchOrCleanup();
+}
+
+// In classic codegen this function is equivalent to `getInvokeDestImpl`, in
+// ClangIR we don't need to return to return any landing pad, we just need to
+// populate the catch handlers if they are required
+void CIRGenFunction::populateCatchHandlersIfRequired(cir::TryOp tryOp) {
+  assert(ehStack.requiresCatchOrCleanup());
+  assert(!ehStack.empty());
+
+  assert(!cir::MissingFeatures::setFunctionPersonality());
+
+  // CIR does not cache landing pads.
+  const EHPersonality &personality = EHPersonality::get(*this);
+  if (personality.usesFuncletPads()) {
+    cgm.errorNYI("getInvokeDestImpl: usesFuncletPads");
+  } else {
+    populateCatchHandlers(tryOp);
+  }
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5ccb431..cac046c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -22,7 +22,11 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include <optional>
 
@@ -184,6 +188,7 @@ Address CIRGenFunction::emitPointerWithAlignment(const Expr *expr,
     case CK_HLSLArrayRValue:
     case CK_HLSLElementwiseCast:
     case CK_HLSLVectorTruncation:
+    case CK_HLSLMatrixTruncation:
     case CK_IntToOCLSampler:
     case CK_IntegralCast:
     case CK_IntegralComplexCast:
@@ -277,7 +282,6 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
   QualType t = e->getType();
 
   // If it's thread_local, emit a call to its wrapper function instead.
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
   if (vd->getTLSKind() == VarDecl::TLS_Dynamic)
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "emitGlobalVarDeclLValue: thread_local variable");
@@ -311,8 +315,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
 
 void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
-                                       bool isInit, bool isNontemporal) {
-  assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
+                                       LValueBaseInfo baseInfo, bool isInit,
+                                       bool isNontemporal) {
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
     // Boolean vectors use `iN` as storage type.
@@ -333,7 +337,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
 
   value = emitToMemory(value, ty);
 
-  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+  assert(!cir::MissingFeatures::opLoadStoreTbaa());
+  LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo);
+  if (ty->isAtomicType() ||
+      (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) {
+    emitAtomicStore(RValue::get(value), atomicLValue, isInit);
+    return;
+  }
 
   // Update the alloca with more info on initialization.
   assert(addr.getPointer() && "expected pointer to exist");
@@ -550,13 +560,15 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
   }
 
   emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
-                    lvalue.getType(), isInit, /*isNontemporal=*/false);
+                    lvalue.getType(), lvalue.getBaseInfo(), isInit,
+                    /*isNontemporal=*/false);
 }
 
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
                                              QualType ty, SourceLocation loc,
                                              LValueBaseInfo baseInfo) {
-  assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
+  // Traditional LLVM codegen handles thread local separately, CIR handles
+  // as part of getAddrOfGlobalVar (GetGlobalOp).
   mlir::Type eltTy = addr.getElementType();
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
@@ -619,10 +631,82 @@ RValue CIRGenFunction::emitLoadOfLValue(LValue lv, SourceLocation loc) {
                                                  lv.getVectorIdx()));
   }
 
+  if (lv.isExtVectorElt())
+    return emitLoadOfExtVectorElementLValue(lv);
+
   cgm.errorNYI(loc, "emitLoadOfLValue");
   return RValue::get(nullptr);
 }
 
+int64_t CIRGenFunction::getAccessedFieldNo(unsigned int idx,
+                                           const mlir::ArrayAttr elts) {
+  auto elt = mlir::cast<mlir::IntegerAttr>(elts[idx]);
+  return elt.getInt();
+}
+
+// If this is a reference to a subset of the elements of a vector, create an
+// appropriate shufflevector.
+RValue CIRGenFunction::emitLoadOfExtVectorElementLValue(LValue lv) {
+  mlir::Location loc = lv.getExtVectorPointer().getLoc();
+  mlir::Value vec = builder.createLoad(loc, lv.getExtVectorAddress());
+
+  // HLSL allows treating scalars as one-element vectors. Converting the scalar
+  // IR value to a vector here allows the rest of codegen to behave as normal.
+  if (getLangOpts().HLSL && !mlir::isa<cir::VectorType>(vec.getType())) {
+    cgm.errorNYI(loc, "emitLoadOfExtVectorElementLValue: HLSL");
+    return {};
+  }
+
+  const mlir::ArrayAttr elts = lv.getExtVectorElts();
+
+  // If the result of the expression is a non-vector type, we must be extracting
+  // a single element. Just codegen as an extractelement.
+  const auto *exprVecTy = lv.getType()->getAs<clang::VectorType>();
+  if (!exprVecTy) {
+    int64_t indexValue = getAccessedFieldNo(0, elts);
+    cir::ConstantOp index =
+        builder.getConstInt(loc, builder.getSInt64Ty(), indexValue);
+    return RValue::get(cir::VecExtractOp::create(builder, loc, vec, index));
+  }
+
+  // Always use shuffle vector to try to retain the original program structure
+  SmallVector<int64_t> mask;
+  for (auto i : llvm::seq<unsigned>(0, exprVecTy->getNumElements()))
+    mask.push_back(getAccessedFieldNo(i, elts));
+
+  cir::VecShuffleOp resultVec = builder.createVecShuffle(loc, vec, mask);
+  if (lv.getType()->isExtVectorBoolType()) {
+    cgm.errorNYI(loc, "emitLoadOfExtVectorElementLValue: ExtVectorBoolType");
+    return {};
+  }
+
+  return RValue::get(resultVec);
+}
+
+/// Generates lvalue for partial ext_vector access.
+Address CIRGenFunction::emitExtVectorElementLValue(LValue lv,
+                                                   mlir::Location loc) {
+  Address vectorAddress = lv.getExtVectorAddress();
+  QualType elementTy = lv.getType()->castAs<VectorType>()->getElementType();
+  mlir::Type vectorElementTy = cgm.getTypes().convertType(elementTy);
+  Address castToPointerElement =
+      vectorAddress.withElementType(builder, vectorElementTy);
+
+  mlir::ArrayAttr extVecElts = lv.getExtVectorElts();
+  unsigned idx = getAccessedFieldNo(0, extVecElts);
+  mlir::Value idxValue =
+      builder.getConstInt(loc, mlir::cast<cir::IntType>(ptrDiffTy), idx);
+
+  mlir::Value elementValue = builder.getArrayElement(
+      loc, loc, castToPointerElement.getPointer(), vectorElementTy, idxValue,
+      /*shouldDecay=*/false);
+
+  const CharUnits eltSize = getContext().getTypeSizeInChars(elementTy);
+  const CharUnits alignment =
+      castToPointerElement.getAlignment().alignmentAtOffset(idx * eltSize);
+  return Address(elementValue, vectorElementTy, alignment);
+}
+
 static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) {
   assert(!cir::MissingFeatures::weakRefReference());
   return cgm.getAddrOfFunction(gd);
@@ -1021,12 +1105,6 @@ static Address emitArraySubscriptPtr(CIRGenFunction &cgf,
 
 LValue
 CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
-  if (isa<ExtVectorElementExpr>(e->getBase())) {
-    cgm.errorNYI(e->getSourceRange(),
-                 "emitArraySubscriptExpr: ExtVectorElementExpr");
-    return LValue::makeAddr(Address::invalid(), e->getType(), LValueBaseInfo());
-  }
-
   if (getContext().getAsVariableArrayType(e->getType())) {
     cgm.errorNYI(e->getSourceRange(),
                  "emitArraySubscriptExpr: VariableArrayType");
@@ -1056,15 +1134,30 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
 
   // If the base is a vector type, then we are forming a vector element
   // with this subscript.
-  if (e->getBase()->getType()->isVectorType() &&
+  if (e->getBase()->getType()->isSubscriptableVectorType() &&
       !isa<ExtVectorElementExpr>(e->getBase())) {
     const mlir::Value idx = emitIdxAfterBase(/*promote=*/false);
-    const LValue lhs = emitLValue(e->getBase());
-    return LValue::makeVectorElt(lhs.getAddress(), idx, e->getBase()->getType(),
-                                 lhs.getBaseInfo());
+    const LValue lv = emitLValue(e->getBase());
+    return LValue::makeVectorElt(lv.getAddress(), idx, e->getBase()->getType(),
+                                 lv.getBaseInfo());
   }
 
   const mlir::Value idx = emitIdxAfterBase(/*promote=*/true);
+
+  // Handle the extvector case we ignored above.
+  if (isa<ExtVectorElementExpr>(e->getBase())) {
+    const LValue lv = emitLValue(e->getBase());
+    Address addr = emitExtVectorElementLValue(lv, cgm.getLoc(e->getExprLoc()));
+
+    QualType elementType = lv.getType()->castAs<VectorType>()->getElementType();
+    addr = emitArraySubscriptPtr(*this, cgm.getLoc(e->getBeginLoc()),
+                                 cgm.getLoc(e->getEndLoc()), addr, e->getType(),
+                                 idx, cgm.getLoc(e->getExprLoc()),
+                                 /*shouldDecay=*/false);
+
+    return makeAddrLValue(addr, elementType, lv.getBaseInfo());
+  }
+
   if (const Expr *array = getSimpleArrayDecayOperand(e->getBase())) {
     LValue arrayLV;
     if (const auto *ase = dyn_cast<ArraySubscriptExpr>(array))
@@ -1108,6 +1201,62 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
   return lv;
 }
 
+LValue CIRGenFunction::emitExtVectorElementExpr(const ExtVectorElementExpr *e) {
+  // Emit the base vector as an l-value.
+  LValue base;
+
+  // ExtVectorElementExpr's base can either be a vector or pointer to vector.
+  if (e->isArrow()) {
+    // If it is a pointer to a vector, emit the address and form an lvalue with
+    // it.
+    LValueBaseInfo baseInfo;
+    Address ptr = emitPointerWithAlignment(e->getBase(), &baseInfo);
+    const auto *clangPtrTy =
+        e->getBase()->getType()->castAs<clang::PointerType>();
+    base = makeAddrLValue(ptr, clangPtrTy->getPointeeType(), baseInfo);
+    base.getQuals().removeObjCGCAttr();
+  } else if (e->getBase()->isGLValue()) {
+    // Otherwise, if the base is an lvalue ( as in the case of foo.x.x),
+    // emit the base as an lvalue.
+    assert(e->getBase()->getType()->isVectorType());
+    base = emitLValue(e->getBase());
+  } else {
+    // Otherwise, the base is a normal rvalue (as in (V+V).x), emit it as such.
+    assert(e->getBase()->getType()->isVectorType() &&
+           "Result must be a vector");
+    mlir::Value vec = emitScalarExpr(e->getBase());
+
+    // Store the vector to memory (because LValue wants an address).
+    QualType baseTy = e->getBase()->getType();
+    Address vecMem = createMemTemp(baseTy, vec.getLoc(), "tmp");
+    if (!getLangOpts().HLSL && baseTy->isExtVectorBoolType()) {
+      cgm.errorNYI(e->getSourceRange(),
+                   "emitExtVectorElementExpr: ExtVectorBoolType & !HLSL");
+      return {};
+    }
+    builder.createStore(vec.getLoc(), vec, vecMem);
+    base = makeAddrLValue(vecMem, baseTy, AlignmentSource::Decl);
+  }
+
+  QualType type =
+      e->getType().withCVRQualifiers(base.getQuals().getCVRQualifiers());
+
+  // Encode the element access list into a vector of unsigned indices.
+  SmallVector<uint32_t, 4> indices;
+  e->getEncodedElementAccess(indices);
+
+  if (base.isSimple()) {
+    SmallVector<int64_t> attrElts(indices.begin(), indices.end());
+    mlir::ArrayAttr elts = builder.getI64ArrayAttr(attrElts);
+    return LValue::makeExtVectorElt(base.getAddress(), elts, type,
+                                    base.getBaseInfo());
+  }
+
+  cgm.errorNYI(e->getSourceRange(),
+               "emitExtVectorElementExpr: isSimple is false");
+  return {};
+}
+
 LValue CIRGenFunction::emitStringLiteralLValue(const StringLiteral *e,
                                                llvm::StringRef name) {
   cir::GlobalOp globalOp = cgm.getGlobalForStringLiteral(e, name);
@@ -1174,6 +1323,7 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLArrayRValue:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
@@ -1196,8 +1346,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_ToUnion:
-  case CK_BaseToDerived:
-  case CK_AddressSpaceConversion:
   case CK_ObjCObjectLValueCast:
   case CK_VectorSplat:
   case CK_ConstructorConversion:
@@ -1211,6 +1359,27 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
 
     return {};
   }
+  case CK_AddressSpaceConversion: {
+    LValue lv = emitLValue(e->getSubExpr());
+    QualType destTy = getContext().getPointerType(e->getType());
+
+    clang::LangAS srcLangAS = e->getSubExpr()->getType().getAddressSpace();
+    cir::TargetAddressSpaceAttr srcAS;
+    if (clang::isTargetAddressSpace(srcLangAS))
+      srcAS = cir::toCIRTargetAddressSpace(getMLIRContext(), srcLangAS);
+    else
+      cgm.errorNYI(
+          e->getSourceRange(),
+          "emitCastLValue: address space conversion from unknown address "
+          "space");
+
+    mlir::Value v = getTargetHooks().performAddrSpaceCast(
+        *this, lv.getPointer(), srcAS, convertType(destTy));
+
+    return makeAddrLValue(Address(v, convertTypeForMem(e->getType()),
+                                  lv.getAddress().getAlignment()),
+                          e->getType(), lv.getBaseInfo());
+  }
 
   case CK_LValueBitCast: {
     // This must be a reinterpret_cast (or c-style equivalent).
@@ -1263,6 +1432,22 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
     return makeAddrLValue(baseAddr, e->getType(), lv.getBaseInfo());
   }
 
+  case CK_BaseToDerived: {
+    const auto *derivedClassDecl = e->getType()->castAsCXXRecordDecl();
+    LValue lv = emitLValue(e->getSubExpr());
+
+    // Perform the base-to-derived conversion
+    Address derived = getAddressOfDerivedClass(
+        getLoc(e->getSourceRange()), lv.getAddress(), derivedClassDecl,
+        e->path(), /*NullCheckValue=*/false);
+    // C++11 [expr.static.cast]p2: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    assert(!cir::MissingFeatures::opTBAA());
+    return makeAddrLValue(derived, e->getType(), lv.getBaseInfo());
+  }
+
   case CK_ZeroToOCLOpaqueType:
     llvm_unreachable("NULL to OpenCL opaque type lvalue cast is not valid");
   }
@@ -1630,7 +1815,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot,
                                    bool ignoreResult) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
-    return RValue::get(emitScalarExpr(e));
+    return RValue::get(emitScalarExpr(e, ignoreResult));
   case cir::TEK_Complex:
     return RValue::getComplex(emitComplexExpr(e));
   case cir::TEK_Aggregate: {
@@ -1658,11 +1843,7 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
   const auto *fd = cast<FunctionDecl>(gd.getDecl());
 
   if (unsigned builtinID = fd->getBuiltinID()) {
-    if (fd->getAttr<AsmLabelAttr>()) {
-      cgm.errorNYI("AsmLabelAttr");
-    }
-
-    StringRef ident = fd->getName();
+    StringRef ident = cgm.getMangledName(gd);
     std::string fdInlineName = (ident + ".inline").str();
 
     bool isPredefinedLibFunction =
@@ -1690,8 +1871,7 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
         clone.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
             &cgm.getMLIRContext(), cir::GlobalLinkageKind::InternalLinkage));
         clone.setSymVisibility("private");
-        clone.setInlineKindAttr(cir::InlineAttr::get(
-            &cgm.getMLIRContext(), cir::InlineKind::AlwaysInline));
+        clone.setInlineKind(cir::InlineKind::AlwaysInline);
       }
       return CIRGenCallee::forDirect(clone, gd);
     }
@@ -2126,79 +2306,6 @@ RValue CIRGenFunction::emitCXXMemberCallExpr(const CXXMemberCallExpr *ce,
       ce, md, returnValue, hasQualifier, qualifier, isArrow, base);
 }
 
-void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
-                                          AggValueSlot dest) {
-  assert(!dest.isIgnored() && "Must have a destination!");
-  const CXXConstructorDecl *cd = e->getConstructor();
-
-  // If we require zero initialization before (or instead of) calling the
-  // constructor, as can be the case with a non-user-provided default
-  // constructor, emit the zero initialization now, unless destination is
-  // already zeroed.
-  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
-    switch (e->getConstructionKind()) {
-    case CXXConstructionKind::Delegating:
-    case CXXConstructionKind::Complete:
-      emitNullInitialization(getLoc(e->getSourceRange()), dest.getAddress(),
-                             e->getType());
-      break;
-    case CXXConstructionKind::VirtualBase:
-    case CXXConstructionKind::NonVirtualBase:
-      cgm.errorNYI(e->getSourceRange(),
-                   "emitCXXConstructExpr: base requires initialization");
-      break;
-    }
-  }
-
-  // If this is a call to a trivial default constructor, do nothing.
-  if (cd->isTrivial() && cd->isDefaultConstructor())
-    return;
-
-  // Elide the constructor if we're constructing from a temporary
-  if (getLangOpts().ElideConstructors && e->isElidable()) {
-    // FIXME: This only handles the simplest case, where the source object is
-    //        passed directly as the first argument to the constructor. This
-    //        should also handle stepping through implicit casts and conversion
-    //        sequences which involve two steps, with a conversion operator
-    //        follwed by a converting constructor.
-    const Expr *srcObj = e->getArg(0);
-    assert(srcObj->isTemporaryObject(getContext(), cd->getParent()));
-    assert(
-        getContext().hasSameUnqualifiedType(e->getType(), srcObj->getType()));
-    emitAggExpr(srcObj, dest);
-    return;
-  }
-
-  if (const ArrayType *arrayType = getContext().getAsArrayType(e->getType())) {
-    assert(!cir::MissingFeatures::sanitizers());
-    emitCXXAggrConstructorCall(cd, arrayType, dest.getAddress(), e, false);
-  } else {
-
-    clang::CXXCtorType type = Ctor_Complete;
-    bool forVirtualBase = false;
-    bool delegating = false;
-
-    switch (e->getConstructionKind()) {
-    case CXXConstructionKind::Complete:
-      type = Ctor_Complete;
-      break;
-    case CXXConstructionKind::Delegating:
-      // We should be emitting a constructor; GlobalDecl will assert this
-      type = curGD.getCtorType();
-      delegating = true;
-      break;
-    case CXXConstructionKind::VirtualBase:
-      forVirtualBase = true;
-      [[fallthrough]];
-    case CXXConstructionKind::NonVirtualBase:
-      type = Ctor_Base;
-      break;
-    }
-
-    emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
-  }
-}
-
 RValue CIRGenFunction::emitReferenceBindingToExpr(const Expr *e) {
   // Emit the expression as an lvalue.
   LValue lv = emitLValue(e);
@@ -2298,6 +2405,8 @@ Address CIRGenFunction::createTempAllocaWithoutCast(
 
 /// This creates a alloca and inserts it into the entry block. The alloca is
 /// casted to default address space if necessary.
+// TODO(cir): Implement address space casting to match classic codegen's
+// CreateTempAlloca behavior with DestLangAS parameter
 Address CIRGenFunction::createTempAlloca(mlir::Type ty, CharUnits align,
                                          mlir::Location loc, const Twine &name,
                                          mlir::Value arraySize,
@@ -2312,7 +2421,21 @@ Address CIRGenFunction::createTempAlloca(mlir::Type ty, CharUnits align,
   // be different from the type defined by the language. For example,
   // in C++ the auto variables are in the default address space. Therefore
   // cast alloca to the default address space when necessary.
-  assert(!cir::MissingFeatures::addressSpace());
+
+  LangAS allocaAS = alloca.getAddressSpace()
+                        ? clang::getLangASFromTargetAS(
+                              alloca.getAddressSpace().getValue().getUInt())
+                        : clang::LangAS::Default;
+  LangAS dstTyAS = clang::LangAS::Default;
+  if (getCIRAllocaAddressSpace()) {
+    dstTyAS = clang::getLangASFromTargetAS(
+        getCIRAllocaAddressSpace().getValue().getUInt());
+  }
+
+  if (dstTyAS != allocaAS) {
+    getTargetHooks().performAddrSpaceCast(*this, v, getCIRAllocaAddressSpace(),
+                                          builder.getPointerTo(ty, dstTyAS));
+  }
   return Address(v, ty, align);
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 3d3030c..872fc8d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -343,8 +343,8 @@ public:
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr");
   }
   void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
-    cgf.cgm.errorNYI(dae->getSourceRange(),
-                     "AggExprEmitter: VisitCXXDefaultArgExpr");
+    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
+    Visit(dae->getExpr());
   }
   void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
@@ -779,8 +779,8 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
     Expr *e, ArrayRef<Expr *> args, FieldDecl *initializedFieldInUnion,
     Expr *arrayFiller) {
 
-  const AggValueSlot dest =
-      ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType());
+  const mlir::Location loc = cgf.getLoc(e->getSourceRange());
+  const AggValueSlot dest = ensureSlot(loc, e->getType());
 
   if (e->getType()->isConstantArrayType()) {
     cir::ArrayType arrayTy =
@@ -819,13 +819,29 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
   if (auto *cxxrd = dyn_cast<CXXRecordDecl>(record)) {
     assert(numInitElements >= cxxrd->getNumBases() &&
            "missing initializer for base class");
-    if (cxxrd->getNumBases() > 0) {
-      cgf.cgm.errorNYI(e->getSourceRange(),
-                       "visitCXXParenListOrInitListExpr base class init");
-      return;
+    for (auto &base : cxxrd->bases()) {
+      assert(!base.isVirtual() && "should not see vbases here");
+      CXXRecordDecl *baseRD = base.getType()->getAsCXXRecordDecl();
+      Address address = cgf.getAddressOfDirectBaseInCompleteClass(
+          loc, dest.getAddress(), cxxrd, baseRD,
+          /*baseIsVirtual=*/false);
+      assert(!cir::MissingFeatures::aggValueSlotGC());
+      AggValueSlot aggSlot = AggValueSlot::forAddr(
+          address, Qualifiers(), AggValueSlot::IsDestructed,
+          AggValueSlot::IsNotAliased,
+          cgf.getOverlapForBaseInit(cxxrd, baseRD, false));
+      cgf.emitAggExpr(args[curInitIndex++], aggSlot);
+      if (base.getType().isDestructedType()) {
+        cgf.cgm.errorNYI(e->getSourceRange(),
+                         "push deferred deactivation cleanup");
+        return;
+      }
     }
   }
 
+  // Prepare a 'this' for CXXDefaultInitExprs.
+  CIRGenFunction::FieldConstructionScope fcScope(cgf, dest.getAddress());
+
   LValue destLV = cgf.makeAddrLValue(dest.getAddress(), e->getType());
 
   if (record->isUnion()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 9dd9b6d..a4acbac 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -16,6 +16,7 @@
 
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/Basic/OperatorKinds.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -234,6 +235,127 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
   return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
 }
 
+static void emitNullBaseClassInitialization(CIRGenFunction &cgf,
+                                            Address destPtr,
+                                            const CXXRecordDecl *base) {
+  if (base->isEmpty())
+    return;
+
+  const ASTRecordLayout &layout = cgf.getContext().getASTRecordLayout(base);
+  CharUnits nvSize = layout.getNonVirtualSize();
+
+  // We cannot simply zero-initialize the entire base sub-object if vbptrs are
+  // present, they are initialized by the most derived class before calling the
+  // constructor.
+  SmallVector<std::pair<CharUnits, CharUnits>, 1> stores;
+  stores.emplace_back(CharUnits::Zero(), nvSize);
+
+  // Each store is split by the existence of a vbptr.
+  // TODO(cir): This only needs handling for the MS CXXABI.
+  assert(!cir::MissingFeatures::msabi());
+
+  // If the type contains a pointer to data member we can't memset it to zero.
+  // Instead, create a null constant and copy it to the destination.
+  // TODO: there are other patterns besides zero that we can usefully memset,
+  // like -1, which happens to be the pattern used by member-pointers.
+  // TODO: isZeroInitializable can be over-conservative in the case where a
+  // virtual base contains a member pointer.
+  mlir::TypedAttr nullConstantForBase = cgf.cgm.emitNullConstantForBase(base);
+  if (!cgf.getBuilder().isNullValue(nullConstantForBase)) {
+    cgf.cgm.errorNYI(
+        base->getSourceRange(),
+        "emitNullBaseClassInitialization: base constant is not null");
+  } else {
+    // Otherwise, just memset the whole thing to zero.  This is legal
+    // because in LLVM, all default initializers (other than the ones we just
+    // handled above) are guaranteed to have a bit pattern of all zeros.
+    // TODO(cir): When the MS CXXABI is supported, we will need to iterate over
+    // `stores` and create a separate memset for each one. For now, we know that
+    // there will only be one store and it will begin at offset zero, so that
+    // simplifies this code considerably.
+    assert(stores.size() == 1 && "Expected only one store");
+    assert(stores[0].first == CharUnits::Zero() &&
+           "Expected store to begin at offset zero");
+    CIRGenBuilderTy builder = cgf.getBuilder();
+    mlir::Location loc = cgf.getLoc(base->getBeginLoc());
+    builder.createStore(loc, builder.getConstant(loc, nullConstantForBase),
+                        destPtr);
+  }
+}
+
+void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
+                                          AggValueSlot dest) {
+  assert(!dest.isIgnored() && "Must have a destination!");
+  const CXXConstructorDecl *cd = e->getConstructor();
+
+  // If we require zero initialization before (or instead of) calling the
+  // constructor, as can be the case with a non-user-provided default
+  // constructor, emit the zero initialization now, unless destination is
+  // already zeroed.
+  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
+    switch (e->getConstructionKind()) {
+    case CXXConstructionKind::Delegating:
+    case CXXConstructionKind::Complete:
+      emitNullInitialization(getLoc(e->getSourceRange()), dest.getAddress(),
+                             e->getType());
+      break;
+    case CXXConstructionKind::VirtualBase:
+    case CXXConstructionKind::NonVirtualBase:
+      emitNullBaseClassInitialization(*this, dest.getAddress(),
+                                      cd->getParent());
+      break;
+    }
+  }
+
+  // If this is a call to a trivial default constructor, do nothing.
+  if (cd->isTrivial() && cd->isDefaultConstructor())
+    return;
+
+  // Elide the constructor if we're constructing from a temporary
+  if (getLangOpts().ElideConstructors && e->isElidable()) {
+    // FIXME: This only handles the simplest case, where the source object is
+    //        passed directly as the first argument to the constructor. This
+    //        should also handle stepping through implicit casts and conversion
+    //        sequences which involve two steps, with a conversion operator
+    //        follwed by a converting constructor.
+    const Expr *srcObj = e->getArg(0);
+    assert(srcObj->isTemporaryObject(getContext(), cd->getParent()));
+    assert(
+        getContext().hasSameUnqualifiedType(e->getType(), srcObj->getType()));
+    emitAggExpr(srcObj, dest);
+    return;
+  }
+
+  if (const ArrayType *arrayType = getContext().getAsArrayType(e->getType())) {
+    assert(!cir::MissingFeatures::sanitizers());
+    emitCXXAggrConstructorCall(cd, arrayType, dest.getAddress(), e, false);
+  } else {
+
+    clang::CXXCtorType type = Ctor_Complete;
+    bool forVirtualBase = false;
+    bool delegating = false;
+
+    switch (e->getConstructionKind()) {
+    case CXXConstructionKind::Complete:
+      type = Ctor_Complete;
+      break;
+    case CXXConstructionKind::Delegating:
+      // We should be emitting a constructor; GlobalDecl will assert this
+      type = curGD.getCtorType();
+      delegating = true;
+      break;
+    case CXXConstructionKind::VirtualBase:
+      forVirtualBase = true;
+      [[fallthrough]];
+    case CXXConstructionKind::NonVirtualBase:
+      type = Ctor_Base;
+      break;
+    }
+
+    emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
+  }
+}
+
 static CharUnits calculateCookiePadding(CIRGenFunction &cgf,
                                         const CXXNewExpr *e) {
   if (!e->isArray())
@@ -305,7 +427,7 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
     const llvm::APInt &count =
         mlir::cast<cir::IntAttr>(constNumElements).getValue();
 
-    unsigned numElementsWidth = count.getBitWidth();
+    [[maybe_unused]] unsigned numElementsWidth = count.getBitWidth();
     bool hasAnyOverflow = false;
 
     // The equivalent code in CodeGen/CGExprCXX.cpp handles these cases as
@@ -527,6 +649,36 @@ static RValue emitNewDeleteCall(CIRGenFunction &cgf,
   return rv;
 }
 
+RValue CIRGenFunction::emitNewOrDeleteBuiltinCall(const FunctionProtoType *type,
+                                                  const CallExpr *callExpr,
+                                                  OverloadedOperatorKind op) {
+  CallArgList args;
+  emitCallArgs(args, type, callExpr->arguments());
+  // Find the allocation or deallocation function that we're calling.
+  ASTContext &astContext = getContext();
+  assert(op == OO_New || op == OO_Delete);
+  DeclarationName name = astContext.DeclarationNames.getCXXOperatorName(op);
+
+  clang::DeclContextLookupResult lookupResult =
+      astContext.getTranslationUnitDecl()->lookup(name);
+  for (const auto *decl : lookupResult) {
+    if (const auto *funcDecl = dyn_cast<FunctionDecl>(decl)) {
+      if (astContext.hasSameType(funcDecl->getType(), QualType(type, 0))) {
+        if (sanOpts.has(SanitizerKind::AllocToken)) {
+          // TODO: Set !alloc_token metadata.
+          assert(!cir::MissingFeatures::allocToken());
+          cgm.errorNYI("Alloc token sanitizer not yet supported!");
+        }
+
+        // Emit the call to operator new/delete.
+        return emitNewDeleteCall(*this, funcDecl, type, args);
+      }
+    }
+  }
+
+  llvm_unreachable("predeclared global operator new/delete is missing");
+}
+
 namespace {
 /// Calls the given 'operator delete' on a single object.
 struct CallObjectDelete final : EHScopeStack::Cleanup {
@@ -538,7 +690,7 @@ struct CallObjectDelete final : EHScopeStack::Cleanup {
                    QualType elementType)
       : ptr(ptr), operatorDelete(operatorDelete), elementType(elementType) {}
 
-  void emit(CIRGenFunction &cgf) override {
+  void emit(CIRGenFunction &cgf, Flags flags) override {
     cgf.emitDeleteCall(operatorDelete, ptr, elementType);
   }
 };
@@ -654,8 +806,27 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
   Address allocation = Address::invalid();
   CallArgList allocatorArgs;
   if (allocator->isReservedGlobalPlacementOperator()) {
-    cgm.errorNYI(e->getSourceRange(),
-                 "emitCXXNewExpr: reserved global placement operator");
+    // If the allocator is a global placement operator, just
+    // "inline" it directly.
+    assert(e->getNumPlacementArgs() == 1);
+    const Expr *arg = *e->placement_arguments().begin();
+
+    LValueBaseInfo baseInfo;
+    allocation = emitPointerWithAlignment(arg, &baseInfo);
+
+    // The pointer expression will, in many cases, be an opaque void*.
+    // In these cases, discard the computed alignment and use the
+    // formal alignment of the allocated type.
+    if (baseInfo.getAlignmentSource() != AlignmentSource::Decl)
+      allocation = allocation.withAlignment(allocAlign);
+
+    // Set up allocatorArgs for the call to operator delete if it's not
+    // the reserved global operator.
+    if (e->getOperatorDelete() &&
+        !e->getOperatorDelete()->isReservedGlobalPlacementOperator()) {
+      cgm.errorNYI(e->getSourceRange(),
+                   "emitCXXNewExpr: reserved placement new with delete");
+    }
   } else {
     const FunctionProtoType *allocatorType =
         allocator->getType()->castAs<FunctionProtoType>();
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 047f359..d112425 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -339,7 +339,7 @@ mlir::Value ComplexExprEmitter::emitLoadOfLValue(LValue lv,
     cgf.cgm.errorNYI(loc, "emitLoadOfLValue with Atomic LV");
 
   const Address srcAddr = lv.getAddress();
-  return builder.createLoad(cgf.getLoc(loc), srcAddr);
+  return builder.createLoad(cgf.getLoc(loc), srcAddr, lv.isVolatileQualified());
 }
 
 /// EmitStoreOfComplex - Store the specified real/imag parts into the
@@ -353,7 +353,7 @@ void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
   }
 
   const Address destAddr = lv.getAddress();
-  builder.createStore(loc, val, destAddr);
+  builder.createStore(loc, val, destAddr, lv.isVolatileQualified());
 }
 
 //===----------------------------------------------------------------------===//
@@ -400,8 +400,13 @@ mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
 }
 
 mlir::Value ComplexExprEmitter::VisitStmtExpr(const StmtExpr *e) {
-  cgf.cgm.errorNYI(e->getExprLoc(), "ComplexExprEmitter VisitExpr");
-  return {};
+  CIRGenFunction::StmtExprEvaluation eval(cgf);
+  Address retAlloca =
+      cgf.createMemTemp(e->getType(), cgf.getLoc(e->getSourceRange()));
+  (void)cgf.emitCompoundStmt(*e->getSubStmt(), &retAlloca);
+  assert(retAlloca.isValid() && "Expected complex return value");
+  return emitLoadOfLValue(cgf.makeAddrLValue(retAlloca, e->getType()),
+                          e->getExprLoc());
 }
 
 mlir::Value ComplexExprEmitter::emitComplexToComplexCast(mlir::Value val,
@@ -534,6 +539,7 @@ mlir::Value ComplexExprEmitter::emitCast(CastKind ck, Expr *op,
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLArrayRValue:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 6af87a0..329fd08 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -1012,6 +1012,7 @@ public:
     case CK_MatrixCast:
     case CK_HLSLArrayRValue:
     case CK_HLSLVectorTruncation:
+    case CK_HLSLMatrixTruncation:
     case CK_HLSLElementwiseCast:
     case CK_HLSLAggregateSplatCast:
       return {};
@@ -1521,6 +1522,101 @@ ConstantEmitter::~ConstantEmitter() {
          "not finalized after being initialized for non-abstract emission");
 }
 
+static mlir::TypedAttr emitNullConstantForBase(CIRGenModule &cgm,
+                                               mlir::Type baseType,
+                                               const CXXRecordDecl *baseDecl);
+
+static mlir::TypedAttr emitNullConstant(CIRGenModule &cgm, const RecordDecl *rd,
+                                        bool asCompleteObject) {
+  const CIRGenRecordLayout &layout = cgm.getTypes().getCIRGenRecordLayout(rd);
+  mlir::Type ty = (asCompleteObject ? layout.getCIRType()
+                                    : layout.getBaseSubobjectCIRType());
+  auto recordTy = mlir::cast<cir::RecordType>(ty);
+
+  unsigned numElements = recordTy.getNumElements();
+  SmallVector<mlir::Attribute> elements(numElements);
+
+  auto *cxxrd = dyn_cast<CXXRecordDecl>(rd);
+  // Fill in all the bases.
+  if (cxxrd) {
+    for (const CXXBaseSpecifier &base : cxxrd->bases()) {
+      if (base.isVirtual()) {
+        // Ignore virtual bases; if we're laying out for a complete
+        // object, we'll lay these out later.
+        continue;
+      }
+
+      const auto *baseDecl = base.getType()->castAsCXXRecordDecl();
+      // Ignore empty bases.
+      if (isEmptyRecordForLayout(cgm.getASTContext(), base.getType()) ||
+          cgm.getASTContext()
+              .getASTRecordLayout(baseDecl)
+              .getNonVirtualSize()
+              .isZero())
+        continue;
+
+      unsigned fieldIndex = layout.getNonVirtualBaseCIRFieldNo(baseDecl);
+      mlir::Type baseType = recordTy.getElementType(fieldIndex);
+      elements[fieldIndex] = emitNullConstantForBase(cgm, baseType, baseDecl);
+    }
+  }
+
+  // Fill in all the fields.
+  for (const FieldDecl *field : rd->fields()) {
+    // Fill in non-bitfields. (Bitfields always use a zero pattern, which we
+    // will fill in later.)
+    if (!field->isBitField() &&
+        !isEmptyFieldForLayout(cgm.getASTContext(), field)) {
+      unsigned fieldIndex = layout.getCIRFieldNo(field);
+      elements[fieldIndex] = cgm.emitNullConstantAttr(field->getType());
+    }
+
+    // For unions, stop after the first named field.
+    if (rd->isUnion()) {
+      if (field->getIdentifier())
+        break;
+      if (const auto *fieldRD = field->getType()->getAsRecordDecl())
+        if (fieldRD->findFirstNamedDataMember())
+          break;
+    }
+  }
+
+  // Fill in the virtual bases, if we're working with the complete object.
+  if (cxxrd && asCompleteObject) {
+    for ([[maybe_unused]] const CXXBaseSpecifier &vbase : cxxrd->vbases()) {
+      cgm.errorNYI(vbase.getSourceRange(), "emitNullConstant: virtual base");
+      return {};
+    }
+  }
+
+  // Now go through all other fields and zero them out.
+  for (unsigned i = 0; i != numElements; ++i) {
+    if (!elements[i]) {
+      cgm.errorNYI(rd->getSourceRange(), "emitNullConstant: field not zeroed");
+      return {};
+    }
+  }
+
+  mlir::MLIRContext *mlirContext = recordTy.getContext();
+  return cir::ConstRecordAttr::get(recordTy,
+                                   mlir::ArrayAttr::get(mlirContext, elements));
+}
+
+/// Emit the null constant for a base subobject.
+static mlir::TypedAttr emitNullConstantForBase(CIRGenModule &cgm,
+                                               mlir::Type baseType,
+                                               const CXXRecordDecl *baseDecl) {
+  const CIRGenRecordLayout &baseLayout =
+      cgm.getTypes().getCIRGenRecordLayout(baseDecl);
+
+  // Just zero out bases that don't have any pointer to data members.
+  if (baseLayout.isZeroInitializableAsBase())
+    return cgm.getBuilder().getZeroInitAttr(baseType);
+
+  // Otherwise, we can just use its null constant.
+  return emitNullConstant(cgm, baseDecl, /*asCompleteObject=*/false);
+}
+
 mlir::Attribute ConstantEmitter::tryEmitPrivateForVarInit(const VarDecl &d) {
   // Make a quick check if variable can be default NULL initialized
   // and avoid going through rest of code which may do, for c++11,
@@ -1820,23 +1916,32 @@ mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value,
 }
 
 mlir::Value CIRGenModule::emitNullConstant(QualType t, mlir::Location loc) {
-  if (t->getAs<PointerType>()) {
-    return builder.getNullPtr(getTypes().convertTypeForMem(t), loc);
-  }
+  return builder.getConstant(loc, emitNullConstantAttr(t));
+}
+
+mlir::TypedAttr CIRGenModule::emitNullConstantAttr(QualType t) {
+  if (t->getAs<PointerType>())
+    return builder.getConstNullPtrAttr(getTypes().convertTypeForMem(t));
 
   if (getTypes().isZeroInitializable(t))
-    return builder.getNullValue(getTypes().convertTypeForMem(t), loc);
+    return builder.getZeroInitAttr(getTypes().convertTypeForMem(t));
 
   if (getASTContext().getAsConstantArrayType(t)) {
-    errorNYI("CIRGenModule::emitNullConstant ConstantArrayType");
+    errorNYI("CIRGenModule::emitNullConstantAttr ConstantArrayType");
+    return {};
   }
 
-  if (t->isRecordType())
-    errorNYI("CIRGenModule::emitNullConstant RecordType");
+  if (const RecordType *rt = t->getAs<RecordType>())
+    return ::emitNullConstant(*this, rt->getDecl(), /*asCompleteObject=*/true);
 
   assert(t->isMemberDataPointerType() &&
          "Should only see pointers to data members here!");
 
-  errorNYI("CIRGenModule::emitNullConstant unsupported type");
+  errorNYI("CIRGenModule::emitNullConstantAttr unsupported type");
   return {};
 }
+
+mlir::TypedAttr
+CIRGenModule::emitNullConstantForBase(const CXXRecordDecl *record) {
+  return ::emitNullConstant(*this, record, false);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 119314f..25ce1ba 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -15,6 +15,7 @@
 
 #include "clang/AST/Expr.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 
 #include "mlir/IR/Location.h"
@@ -78,15 +79,20 @@ struct BinOpInfo {
 class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   CIRGenFunction &cgf;
   CIRGenBuilderTy &builder;
+  // Unlike classic codegen we set this to false or use std::exchange to read
+  // the value instead of calling TestAndClearIgnoreResultAssign to make it
+  // explicit when the value is used
   bool ignoreResultAssign;
 
 public:
-  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder)
-      : cgf(cgf), builder(builder) {}
+  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+                    bool ignoreResultAssign = false)
+      : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {}
 
   //===--------------------------------------------------------------------===//
   //                               Utilities
   //===--------------------------------------------------------------------===//
+  mlir::Type convertType(QualType ty) { return cgf.convertType(ty); }
 
   mlir::Value emitComplexToScalarConversion(mlir::Location loc,
                                             mlir::Value value, CastKind kind,
@@ -133,6 +139,11 @@ public:
     return {};
   }
 
+  mlir::Value VisitConstantExpr(ConstantExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: constant expr");
+    return {};
+  }
+
   mlir::Value VisitPackIndexingExpr(PackIndexingExpr *e) {
     return Visit(e->getSelectedExpr());
   }
@@ -150,6 +161,18 @@ public:
     return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
   }
 
+  mlir::Value VisitCoawaitExpr(CoawaitExpr *s) {
+    return cgf.emitCoawaitExpr(*s).getValue();
+  }
+  mlir::Value VisitCoyieldExpr(CoyieldExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: coyield");
+    return {};
+  }
+  mlir::Value VisitUnaryCoawait(const UnaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: unary coawait");
+    return {};
+  }
+
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
     return cgf.emitLoadOfLValue(lv, loc).getValue();
   }
@@ -162,12 +185,38 @@ public:
     return emitLoadOfLValue(e);
   }
 
+  mlir::Value VisitAddrLabelExpr(const AddrLabelExpr *e) {
+    auto func = cast<cir::FuncOp>(cgf.curFn);
+    cir::BlockAddrInfoAttr blockInfoAttr = cir::BlockAddrInfoAttr::get(
+        &cgf.getMLIRContext(), func.getSymName(), e->getLabel()->getName());
+    cir::BlockAddressOp blockAddressOp = cir::BlockAddressOp::create(
+        builder, cgf.getLoc(e->getSourceRange()), cgf.convertType(e->getType()),
+        blockInfoAttr);
+    cir::LabelOp resolvedLabel = cgf.cgm.lookupBlockAddressInfo(blockInfoAttr);
+    if (!resolvedLabel) {
+      cgf.cgm.mapUnresolvedBlockAddress(blockAddressOp);
+      // Still add the op to maintain insertion order it will be resolved in
+      // resolveBlockAddresses
+      cgf.cgm.mapResolvedBlockAddress(blockAddressOp, nullptr);
+    } else {
+      cgf.cgm.mapResolvedBlockAddress(blockAddressOp, resolvedLabel);
+    }
+    cgf.instantiateIndirectGotoBlock();
+    return blockAddressOp;
+  }
+
   mlir::Value VisitIntegerLiteral(const IntegerLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()),
                                    cir::IntAttr::get(type, e->getValue()));
   }
 
+  mlir::Value VisitFixedPointLiteral(const FixedPointLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: fixed point literal");
+    return {};
+  }
+
   mlir::Value VisitFloatingLiteral(const FloatingLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     assert(mlir::isa<cir::FPTypeInterface>(type) &&
@@ -193,6 +242,29 @@ public:
     return emitNullValue(e->getType(), cgf.getLoc(e->getSourceRange()));
   }
 
+  mlir::Value VisitGNUNullExpr(const GNUNullExpr *e) {
+    return emitNullValue(e->getType(), cgf.getLoc(e->getSourceRange()));
+  }
+
+  mlir::Value VisitOffsetOfExpr(OffsetOfExpr *e);
+
+  mlir::Value VisitSizeOfPackExpr(SizeOfPackExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: size of pack");
+    return {};
+  }
+  mlir::Value VisitPseudoObjectExpr(PseudoObjectExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: pseudo object");
+    return {};
+  }
+  mlir::Value VisitSYCLUniqueStableNameExpr(SYCLUniqueStableNameExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: sycl unique stable name");
+    return {};
+  }
+  mlir::Value VisitEmbedExpr(EmbedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: embed");
+    return {};
+  }
   mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     if (e->isGLValue())
       return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
@@ -202,6 +274,38 @@ public:
     return cgf.getOrCreateOpaqueRValueMapping(e).getValue();
   }
 
+  mlir::Value VisitObjCSelectorExpr(ObjCSelectorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc selector");
+    return {};
+  }
+  mlir::Value VisitObjCProtocolExpr(ObjCProtocolExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc protocol");
+    return {};
+  }
+  mlir::Value VisitObjCIVarRefExpr(ObjCIvarRefExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc ivar ref");
+    return {};
+  }
+  mlir::Value VisitObjCMessageExpr(ObjCMessageExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc message");
+    return {};
+  }
+  mlir::Value VisitObjCIsaExpr(ObjCIsaExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc isa");
+    return {};
+  }
+  mlir::Value VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc availability check");
+    return {};
+  }
+
+  mlir::Value VisitMatrixSubscriptExpr(MatrixSubscriptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: matrix subscript");
+    return {};
+  }
+
   mlir::Value VisitCastExpr(CastExpr *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
 
@@ -221,6 +325,8 @@ public:
   }
 
   mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) {
+    ignoreResultAssign = false;
+
     if (e->getBase()->getType()->isVectorType()) {
       assert(!cir::MissingFeatures::scalableVectors());
 
@@ -271,6 +377,8 @@ public:
                                 e->getSourceRange().getBegin());
   }
 
+  mlir::Value VisitExtVectorElementExpr(Expr *e) { return emitLoadOfLValue(e); }
+
   mlir::Value VisitMemberExpr(MemberExpr *e);
 
   mlir::Value VisitCompoundLiteralExpr(CompoundLiteralExpr *e) {
@@ -279,6 +387,18 @@ public:
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
 
+  mlir::Value VisitArrayInitIndexExpr(ArrayInitIndexExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array init index");
+    return {};
+  }
+
+  mlir::Value VisitImplicitValueInitExpr(const ImplicitValueInitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: implicit value init");
+    return {};
+  }
+
   mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) {
     return VisitCastExpr(e);
   }
@@ -432,6 +552,10 @@ public:
     return cgf.emitVAArg(ve);
   }
 
+  mlir::Value VisitCXXRewrittenBinaryOperator(CXXRewrittenBinaryOperator *e) {
+    return Visit(e->getSemanticForm());
+  }
+
   mlir::Value VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *e);
   mlir::Value
   VisitAbstractConditionalOperator(const AbstractConditionalOperator *e);
@@ -607,11 +731,8 @@ public:
   }
 
   mlir::Value VisitUnaryAddrOf(const UnaryOperator *e) {
-    if (llvm::isa<MemberPointerType>(e->getType())) {
-      cgf.cgm.errorNYI(e->getSourceRange(), "Address of member pointer");
-      return builder.getNullPtr(cgf.convertType(e->getType()),
-                                cgf.getLoc(e->getExprLoc()));
-    }
+    if (llvm::isa<MemberPointerType>(e->getType()))
+      return cgf.cgm.emitMemberPointerConstant(e);
 
     return cgf.emitLValue(e->getSubExpr()).getPointer();
   }
@@ -682,6 +803,20 @@ public:
     return Visit(e->getSubExpr());
   }
 
+  // C++
+  mlir::Value VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: materialize temporary");
+    return {};
+  }
+  mlir::Value VisitSourceLocExpr(SourceLocExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: source loc");
+    return {};
+  }
+  mlir::Value VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
+    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
+    return Visit(dae->getExpr());
+  }
   mlir::Value VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die) {
     CIRGenFunction::CXXDefaultInitExprScope scope(cgf, die);
     return Visit(die->getExpr());
@@ -697,11 +832,43 @@ public:
     cgf.emitCXXDeleteExpr(e);
     return {};
   }
-
+  mlir::Value VisitTypeTraitExpr(const TypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: type trait");
+    return {};
+  }
+  mlir::Value
+  VisitConceptSpecializationExpr(const ConceptSpecializationExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: concept specialization");
+    return {};
+  }
+  mlir::Value VisitRequiresExpr(const RequiresExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: requires");
+    return {};
+  }
+  mlir::Value VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array type trait");
+    return {};
+  }
+  mlir::Value VisitExpressionTraitExpr(const ExpressionTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: expression trait");
+    return {};
+  }
+  mlir::Value VisitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: cxx pseudo destructor");
+    return {};
+  }
   mlir::Value VisitCXXThrowExpr(const CXXThrowExpr *e) {
     cgf.emitCXXThrowExpr(e);
     return {};
   }
+  mlir::Value VisitCXXNoexceptExpr(CXXNoexceptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: cxx noexcept");
+    return {};
+  }
 
   /// Emit a conversion from the specified type to the specified destination
   /// type, both of which are CIR scalar types.
@@ -839,6 +1006,7 @@ public:
 
   BinOpInfo emitBinOps(const BinaryOperator *e,
                        QualType promotionType = QualType()) {
+    ignoreResultAssign = false;
     BinOpInfo result;
     result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType);
     result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType);
@@ -924,6 +1092,7 @@ public:
 #undef HANDLEBINOP
 
   mlir::Value emitCmp(const BinaryOperator *e) {
+    ignoreResultAssign = false;
     const mlir::Location loc = cgf.getLoc(e->getExprLoc());
     mlir::Value result;
     QualType lhsTy = e->getLHS()->getType();
@@ -1163,6 +1332,52 @@ public:
     return maybePromoteBoolResult(resOp.getResult(), resTy);
   }
 
+  mlir::Value VisitBinPtrMemD(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem d");
+    return {};
+  }
+
+  mlir::Value VisitBinPtrMemI(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem i");
+    return {};
+  }
+
+  // Other Operators.
+  mlir::Value VisitBlockExpr(const BlockExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: block");
+    return {};
+  }
+
+  mlir::Value VisitChooseExpr(ChooseExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: choose");
+    return {};
+  }
+
+  mlir::Value VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc string literal");
+    return {};
+  }
+  mlir::Value VisitObjCBoxedExpr(ObjCBoxedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc boxed");
+    return {};
+  }
+  mlir::Value VisitObjCArrayLiteral(ObjCArrayLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc array literal");
+    return {};
+  }
+  mlir::Value VisitObjCDictionaryLiteral(ObjCDictionaryLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc dictionary literal");
+    return {};
+  }
+
+  mlir::Value VisitAsTypeExpr(AsTypeExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: as type");
+    return {};
+  }
+
   mlir::Value VisitAtomicExpr(AtomicExpr *e) {
     return cgf.emitAtomicExpr(e).getValue();
   }
@@ -1406,11 +1621,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) {
 }
 
 /// Emit the computation of the specified expression of scalar type.
-mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) {
+mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e,
+                                           bool ignoreResultAssign) {
   assert(e && hasScalarEvaluationKind(e->getType()) &&
          "Invalid scalar expression to emit");
 
-  return ScalarExprEmitter(*this, builder).Visit(const_cast<Expr *>(e));
+  return ScalarExprEmitter(*this, builder, ignoreResultAssign)
+      .Visit(const_cast<Expr *>(e));
 }
 
 mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e,
@@ -1874,6 +2091,35 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return cgf.getBuilder().createBitcast(cgf.getLoc(subExpr->getSourceRange()),
                                           src, dstTy);
   }
+  case CK_AddressSpaceConversion: {
+    Expr::EvalResult result;
+    if (subExpr->EvaluateAsRValue(result, cgf.getContext()) &&
+        result.Val.isNullPointer()) {
+      // If e has side effect, it is emitted even if its final result is a
+      // null pointer. In that case, a DCE pass should be able to
+      // eliminate the useless instructions emitted during translating E.
+      if (result.HasSideEffects)
+        Visit(subExpr);
+      return cgf.cgm.emitNullConstant(destTy,
+                                      cgf.getLoc(subExpr->getExprLoc()));
+    }
+
+    clang::QualType srcTy = subExpr->IgnoreImpCasts()->getType();
+    if (srcTy->isPointerType() || srcTy->isReferenceType())
+      srcTy = srcTy->getPointeeType();
+
+    clang::LangAS srcLangAS = srcTy.getAddressSpace();
+    cir::TargetAddressSpaceAttr subExprAS;
+    if (clang::isTargetAddressSpace(srcLangAS))
+      subExprAS = cir::toCIRTargetAddressSpace(cgf.getMLIRContext(), srcLangAS);
+    else
+      cgf.cgm.errorNYI(subExpr->getSourceRange(),
+                       "non-target address space conversion");
+    // Since target may map different address spaces in AST to the same address
+    // space, an address space conversion may end up as a bitcast.
+    return cgf.cgm.getTargetCIRGenInfo().performAddrSpaceCast(
+        cgf, Visit(subExpr), subExprAS, convertType(destTy));
+  }
 
   case CK_AtomicToNonAtomic: {
     cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(),
@@ -1919,6 +2165,27 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return builder.createIntToPtr(middleVal, destCIRTy);
   }
 
+  case CK_BaseToDerived: {
+    const CXXRecordDecl *derivedClassDecl = destTy->getPointeeCXXRecordDecl();
+    assert(derivedClassDecl && "BaseToDerived arg isn't a C++ object pointer!");
+    Address base = cgf.emitPointerWithAlignment(subExpr);
+    Address derived = cgf.getAddressOfDerivedClass(
+        cgf.getLoc(ce->getSourceRange()), base, derivedClassDecl, ce->path(),
+        cgf.shouldNullCheckClassCastValue(ce));
+
+    // C++11 [expr.static.cast]p11: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    return cgf.getAsNaturalPointerTo(derived, ce->getType()->getPointeeType());
+  }
+  case CK_UncheckedDerivedToBase:
+  case CK_DerivedToBase: {
+    // The EmitPointerWithAlignment path does this fine; just discard
+    // the alignment.
+    return cgf.getAsNaturalPointerTo(cgf.emitPointerWithAlignment(ce),
+                                     ce->getType()->getPointeeType());
+  }
   case CK_Dynamic: {
     Address v = cgf.emitPointerWithAlignment(subExpr);
     const auto *dce = cast<CXXDynamicCastExpr>(ce);
@@ -2054,6 +2321,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) {
 mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) {
   const unsigned numInitElements = e->getNumInits();
 
+  [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false);
+  assert((ignore == false ||
+          (numInitElements == 0 && e->getType()->isVoidType())) &&
+         "init list ignored");
+
   if (e->hadArrayRangeDesignator()) {
     cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator");
     return {};
@@ -2145,6 +2417,21 @@ mlir::Value ScalarExprEmitter::VisitUnaryLNot(const UnaryOperator *e) {
   return maybePromoteBoolResult(boolVal, cgf.convertType(e->getType()));
 }
 
+mlir::Value ScalarExprEmitter::VisitOffsetOfExpr(OffsetOfExpr *e) {
+  // Try folding the offsetof to a constant.
+  Expr::EvalResult evalResult;
+  if (e->EvaluateAsInt(evalResult, cgf.getContext())) {
+    mlir::Type type = cgf.convertType(e->getType());
+    llvm::APSInt value = evalResult.Val.getInt();
+    return builder.getConstAPInt(cgf.getLoc(e->getExprLoc()), type, value);
+  }
+
+  cgf.getCIRGenModule().errorNYI(
+      e->getSourceRange(),
+      "ScalarExprEmitter::VisitOffsetOfExpr Can't eval expr as int");
+  return {};
+}
+
 mlir::Value ScalarExprEmitter::VisitUnaryReal(const UnaryOperator *e) {
   QualType promotionTy = getPromotionType(e->getSubExpr()->getType());
   mlir::Value result = VisitRealImag(e, promotionTy);
@@ -2214,14 +2501,50 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
   const QualType typeToSize = e->getTypeOfArgument();
   const mlir::Location loc = cgf.getLoc(e->getSourceRange());
   if (auto kind = e->getKind();
-      kind == UETT_SizeOf || kind == UETT_DataSizeOf) {
-    if (cgf.getContext().getAsVariableArrayType(typeToSize)) {
-      cgf.getCIRGenModule().errorNYI(e->getSourceRange(),
-                                     "sizeof operator for VariableArrayType",
-                                     e->getStmtClassName());
-      return builder.getConstant(
-          loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
-                                 llvm::APSInt(llvm::APInt(64, 1), true)));
+      kind == UETT_SizeOf || kind == UETT_DataSizeOf || kind == UETT_CountOf) {
+    if (const VariableArrayType *vat =
+            cgf.getContext().getAsVariableArrayType(typeToSize)) {
+      // For _Countof, we only want to evaluate if the extent is actually
+      // variable as opposed to a multi-dimensional array whose extent is
+      // constant but whose element type is variable.
+      bool evaluateExtent = true;
+      if (kind == UETT_CountOf && vat->getElementType()->isArrayType()) {
+        evaluateExtent =
+            !vat->getSizeExpr()->isIntegerConstantExpr(cgf.getContext());
+      }
+
+      if (evaluateExtent) {
+        if (e->isArgumentType()) {
+          // sizeof(type) - make sure to emit the VLA size.
+          cgf.emitVariablyModifiedType(typeToSize);
+        } else {
+          // C99 6.5.3.4p2: If the argument is an expression of type
+          // VLA, it is evaluated.
+          cgf.emitIgnoredExpr(e->getArgumentExpr());
+        }
+
+        // For _Countof, we just want to return the size of a single dimension.
+        if (kind == UETT_CountOf)
+          return cgf.getVLAElements1D(vat).numElts;
+
+        // For sizeof and __datasizeof, we need to scale the number of elements
+        // by the size of the array element type.
+        CIRGenFunction::VlaSizePair vlaSize = cgf.getVLASize(vat);
+        mlir::Value numElts = vlaSize.numElts;
+
+        // Scale the number of non-VLA elements by the non-VLA element size.
+        CharUnits eltSize = cgf.getContext().getTypeSizeInChars(vlaSize.type);
+        if (!eltSize.isOne()) {
+          mlir::Location loc = cgf.getLoc(e->getSourceRange());
+          mlir::Value eltSizeValue =
+              builder.getConstAPInt(numElts.getLoc(), numElts.getType(),
+                                    cgf.cgm.getSize(eltSize).getValue());
+          return builder.createMul(loc, eltSizeValue, numElts,
+                                   cir::OverflowBehavior::NoUnsignedWrap);
+        }
+
+        return numElts;
+      }
     }
   } else if (e->getKind() == UETT_OpenMPRequiredSimdAlign) {
     cgf.getCIRGenModule().errorNYI(
@@ -2285,9 +2608,9 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
       // type, so evaluating it returns a null Value.  However, a conditional
       // with non-void type must return a non-null Value.
       if (!result && !e->getType()->isVoidType()) {
-        cgf.cgm.errorNYI(e->getSourceRange(),
-                         "throw expression in conditional operator");
-        result = {};
+        result = builder.getConstant(
+            loc, cir::PoisonAttr::get(builder.getContext(),
+                                      cgf.convertType(e->getType())));
       }
 
       return result;
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 5d5209b..6b2e60a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -16,6 +16,7 @@
 #include "CIRGenCall.h"
 #include "CIRGenValue.h"
 #include "mlir/IR/Location.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -412,28 +413,45 @@ void CIRGenFunction::LexicalScope::emitImplicitReturn() {
   (void)emitReturn(localScope->endLoc);
 }
 
-void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
-                                   cir::FuncOp fn, cir::FuncType funcType,
-                                   FunctionArgList args, SourceLocation loc,
-                                   SourceLocation startLoc) {
-  assert(!curFn &&
-         "CIRGenFunction can only be used for one function at a time");
+cir::TryOp CIRGenFunction::LexicalScope::getClosestTryParent() {
+  LexicalScope *scope = this;
+  while (scope) {
+    if (scope->isTry())
+      return scope->getTry();
+    scope = scope->parentScope;
+  }
+  return nullptr;
+}
 
-  curFn = fn;
+/// An argument came in as a promoted argument; demote it back to its
+/// declared type.
+static mlir::Value emitArgumentDemotion(CIRGenFunction &cgf, const VarDecl *var,
+                                        mlir::Value value) {
+  mlir::Type ty = cgf.convertType(var->getType());
 
-  const Decl *d = gd.getDecl();
+  // This can happen with promotions that actually don't change the
+  // underlying type, like the enum promotions.
+  if (value.getType() == ty)
+    return value;
 
-  didCallStackSave = false;
-  curCodeDecl = d;
-  const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
-  curFuncDecl = d->getNonClosureContext();
+  assert((mlir::isa<cir::IntType>(ty) || cir::isAnyFloatingPointType(ty)) &&
+         "unexpected promotion type");
 
-  prologueCleanupDepth = ehStack.stable_begin();
+  if (mlir::isa<cir::IntType>(ty))
+    return cgf.getBuilder().CIRBaseBuilderTy::createIntCast(value, ty);
 
-  mlir::Block *entryBB = &fn.getBlocks().front();
-  builder.setInsertionPointToStart(entryBB);
+  return cgf.getBuilder().createFloatingCast(value, ty);
+}
+
+void CIRGenFunction::emitFunctionProlog(const FunctionArgList &args,
+                                        mlir::Block *entryBB,
+                                        const FunctionDecl *fd,
+                                        SourceLocation bodyBeginLoc) {
+  // Naked functions don't have prologues.
+  if (fd && fd->hasAttr<NakedAttr>()) {
+    cgm.errorNYI(bodyBeginLoc, "naked function decl");
+  }
 
-  // TODO(cir): this should live in `emitFunctionProlog
   // Declare all the function arguments in the symbol table.
   for (const auto nameValue : llvm::zip(args, entryBB->getArguments())) {
     const VarDecl *paramVar = std::get<0>(nameValue);
@@ -456,20 +474,64 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
                       cast<ParmVarDecl>(paramVar)->isKNRPromoted();
     assert(!cir::MissingFeatures::constructABIArgDirectExtend());
     if (isPromoted)
-      cgm.errorNYI(fd->getSourceRange(), "Function argument demotion");
+      paramVal = emitArgumentDemotion(*this, paramVar, paramVal);
 
     // Location of the store to the param storage tracked as beginning of
     // the function body.
-    mlir::Location fnBodyBegin = getLoc(fd->getBody()->getBeginLoc());
+    mlir::Location fnBodyBegin = getLoc(bodyBeginLoc);
     builder.CIRBaseBuilderTy::createStore(fnBodyBegin, paramVal, addrVal);
   }
   assert(builder.getInsertionBlock() && "Should be valid");
+}
+
+void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
+                                   cir::FuncOp fn, cir::FuncType funcType,
+                                   FunctionArgList args, SourceLocation loc,
+                                   SourceLocation startLoc) {
+  assert(!curFn &&
+         "CIRGenFunction can only be used for one function at a time");
+
+  curFn = fn;
+
+  const Decl *d = gd.getDecl();
+
+  didCallStackSave = false;
+  curCodeDecl = d;
+  const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
+  curFuncDecl = d->getNonClosureContext();
+
+  prologueCleanupDepth = ehStack.stable_begin();
+
+  mlir::Block *entryBB = &fn.getBlocks().front();
+  builder.setInsertionPointToStart(entryBB);
+
+  // Determine the function body begin location for the prolog.
+  // If fd is null or has no body, use startLoc as fallback.
+  SourceLocation bodyBeginLoc = startLoc;
+  if (fd) {
+    if (Stmt *body = fd->getBody())
+      bodyBeginLoc = body->getBeginLoc();
+    else
+      bodyBeginLoc = fd->getLocation();
+  }
+
+  emitFunctionProlog(args, entryBB, fd, bodyBeginLoc);
 
   // When the current function is not void, create an address to store the
   // result value.
-  if (!returnType->isVoidType())
-    emitAndUpdateRetAlloca(returnType, getLoc(fd->getBody()->getEndLoc()),
+  if (!returnType->isVoidType()) {
+    // Determine the function body end location.
+    // If fd is null or has no body, use loc as fallback.
+    SourceLocation bodyEndLoc = loc;
+    if (fd) {
+      if (Stmt *body = fd->getBody())
+        bodyEndLoc = body->getEndLoc();
+      else
+        bodyEndLoc = fd->getLocation();
+    }
+    emitAndUpdateRetAlloca(returnType, getLoc(bodyEndLoc),
                            getContext().getTypeAlignInChars(returnType));
+  }
 
   if (isa_and_nonnull<CXXMethodDecl>(d) &&
       cast<CXXMethodDecl>(d)->isInstance()) {
@@ -521,7 +583,48 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
   }
 }
 
+void CIRGenFunction::resolveBlockAddresses() {
+  for (cir::BlockAddressOp &blockAddress : cgm.unresolvedBlockAddressToLabel) {
+    cir::LabelOp labelOp =
+        cgm.lookupBlockAddressInfo(blockAddress.getBlockAddrInfo());
+    assert(labelOp && "expected cir.labelOp to already be emitted");
+    cgm.updateResolvedBlockAddress(blockAddress, labelOp);
+  }
+  cgm.unresolvedBlockAddressToLabel.clear();
+}
+
+void CIRGenFunction::finishIndirectBranch() {
+  if (!indirectGotoBlock)
+    return;
+  llvm::SmallVector<mlir::Block *> succesors;
+  llvm::SmallVector<mlir::ValueRange> rangeOperands;
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(indirectGotoBlock);
+  for (auto &[blockAdd, labelOp] : cgm.blockAddressToLabel) {
+    succesors.push_back(labelOp->getBlock());
+    rangeOperands.push_back(labelOp->getBlock()->getArguments());
+  }
+  cir::IndirectBrOp::create(builder, builder.getUnknownLoc(),
+                            indirectGotoBlock->getArgument(0), false,
+                            rangeOperands, succesors);
+  cgm.blockAddressToLabel.clear();
+}
+
 void CIRGenFunction::finishFunction(SourceLocation endLoc) {
+  // Resolve block address-to-label mappings, then emit the indirect branch
+  // with the corresponding targets.
+  resolveBlockAddresses();
+  finishIndirectBranch();
+
+  // If a label address was taken but no indirect goto was used, we can't remove
+  // the block argument here. Instead, we mark the 'indirectbr' op
+  // as poison so that the cleanup can be deferred to lowering, since the
+  // verifier doesn't allow the 'indirectbr' target address to be null.
+  if (indirectGotoBlock && indirectGotoBlock->hasNoPredecessors()) {
+    auto indrBr = cast<cir::IndirectBrOp>(indirectGotoBlock->front());
+    indrBr.setPoison(true);
+  }
+
   // Pop any cleanups that might have been associated with the
   // parameters.  Do this in whatever block we're currently in; it's
   // important to do this before we enter the return block or return
@@ -560,7 +663,7 @@ static void eraseEmptyAndUnusedBlocks(cir::FuncOp func) {
 
 cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
                                          cir::FuncType funcType) {
-  const auto funcDecl = cast<FunctionDecl>(gd.getDecl());
+  const auto *funcDecl = cast<FunctionDecl>(gd.getDecl());
   curGD = gd;
 
   if (funcDecl->isInlineBuiltinDeclaration()) {
@@ -630,8 +733,13 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
   {
     LexicalScope lexScope(*this, fusedLoc, entryBB);
 
+    // Emit the standard function prologue.
     startFunction(gd, retTy, fn, funcType, args, loc, bodyRange.getBegin());
 
+    // Save parameters for coroutine function.
+    if (body && isa_and_nonnull<CoroutineBodyStmt>(body))
+      llvm::append_range(fnArgs, funcDecl->parameters());
+
     if (isa<CXXDestructorDecl>(funcDecl)) {
       emitDestructorBody(args);
     } else if (isa<CXXConstructorDecl>(funcDecl)) {
@@ -652,6 +760,7 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
       // copy-constructors.
       emitImplicitAssignmentOperatorBody(args);
     } else if (body) {
+      // Emit standard function body.
       if (mlir::failed(emitFunctionBody(body))) {
         return nullptr;
       }
@@ -679,6 +788,8 @@ void CIRGenFunction::emitConstructorBody(FunctionArgList &args) {
           ctorType == Ctor_Complete) &&
          "can only generate complete ctor for this ABI");
 
+  cgm.setCXXSpecialMemberAttr(cast<cir::FuncOp>(curFn), ctor);
+
   if (ctorType == Ctor_Complete && isConstructorDelegationValid(ctor) &&
       cgm.getTarget().getCXXABI().hasConstructorVariants()) {
     emitDelegateCXXConstructorCall(ctor, Ctor_Base, args, ctor->getEndLoc());
@@ -717,6 +828,8 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) {
   const CXXDestructorDecl *dtor = cast<CXXDestructorDecl>(curGD.getDecl());
   CXXDtorType dtorType = curGD.getDtorType();
 
+  cgm.setCXXSpecialMemberAttr(cast<cir::FuncOp>(curFn), dtor);
+
   // For an abstract class, non-base destructors are never used (and can't
   // be emitted in general, because vbase dtors may not have been validated
   // by Sema), but the Itanium ABI doesn't make them optional and Clang may
@@ -883,6 +996,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitConditionalOperatorLValue(cast<BinaryConditionalOperator>(e));
   case Expr::ArraySubscriptExprClass:
     return emitArraySubscriptExpr(cast<ArraySubscriptExpr>(e));
+  case Expr::ExtVectorElementExprClass:
+    return emitExtVectorElementExpr(cast<ExtVectorElementExpr>(e));
   case Expr::UnaryOperatorClass:
     return emitUnaryOpLValue(cast<UnaryOperator>(e));
   case Expr::StringLiteralClass:
@@ -912,6 +1027,18 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
   case Expr::CXXOperatorCallExprClass:
   case Expr::UserDefinedLiteralClass:
     return emitCallExprLValue(cast<CallExpr>(e));
+  case Expr::ExprWithCleanupsClass: {
+    const auto *cleanups = cast<ExprWithCleanups>(e);
+    RunCleanupsScope scope(*this);
+    LValue lv = emitLValue(cleanups->getSubExpr());
+    assert(!cir::MissingFeatures::cleanupWithPreservedValues());
+    return lv;
+  }
+  case Expr::CXXDefaultArgExprClass: {
+    auto *dae = cast<CXXDefaultArgExpr>(e);
+    CXXDefaultArgExprScope scope(*this, dae);
+    return emitLValue(dae->getExpr());
+  }
   case Expr::ParenExprClass:
     return emitLValue(cast<ParenExpr>(e)->getSubExpr());
   case Expr::GenericSelectionExprClass:
@@ -1052,6 +1179,17 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType,
   return builder.getConstInt(*currSrcLoc, sizeTy, countFromCLAs);
 }
 
+void CIRGenFunction::instantiateIndirectGotoBlock() {
+  // If we already made the indirect branch for indirect goto, return its block.
+  if (indirectGotoBlock)
+    return;
+
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  indirectGotoBlock =
+      builder.createBlock(builder.getBlock()->getParent(), {}, {voidPtrTy},
+                          {builder.getUnknownLoc()});
+}
+
 mlir::Value CIRGenFunction::emitAlignmentAssumption(
     mlir::Value ptrValue, QualType ty, SourceLocation loc,
     SourceLocation assumptionLoc, int64_t alignment, mlir::Value offsetValue) {
@@ -1104,6 +1242,14 @@ CIRGenFunction::getVLASize(const VariableArrayType *type) {
   return {numElements, elementType};
 }
 
+CIRGenFunction::VlaSizePair
+CIRGenFunction::getVLAElements1D(const VariableArrayType *vla) {
+  mlir::Value vlaSize = vlaSizeMap[vla->getSizeExpr()];
+  assert(vlaSize && "no size for VLA!");
+  assert(vlaSize.getType() == sizeTy);
+  return {vlaSize, vla->getElementType()};
+}
+
 // TODO(cir): Most of this function can be shared between CIRGen
 // and traditional LLVM codegen
 void CIRGenFunction::emitVariablyModifiedType(QualType type) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index e5cecaa5..15322ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -30,6 +30,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/Type.h"
+#include "clang/Basic/OperatorKinds.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "clang/CIR/TypeEvaluationKind.h"
@@ -122,6 +123,10 @@ public:
 
   GlobalDecl curSEHParent;
 
+  /// A mapping from NRVO variables to the flags used to indicate
+  /// when the NRVO has been applied to this variable.
+  llvm::DenseMap<const VarDecl *, mlir::Value> nrvoFlags;
+
   llvm::DenseMap<const clang::ValueDecl *, clang::FieldDecl *>
       lambdaCaptureFields;
   clang::FieldDecl *lambdaThisCaptureField = nullptr;
@@ -152,6 +157,9 @@ public:
   /// global initializers.
   mlir::Operation *curFn = nullptr;
 
+  /// Save Parameter Decl for coroutine.
+  llvm::SmallVector<const ParmVarDecl *> fnArgs;
+
   using DeclMapTy = llvm::DenseMap<const clang::Decl *, Address>;
   /// This keeps track of the CIR allocas or globals for local C
   /// declarations.
@@ -199,6 +207,22 @@ public:
     return convertType(getContext().getTypeDeclType(t));
   }
 
+  /// Get integer from a mlir::Value that is an int constant or a constant op.
+  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getSExtValue();
+  }
+
+  /// Get zero-extended integer from a mlir::Value that is an int constant or a
+  /// constant op.
+  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp &&
+           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getZExtValue();
+  }
+
   ///  Return the cir::TypeEvaluationKind of QualType \c type.
   static cir::TypeEvaluationKind getEvaluationKind(clang::QualType type);
 
@@ -219,6 +243,10 @@ public:
   const TargetInfo &getTarget() const { return cgm.getTarget(); }
   mlir::MLIRContext &getMLIRContext() { return cgm.getMLIRContext(); }
 
+  const TargetCIRGenInfo &getTargetHooks() const {
+    return cgm.getTargetCIRGenInfo();
+  }
+
   // ---------------------
   // Opaque value handling
   // ---------------------
@@ -490,6 +518,10 @@ public:
     VlaSizePair(mlir::Value num, QualType ty) : numElts(num), type(ty) {}
   };
 
+  /// Return the number of elements for a single dimension
+  /// for the given array type.
+  VlaSizePair getVLAElements1D(const VariableArrayType *vla);
+
   /// Returns an MLIR::Value+QualType pair that corresponds to the size,
   /// in non-variably-sized elements, of a variable length array type,
   /// plus that largest non-variably-sized element type.  Assumes that
@@ -497,6 +529,12 @@ public:
   VlaSizePair getVLASize(const VariableArrayType *type);
   VlaSizePair getVLASize(QualType type);
 
+  Address getAsNaturalAddressOf(Address addr, QualType pointeeTy);
+
+  mlir::Value getAsNaturalPointerTo(Address addr, QualType pointeeType) {
+    return getAsNaturalAddressOf(addr, pointeeType).getBasePointer();
+  }
+
   void finishFunction(SourceLocation endLoc);
 
   /// Determine whether the given initializer is trivial in the sense
@@ -517,6 +555,8 @@ public:
   /// that we can just remove the code.
   bool containsLabel(const clang::Stmt *s, bool ignoreCaseStmts = false);
 
+  Address emitExtVectorElementLValue(LValue lv, mlir::Location loc);
+
   class ConstantEmission {
     // Cannot use mlir::TypedAttr directly here because of bit availability.
     llvm::PointerIntPair<mlir::Attribute, 1, bool> valueAndIsReference;
@@ -618,6 +658,14 @@ public:
     return JumpDest(target, ehStack.getInnermostNormalCleanup(),
                     nextCleanupDestIndex++);
   }
+  /// IndirectBranch - The first time an indirect goto is seen we create a block
+  /// reserved for the indirect branch. Unlike before,the actual 'indirectbr'
+  /// is emitted at the end of the function, once all block destinations have
+  /// been resolved.
+  mlir::Block *indirectGotoBlock = nullptr;
+
+  void resolveBlockAddresses();
+  void finishIndirectBranch();
 
   /// Perform the usual unary conversions on the specified expression and
   /// compare the result against zero, returning an Int1Ty value.
@@ -810,6 +858,11 @@ public:
       llvm::iterator_range<CastExpr::path_const_iterator> path,
       bool nullCheckValue, SourceLocation loc);
 
+  Address getAddressOfDerivedClass(
+      mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+      llvm::iterator_range<CastExpr::path_const_iterator> path,
+      bool nullCheckValue);
+
   /// Return the VTT parameter that should be passed to a base
   /// constructor/destructor with virtual bases.
   /// FIXME: VTTs are Itanium ABI-specific, so the definition should move
@@ -901,6 +954,11 @@ public:
   clang::QualType buildFunctionArgList(clang::GlobalDecl gd,
                                        FunctionArgList &args);
 
+  /// Emit the function prologue: declare function arguments in the symbol
+  /// table.
+  void emitFunctionProlog(const FunctionArgList &args, mlir::Block *entryBB,
+                          const FunctionDecl *fd, SourceLocation bodyBeginLoc);
+
   /// Emit code for the start of a function.
   /// \param loc       The location to be associated with the function.
   /// \param startLoc  The location of the function body.
@@ -916,10 +974,16 @@ public:
     return false;
   }
 
+  void populateEHCatchRegions(EHScopeStack::stable_iterator scope,
+                              cir::TryOp tryOp);
+
   /// The cleanup depth enclosing all the cleanups associated with the
   /// parameters.
   EHScopeStack::stable_iterator prologueCleanupDepth;
 
+  bool isCatchOrCleanupRequired();
+  void populateCatchHandlersIfRequired(cir::TryOp tryOp);
+
   /// Takes the old cleanup stack size and emits the cleanup blocks
   /// that have been added.
   void popCleanupBlocks(EHScopeStack::stable_iterator oldCleanupStackDepth);
@@ -1063,7 +1127,7 @@ public:
     bool isSwitch() { return scopeKind == Kind::Switch; }
     bool isTernary() { return scopeKind == Kind::Ternary; }
     bool isTry() { return scopeKind == Kind::Try; }
-
+    cir::TryOp getClosestTryParent();
     void setAsGlobalInit() { scopeKind = Kind::GlobalInit; }
     void setAsSwitch() { scopeKind = Kind::Switch; }
     void setAsTernary() { scopeKind = Kind::Ternary; }
@@ -1189,6 +1253,14 @@ public:
   /// CIR emit functions
   /// ----------------------
 public:
+  mlir::Value emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
+                                     ReturnValueSlot returnValue,
+                                     llvm::Triple::ArchType arch);
+  mlir::Value emitAArch64SMEBuiltinExpr(unsigned builtinID,
+                                        const CallExpr *expr);
+  mlir::Value emitAArch64SVEBuiltinExpr(unsigned builtinID,
+                                        const CallExpr *expr);
+
   mlir::Value emitAlignmentAssumption(mlir::Value ptrValue, QualType ty,
                                       SourceLocation loc,
                                       SourceLocation assumptionLoc,
@@ -1264,6 +1336,8 @@ public:
                               QualType &baseType, Address &addr);
   LValue emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e);
 
+  LValue emitExtVectorElementExpr(const ExtVectorElementExpr *e);
+
   Address emitArrayToPointerDecay(const Expr *e,
                                   LValueBaseInfo *baseInfo = nullptr);
 
@@ -1271,6 +1345,9 @@ public:
 
   RValue emitAtomicExpr(AtomicExpr *e);
   void emitAtomicInit(Expr *init, LValue dest);
+  void emitAtomicStore(RValue rvalue, LValue dest, bool isInit);
+  void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order,
+                       bool isVolatile, bool isInit);
 
   AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d,
                                     mlir::OpBuilder::InsertPoint ip = {});
@@ -1304,6 +1381,32 @@ public:
   RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
                          const clang::CallExpr *e, ReturnValueSlot returnValue);
 
+  /// Returns a Value corresponding to the size of the given expression by
+  /// emitting a `cir.objsize` operation.
+  ///
+  /// \param e The expression whose object size to compute
+  /// \param type Determines the semantics of the object size computation.
+  ///   The type parameter is a 2-bit value where:
+  ///     bit 0 (type & 1): 0 = whole object, 1 = closest subobject
+  ///     bit 1 (type & 2): 0 = maximum size, 2 = minimum size
+  /// \param resType The result type for the size value
+  /// \param emittedE Optional pre-emitted pointer value. If non-null, we'll
+  ///   call `cir.objsize` on this value rather than emitting e.
+  /// \param isDynamic If true, allows runtime evaluation via dynamic mode
+  mlir::Value emitBuiltinObjectSize(const clang::Expr *e, unsigned type,
+                                    cir::IntType resType, mlir::Value emittedE,
+                                    bool isDynamic);
+
+  mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
+                                              unsigned type,
+                                              cir::IntType resType,
+                                              mlir::Value emittedE,
+                                              bool isDynamic);
+
+  int64_t getAccessedFieldNo(unsigned idx, mlir::ArrayAttr elts);
+
+  void instantiateIndirectGotoBlock();
+
   RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                   const CIRGenCallee &callee, ReturnValueSlot returnValue,
                   const CallArgList &args, cir::CIRCallOpInterface *callOp,
@@ -1362,6 +1465,7 @@ public:
   cir::CallOp emitCoroAllocBuiltinCall(mlir::Location loc);
   cir::CallOp emitCoroBeginBuiltinCall(mlir::Location loc,
                                        mlir::Value coroframeAddr);
+  RValue emitCoroutineFrame();
 
   void emitDestroy(Address addr, QualType type, Destroyer *destroyer);
 
@@ -1434,6 +1538,10 @@ public:
 
   RValue emitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *expr);
 
+  RValue emitNewOrDeleteBuiltinCall(const FunctionProtoType *type,
+                                    const CallExpr *callExpr,
+                                    OverloadedOperatorKind op);
+
   void emitCXXTemporary(const CXXTemporary *temporary, QualType tempType,
                         Address ptr);
 
@@ -1482,6 +1590,8 @@ public:
 
   mlir::LogicalResult emitGotoStmt(const clang::GotoStmt &s);
 
+  mlir::LogicalResult emitIndirectGotoStmt(const IndirectGotoStmt &s);
+
   void emitImplicitAssignmentOperatorBody(FunctionArgList &args);
 
   void emitInitializerForField(clang::FieldDecl *field, LValue lhs,
@@ -1501,7 +1611,8 @@ public:
                               llvm::ArrayRef<mlir::Value> args = {});
 
   /// Emit the computation of the specified expression of scalar type.
-  mlir::Value emitScalarExpr(const clang::Expr *e);
+  mlir::Value emitScalarExpr(const clang::Expr *e,
+                             bool ignoreResultAssign = false);
 
   mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
                                       cir::UnaryOpKind kind, bool isPre);
@@ -1522,6 +1633,9 @@ public:
   void emitForwardingCallToLambda(const CXXMethodDecl *lambdaCallOperator,
                                   CallArgList &callArgs);
 
+  RValue emitCoawaitExpr(const CoawaitExpr &e,
+                         AggValueSlot aggSlot = AggValueSlot::ignored(),
+                         bool ignoreResult = false);
   /// Emit the computation of the specified expression of complex type,
   /// returning the result.
   mlir::Value emitComplexExpr(const Expr *e);
@@ -1587,6 +1701,8 @@ public:
   void emitLambdaDelegatingInvokeBody(const CXXMethodDecl *md);
   void emitLambdaStaticInvokeBody(const CXXMethodDecl *md);
 
+  void populateCatchHandlers(cir::TryOp tryOp);
+
   mlir::LogicalResult emitIfStmt(const clang::IfStmt &s);
 
   /// Emit code to compute the specified expression,
@@ -1598,6 +1714,8 @@ public:
   /// Load a complex number from the specified l-value.
   mlir::Value emitLoadOfComplex(LValue src, SourceLocation loc);
 
+  RValue emitLoadOfExtVectorElementLValue(LValue lv);
+
   /// Given an expression that represents a value lvalue, this method emits
   /// the address of the lvalue, then loads the result as an rvalue,
   /// returning the rvalue.
@@ -1673,14 +1791,17 @@ public:
   void emitScalarInit(const clang::Expr *init, mlir::Location loc,
                       LValue lvalue, bool capturedByInit = false);
 
+  mlir::Value emitScalarOrConstFoldImmArg(unsigned iceArguments, unsigned idx,
+                                          const Expr *argExpr);
+
   void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage);
 
   void emitStoreOfComplex(mlir::Location loc, mlir::Value v, LValue dest,
                           bool isInit);
 
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
-                         clang::QualType ty, bool isInit = false,
-                         bool isNontemporal = false);
+                         clang::QualType ty, LValueBaseInfo baseInfo,
+                         bool isInit = false, bool isNontemporal = false);
   void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
 
   /// Store the specified rvalue into the specified
@@ -1740,7 +1861,7 @@ public:
 
   mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
 
-  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
+  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr);
 
   /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
   /// nonnull, if 1\p LHS is marked _Nonnull.
@@ -1899,25 +2020,23 @@ public:
 private:
   template <typename Op>
   Op emitOpenACCOp(mlir::Location start, OpenACCDirectiveKind dirKind,
-                   SourceLocation dirLoc,
                    llvm::ArrayRef<const OpenACCClause *> clauses);
   // Function to do the basic implementation of an operation with an Associated
   // Statement.  Models AssociatedStmtConstruct.
   template <typename Op, typename TermOp>
-  mlir::LogicalResult emitOpenACCOpAssociatedStmt(
-      mlir::Location start, mlir::Location end, OpenACCDirectiveKind dirKind,
-      SourceLocation dirLoc, llvm::ArrayRef<const OpenACCClause *> clauses,
-      const Stmt *associatedStmt);
+  mlir::LogicalResult
+  emitOpenACCOpAssociatedStmt(mlir::Location start, mlir::Location end,
+                              OpenACCDirectiveKind dirKind,
+                              llvm::ArrayRef<const OpenACCClause *> clauses,
+                              const Stmt *associatedStmt);
 
   template <typename Op, typename TermOp>
   mlir::LogicalResult emitOpenACCOpCombinedConstruct(
       mlir::Location start, mlir::Location end, OpenACCDirectiveKind dirKind,
-      SourceLocation dirLoc, llvm::ArrayRef<const OpenACCClause *> clauses,
-      const Stmt *loopStmt);
+      llvm::ArrayRef<const OpenACCClause *> clauses, const Stmt *loopStmt);
 
   template <typename Op>
   void emitOpenACCClauses(Op &op, OpenACCDirectiveKind dirKind,
-                          SourceLocation dirLoc,
                           ArrayRef<const OpenACCClause *> clauses);
   // The second template argument doesn't need to be a template, since it should
   // always be an mlir::acc::LoopOp, but as this is a template anyway, we make
@@ -1927,7 +2046,7 @@ private:
   // instantiated 3x.
   template <typename ComputeOp, typename LoopOp>
   void emitOpenACCClauses(ComputeOp &op, LoopOp &loopOp,
-                          OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
+                          OpenACCDirectiveKind dirKind,
                           ArrayRef<const OpenACCClause *> clauses);
 
   // The OpenACC LoopOp requires that we have auto, seq, or independent on all
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index f603f5ec..7e145f2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -123,6 +123,12 @@ public:
     return true;
   }
 
+  size_t getSrcArgforCopyCtor(const CXXConstructorDecl *,
+                              FunctionArgList &args) const override {
+    assert(!args.empty() && "expected the arglist to not be empty!");
+    return args.size() - 1;
+  }
+
   void emitBadCastCall(CIRGenFunction &cgf, mlir::Location loc) override;
 
   mlir::Value
@@ -459,7 +465,8 @@ void CIRGenItaniumCXXABI::emitVTableDefinitions(CIRGenVTables &cgvt,
                  "emitVTableDefinitions: __fundamental_type_info");
   }
 
-  auto vtableAsGlobalValue = dyn_cast<cir::CIRGlobalValueInterface>(*vtable);
+  [[maybe_unused]] auto vtableAsGlobalValue =
+      dyn_cast<cir::CIRGlobalValueInterface>(*vtable);
   assert(vtableAsGlobalValue && "VTable must support CIRGlobalValueInterface");
   // Always emit type metadata on non-available_externally definitions, and on
   // available_externally definitions if we are performing whole program
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 9f9b2db..eaa9e94 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -675,12 +675,18 @@ CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty,
       errorNYI(d->getSourceRange(), "OpenMP target global variable");
 
     gv.setAlignmentAttr(getSize(astContext.getDeclAlign(d)));
-    assert(!cir::MissingFeatures::opGlobalConstant());
+    // FIXME: This code is overly simple and should be merged with other global
+    // handling.
+    gv.setConstant(d->getType().isConstantStorage(
+        astContext, /*ExcludeCtor=*/false, /*ExcludeDtor=*/false));
 
     setLinkageForGV(gv, d);
 
-    if (d->getTLSKind())
-      errorNYI(d->getSourceRange(), "thread local global variable");
+    if (d->getTLSKind()) {
+      if (d->getTLSKind() == VarDecl::TLS_Dynamic)
+        errorNYI(d->getSourceRange(), "TLS dynamic");
+      setTLSMode(gv, *d);
+    }
 
     setGVProperties(gv, d);
 
@@ -735,12 +741,11 @@ mlir::Value CIRGenModule::getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty,
   if (!ty)
     ty = getTypes().convertTypeForMem(astTy);
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-
+  bool tlsAccess = d->getTLSKind() != VarDecl::TLS_None;
   cir::GlobalOp g = getOrCreateCIRGlobal(d, ty, isForDefinition);
   mlir::Type ptrTy = builder.getPointerTo(g.getSymType());
   return cir::GetGlobalOp::create(builder, getLoc(d->getSourceRange()), ptrTy,
-                                  g.getSymName());
+                                  g.getSymNameAttr(), tlsAccess);
 }
 
 cir::GlobalViewAttr CIRGenModule::getAddrOfGlobalVarAttr(const VarDecl *d) {
@@ -864,7 +869,11 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   if (emitter)
     emitter->finalize(gv);
 
-  assert(!cir::MissingFeatures::opGlobalConstant());
+  // If it is safe to mark the global 'constant', do so now.
+  gv.setConstant((vd->hasAttr<CUDAConstantAttr>() && langOpts.CUDAIsDevice) ||
+                 (!needsGlobalCtor && !needsGlobalDtor &&
+                  vd->getType().isConstantStorage(
+                      astContext, /*ExcludeCtor=*/true, /*ExcludeDtor=*/true)));
   assert(!cir::MissingFeatures::opGlobalSection());
 
   // Set CIR's linkage type as appropriate.
@@ -876,8 +885,17 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   // FIXME(cir): setLinkage should likely set MLIR's visibility automatically.
   gv.setVisibility(getMLIRVisibilityFromCIRLinkage(linkage));
   assert(!cir::MissingFeatures::opGlobalDLLImportExport());
-  if (linkage == cir::GlobalLinkageKind::CommonLinkage)
-    errorNYI(initExpr->getSourceRange(), "common linkage");
+  if (linkage == cir::GlobalLinkageKind::CommonLinkage) {
+    // common vars aren't constant even if declared const.
+    gv.setConstant(false);
+    // Tentative definition of global variables may be initialized with
+    // non-zero null pointers. In this case they should have weak linkage
+    // since common linkage must have zero initializer and must not have
+    // explicit section therefore cannot have non-zero initial value.
+    std::optional<mlir::Attribute> initializer = gv.getInitialValue();
+    if (initializer && !getBuilder().isNullValue(*initializer))
+      gv.setLinkage(cir::GlobalLinkageKind::WeakAnyLinkage);
+  }
 
   setNonAliasAttributes(vd, gv);
 
@@ -1231,10 +1249,8 @@ cir::GlobalLinkageKind CIRGenModule::getCIRLinkageForDeclarator(
   // linkage.
   if (!getLangOpts().CPlusPlus && isa<VarDecl>(dd) &&
       !isVarDeclStrongDefinition(astContext, *this, cast<VarDecl>(dd),
-                                 getCodeGenOpts().NoCommon)) {
-    errorNYI(dd->getBeginLoc(), "common linkage", dd->getDeclKindName());
+                                 getCodeGenOpts().NoCommon))
     return cir::GlobalLinkageKind::CommonLinkage;
-  }
 
   // selectany symbols are externally visible, so use weak instead of
   // linkonce.  MSVC optimizes away references to const selectany globals, so
@@ -1424,6 +1440,23 @@ CIRGenModule::getAddrOfConstantStringFromLiteral(const StringLiteral *s,
   return builder.getGlobalViewAttr(ptrTy, gv);
 }
 
+// TODO(cir): this could be a common AST helper for both CIR and LLVM codegen.
+LangAS CIRGenModule::getLangTempAllocaAddressSpace() const {
+  if (getLangOpts().OpenCL)
+    return LangAS::opencl_private;
+
+  // For temporaries inside functions, CUDA treats them as normal variables.
+  // LangAS::cuda_device, on the other hand, is reserved for those variables
+  // explicitly marked with __device__.
+  if (getLangOpts().CUDAIsDevice)
+    return LangAS::Default;
+
+  if (getLangOpts().SYCLIsDevice ||
+      (getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice))
+    errorNYI("SYCL or OpenMP temp address space");
+  return LangAS::Default;
+}
+
 void CIRGenModule::emitExplicitCastExprType(const ExplicitCastExpr *e,
                                             CIRGenFunction *cgf) {
   if (cgf && e->getType()->isVariablyModifiedType())
@@ -1433,6 +1466,26 @@ void CIRGenModule::emitExplicitCastExprType(const ExplicitCastExpr *e,
          "emitExplicitCastExprType");
 }
 
+mlir::Value CIRGenModule::emitMemberPointerConstant(const UnaryOperator *e) {
+  assert(!cir::MissingFeatures::cxxABI());
+
+  mlir::Location loc = getLoc(e->getSourceRange());
+
+  const auto *decl = cast<DeclRefExpr>(e->getSubExpr())->getDecl();
+
+  // A member function pointer.
+  if (isa<CXXMethodDecl>(decl)) {
+    errorNYI(e->getSourceRange(), "emitMemberPointerConstant: method pointer");
+    return {};
+  }
+
+  // Otherwise, a member data pointer.
+  auto ty = mlir::cast<cir::DataMemberType>(convertType(e->getType()));
+  const auto *fieldDecl = cast<FieldDecl>(decl);
+  return cir::ConstantOp::create(
+      builder, loc, builder.getDataMemberAttr(ty, fieldDecl->getFieldIndex()));
+}
+
 void CIRGenModule::emitDeclContext(const DeclContext *dc) {
   for (Decl *decl : dc->decls()) {
     // Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope
@@ -1482,10 +1535,10 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
     break;
   }
   case Decl::OpenACCRoutine:
-    emitGlobalOpenACCDecl(cast<OpenACCRoutineDecl>(decl));
+    emitGlobalOpenACCRoutineDecl(cast<OpenACCRoutineDecl>(decl));
     break;
   case Decl::OpenACCDeclare:
-    emitGlobalOpenACCDecl(cast<OpenACCDeclareDecl>(decl));
+    emitGlobalOpenACCDeclareDecl(cast<OpenACCDeclareDecl>(decl));
     break;
   case Decl::Enum:
   case Decl::Using:          // using X; [C++]
@@ -1525,10 +1578,14 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
     break;
 
   case Decl::ClassTemplateSpecialization:
-  case Decl::CXXRecord:
+  case Decl::CXXRecord: {
+    CXXRecordDecl *crd = cast<CXXRecordDecl>(decl);
     assert(!cir::MissingFeatures::generateDebugInfo());
-    assert(!cir::MissingFeatures::cxxRecordStaticMembers());
+    for (auto *childDecl : crd->decls())
+      if (isa<VarDecl, CXXRecordDecl, EnumDecl, OpenACCDeclareDecl>(childDecl))
+        emitTopLevelDecl(childDecl);
     break;
+  }
 
   case Decl::FileScopeAsm:
     // File-scope asm is ignored during device-side CUDA compilation.
@@ -1898,6 +1955,33 @@ void CIRGenModule::setGVPropertiesAux(mlir::Operation *op,
   assert(!cir::MissingFeatures::opGlobalPartition());
 }
 
+cir::TLS_Model CIRGenModule::getDefaultCIRTLSModel() const {
+  switch (getCodeGenOpts().getDefaultTLSModel()) {
+  case CodeGenOptions::GeneralDynamicTLSModel:
+    return cir::TLS_Model::GeneralDynamic;
+  case CodeGenOptions::LocalDynamicTLSModel:
+    return cir::TLS_Model::LocalDynamic;
+  case CodeGenOptions::InitialExecTLSModel:
+    return cir::TLS_Model::InitialExec;
+  case CodeGenOptions::LocalExecTLSModel:
+    return cir::TLS_Model::LocalExec;
+  }
+  llvm_unreachable("Invalid TLS model!");
+}
+
+void CIRGenModule::setTLSMode(mlir::Operation *op, const VarDecl &d) {
+  assert(d.getTLSKind() && "setting TLS mode on non-TLS var!");
+
+  cir::TLS_Model tlm = getDefaultCIRTLSModel();
+
+  // Override the TLS model if it is explicitly specified.
+  if (d.getAttr<TLSModelAttr>())
+    errorNYI(d.getSourceRange(), "TLS model attribute");
+
+  auto global = cast<cir::GlobalOp>(op);
+  global.setTlsModel(tlm);
+}
+
 void CIRGenModule::setFunctionAttributes(GlobalDecl globalDecl,
                                          cir::FuncOp func,
                                          bool isIncompleteFunction,
@@ -1940,7 +2024,6 @@ void CIRGenModule::setCIRFunctionAttributesForDefinition(
       existingInlineKind && *existingInlineKind == cir::InlineKind::NoInline;
   bool isAlwaysInline = existingInlineKind &&
                         *existingInlineKind == cir::InlineKind::AlwaysInline;
-
   if (!decl) {
     assert(!cir::MissingFeatures::hlsl());
 
@@ -1949,8 +2032,7 @@ void CIRGenModule::setCIRFunctionAttributesForDefinition(
       // If inlining is disabled and we don't have a declaration to control
       // inlining, mark the function as 'noinline' unless it is explicitly
       // marked as 'alwaysinline'.
-      f.setInlineKindAttr(
-          cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+      f.setInlineKind(cir::InlineKind::NoInline);
     }
 
     return;
@@ -1967,19 +2049,16 @@ void CIRGenModule::setCIRFunctionAttributesForDefinition(
   // Handle inline attributes
   if (decl->hasAttr<NoInlineAttr>() && !isAlwaysInline) {
     // Add noinline if the function isn't always_inline.
-    f.setInlineKindAttr(
-        cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+    f.setInlineKind(cir::InlineKind::NoInline);
   } else if (decl->hasAttr<AlwaysInlineAttr>() && !isNoInline) {
     // Don't override AlwaysInline with NoInline, or vice versa, since we can't
     // specify both in IR.
-    f.setInlineKindAttr(
-        cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::AlwaysInline));
+    f.setInlineKind(cir::InlineKind::AlwaysInline);
   } else if (codeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining) {
     // If inlining is disabled, force everything that isn't always_inline
     // to carry an explicit noinline attribute.
     if (!isAlwaysInline) {
-      f.setInlineKindAttr(
-          cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+      f.setInlineKind(cir::InlineKind::NoInline);
     }
   } else {
     // Otherwise, propagate the inline hint attribute and potentially use its
@@ -2001,13 +2080,11 @@ void CIRGenModule::setCIRFunctionAttributesForDefinition(
         return any_of(pattern->redecls(), checkRedeclForInline);
       };
       if (checkForInline(fd)) {
-        f.setInlineKindAttr(cir::InlineAttr::get(&getMLIRContext(),
-                                                 cir::InlineKind::InlineHint));
+        f.setInlineKind(cir::InlineKind::InlineHint);
       } else if (codeGenOpts.getInlining() ==
                      CodeGenOptions::OnlyHintInlining &&
                  !fd->isInlined() && !isAlwaysInline) {
-        f.setInlineKindAttr(
-            cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+        f.setInlineKind(cir::InlineKind::NoInline);
       }
     }
   }
@@ -2194,8 +2271,20 @@ CIRGenModule::createCIRFunction(mlir::Location loc, StringRef name,
 
     assert(!cir::MissingFeatures::opFuncExtraAttrs());
 
+    // Mark C++ special member functions (Constructor, Destructor etc.)
+    setCXXSpecialMemberAttr(func, funcDecl);
+
     if (!cgf)
       theModule.push_back(func);
+
+    if (this->getLangOpts().OpenACC) {
+      // We only have to handle this attribute, since OpenACCAnnotAttrs are
+      // handled via the end-of-TU work.
+      for (const auto *attr :
+           funcDecl->specific_attrs<OpenACCRoutineDeclAttr>())
+        emitOpenACCRoutineDecl(funcDecl, func, attr->getLocation(),
+                               attr->Clauses);
+    }
   }
   return func;
 }
@@ -2209,6 +2298,58 @@ CIRGenModule::createCIRBuiltinFunction(mlir::Location loc, StringRef name,
   return fnOp;
 }
 
+static cir::CtorKind getCtorKindFromDecl(const CXXConstructorDecl *ctor) {
+  if (ctor->isDefaultConstructor())
+    return cir::CtorKind::Default;
+  if (ctor->isCopyConstructor())
+    return cir::CtorKind::Copy;
+  if (ctor->isMoveConstructor())
+    return cir::CtorKind::Move;
+  return cir::CtorKind::Custom;
+}
+
+static cir::AssignKind getAssignKindFromDecl(const CXXMethodDecl *method) {
+  if (method->isCopyAssignmentOperator())
+    return cir::AssignKind::Copy;
+  if (method->isMoveAssignmentOperator())
+    return cir::AssignKind::Move;
+  llvm_unreachable("not a copy or move assignment operator");
+}
+
+void CIRGenModule::setCXXSpecialMemberAttr(
+    cir::FuncOp funcOp, const clang::FunctionDecl *funcDecl) {
+  if (!funcDecl)
+    return;
+
+  if (const auto *dtor = dyn_cast<CXXDestructorDecl>(funcDecl)) {
+    auto cxxDtor = cir::CXXDtorAttr::get(
+        convertType(getASTContext().getCanonicalTagType(dtor->getParent())),
+        dtor->isTrivial());
+    funcOp.setCxxSpecialMemberAttr(cxxDtor);
+    return;
+  }
+
+  if (const auto *ctor = dyn_cast<CXXConstructorDecl>(funcDecl)) {
+    cir::CtorKind kind = getCtorKindFromDecl(ctor);
+    auto cxxCtor = cir::CXXCtorAttr::get(
+        convertType(getASTContext().getCanonicalTagType(ctor->getParent())),
+        kind, ctor->isTrivial());
+    funcOp.setCxxSpecialMemberAttr(cxxCtor);
+    return;
+  }
+
+  const auto *method = dyn_cast<CXXMethodDecl>(funcDecl);
+  if (method && (method->isCopyAssignmentOperator() ||
+                 method->isMoveAssignmentOperator())) {
+    cir::AssignKind assignKind = getAssignKindFromDecl(method);
+    auto cxxAssign = cir::CXXAssignAttr::get(
+        convertType(getASTContext().getCanonicalTagType(method->getParent())),
+        assignKind, method->isTrivial());
+    funcOp.setCxxSpecialMemberAttr(cxxAssign);
+    return;
+  }
+}
+
 cir::FuncOp CIRGenModule::createRuntimeFunction(cir::FuncType ty,
                                                 StringRef name, mlir::ArrayAttr,
                                                 [[maybe_unused]] bool isLocal,
@@ -2406,3 +2547,39 @@ DiagnosticBuilder CIRGenModule::errorNYI(SourceRange loc,
                                          llvm::StringRef feature) {
   return errorNYI(loc.getBegin(), feature) << loc;
 }
+
+void CIRGenModule::mapBlockAddress(cir::BlockAddrInfoAttr blockInfo,
+                                   cir::LabelOp label) {
+  [[maybe_unused]] auto result =
+      blockAddressInfoToLabel.try_emplace(blockInfo, label);
+  assert(result.second &&
+         "attempting to map a blockaddress info that is already mapped");
+}
+
+void CIRGenModule::mapUnresolvedBlockAddress(cir::BlockAddressOp op) {
+  [[maybe_unused]] auto result = unresolvedBlockAddressToLabel.insert(op);
+  assert(result.second &&
+         "attempting to map a blockaddress operation that is already mapped");
+}
+
+void CIRGenModule::mapResolvedBlockAddress(cir::BlockAddressOp op,
+                                           cir::LabelOp label) {
+  [[maybe_unused]] auto result = blockAddressToLabel.try_emplace(op, label);
+  assert(result.second &&
+         "attempting to map a blockaddress operation that is already mapped");
+}
+
+void CIRGenModule::updateResolvedBlockAddress(cir::BlockAddressOp op,
+                                              cir::LabelOp newLabel) {
+  auto *it = blockAddressToLabel.find(op);
+  assert(it != blockAddressToLabel.end() &&
+         "trying to update a blockaddress not previously mapped");
+  assert(!it->second && "blockaddress already has a resolved label");
+
+  it->second = newLabel;
+}
+
+cir::LabelOp
+CIRGenModule::lookupBlockAddressInfo(cir::BlockAddrInfoAttr blockInfo) {
+  return blockAddressInfoToLabel.lookup(blockInfo);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 186913d..de263f4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -126,7 +126,23 @@ public:
   /// the pointers are supposed to be uniqued, should be fine. Revisit this if
   /// it ends up taking too much memory.
   llvm::DenseMap<const clang::FieldDecl *, llvm::StringRef> lambdaFieldToName;
-
+  /// Map BlockAddrInfoAttr (function name, label name) to the corresponding CIR
+  /// LabelOp. This provides the main lookup table used to resolve block
+  /// addresses into their label operations.
+  llvm::DenseMap<cir::BlockAddrInfoAttr, cir::LabelOp> blockAddressInfoToLabel;
+  /// Map CIR BlockAddressOps directly to their resolved LabelOps.
+  /// Used once a block address has been successfully lowered to a label.
+  llvm::MapVector<cir::BlockAddressOp, cir::LabelOp> blockAddressToLabel;
+  /// Track CIR BlockAddressOps that cannot be resolved immediately
+  /// because their LabelOp has not yet been emitted. These entries
+  /// are solved later once the corresponding label is available.
+  llvm::DenseSet<cir::BlockAddressOp> unresolvedBlockAddressToLabel;
+  cir::LabelOp lookupBlockAddressInfo(cir::BlockAddrInfoAttr blockInfo);
+  void mapBlockAddress(cir::BlockAddrInfoAttr blockInfo, cir::LabelOp label);
+  void mapUnresolvedBlockAddress(cir::BlockAddressOp op);
+  void mapResolvedBlockAddress(cir::BlockAddressOp op, cir::LabelOp);
+  void updateResolvedBlockAddress(cir::BlockAddressOp op,
+                                  cir::LabelOp newLabel);
   /// Tell the consumer that this variable has been instantiated.
   void handleCXXStaticMemberVarInstantiation(VarDecl *vd);
 
@@ -297,6 +313,12 @@ public:
   getAddrOfConstantStringFromLiteral(const StringLiteral *s,
                                      llvm::StringRef name = ".str");
 
+  /// Returns the address space for temporary allocations in the language. This
+  /// ensures that the allocated variable's address space matches the
+  /// expectations of the AST, rather than using the target's allocation address
+  /// space, which may lead to type mismatches in other parts of the IR.
+  LangAS getLangTempAllocaAddressSpace() const;
+
   /// Set attributes which are common to any form of a global definition (alias,
   /// Objective-C method, function, global variable).
   ///
@@ -425,6 +447,13 @@ public:
   void setGVProperties(mlir::Operation *op, const NamedDecl *d) const;
   void setGVPropertiesAux(mlir::Operation *op, const NamedDecl *d) const;
 
+  /// Set TLS mode for the given operation based on the given variable
+  /// declaration.
+  void setTLSMode(mlir::Operation *op, const VarDecl &d);
+
+  /// Get TLS mode from CodeGenOptions.
+  cir::TLS_Model getDefaultCIRTLSModel() const;
+
   /// Set function attributes for a function declaration.
   void setFunctionAttributes(GlobalDecl gd, cir::FuncOp f,
                              bool isIncompleteFunction, bool isThunk);
@@ -447,6 +476,20 @@ public:
                                     bool performInit);
 
   void emitGlobalOpenACCDecl(const clang::OpenACCConstructDecl *cd);
+  void emitGlobalOpenACCRoutineDecl(const clang::OpenACCRoutineDecl *cd);
+  void emitGlobalOpenACCDeclareDecl(const clang::OpenACCDeclareDecl *cd);
+  template <typename BeforeOpTy, typename DataClauseTy>
+  void emitGlobalOpenACCDeclareDataOperands(const Expr *varOperand,
+                                            DataClauseTy dataClause,
+                                            OpenACCModifierKind modifiers,
+                                            bool structured, bool implicit,
+                                            bool requiresDtor);
+  // Each of the acc.routine operations must have a unique name, so we just use
+  // an integer counter.  This is how Flang does it, so it seems reasonable.
+  unsigned routineCounter = 0;
+  void emitOpenACCRoutineDecl(const clang::FunctionDecl *funcDecl,
+                              cir::FuncOp func, SourceLocation pragmaLoc,
+                              ArrayRef<const OpenACCClause *> clauses);
 
   // C++ related functions.
   void emitDeclContext(const DeclContext *dc);
@@ -455,6 +498,14 @@ public:
   /// expression of the given type.
   mlir::Value emitNullConstant(QualType t, mlir::Location loc);
 
+  mlir::TypedAttr emitNullConstantAttr(QualType t);
+
+  /// Return a null constant appropriate for zero-initializing a base class with
+  /// the given type. This is usually, but not always, an LLVM null constant.
+  mlir::TypedAttr emitNullConstantForBase(const CXXRecordDecl *record);
+
+  mlir::Value emitMemberPointerConstant(const UnaryOperator *e);
+
   llvm::StringRef getMangledName(clang::GlobalDecl gd);
 
   void emitTentativeDefinition(const VarDecl *d);
@@ -491,6 +542,10 @@ public:
                                        cir::FuncType ty,
                                        const clang::FunctionDecl *fd);
 
+  /// Mark the function as a special member (e.g. constructor, destructor)
+  void setCXXSpecialMemberAttr(cir::FuncOp funcOp,
+                               const clang::FunctionDecl *funcDecl);
+
   cir::FuncOp createRuntimeFunction(cir::FuncType ty, llvm::StringRef name,
                                     mlir::ArrayAttr = {}, bool isLocal = false,
                                     bool assumeConvergent = false);
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 527dfd2..8e7384ae 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -14,6 +14,7 @@
 
 #include "CIRGenCXXABI.h"
 #include "CIRGenFunction.h"
+#include "CIRGenOpenACCHelpers.h"
 #include "CIRGenOpenACCRecipe.h"
 
 #include "clang/AST/ExprCXX.h"
@@ -60,9 +61,6 @@ class OpenACCClauseCIREmitter final
   // This is necessary since a few of the clauses emit differently based on the
   // directive kind they are attached to.
   OpenACCDirectiveKind dirKind;
-  // TODO(cir): This source location should be able to go away once the NYI
-  // diagnostics are gone.
-  SourceLocation dirLoc;
 
   llvm::SmallVector<mlir::acc::DeviceType> lastDeviceTypeValues;
   // Keep track of the async-clause so that we can shortcut updating the data
@@ -71,10 +69,6 @@ class OpenACCClauseCIREmitter final
   // Keep track of the data operands so that we can update their async clauses.
   llvm::SmallVector<mlir::Operation *> dataOperands;
 
-  void clauseNotImplemented(const OpenACCClause &c) {
-    cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind());
-  }
-
   void setLastDeviceTypeClause(const OpenACCDeviceTypeClause &clause) {
     lastDeviceTypeValues.clear();
 
@@ -118,19 +112,6 @@ class OpenACCClauseCIREmitter final
     return createConstantInt(cgf.cgm.getLoc(loc), width, value);
   }
 
-  mlir::acc::DeviceType decodeDeviceType(const IdentifierInfo *ii) {
-    // '*' case leaves no identifier-info, just a nullptr.
-    if (!ii)
-      return mlir::acc::DeviceType::Star;
-    return llvm::StringSwitch<mlir::acc::DeviceType>(ii->getName())
-        .CaseLower("default", mlir::acc::DeviceType::Default)
-        .CaseLower("host", mlir::acc::DeviceType::Host)
-        .CaseLower("multicore", mlir::acc::DeviceType::Multicore)
-        .CasesLower({"nvidia", "acc_device_nvidia"},
-                    mlir::acc::DeviceType::Nvidia)
-        .CaseLower("radeon", mlir::acc::DeviceType::Radeon);
-  }
-
   mlir::acc::GangArgType decodeGangType(OpenACCGangKind gk) {
     switch (gk) {
     case OpenACCGangKind::Num:
@@ -149,7 +130,7 @@ class OpenACCClauseCIREmitter final
     mlir::OpBuilder::InsertionGuard guardCase(builder);
     builder.setInsertionPoint(operation.loopOp);
     OpenACCClauseCIREmitter<mlir::acc::LoopOp> loopEmitter{
-        operation.loopOp, recipeInsertLocation, cgf, builder, dirKind, dirLoc};
+        operation.loopOp, recipeInsertLocation, cgf, builder, dirKind};
     loopEmitter.lastDeviceTypeValues = lastDeviceTypeValues;
     loopEmitter.Visit(&c);
   }
@@ -160,12 +141,7 @@ class OpenACCClauseCIREmitter final
     mlir::OpBuilder::InsertionGuard guardCase(builder);
     builder.setInsertionPoint(operation.computeOp);
     OpenACCClauseCIREmitter<typename OpTy::ComputeOpTy> computeEmitter{
-        operation.computeOp,
-        recipeInsertLocation,
-        cgf,
-        builder,
-        dirKind,
-        dirLoc};
+        operation.computeOp, recipeInsertLocation, cgf, builder, dirKind};
 
     computeEmitter.lastDeviceTypeValues = lastDeviceTypeValues;
 
@@ -182,33 +158,6 @@ class OpenACCClauseCIREmitter final
     dataOperands.append(computeEmitter.dataOperands);
   }
 
-  mlir::acc::DataClauseModifier
-  convertModifiers(OpenACCModifierKind modifiers) {
-    using namespace mlir::acc;
-    static_assert(static_cast<int>(OpenACCModifierKind::Zero) ==
-                      static_cast<int>(DataClauseModifier::zero) &&
-                  static_cast<int>(OpenACCModifierKind::Readonly) ==
-                      static_cast<int>(DataClauseModifier::readonly) &&
-                  static_cast<int>(OpenACCModifierKind::AlwaysIn) ==
-                      static_cast<int>(DataClauseModifier::alwaysin) &&
-                  static_cast<int>(OpenACCModifierKind::AlwaysOut) ==
-                      static_cast<int>(DataClauseModifier::alwaysout) &&
-                  static_cast<int>(OpenACCModifierKind::Capture) ==
-                      static_cast<int>(DataClauseModifier::capture));
-
-    DataClauseModifier mlirModifiers{};
-
-    // The MLIR representation of this represents `always` as `alwaysin` +
-    // `alwaysout`.  So do a small fixup here.
-    if (isOpenACCModifierBitSet(modifiers, OpenACCModifierKind::Always)) {
-      mlirModifiers = mlirModifiers | DataClauseModifier::always;
-      modifiers &= ~OpenACCModifierKind::Always;
-    }
-
-    mlirModifiers = mlirModifiers | static_cast<DataClauseModifier>(modifiers);
-    return mlirModifiers;
-  }
-
   template <typename BeforeOpTy, typename AfterOpTy>
   void addDataOperand(const Expr *varOperand, mlir::acc::DataClause dataClause,
                       OpenACCModifierKind modifiers, bool structured,
@@ -243,8 +192,8 @@ class OpenACCClauseCIREmitter final
     // Set the 'rest' of the info for both operations.
     beforeOp.setDataClause(dataClause);
     afterOp.setDataClause(dataClause);
-    beforeOp.setModifiers(convertModifiers(modifiers));
-    afterOp.setModifiers(convertModifiers(modifiers));
+    beforeOp.setModifiers(convertOpenACCModifiers(modifiers));
+    afterOp.setModifiers(convertOpenACCModifiers(modifiers));
 
     // Make sure we record these, so 'async' values can be updated later.
     dataOperands.push_back(beforeOp.getOperation());
@@ -264,7 +213,7 @@ class OpenACCClauseCIREmitter final
 
     // Set the 'rest' of the info for the operation.
     beforeOp.setDataClause(dataClause);
-    beforeOp.setModifiers(convertModifiers(modifiers));
+    beforeOp.setModifiers(convertOpenACCModifiers(modifiers));
 
     // Make sure we record these, so 'async' values can be updated later.
     dataOperands.push_back(beforeOp.getOperation());
@@ -368,12 +317,12 @@ public:
                           mlir::OpBuilder::InsertPoint &recipeInsertLocation,
                           CIRGen::CIRGenFunction &cgf,
                           CIRGen::CIRGenBuilderTy &builder,
-                          OpenACCDirectiveKind dirKind, SourceLocation dirLoc)
+                          OpenACCDirectiveKind dirKind)
       : operation(operation), recipeInsertLocation(recipeInsertLocation),
-        cgf(cgf), builder(builder), dirKind(dirKind), dirLoc(dirLoc) {}
+        cgf(cgf), builder(builder), dirKind(dirKind) {}
 
   void VisitClause(const OpenACCClause &clause) {
-    clauseNotImplemented(clause);
+    llvm_unreachable("Unknown/unhandled clause kind");
   }
 
   // The entry point for the CIR emitter. All users should use this rather than
@@ -432,9 +381,7 @@ public:
       // Nothing to do here either, combined constructs are just going to use
       // 'lastDeviceTypeValues' to set the value for the child visitor.
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. routine construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitDeviceTypeClause");
     }
   }
 
@@ -498,9 +445,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. Combined constructs remain. update construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitAsyncClause");
     }
   }
 
@@ -627,7 +572,7 @@ public:
     } else {
       // TODO: When we've implemented this for everything, switch this to an
       // unreachable. update construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitWaitClause");
     }
   }
 
@@ -646,9 +591,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToLoopOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. Routine construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitSeqClause");
     }
   }
 
@@ -658,9 +601,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToLoopOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. Routine, construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitAutoClause");
     }
   }
 
@@ -670,9 +611,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToLoopOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. Routine construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitIndependentClause");
     }
   }
 
@@ -732,9 +671,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToLoopOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. Combined constructs remain.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitWorkerClause");
     }
   }
 
@@ -750,9 +687,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToLoopOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. Combined constructs remain.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitVectorClause");
     }
   }
 
@@ -800,12 +735,16 @@ public:
             var, mlir::acc::DataClause::acc_copy, clause.getModifierList(),
             /*structured=*/true,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CopyinOp>(
+            var, mlir::acc::DataClause::acc_copy, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyClause");
     }
   }
 
@@ -822,12 +761,16 @@ public:
         addDataOperand<mlir::acc::CopyinOp>(
             var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
             /*structured=*/false, /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CopyinOp>(
+            var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyInClause");
     }
   }
 
@@ -845,12 +788,16 @@ public:
             var, mlir::acc::DataClause::acc_copyout, clause.getModifierList(),
             /*structured=*/false,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CreateOp>(
+            var, mlir::acc::DataClause::acc_copyout, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyOutClause");
     }
   }
 
@@ -867,12 +814,28 @@ public:
         addDataOperand<mlir::acc::CreateOp>(
             var, mlir::acc::DataClause::acc_create, clause.getModifierList(),
             /*structured=*/false, /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CreateOp>(
+            var, mlir::acc::DataClause::acc_create, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCreateClause");
+    }
+  }
+
+  void VisitLinkClause(const OpenACCLinkClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::DeclareLinkOp>(
+            var, mlir::acc::DataClause::acc_declare_link, {},
+            /*structured=*/true,
+            /*implicit=*/false);
+    } else {
+      llvm_unreachable("Unknown construct kind in VisitLinkClause");
     }
   }
 
@@ -921,7 +884,8 @@ public:
 
   void VisitDevicePtrClause(const OpenACCDevicePtrClause &clause) {
     if constexpr (isOneOfTypes<OpTy, mlir::acc::ParallelOp, mlir::acc::SerialOp,
-                               mlir::acc::KernelsOp, mlir::acc::DataOp>) {
+                               mlir::acc::KernelsOp, mlir::acc::DataOp,
+                               mlir::acc::DeclareEnterOp>) {
       for (const Expr *var : clause.getVarList())
         addDataOperand<mlir::acc::DevicePtrOp>(
             var, mlir::acc::DataClause::acc_deviceptr, {},
@@ -930,9 +894,7 @@ public:
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitDevicePtrClause");
     }
   }
 
@@ -957,12 +919,16 @@ public:
         addDataOperand<mlir::acc::PresentOp, mlir::acc::DeleteOp>(
             var, mlir::acc::DataClause::acc_present, {}, /*structured=*/true,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::PresentOp>(
+            var, mlir::acc::DataClause::acc_present, {},
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitPresentClause");
     }
   }
 
@@ -1104,6 +1070,18 @@ public:
       llvm_unreachable("Unknown construct kind in VisitReductionClause");
     }
   }
+
+  void VisitDeviceResidentClause(const OpenACCDeviceResidentClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::DeclareDeviceResidentOp>(
+            var, mlir::acc::DataClause::acc_declare_device_resident, {},
+            /*structured=*/true,
+            /*implicit=*/false);
+    } else {
+      llvm_unreachable("Unknown construct kind in VisitDeviceResidentClause");
+    }
+  }
 };
 
 template <typename OpTy>
@@ -1111,29 +1089,28 @@ auto makeClauseEmitter(OpTy &op,
                        mlir::OpBuilder::InsertPoint &recipeInsertLocation,
                        CIRGen::CIRGenFunction &cgf,
                        CIRGen::CIRGenBuilderTy &builder,
-                       OpenACCDirectiveKind dirKind, SourceLocation dirLoc) {
+                       OpenACCDirectiveKind dirKind) {
   return OpenACCClauseCIREmitter<OpTy>(op, recipeInsertLocation, cgf, builder,
-                                       dirKind, dirLoc);
+                                       dirKind);
 }
 } // namespace
 
 template <typename Op>
 void CIRGenFunction::emitOpenACCClauses(
-    Op &op, OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
+    Op &op, OpenACCDirectiveKind dirKind,
     ArrayRef<const OpenACCClause *> clauses) {
   mlir::OpBuilder::InsertionGuard guardCase(builder);
 
   // Sets insertion point before the 'op', since every new expression needs to
   // be before the operation.
   builder.setInsertionPoint(op);
-  makeClauseEmitter(op, lastRecipeLocation, *this, builder, dirKind, dirLoc)
+  makeClauseEmitter(op, lastRecipeLocation, *this, builder, dirKind)
       .emitClauses(clauses);
 }
 
 #define EXPL_SPEC(N)                                                           \
   template void CIRGenFunction::emitOpenACCClauses<N>(                         \
-      N &, OpenACCDirectiveKind, SourceLocation,                               \
-      ArrayRef<const OpenACCClause *>);
+      N &, OpenACCDirectiveKind, ArrayRef<const OpenACCClause *>);
 EXPL_SPEC(mlir::acc::ParallelOp)
 EXPL_SPEC(mlir::acc::SerialOp)
 EXPL_SPEC(mlir::acc::KernelsOp)
@@ -1151,25 +1128,26 @@ EXPL_SPEC(mlir::acc::AtomicReadOp)
 EXPL_SPEC(mlir::acc::AtomicWriteOp)
 EXPL_SPEC(mlir::acc::AtomicCaptureOp)
 EXPL_SPEC(mlir::acc::AtomicUpdateOp)
+EXPL_SPEC(mlir::acc::DeclareEnterOp)
 #undef EXPL_SPEC
 
 template <typename ComputeOp, typename LoopOp>
 void CIRGenFunction::emitOpenACCClauses(
     ComputeOp &op, LoopOp &loopOp, OpenACCDirectiveKind dirKind,
-    SourceLocation dirLoc, ArrayRef<const OpenACCClause *> clauses) {
+    ArrayRef<const OpenACCClause *> clauses) {
   static_assert(std::is_same_v<mlir::acc::LoopOp, LoopOp>);
 
   CombinedConstructClauseInfo<ComputeOp> inf{op, loopOp};
   // We cannot set the insertion point here and do so in the emitter, but make
   // sure we reset it with the 'guard' anyway.
   mlir::OpBuilder::InsertionGuard guardCase(builder);
-  makeClauseEmitter(inf, lastRecipeLocation, *this, builder, dirKind, dirLoc)
+  makeClauseEmitter(inf, lastRecipeLocation, *this, builder, dirKind)
       .emitClauses(clauses);
 }
 
 #define EXPL_SPEC(N)                                                           \
   template void CIRGenFunction::emitOpenACCClauses<N, mlir::acc::LoopOp>(      \
-      N &, mlir::acc::LoopOp &, OpenACCDirectiveKind, SourceLocation,          \
+      N &, mlir::acc::LoopOp &, OpenACCDirectiveKind,                          \
       ArrayRef<const OpenACCClause *>);
 
 EXPL_SPEC(mlir::acc::ParallelOp)
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCHelpers.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCHelpers.h
new file mode 100644
index 0000000..639d148
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCHelpers.h
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains helpers for OpenACC emission that don't need to be in
+// CIRGenModule, but can't live in a single .cpp file.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "clang/AST/DeclOpenACC.h"
+
+namespace clang::CIRGen {
+inline mlir::acc::DataClauseModifier
+convertOpenACCModifiers(OpenACCModifierKind modifiers) {
+  using namespace mlir::acc;
+  static_assert(static_cast<int>(OpenACCModifierKind::Zero) ==
+                    static_cast<int>(DataClauseModifier::zero) &&
+                static_cast<int>(OpenACCModifierKind::Readonly) ==
+                    static_cast<int>(DataClauseModifier::readonly) &&
+                static_cast<int>(OpenACCModifierKind::AlwaysIn) ==
+                    static_cast<int>(DataClauseModifier::alwaysin) &&
+                static_cast<int>(OpenACCModifierKind::AlwaysOut) ==
+                    static_cast<int>(DataClauseModifier::alwaysout) &&
+                static_cast<int>(OpenACCModifierKind::Capture) ==
+                    static_cast<int>(DataClauseModifier::capture));
+
+  DataClauseModifier mlirModifiers{};
+
+  // The MLIR representation of this represents `always` as `alwaysin` +
+  // `alwaysout`.  So do a small fixup here.
+  if (isOpenACCModifierBitSet(modifiers, OpenACCModifierKind::Always)) {
+    mlirModifiers = mlirModifiers | DataClauseModifier::always;
+    modifiers &= ~OpenACCModifierKind::Always;
+  }
+
+  mlirModifiers = mlirModifiers | static_cast<DataClauseModifier>(modifiers);
+  return mlirModifiers;
+}
+
+inline mlir::acc::DeviceType decodeDeviceType(const IdentifierInfo *ii) {
+  // '*' case leaves no identifier-info, just a nullptr.
+  if (!ii)
+    return mlir::acc::DeviceType::Star;
+  return llvm::StringSwitch<mlir::acc::DeviceType>(ii->getName())
+      .CaseLower("default", mlir::acc::DeviceType::Default)
+      .CaseLower("host", mlir::acc::DeviceType::Host)
+      .CaseLower("multicore", mlir::acc::DeviceType::Multicore)
+      .CasesLower({"nvidia", "acc_device_nvidia"},
+                  mlir::acc::DeviceType::Nvidia)
+      .CaseLower("radeon", mlir::acc::DeviceType::Radeon);
+}
+} // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp b/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp
new file mode 100644
index 0000000..20b0646
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp
@@ -0,0 +1,23 @@
+//===--- CIRGenPointerAuth.cpp - CIR generation for ptr auth --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains common routines relating to the emission of
+// pointer authentication operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+Address CIRGenFunction::getAsNaturalAddressOf(Address addr,
+                                              QualType pointeeTy) {
+  assert(!cir::MissingFeatures::pointerAuthentication());
+  return addr;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
index bf0ddc5..c936497 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
@@ -184,6 +184,11 @@ public:
     return fieldIdxMap.lookup(fd);
   }
 
+  unsigned getNonVirtualBaseCIRFieldNo(const CXXRecordDecl *rd) const {
+    assert(nonVirtualBases.count(rd) && "Invalid non-virtual base!");
+    return nonVirtualBases.lookup(rd);
+  }
+
   /// Check whether this struct can be C++ zero-initialized
   /// with a zeroinitializer.
   bool isZeroInitializable() const { return zeroInitializable; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 1eb7199..f13e7cb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -66,7 +66,7 @@ static mlir::LogicalResult emitStmtWithResult(CIRGenFunction &cgf,
 mlir::LogicalResult CIRGenFunction::emitCompoundStmtWithoutScope(
     const CompoundStmt &s, Address *lastValue, AggValueSlot slot) {
   mlir::LogicalResult result = mlir::success();
-  const Stmt *exprResult = s.getStmtExprResult();
+  const Stmt *exprResult = s.body_back();
   assert((!lastValue || (lastValue && exprResult)) &&
          "If lastValue is not null then the CompoundStmt must have a "
          "StmtExprResult");
@@ -159,6 +159,10 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
     return emitCXXTryStmt(cast<CXXTryStmt>(*s));
   case Stmt::CXXForRangeStmtClass:
     return emitCXXForRangeStmt(cast<CXXForRangeStmt>(*s), attr);
+  case Stmt::CoroutineBodyStmtClass:
+    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
+  case Stmt::IndirectGotoStmtClass:
+    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OpenACCComputeConstructClass:
     return emitOpenACCComputeConstruct(cast<OpenACCComputeConstruct>(*s));
   case Stmt::OpenACCLoopConstructClass:
@@ -199,10 +203,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::CaseStmtClass:
   case Stmt::SEHLeaveStmtClass:
   case Stmt::SYCLKernelCallStmtClass:
-  case Stmt::CoroutineBodyStmtClass:
-    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
   case Stmt::CoreturnStmtClass:
-  case Stmt::IndirectGotoStmtClass:
   case Stmt::OMPParallelDirectiveClass:
   case Stmt::OMPTaskwaitDirectiveClass:
   case Stmt::OMPTaskyieldDirectiveClass:
@@ -458,7 +459,14 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
     if (getContext().getLangOpts().ElideConstructors && s.getNRVOCandidate() &&
         s.getNRVOCandidate()->isNRVOVariable()) {
       assert(!cir::MissingFeatures::openMP());
-      assert(!cir::MissingFeatures::nrvo());
+      // Apply the named return value optimization for this return statement,
+      // which means doing nothing: the appropriate result has already been
+      // constructed into the NRVO variable.
+
+      // If there is an NRVO flag for this variable, set it to 1 into indicate
+      // that the cleanup code should not destroy the variable.
+      if (auto nrvoFlag = nrvoFlags[s.getNRVOCandidate()])
+        builder.createFlagStore(loc, true, nrvoFlag);
     } else if (!rv) {
       // No return expression. Do nothing.
     } else if (rv->getType()->isVoidType()) {
@@ -556,6 +564,17 @@ mlir::LogicalResult CIRGenFunction::emitGotoStmt(const clang::GotoStmt &s) {
 }
 
 mlir::LogicalResult
+CIRGenFunction::emitIndirectGotoStmt(const IndirectGotoStmt &s) {
+  mlir::Value val = emitScalarExpr(s.getTarget());
+  assert(indirectGotoBlock &&
+         "If you jumping to a indirect branch should be alareadye emitted");
+  cir::BrOp::create(builder, getLoc(s.getSourceRange()), indirectGotoBlock,
+                    val);
+  builder.createBlock(builder.getBlock()->getParent());
+  return mlir::success();
+}
+
+mlir::LogicalResult
 CIRGenFunction::emitContinueStmt(const clang::ContinueStmt &s) {
   builder.createContinue(getLoc(s.getKwLoc()));
 
@@ -581,9 +600,14 @@ mlir::LogicalResult CIRGenFunction::emitLabel(const clang::LabelDecl &d) {
   }
 
   builder.setInsertionPointToEnd(labelBlock);
-  cir::LabelOp::create(builder, getLoc(d.getSourceRange()), d.getName());
+  cir::LabelOp label =
+      cir::LabelOp::create(builder, getLoc(d.getSourceRange()), d.getName());
   builder.setInsertionPointToEnd(labelBlock);
-
+  auto func = cast<cir::FuncOp>(curFn);
+  cgm.mapBlockAddress(cir::BlockAddrInfoAttr::get(builder.getContext(),
+                                                  func.getSymNameAttr(),
+                                                  label.getLabelAttr()),
+                      label);
   //  FIXME: emit debug info for labels, incrementProfileCounter
   assert(!cir::MissingFeatures::ehstackBranches());
   assert(!cir::MissingFeatures::incrementProfileCounter());
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 9e55bd5..11aad17 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -24,15 +24,14 @@ using namespace mlir::acc;
 template <typename Op, typename TermOp>
 mlir::LogicalResult CIRGenFunction::emitOpenACCOpAssociatedStmt(
     mlir::Location start, mlir::Location end, OpenACCDirectiveKind dirKind,
-    SourceLocation dirLoc, llvm::ArrayRef<const OpenACCClause *> clauses,
-    const Stmt *associatedStmt) {
+    llvm::ArrayRef<const OpenACCClause *> clauses, const Stmt *associatedStmt) {
   mlir::LogicalResult res = mlir::success();
 
   llvm::SmallVector<mlir::Type> retTy;
   llvm::SmallVector<mlir::Value> operands;
   auto op = Op::create(builder, start, retTy, operands);
 
-  emitOpenACCClauses(op, dirKind, dirLoc, clauses);
+  emitOpenACCClauses(op, dirKind, clauses);
 
   {
     mlir::Block &block = op.getRegion().emplaceBlock();
@@ -66,8 +65,7 @@ template <> struct CombinedType<KernelsOp> {
 template <typename Op, typename TermOp>
 mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
     mlir::Location start, mlir::Location end, OpenACCDirectiveKind dirKind,
-    SourceLocation dirLoc, llvm::ArrayRef<const OpenACCClause *> clauses,
-    const Stmt *loopStmt) {
+    llvm::ArrayRef<const OpenACCClause *> clauses, const Stmt *loopStmt) {
   mlir::LogicalResult res = mlir::success();
 
   llvm::SmallVector<mlir::Type> retTy;
@@ -102,7 +100,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
       mlir::acc::YieldOp::create(builder, end);
     }
 
-    emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);
+    emitOpenACCClauses(computeOp, loopOp, dirKind, clauses);
 
     updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);
 
@@ -114,13 +112,13 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
 
 template <typename Op>
 Op CIRGenFunction::emitOpenACCOp(
-    mlir::Location start, OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
+    mlir::Location start, OpenACCDirectiveKind dirKind,
     llvm::ArrayRef<const OpenACCClause *> clauses) {
   llvm::SmallVector<mlir::Type> retTy;
   llvm::SmallVector<mlir::Value> operands;
   auto op = Op::create(builder, start, retTy, operands);
 
-  emitOpenACCClauses(op, dirKind, dirLoc, clauses);
+  emitOpenACCClauses(op, dirKind, clauses);
   return op;
 }
 
@@ -132,16 +130,13 @@ CIRGenFunction::emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s) {
   switch (s.getDirectiveKind()) {
   case OpenACCDirectiveKind::Parallel:
     return emitOpenACCOpAssociatedStmt<ParallelOp, mlir::acc::YieldOp>(
-        start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-        s.getStructuredBlock());
+        start, end, s.getDirectiveKind(), s.clauses(), s.getStructuredBlock());
   case OpenACCDirectiveKind::Serial:
     return emitOpenACCOpAssociatedStmt<SerialOp, mlir::acc::YieldOp>(
-        start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-        s.getStructuredBlock());
+        start, end, s.getDirectiveKind(), s.clauses(), s.getStructuredBlock());
   case OpenACCDirectiveKind::Kernels:
     return emitOpenACCOpAssociatedStmt<KernelsOp, mlir::acc::TerminatorOp>(
-        start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-        s.getStructuredBlock());
+        start, end, s.getDirectiveKind(), s.clauses(), s.getStructuredBlock());
   default:
     llvm_unreachable("invalid compute construct kind");
   }
@@ -153,39 +148,34 @@ CIRGenFunction::emitOpenACCDataConstruct(const OpenACCDataConstruct &s) {
   mlir::Location end = getLoc(s.getSourceRange().getEnd());
 
   return emitOpenACCOpAssociatedStmt<DataOp, mlir::acc::TerminatorOp>(
-      start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-      s.getStructuredBlock());
+      start, end, s.getDirectiveKind(), s.clauses(), s.getStructuredBlock());
 }
 
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCInitConstruct(const OpenACCInitConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  emitOpenACCOp<InitOp>(start, s.getDirectiveKind(), s.getDirectiveLoc(),
-                               s.clauses());
+  emitOpenACCOp<InitOp>(start, s.getDirectiveKind(), s.clauses());
   return mlir::success();
 }
 
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCSetConstruct(const OpenACCSetConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  emitOpenACCOp<SetOp>(start, s.getDirectiveKind(), s.getDirectiveLoc(),
-                              s.clauses());
+  emitOpenACCOp<SetOp>(start, s.getDirectiveKind(), s.clauses());
   return mlir::success();
 }
 
 mlir::LogicalResult CIRGenFunction::emitOpenACCShutdownConstruct(
     const OpenACCShutdownConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  emitOpenACCOp<ShutdownOp>(start, s.getDirectiveKind(),
-                                   s.getDirectiveLoc(), s.clauses());
+  emitOpenACCOp<ShutdownOp>(start, s.getDirectiveKind(), s.clauses());
   return mlir::success();
 }
 
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCWaitConstruct(const OpenACCWaitConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  auto waitOp = emitOpenACCOp<WaitOp>(start, s.getDirectiveKind(),
-                                   s.getDirectiveLoc(), s.clauses());
+  auto waitOp = emitOpenACCOp<WaitOp>(start, s.getDirectiveKind(), s.clauses());
 
   auto createIntExpr = [this](const Expr *intExpr) {
     mlir::Value expr = emitScalarExpr(intExpr);
@@ -225,16 +215,13 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCCombinedConstruct(
   switch (s.getDirectiveKind()) {
   case OpenACCDirectiveKind::ParallelLoop:
     return emitOpenACCOpCombinedConstruct<ParallelOp, mlir::acc::YieldOp>(
-        start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-        s.getLoop());
+        start, end, s.getDirectiveKind(), s.clauses(), s.getLoop());
   case OpenACCDirectiveKind::SerialLoop:
     return emitOpenACCOpCombinedConstruct<SerialOp, mlir::acc::YieldOp>(
-        start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-        s.getLoop());
+        start, end, s.getDirectiveKind(), s.clauses(), s.getLoop());
   case OpenACCDirectiveKind::KernelsLoop:
     return emitOpenACCOpCombinedConstruct<KernelsOp, mlir::acc::TerminatorOp>(
-        start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-        s.getLoop());
+        start, end, s.getDirectiveKind(), s.clauses(), s.getLoop());
   default:
     llvm_unreachable("invalid compute construct kind");
   }
@@ -246,31 +233,27 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCHostDataConstruct(
   mlir::Location end = getLoc(s.getSourceRange().getEnd());
 
   return emitOpenACCOpAssociatedStmt<HostDataOp, mlir::acc::TerminatorOp>(
-      start, end, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses(),
-      s.getStructuredBlock());
+      start, end, s.getDirectiveKind(), s.clauses(), s.getStructuredBlock());
 }
 
 mlir::LogicalResult CIRGenFunction::emitOpenACCEnterDataConstruct(
     const OpenACCEnterDataConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  emitOpenACCOp<EnterDataOp>(start, s.getDirectiveKind(), s.getDirectiveLoc(),
-                             s.clauses());
+  emitOpenACCOp<EnterDataOp>(start, s.getDirectiveKind(), s.clauses());
   return mlir::success();
 }
 
 mlir::LogicalResult CIRGenFunction::emitOpenACCExitDataConstruct(
     const OpenACCExitDataConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  emitOpenACCOp<ExitDataOp>(start, s.getDirectiveKind(), s.getDirectiveLoc(),
-                            s.clauses());
+  emitOpenACCOp<ExitDataOp>(start, s.getDirectiveKind(), s.clauses());
   return mlir::success();
 }
 
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
-  emitOpenACCOp<UpdateOp>(start, s.getDirectiveKind(), s.getDirectiveLoc(),
-                          s.clauses());
+  emitOpenACCOp<UpdateOp>(start, s.getDirectiveKind(), s.clauses());
   return mlir::success();
 }
 
@@ -314,15 +297,80 @@ const VarDecl *getLValueDecl(const Expr *e) {
   return cast<VarDecl>(dre->getDecl());
 }
 
-mlir::LogicalResult
-CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) {
-  // For now, we are only support 'read'/'write'/'update', so diagnose. We can
-  // switch on the kind later once we implement the 'capture' form.
-  if (s.getAtomicKind() == OpenACCAtomicKind::Capture) {
-    cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct");
-    return mlir::failure();
+static mlir::acc::AtomicReadOp
+emitAtomicRead(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+               mlir::Location start,
+               const OpenACCAtomicConstruct::SingleStmtInfo &inf) {
+  // Atomic 'read' only permits 'v = x', where v and x are both scalar L
+  // values. The getAssociatedStmtInfo strips off implicit casts, which
+  // includes implicit conversions and L-to-R-Value conversions, so we can
+  // just emit it as an L value.  The Flang implementation has no problem with
+  // different types, so it appears that the dialect can handle the
+  // conversions.
+  mlir::Value v = cgf.emitLValue(inf.V).getPointer();
+  mlir::Value x = cgf.emitLValue(inf.X).getPointer();
+  mlir::Type resTy = cgf.convertType(inf.V->getType());
+  return mlir::acc::AtomicReadOp::create(builder, start, x, v, resTy,
+                                         /*ifCond=*/{});
+}
+
+static mlir::acc::AtomicWriteOp
+emitAtomicWrite(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+                mlir::Location start,
+                const OpenACCAtomicConstruct::SingleStmtInfo &inf) {
+  mlir::Value x = cgf.emitLValue(inf.X).getPointer();
+  mlir::Value expr = cgf.emitAnyExpr(inf.RefExpr).getValue();
+  return mlir::acc::AtomicWriteOp::create(builder, start, x, expr,
+                                          /*ifCond=*/{});
+}
+
+static std::pair<mlir::LogicalResult, mlir::acc::AtomicUpdateOp>
+emitAtomicUpdate(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+                 mlir::Location start, mlir::Location end,
+                 const OpenACCAtomicConstruct::SingleStmtInfo &inf) {
+  mlir::Value x = cgf.emitLValue(inf.X).getPointer();
+  auto op = mlir::acc::AtomicUpdateOp::create(builder, start, x, /*ifCond=*/{});
+
+  mlir::LogicalResult res = mlir::success();
+  {
+    mlir::OpBuilder::InsertionGuard guardCase(builder);
+    mlir::Type argTy = cast<cir::PointerType>(x.getType()).getPointee();
+    std::array<mlir::Type, 1> recipeType{argTy};
+    std::array<mlir::Location, 1> recipeLoc{start};
+    auto *recipeBlock = builder.createBlock(
+        &op.getRegion(), op.getRegion().end(), recipeType, recipeLoc);
+    builder.setInsertionPointToEnd(recipeBlock);
+    // Since we have an initial value that we know is a scalar type, we can
+    // just emit the entire statement here after sneaking-in our 'alloca' in
+    // the right place, then loading out of it. Flang does a lot less work
+    // (probably does its own emitting!), but we have more complicated AST
+    // nodes to worry about, so we can just count on opt to remove the extra
+    // alloca/load/store set.
+    auto alloca = cir::AllocaOp::create(
+        builder, start, x.getType(), argTy, "x_var",
+        cgf.cgm.getSize(
+            cgf.getContext().getTypeAlignInChars(inf.X->getType())));
+
+    alloca.setInitAttr(builder.getUnitAttr());
+    builder.CIRBaseBuilderTy::createStore(start, recipeBlock->getArgument(0),
+                                          alloca);
+
+    const VarDecl *xval = getLValueDecl(inf.X);
+    CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, xval};
+    cgf.replaceAddrOfLocalVar(
+        xval, Address{alloca, argTy, cgf.getContext().getDeclAlign(xval)});
+
+    res = cgf.emitStmt(inf.WholeExpr, /*useCurrentScope=*/true);
+
+    auto load = cir::LoadOp::create(builder, start, {alloca});
+    mlir::acc::YieldOp::create(builder, end, {load});
   }
 
+  return {res, op};
+}
+
+mlir::LogicalResult
+CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) {
   // While Atomic is an 'associated statement' construct, it 'steals' the
   // expression it is associated with rather than emitting it inside of it.  So
   // it has custom emit logic.
@@ -331,78 +379,85 @@ CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) {
   OpenACCAtomicConstruct::StmtInfo inf = s.getAssociatedStmtInfo();
 
   switch (s.getAtomicKind()) {
-  case OpenACCAtomicKind::Capture:
-    llvm_unreachable("Unimplemented atomic construct type, should have "
-                     "diagnosed/returned above");
-    return mlir::failure();
   case OpenACCAtomicKind::Read: {
-
-    // Atomic 'read' only permits 'v = x', where v and x are both scalar L
-    // values. The getAssociatedStmtInfo strips off implicit casts, which
-    // includes implicit conversions and L-to-R-Value conversions, so we can
-    // just emit it as an L value.  The Flang implementation has no problem with
-    // different types, so it appears that the dialect can handle the
-    // conversions.
-    mlir::Value v = emitLValue(inf.V).getPointer();
-    mlir::Value x = emitLValue(inf.X).getPointer();
-    mlir::Type resTy = convertType(inf.V->getType());
-    auto op = mlir::acc::AtomicReadOp::create(builder, start, x, v, resTy,
-                                              /*ifCond=*/{});
-    emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
-                       s.clauses());
+    assert(inf.Form == OpenACCAtomicConstruct::StmtInfo::StmtForm::Read);
+    mlir::acc::AtomicReadOp op =
+        emitAtomicRead(*this, builder, start, inf.First);
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.clauses());
     return mlir::success();
   }
   case OpenACCAtomicKind::Write: {
-    mlir::Value x = emitLValue(inf.X).getPointer();
-    mlir::Value expr = emitAnyExpr(inf.RefExpr).getValue();
-    auto op = mlir::acc::AtomicWriteOp::create(builder, start, x, expr,
-                                               /*ifCond=*/{});
-    emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
-                       s.clauses());
+    assert(inf.Form == OpenACCAtomicConstruct::StmtInfo::StmtForm::Write);
+    auto op = emitAtomicWrite(*this, builder, start, inf.First);
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.clauses());
     return mlir::success();
   }
   case OpenACCAtomicKind::None:
   case OpenACCAtomicKind::Update: {
-    mlir::Value x = emitLValue(inf.X).getPointer();
-    auto op =
-        mlir::acc::AtomicUpdateOp::create(builder, start, x, /*ifCond=*/{});
-    emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
-                       s.clauses());
+    assert(inf.Form == OpenACCAtomicConstruct::StmtInfo::StmtForm::Update);
+    auto [res, op] = emitAtomicUpdate(*this, builder, start, end, inf.First);
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.clauses());
+    return res;
+  }
+  case OpenACCAtomicKind::Capture: {
+    // Atomic-capture is made up of two statements, either an update = read,
+    // read + update, or read + write.  As a result, the IR represents the
+    // capture region as having those two 'inside' of it.
+    auto op = mlir::acc::AtomicCaptureOp::create(builder, start, /*ifCond=*/{});
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.clauses());
     mlir::LogicalResult res = mlir::success();
     {
       mlir::OpBuilder::InsertionGuard guardCase(builder);
-      mlir::Type argTy = cast<cir::PointerType>(x.getType()).getPointee();
-      std::array<mlir::Type, 1> recipeType{argTy};
-      std::array<mlir::Location, 1> recipeLoc{start};
-      mlir::Block *recipeBlock = builder.createBlock(
-          &op.getRegion(), op.getRegion().end(), recipeType, recipeLoc);
-      builder.setInsertionPointToEnd(recipeBlock);
-
-      // Since we have an initial value that we know is a scalar type, we can
-      // just emit the entire statement here after sneaking-in our 'alloca' in
-      // the right place, then loading out of it. Flang does a lot less work
-      // (probably does its own emitting!), but we have more complicated AST
-      // nodes to worry about, so we can just count on opt to remove the extra
-      // alloca/load/store set.
-      auto alloca = cir::AllocaOp::create(
-          builder, start, x.getType(), argTy, "x_var",
-          cgm.getSize(getContext().getTypeAlignInChars(inf.X->getType())));
-
-      alloca.setInitAttr(mlir::UnitAttr::get(&getMLIRContext()));
-      builder.CIRBaseBuilderTy::createStore(start, recipeBlock->getArgument(0),
-                                            alloca);
-
-      const VarDecl *xval = getLValueDecl(inf.X);
-      CIRGenFunction::DeclMapRevertingRAII declMapRAII{*this, xval};
-      replaceAddrOfLocalVar(
-          xval, Address{alloca, argTy, getContext().getDeclAlign(xval)});
-
-      res = emitStmt(s.getAssociatedStmt(), /*useCurrentScope=*/true);
-
-      auto load = cir::LoadOp::create(builder, start, {alloca});
-      mlir::acc::YieldOp::create(builder, end, {load});
-    }
 
+      mlir::Block *block =
+          builder.createBlock(&op.getRegion(), op.getRegion().end(), {}, {});
+
+      builder.setInsertionPointToStart(block);
+
+      auto terminator = mlir::acc::TerminatorOp::create(builder, end);
+
+      // The AtomicCaptureOp only permits the two acc.atomic.* operations inside
+      // of it, so all other parts of the expression need to be emitted before
+      // the AtomicCaptureOp, then moved into place.
+      builder.setInsertionPoint(op);
+
+      switch (inf.Form) {
+      default:
+        llvm_unreachable("invalid form for Capture");
+      case OpenACCAtomicConstruct::StmtInfo::StmtForm::ReadWrite: {
+        mlir::acc::AtomicReadOp first =
+            emitAtomicRead(*this, builder, start, inf.First);
+        mlir::acc::AtomicWriteOp second =
+            emitAtomicWrite(*this, builder, start, inf.Second);
+
+        first->moveBefore(terminator);
+        second->moveBefore(terminator);
+        break;
+      }
+      case OpenACCAtomicConstruct::StmtInfo::StmtForm::ReadUpdate: {
+        mlir::acc::AtomicReadOp first =
+            emitAtomicRead(*this, builder, start, inf.First);
+        auto [this_res, second] =
+            emitAtomicUpdate(*this, builder, start, end, inf.Second);
+        res = this_res;
+
+        first->moveBefore(terminator);
+        second->moveBefore(terminator);
+        break;
+      }
+      case OpenACCAtomicConstruct::StmtInfo::StmtForm::UpdateRead: {
+        auto [this_res, first] =
+            emitAtomicUpdate(*this, builder, start, end, inf.First);
+        res = this_res;
+        mlir::acc::AtomicReadOp second =
+            emitAtomicRead(*this, builder, start, inf.Second);
+
+        first->moveBefore(terminator);
+        second->moveBefore(terminator);
+        break;
+      }
+      }
+    }
     return res;
   }
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
index c5b89bd..406b1a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
@@ -117,8 +117,7 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
   //
 
   // Emit all clauses.
-  emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
-                     s.clauses());
+  emitOpenACCClauses(op, s.getDirectiveKind(), s.clauses());
 
   updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(),
                           s.getParentComputeConstructKind());
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 03618d4..7f000ec 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -97,7 +97,8 @@ std::string CIRGenTypes::getRecordTypeName(const clang::RecordDecl *recordDecl,
   llvm::raw_svector_ostream outStream(typeName);
 
   PrintingPolicy policy = recordDecl->getASTContext().getPrintingPolicy();
-  policy.SuppressInlineNamespace = false;
+  policy.SuppressInlineNamespace =
+      llvm::to_underlying(PrintingPolicy::SuppressInlineNamespaceMode::None);
   policy.AlwaysIncludeTypeForTemplateArgument = true;
   policy.PrintAsCanonical = true;
   policy.SuppressTagKeyword = true;
@@ -404,7 +405,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     const ReferenceType *refTy = cast<ReferenceType>(ty);
     QualType elemTy = refTy->getPointeeType();
     auto pointeeType = convertTypeForMem(elemTy);
-    resultType = builder.getPointerTo(pointeeType);
+    resultType = builder.getPointerTo(pointeeType, elemTy.getAddressSpace());
     assert(resultType && "Cannot get pointer type?");
     break;
   }
@@ -481,6 +482,21 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::MemberPointer: {
+    const auto *mpt = cast<MemberPointerType>(ty);
+
+    mlir::Type memberTy = convertType(mpt->getPointeeType());
+    auto clsTy = mlir::cast<cir::RecordType>(
+        convertType(QualType(mpt->getQualifier().getAsType(), 0)));
+    if (mpt->isMemberDataPointer()) {
+      resultType = cir::DataMemberType::get(memberTy, clsTy);
+    } else {
+      assert(!cir::MissingFeatures::methodType());
+      cgm.errorNYI(SourceLocation(), "MethodType");
+    }
+    break;
+  }
+
   case Type::FunctionNoProto:
   case Type::FunctionProto:
     resultType = convertFunctionTypeInternal(type);
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index ab245a77..2002bd7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -49,6 +49,7 @@ public:
   bool isScalar() const { return flavor == Scalar; }
   bool isComplex() const { return flavor == Complex; }
   bool isAggregate() const { return flavor == Aggregate; }
+  bool isIgnored() const { return isScalar() && !getValue(); }
 
   bool isVolatileQualified() const { return isVolatile; }
 
@@ -166,7 +167,8 @@ class LValue {
   // this is the alignment of the whole vector)
   unsigned alignment;
   mlir::Value v;
-  mlir::Value vectorIdx; // Index for vector subscript
+  mlir::Value vectorIdx;      // Index for vector subscript
+  mlir::Attribute vectorElts; // ExtVector element subset: V.xyx
   mlir::Type elementType;
   LValueBaseInfo baseInfo;
   const CIRGenBitFieldInfo *bitFieldInfo{nullptr};
@@ -190,6 +192,7 @@ public:
   bool isSimple() const { return lvType == Simple; }
   bool isVectorElt() const { return lvType == VectorElt; }
   bool isBitField() const { return lvType == BitField; }
+  bool isExtVectorElt() const { return lvType == ExtVectorElt; }
   bool isGlobalReg() const { return lvType == GlobalReg; }
   bool isVolatile() const { return quals.hasVolatile(); }
 
@@ -254,6 +257,22 @@ public:
     return vectorIdx;
   }
 
+  // extended vector elements.
+  Address getExtVectorAddress() const {
+    assert(isExtVectorElt());
+    return Address(getExtVectorPointer(), elementType, getAlignment());
+  }
+
+  mlir::Value getExtVectorPointer() const {
+    assert(isExtVectorElt());
+    return v;
+  }
+
+  mlir::ArrayAttr getExtVectorElts() const {
+    assert(isExtVectorElt());
+    return mlir::cast<mlir::ArrayAttr>(vectorElts);
+  }
+
   static LValue makeVectorElt(Address vecAddress, mlir::Value index,
                               clang::QualType t, LValueBaseInfo baseInfo) {
     LValue r;
@@ -265,6 +284,19 @@ public:
     return r;
   }
 
+  static LValue makeExtVectorElt(Address vecAddress, mlir::ArrayAttr elts,
+                                 clang::QualType type,
+                                 LValueBaseInfo baseInfo) {
+    LValue r;
+    r.lvType = ExtVectorElt;
+    r.v = vecAddress.getPointer();
+    r.elementType = vecAddress.getElementType();
+    r.vectorElts = elts;
+    r.initialize(type, type.getQualifiers(), vecAddress.getAlignment(),
+                 baseInfo);
+    return r;
+  }
+
   // bitfield lvalue
   Address getBitFieldAddress() const {
     return Address(getBitFieldPointer(), elementType, getAlignment());
diff --git a/clang/lib/CIR/CodeGen/CIRGenerator.cpp b/clang/lib/CIR/CodeGen/CIRGenerator.cpp
index aa4d9eb..0208eee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenerator.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenerator.cpp
@@ -166,6 +166,18 @@ void CIRGenerator::HandleCXXStaticMemberVarInstantiation(VarDecl *D) {
   cgm->handleCXXStaticMemberVarInstantiation(D);
 }
 
+void CIRGenerator::HandleOpenACCRoutineReference(const FunctionDecl *FD,
+                                                 const OpenACCRoutineDecl *RD) {
+  llvm::StringRef mangledName = cgm->getMangledName(FD);
+  cir::FuncOp entry =
+      mlir::dyn_cast_if_present<cir::FuncOp>(cgm->getGlobalValue(mangledName));
+
+  // if this wasn't generated, don't force it to be.
+  if (!entry)
+    return;
+  cgm->emitOpenACCRoutineDecl(FD, entry, RD->getBeginLoc(), RD->clauses());
+}
+
 void CIRGenerator::CompleteTentativeDefinition(VarDecl *d) {
   if (diags.hasErrorOccurred())
     return;
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 7c31bea..d6cd150 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -12,6 +12,7 @@ add_clang_library(clangCIR
   CIRGenAtomic.cpp
   CIRGenBuilder.cpp
   CIRGenBuiltin.cpp
+  CIRGenBuiltinAArch64.cpp
   CIRGenBuiltinX86.cpp
   CIRGenCall.cpp
   CIRGenClass.cpp
@@ -35,6 +36,7 @@ add_clang_library(clangCIR
   CIRGenOpenACC.cpp
   CIRGenOpenACCClause.cpp
   CIRGenOpenACCRecipe.cpp
+  CIRGenPointerAuth.cpp
   CIRGenRecordLayoutBuilder.cpp
   CIRGenStmt.cpp
   CIRGenStmtOpenACC.cpp
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 4198c23..2ebe6df 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -127,13 +127,43 @@ public:
 
     virtual ~Cleanup() = default;
 
+    /// Generation flags.
+    class Flags {
+      enum {
+        F_IsForEH = 0x1,
+        F_IsNormalCleanupKind = 0x2,
+        F_IsEHCleanupKind = 0x4,
+        F_HasExitSwitch = 0x8,
+      };
+      unsigned flags = 0;
+
+    public:
+      Flags() = default;
+
+      /// isForEH - true if the current emission is for an EH cleanup.
+      bool isForEHCleanup() const { return flags & F_IsForEH; }
+      bool isForNormalCleanup() const { return !isForEHCleanup(); }
+      void setIsForEHCleanup() { flags |= F_IsForEH; }
+
+      bool isNormalCleanupKind() const { return flags & F_IsNormalCleanupKind; }
+      void setIsNormalCleanupKind() { flags |= F_IsNormalCleanupKind; }
+
+      /// isEHCleanupKind - true if the cleanup was pushed as an EH
+      /// cleanup.
+      bool isEHCleanupKind() const { return flags & F_IsEHCleanupKind; }
+      void setIsEHCleanupKind() { flags |= F_IsEHCleanupKind; }
+
+      bool hasExitSwitch() const { return flags & F_HasExitSwitch; }
+      void setHasExitSwitch() { flags |= F_HasExitSwitch; }
+    };
+
     /// Emit the cleanup.  For normal cleanups, this is run in the
     /// same EH context as when the cleanup was pushed, i.e. the
     /// immediately-enclosing context of the cleanup scope.  For
     /// EH cleanups, this is run in a terminate context.
     ///
     // \param flags cleanup kind.
-    virtual void emit(CIRGenFunction &cgf) = 0;
+    virtual void emit(CIRGenFunction &cgf, Flags flags) = 0;
   };
 
 private:
@@ -155,6 +185,9 @@ private:
   /// The innermost normal cleanup on the stack.
   stable_iterator innermostNormalCleanup = stable_end();
 
+  /// The innermost EH scope on the stack.
+  stable_iterator innermostEHScope = stable_end();
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
@@ -214,6 +247,8 @@ public:
   /// Determines whether the exception-scopes stack is empty.
   bool empty() const { return startOfData == endOfBuffer; }
 
+  bool requiresCatchOrCleanup() const;
+
   /// Determines whether there are any normal cleanups on the stack.
   bool hasNormalCleanups() const {
     return innermostNormalCleanup != stable_end();
@@ -226,6 +261,8 @@ public:
   }
   stable_iterator getInnermostActiveNormalCleanup() const;
 
+  stable_iterator getInnermostEHScope() const { return innermostEHScope; }
+
   /// An unstable reference to a scope-stack depth.  Invalidated by
   /// pushes but not pops.
   class iterator;
@@ -233,6 +270,9 @@ public:
   /// Returns an iterator pointing to the innermost EH scope.
   iterator begin() const;
 
+  /// Returns an iterator pointing to the outermost EH scope.
+  iterator end() const;
+
   /// Create a stable reference to the top of the EH stack.  The
   /// returned reference is valid until that scope is popped off the
   /// stack.
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index 62a8c59..377c532 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -1,5 +1,8 @@
 #include "TargetInfo.h"
 #include "ABIInfo.h"
+#include "CIRGenFunction.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
@@ -68,3 +71,14 @@ bool TargetCIRGenInfo::isNoProtoCallVariadic(
   // For everything else, we just prefer false unless we opt out.
   return false;
 }
+
+mlir::Value TargetCIRGenInfo::performAddrSpaceCast(
+    CIRGenFunction &cgf, mlir::Value v, cir::TargetAddressSpaceAttr srcAddr,
+    mlir::Type destTy, bool isNonNull) const {
+  // Since target may map different address spaces in AST to the same address
+  // space, an address space conversion may end up as a bitcast.
+  if (cir::GlobalOp globalOp = v.getDefiningOp<cir::GlobalOp>())
+    cgf.cgm.errorNYI("Global op addrspace cast");
+  // Try to preserve the source's name to make IR more readable.
+  return cgf.getBuilder().createAddrSpaceCast(v, destTy);
+}
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.h b/clang/lib/CIR/CodeGen/TargetInfo.h
index dbb0312..7268264 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -17,6 +17,7 @@
 #include "ABIInfo.h"
 #include "CIRGenTypes.h"
 #include "clang/Basic/AddressSpaces.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 
 #include <memory>
 #include <utility>
@@ -33,6 +34,8 @@ bool isEmptyFieldForLayout(const ASTContext &context, const FieldDecl *fd);
 /// if the [[no_unique_address]] attribute would have made them empty.
 bool isEmptyRecordForLayout(const ASTContext &context, QualType t);
 
+class CIRGenFunction;
+
 class TargetCIRGenInfo {
   std::unique_ptr<ABIInfo> info;
 
@@ -48,6 +51,15 @@ public:
   virtual cir::TargetAddressSpaceAttr getCIRAllocaAddressSpace() const {
     return {};
   }
+  /// Perform address space cast of an expression of pointer type.
+  /// \param V is the value to be casted to another address space.
+  /// \param DestTy is the destination pointer type.
+  /// \param srcAS is theaddress space of \p V.
+  /// \param IsNonNull is the flag indicating \p V is known to be non null.
+  virtual mlir::Value performAddrSpaceCast(CIRGenFunction &cgf, mlir::Value v,
+                                           cir::TargetAddressSpaceAttr srcAddr,
+                                           mlir::Type destTy,
+                                           bool isNonNull = false) const;
 
   /// Determine whether a call to an unprototyped functions under
   /// the given calling convention should use the variadic