106 files changed, 4151 insertions, 408 deletions
diff --git a/bolt/unittests/Profile/PerfSpeEvents.cpp b/bolt/unittests/Profile/PerfSpeEvents.cpp
index 8d023cd..4f060cd 100644
--- a/bolt/unittests/Profile/PerfSpeEvents.cpp
+++ b/bolt/unittests/Profile/PerfSpeEvents.cpp
@@ -161,4 +161,92 @@ TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
   parseAndCheckBrstackEvents(1234, ExpectedSamples);
 }
 
+TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstackAndPbt) {
+  // Check perf input with SPE branch events as brstack format by
+  // combining with the previous branch target address (named as PBT).
+  // Example collection command:
+  // ```
+  // perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
+  // ```
+  // How Bolt extracts the branch events:
+  // ```
+  // perf script -F pid,brstack --itrace=bl
+  // ```
+
+  opts::ArmSPE = true;
+  opts::ReadPerfEvents =
+      // "<PID> <SRC>/<DEST>/PN/-/-/10/COND/- <NULL>/<PBT>/-/-/-/0//-\n"
+      "  4567  0xa002/0xa003/PN/-/-/10/COND/- 0x0/0xa001/-/-/-/0//-\n"
+      "  4567  0xb002/0xb003/P/-/-/4/RET/- 0x0/0xb001/-/-/-/0//-\n"
+      "  4567  0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n"
+      "  4567  0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n"
+      "  4567  0xe005/0xe009/P/-/-/14/RET/- 0x0/0xe001/-/-/-/0//-\n"
+      "  4567  0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n"
+      "  4567  0xf002/0xf003/MN/-/-/8/COND/- 0x0/0xf001/-/-/-/0//-\n"
+      "  4567  0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n";
+
+  // ExpectedSamples contains the aggregated information about
+  // a branch {{From, To, TraceTo}, {TakenCount, MispredCount}}.
+  // Where
+  // - From: is the source address of the sampled branch operation.
+  // - To: is the target address of the sampled branch operation.
+  // - TraceTo could be either
+  //    - A 'Type = Trace::BR_ONLY', which means the trace only contains branch
+  //    data.
+  //    - Or an address, when the trace contains information about the previous
+  //    branch.
+  //
+  // When FEAT_SPE_PBT is present, Arm SPE emits two records per sample:
+  // - the current branch (Spe.From/Spe.To), and
+  // - the previous taken branch target (PBT) (PBT.From, PBT.To).
+  //
+  // Together they behave like a depth-1 branch stack where:
+  //   - the PBT entry is always taken
+  //   - the current branch entry may represent a taken branch or a fall-through
+  //   - the destination (Spe.To) is the architecturally executed target
+  //
+  // There can be fall-throughs to be inferred between the PBT entry and
+  // the current branch (Spe.From), but there cannot be between current
+  // branch's (Spe.From/Spe.To).
+  //
+  // PBT records only the target address (PBT.To), meaning we have no
+  // information as the branch source (PBT.From=0x0), branch type, and the
+  // prediction bit.
+  //
+  // Consider the trace pair:
+  // {{Spe.From, Spe.To, Type}, {TK, MP}},
+  //   {{PBT.From, PBT.To, TraceTo}, {TK, MP}}
+  // {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}}, {{0x0, 0xd123, 0xd456}, {2, 0}}
+  //
+  // The first entry is the Spe record, which represents a trace from 0xd456
+  // (Spe.From) to 0xd789 (Spe.To). Type = Trace::BR_ONLY, as Bolt processes the
+  // current branch event first. At this point we have no information about the
+  // previous trace (PBT). This entry has a TakenCount = 2, as we have two
+  // samples for (0xd456, 0xd789) in our input. It also has MispredsCount = 2,
+  // as 'M' misprediction flag appears in both cases.
+  //
+  // The second entry is the PBT record. TakenCount = 2 because the
+  // (PBT.From = 0x0, PBT.To = 0xd123) branch target appears twice in the input,
+  // and MispredsCount = 0 because prediction data is absent. There is no branch
+  // source information, so the PBT.From field is zero (0x0). TraceTo = 0xd456
+  // connect the flow from the previous taken branch at 0xd123 (PBT.To) to the
+  // current source branch at 0xd456 (Spe.From), which then continues to 0xd789
+  // (Spe.To).
+  std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
+      {{0xa002, 0xa003, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xa001, 0xa002}, {1, 0}},
+      {{0xb002, 0xb003, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xb001, 0xb002}, {1, 0}},
+      {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 0}},
+      {{0x0, 0xc123, 0xc456}, {2, 0}},
+      {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}},
+      {{0x0, 0xd123, 0xd456}, {2, 0}},
+      {{0xe005, 0xe009, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xe001, 0xe005}, {1, 0}},
+      {{0xf002, 0xf003, Trace::BR_ONLY}, {1, 1}},
+      {{0x0, 0xf001, 0xf002}, {1, 0}}};
+
+  parseAndCheckBrstackEvents(4567, ExpectedSamples);
+}
+
 #endif
diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
index a89a896..e7d97b2 100644
--- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
@@ -13,17 +13,10 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::hicpp {
 
-namespace {
-AST_MATCHER(VarDecl, isAsm) { return Node.hasAttr<clang::AsmLabelAttr>(); }
-const ast_matchers::internal::VariadicDynCastAllOfMatcher<Decl,
-                                                          FileScopeAsmDecl>
-    fileScopeAsmDecl; // NOLINT(readability-identifier-*) preserve clang style
-} // namespace
-
 void NoAssemblerCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(asmStmt().bind("asm-stmt"), this);
   Finder->addMatcher(fileScopeAsmDecl().bind("asm-file-scope"), this);
-  Finder->addMatcher(varDecl(isAsm()).bind("asm-var"), this);
+  Finder->addMatcher(varDecl(hasAttr(attr::AsmLabel)).bind("asm-var"), this);
 }
 
 void NoAssemblerCheck::check(const MatchFinder::MatchResult &Result) {
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index 5b2a96d..ac1abb4 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -825,6 +825,20 @@ fieldDecl()
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('fileScopeAsmDecl0')"><a name="fileScopeAsmDecl0Anchor">fileScopeAsmDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FileScopeAsmDecl.html">FileScopeAsmDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="fileScopeAsmDecl0"><pre>Matches top level asm declarations.
+
+Given
+   __asm("nop");
+   void f() {
+     __asm("mov al, 2");
+   }
+fileScopeAsmDecl()
+  matches '__asm("nop")',
+  but not '__asm("mov al, 2")'.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('friendDecl0')"><a name="friendDecl0Anchor">friendDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="friendDecl0"><pre>Matches friend declarations.
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 98e62de..bca2d84 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -2478,6 +2478,21 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, NullStmt> nullStmt;
 ///   matches '__asm("mov al, 2")'
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, AsmStmt> asmStmt;
 
+/// Matches top level asm declarations.
+///
+/// Given
+/// \code
+///    __asm("nop");
+///    void f() {
+///      __asm("mov al, 2");
+///    }
+/// \endcode
+/// fileScopeAsmDecl()
+///   matches '__asm("nop")',
+///   but not '__asm("mov al, 2")'.
+extern const internal::VariadicDynCastAllOfMatcher<Decl, FileScopeAsmDecl>
+    fileScopeAsmDecl;
+
 /// Matches bool literals.
 ///
 /// Example matches true
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 749f531..1013bfc 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5017,6 +5017,10 @@ def HLSLUnparsedSemantic : HLSLAnnotationAttr {
   let Documentation = [InternalOnly];
 }
 
+def HLSLUserSemantic : HLSLSemanticAttr</* Indexable= */ 1> {
+  let Documentation = [InternalOnly];
+}
+
 def HLSLSV_Position : HLSLSemanticAttr</* Indexable= */ 1> {
   let Documentation = [HLSLSV_PositionDocs];
 }
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fa50953..f43707e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13184,6 +13184,7 @@ def err_hlsl_semantic_indexing_not_supported
     : Error<"semantic %0 does not allow indexing">;
 def err_hlsl_init_priority_unsupported : Error<
   "initializer priorities are not supported in HLSL">;
+def err_hlsl_semantic_index_overlap : Error<"semantic index overlap %0">;
 
 def warn_hlsl_user_defined_type_missing_member: Warning<"binding type '%select{t|u|b|s|c}0' only applies to types containing %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric types}0">, InGroup<LegacyConstantRegisterBinding>;
 def err_hlsl_binding_type_mismatch: Error<"binding type '%select{t|u|b|s|c}0' only applies to %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric variables in the global scope}0">;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 6f9a69e..1625851 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4090,6 +4090,57 @@ def CIR_PrefetchOp : CIR_Op<"prefetch"> {
 }
 
 //===----------------------------------------------------------------------===//
+// ObjSizeOp
+//===----------------------------------------------------------------------===//
+
+def CIR_ObjSizeOp : CIR_Op<"objsize", [Pure]> {
+  let summary = "Implements the llvm.objsize builtin";
+  let description = [{
+    The `cir.objsize` operation is designed to provide information to the
+    optimizer to determine whether a) an operation (like memcpy) will
+    overflow a buffer that corresponds to an object, or b) that a runtime
+    check for overflow isn’t necessary. An object in this context means an
+    allocation of a specific class, structure, array, or other object.
+
+    When the `min` attribute is present, the operation returns the minimum
+    guaranteed accessible size. When absent (max mode), it returns the maximum
+    possible object size. Corresponds to `llvm.objectsize`'s `min` argument.
+    
+    The `dynamic` attribute determines if the value should be evaluated at
+    runtime. Corresponds to `llvm.objectsize`'s `dynamic` argument.
+
+    The `nullunknown` attribute controls how null pointers are handled. When
+    present, null pointers are treated as having unknown size. When absent,
+    null pointers are treated as having 0 size (in min mode) or -1 size
+    (in max mode). Corresponds to `llvm.objectsize`'s `nullunknown` argument.
+
+    Example:
+
+    ```mlir
+    %size = cir.objsize min %ptr : !cir.ptr<i32> -> i64
+    %dsize = cir.objsize max dynamic %ptr : !cir.ptr<i32> -> i64
+    %nsize = cir.objsize min nullunknown %ptr : !cir.ptr<i32> -> i64
+    ```
+  }];
+
+  let arguments = (ins
+    CIR_PointerType:$ptr,
+    UnitAttr:$min,
+    UnitAttr:$nullunknown,
+    UnitAttr:$dynamic
+  );
+
+  let results = (outs CIR_AnyFundamentalIntType:$result);
+
+  let assemblyFormat = [{
+      (`min` $min^) : (`max`)?
+      (`nullunknown` $nullunknown^)?
+      (`dynamic` $dynamic^)?
+      $ptr `:` qualified(type($ptr)) `->` qualified(type($result)) attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // PtrDiffOp
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 6f099a7..af1ffff 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -215,6 +215,7 @@ struct MissingFeatures {
   static bool builtinCallMathErrno() { return false; }
   static bool builtinCheckKind() { return false; }
   static bool cgCapturedStmtInfo() { return false; }
+  static bool countedBySize() { return false; }
   static bool cgFPOptionsRAII() { return false; }
   static bool checkBitfieldClipping() { return false; }
   static bool cirgenABIInfo() { return false; }
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 8c3b6ae..28b03ac 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -20,7 +20,9 @@
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/TargetParser/Triple.h"
 #include <initializer_list>
 
@@ -259,9 +261,11 @@ private:
   HLSLSemanticAttr *createSemantic(const SemanticInfo &Semantic,
                                    DeclaratorDecl *TargetDecl);
   bool determineActiveSemanticOnScalar(FunctionDecl *FD, DeclaratorDecl *D,
-                                       SemanticInfo &ActiveSemantic);
+                                       SemanticInfo &ActiveSemantic,
+                                       llvm::StringSet<> &ActiveInputSemantics);
   bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
-                               SemanticInfo &ActiveSemantic);
+                               SemanticInfo &ActiveSemantic,
+                               llvm::StringSet<> &ActiveInputSemantics);
 
   void processExplicitBindingsOnDecl(VarDecl *D);
 
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 4e63400..84f7e62 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6007,6 +6007,8 @@ bool Compiler<Emitter>::visitSwitchStmt(const SwitchStmt *S) {
       CaseLabels[SC] = this->getLabel();
 
       const Expr *Value = CS->getLHS();
+      if (Value->isValueDependent())
+        return false;
       PrimType ValueT = this->classifyPrim(Value->getType());
 
       // Compare the case statement's value to the switch condition.
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 8fab6ef..193f87c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -5452,10 +5452,13 @@ static EvalStmtResult EvaluateSwitch(StmtResult &Result, EvalInfo &Info,
     }
 
     const CaseStmt *CS = cast<CaseStmt>(SC);
-    APSInt LHS = CS->getLHS()->EvaluateKnownConstInt(Info.Ctx);
-    APSInt RHS = CS->getRHS() ? CS->getRHS()->EvaluateKnownConstInt(Info.Ctx)
-                              : LHS;
-    if (LHS <= Value && Value <= RHS) {
+    const Expr *LHS = CS->getLHS();
+    const Expr *RHS = CS->getRHS();
+    if (LHS->isValueDependent() || (RHS && RHS->isValueDependent()))
+      return ESR_Failed;
+    APSInt LHSValue = LHS->EvaluateKnownConstInt(Info.Ctx);
+    APSInt RHSValue = RHS ? RHS->EvaluateKnownConstInt(Info.Ctx) : LHSValue;
+    if (LHSValue <= Value && Value <= RHSValue) {
       Found = SC;
       break;
     }
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 42f124b..0874b3d 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -954,6 +954,8 @@ const internal::VariadicDynCastAllOfMatcher<Stmt, CXXTryStmt> cxxTryStmt;
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXThrowExpr> cxxThrowExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, NullStmt> nullStmt;
 const internal::VariadicDynCastAllOfMatcher<Stmt, AsmStmt> asmStmt;
+const internal::VariadicDynCastAllOfMatcher<Decl, FileScopeAsmDecl>
+    fileScopeAsmDecl;
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXBoolLiteralExpr>
     cxxBoolLiteral;
 const internal::VariadicDynCastAllOfMatcher<Stmt, StringLiteral> stringLiteral;
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 01c03f3..66848f7 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -246,6 +246,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(expr);
   REGISTER_MATCHER(exprWithCleanups);
   REGISTER_MATCHER(fieldDecl);
+  REGISTER_MATCHER(fileScopeAsmDecl);
   REGISTER_MATCHER(fixedPointLiteral);
   REGISTER_MATCHER(floatLiteral);
   REGISTER_MATCHER(forCallable);
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 0803910..4e6a5ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -481,6 +481,19 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitCall(e->getCallee()->getType(), CIRGenCallee::forDirect(fnOp), e,
                     returnValue);
   }
+  case Builtin::BI__builtin_dynamic_object_size:
+  case Builtin::BI__builtin_object_size: {
+    unsigned type =
+        e->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
+    auto resType = mlir::cast<cir::IntType>(convertType(e->getType()));
+
+    // We pass this builtin onto the optimizer so that it can figure out the
+    // object size in more complex cases.
+    bool isDynamic = builtinID == Builtin::BI__builtin_dynamic_object_size;
+    return RValue::get(emitBuiltinObjectSize(e->getArg(0), type, resType,
+                                             /*EmittedE=*/nullptr, isDynamic));
+  }
+
   case Builtin::BI__builtin_prefetch: {
     auto evaluateOperandAsInt = [&](const Expr *arg) {
       Expr::EvalResult res;
@@ -663,3 +676,42 @@ mlir::Value CIRGenFunction::emitVAArg(VAArgExpr *ve) {
   mlir::Value vaList = emitVAListRef(ve->getSubExpr()).getPointer();
   return cir::VAArgOp::create(builder, loc, type, vaList);
 }
+
+mlir::Value CIRGenFunction::emitBuiltinObjectSize(const Expr *e, unsigned type,
+                                                  cir::IntType resType,
+                                                  mlir::Value emittedE,
+                                                  bool isDynamic) {
+  assert(!cir::MissingFeatures::opCallImplicitObjectSizeArgs());
+
+  // LLVM can't handle type=3 appropriately, and __builtin_object_size shouldn't
+  // evaluate e for side-effects. In either case, just like original LLVM
+  // lowering, we shouldn't lower to `cir.objsize` but to a constant instead.
+  if (type == 3 || (!emittedE && e->HasSideEffects(getContext())))
+    return builder.getConstInt(getLoc(e->getSourceRange()), resType,
+                               (type & 2) ? 0 : -1);
+
+  mlir::Value ptr = emittedE ? emittedE : emitScalarExpr(e);
+  assert(mlir::isa<cir::PointerType>(ptr.getType()) &&
+         "Non-pointer passed to __builtin_object_size?");
+
+  assert(!cir::MissingFeatures::countedBySize());
+
+  // Extract the min/max mode from type. CIR only supports type 0
+  // (max, whole object) and type 2 (min, whole object), not type 1 or 3
+  // (closest subobject variants).
+  const bool min = ((type & 2) != 0);
+  // For GCC compatibility, __builtin_object_size treats NULL as unknown size.
+  auto op =
+      cir::ObjSizeOp::create(builder, getLoc(e->getSourceRange()), resType, ptr,
+                             min, /*nullUnknown=*/true, isDynamic);
+  return op.getResult();
+}
+
+mlir::Value CIRGenFunction::evaluateOrEmitBuiltinObjectSize(
+    const Expr *e, unsigned type, cir::IntType resType, mlir::Value emittedE,
+    bool isDynamic) {
+  uint64_t objectSize;
+  if (!e->tryEvaluateObjectSize(objectSize, getContext(), type))
+    return emitBuiltinObjectSize(e, type, resType, emittedE, isDynamic);
+  return builder.getConstInt(getLoc(e->getSourceRange()), resType, objectSize);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 1c52a78..f879e58 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1307,6 +1307,28 @@ public:
   RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
                          const clang::CallExpr *e, ReturnValueSlot returnValue);
 
+  /// Returns a Value corresponding to the size of the given expression by
+  /// emitting a `cir.objsize` operation.
+  ///
+  /// \param e The expression whose object size to compute
+  /// \param type Determines the semantics of the object size computation.
+  ///   The type parameter is a 2-bit value where:
+  ///     bit 0 (type & 1): 0 = whole object, 1 = closest subobject
+  ///     bit 1 (type & 2): 0 = maximum size, 2 = minimum size
+  /// \param resType The result type for the size value
+  /// \param emittedE Optional pre-emitted pointer value. If non-null, we'll
+  ///   call `cir.objsize` on this value rather than emitting e.
+  /// \param isDynamic If true, allows runtime evaluation via dynamic mode
+  mlir::Value emitBuiltinObjectSize(const clang::Expr *e, unsigned type,
+                                    cir::IntType resType, mlir::Value emittedE,
+                                    bool isDynamic);
+
+  mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
+                                              unsigned type,
+                                              cir::IntType resType,
+                                              mlir::Value emittedE,
+                                              bool isDynamic);
+
   RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                   const CIRGenCallee &callee, ReturnValueSlot returnValue,
                   const CallArgList &args, cir::CIRCallOpInterface *callOp,
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index ba967a4..b4afed7 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2832,6 +2832,29 @@ static void collectUnreachable(mlir::Operation *parent,
   }
 }
 
+mlir::LogicalResult CIRToLLVMObjSizeOpLowering::matchAndRewrite(
+    cir::ObjSizeOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type llvmResTy = getTypeConverter()->convertType(op.getType());
+  mlir::Location loc = op->getLoc();
+
+  mlir::IntegerType i1Ty = rewriter.getI1Type();
+
+  auto i1Val = [&rewriter, &loc, &i1Ty](bool val) {
+    return mlir::LLVM::ConstantOp::create(rewriter, loc, i1Ty, val);
+  };
+
+  replaceOpWithCallLLVMIntrinsicOp(rewriter, op, "llvm.objectsize", llvmResTy,
+                                   {
+                                       adaptor.getPtr(),
+                                       i1Val(op.getMin()),
+                                       i1Val(op.getNullunknown()),
+                                       i1Val(op.getDynamic()),
+                                   });
+
+  return mlir::LogicalResult::success();
+}
+
 void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) {
   // Lower the module attributes to LLVM equivalents.
   if (mlir::Attribute tripleAttr =
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 945f9e2..e392a12 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -549,6 +549,16 @@ static void addSPIRVBuiltinDecoration(llvm::GlobalVariable *GV,
   GV->addMetadata("spirv.Decorations", *Decoration);
 }
 
+static void addLocationDecoration(llvm::GlobalVariable *GV, unsigned Location) {
+  LLVMContext &Ctx = GV->getContext();
+  IRBuilder<> B(GV->getContext());
+  MDNode *Operands =
+      MDNode::get(Ctx, {ConstantAsMetadata::get(B.getInt32(/* Location */ 30)),
+                        ConstantAsMetadata::get(B.getInt32(Location))});
+  MDNode *Decoration = MDNode::get(Ctx, {Operands});
+  GV->addMetadata("spirv.Decorations", *Decoration);
+}
+
 static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
                                            llvm::Type *Ty, const Twine &Name,
                                            unsigned BuiltInID) {
@@ -562,6 +572,69 @@ static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
   return B.CreateLoad(Ty, GV);
 }
 
+static llvm::Value *createSPIRVLocationLoad(IRBuilder<> &B, llvm::Module &M,
+                                            llvm::Type *Ty, unsigned Location,
+                                            StringRef Name) {
+  auto *GV = new llvm::GlobalVariable(
+      M, Ty, /* isConstant= */ true, llvm::GlobalValue::ExternalLinkage,
+      /* Initializer= */ nullptr, /* Name= */ Name, /* insertBefore= */ nullptr,
+      llvm::GlobalVariable::GeneralDynamicTLSModel,
+      /* AddressSpace */ 7, /* isExternallyInitialized= */ true);
+  GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+  addLocationDecoration(GV, Location);
+  return B.CreateLoad(Ty, GV);
+}
+
+llvm::Value *
+CGHLSLRuntime::emitSPIRVUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                         HLSLSemanticAttr *Semantic,
+                                         std::optional<unsigned> Index) {
+  Twine BaseName = Twine(Semantic->getAttrName()->getName());
+  Twine VariableName = BaseName.concat(Twine(Index.value_or(0)));
+
+  unsigned Location = SPIRVLastAssignedInputSemanticLocation;
+
+  // DXC completely ignores the semantic/index pair. Location are assigned from
+  // the first semantic to the last.
+  llvm::ArrayType *AT = dyn_cast<llvm::ArrayType>(Type);
+  unsigned ElementCount = AT ? AT->getNumElements() : 1;
+  SPIRVLastAssignedInputSemanticLocation += ElementCount;
+  return createSPIRVLocationLoad(B, CGM.getModule(), Type, Location,
+                                 VariableName.str());
+}
+
+llvm::Value *
+CGHLSLRuntime::emitDXILUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                        HLSLSemanticAttr *Semantic,
+                                        std::optional<unsigned> Index) {
+  Twine BaseName = Twine(Semantic->getAttrName()->getName());
+  Twine VariableName = BaseName.concat(Twine(Index.value_or(0)));
+
+  // DXIL packing rules etc shall be handled here.
+  // FIXME: generate proper sigpoint, index, col, row values.
+  // FIXME: also DXIL loads vectors element by element.
+  SmallVector<Value *> Args{B.getInt32(4), B.getInt32(0), B.getInt32(0),
+                            B.getInt8(0),
+                            llvm::PoisonValue::get(B.getInt32Ty())};
+
+  llvm::Intrinsic::ID IntrinsicID = llvm::Intrinsic::dx_load_input;
+  llvm::Value *Value = B.CreateIntrinsic(/*ReturnType=*/Type, IntrinsicID, Args,
+                                         nullptr, VariableName);
+  return Value;
+}
+
+llvm::Value *CGHLSLRuntime::emitUserSemanticLoad(
+    IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
+    HLSLSemanticAttr *Semantic, std::optional<unsigned> Index) {
+  if (CGM.getTarget().getTriple().isSPIRV())
+    return emitSPIRVUserSemanticLoad(B, Type, Semantic, Index);
+
+  if (CGM.getTarget().getTriple().isDXIL())
+    return emitDXILUserSemanticLoad(B, Type, Semantic, Index);
+
+  llvm_unreachable("Unsupported target for user-semantic load.");
+}
+
 llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad(
     IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
     Attr *Semantic, std::optional<unsigned> Index) {
@@ -626,6 +699,9 @@ CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD,
   std::optional<unsigned> Index = std::nullopt;
   if (Semantic->isSemanticIndexExplicit())
     Index = Semantic->getSemanticIndex();
+
+  if (isa<HLSLUserSemanticAttr>(Semantic))
+    return emitUserSemanticLoad(B, Type, Decl, Semantic, Index);
   return emitSystemSemanticLoad(B, Type, Decl, Semantic, Index);
 }
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index d35df52..9d31714 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -200,9 +200,25 @@ private:
                                     llvm::GlobalVariable *BufGV);
   void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
                                    llvm::GlobalVariable *GV);
+  void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
+                                   llvm::GlobalVariable *GV,
+                                   HLSLResourceBindingAttr *RBA);
+
+  llvm::Value *emitSPIRVUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                         HLSLSemanticAttr *Semantic,
+                                         std::optional<unsigned> Index);
+  llvm::Value *emitDXILUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                        HLSLSemanticAttr *Semantic,
+                                        std::optional<unsigned> Index);
+  llvm::Value *emitUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                    const clang::DeclaratorDecl *Decl,
+                                    HLSLSemanticAttr *Semantic,
+                                    std::optional<unsigned> Index);
+
   llvm::Triple::ArchType getArch();
 
   llvm::DenseMap<const clang::RecordType *, llvm::TargetExtType *> LayoutTypes;
+  unsigned SPIRVLastAssignedInputSemanticLocation = 0;
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index b9707f0..a06c57b 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -775,6 +775,10 @@ HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
                                            DeclaratorDecl *TargetDecl) {
   std::string SemanticName = Info.Semantic->getAttrName()->getName().upper();
 
+  if (dyn_cast<HLSLUserSemanticAttr>(Info.Semantic))
+    return createSemanticAttr<HLSLUserSemanticAttr>(*Info.Semantic, TargetDecl,
+                                                    Info.Index);
+
   if (SemanticName == "SV_DISPATCHTHREADID") {
     return createSemanticAttr<HLSLSV_DispatchThreadIDAttr>(
         *Info.Semantic, TargetDecl, Info.Index);
@@ -797,9 +801,10 @@ HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
   return nullptr;
 }
 
-bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
-                                               DeclaratorDecl *D,
-                                               SemanticInfo &ActiveSemantic) {
+bool SemaHLSL::determineActiveSemanticOnScalar(
+    FunctionDecl *FD, DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
+    llvm::StringSet<> &ActiveInputSemantics) {
+
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLSemanticAttr>();
     if (ActiveSemantic.Semantic &&
@@ -818,11 +823,31 @@ bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
 
   checkSemanticAnnotation(FD, D, A);
   FD->addAttr(A);
+
+  unsigned Location = ActiveSemantic.Index.value_or(0);
+
+  const ConstantArrayType *AT = dyn_cast<ConstantArrayType>(D->getType());
+  unsigned ElementCount = AT ? AT->getZExtSize() : 1;
+  ActiveSemantic.Index = Location + ElementCount;
+
+  Twine BaseName = Twine(ActiveSemantic.Semantic->getAttrName()->getName());
+  for (unsigned I = 0; I < ElementCount; ++I) {
+    Twine VariableName = BaseName.concat(Twine(Location + I));
+
+    auto [_, Inserted] = ActiveInputSemantics.insert(VariableName.str());
+    if (!Inserted) {
+      Diag(D->getLocation(), diag::err_hlsl_semantic_index_overlap)
+          << VariableName.str();
+      return false;
+    }
+  }
+
   return true;
 }
 
-bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
-                                       SemanticInfo &ActiveSemantic) {
+bool SemaHLSL::determineActiveSemantic(
+    FunctionDecl *FD, DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
+    llvm::StringSet<> &ActiveInputSemantics) {
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLSemanticAttr>();
     if (ActiveSemantic.Semantic &&
@@ -833,12 +858,13 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
   const Type *T = D->getType()->getUnqualifiedDesugaredType();
   const RecordType *RT = dyn_cast<RecordType>(T);
   if (!RT)
-    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic);
+    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic,
+                                           ActiveInputSemantics);
 
   const RecordDecl *RD = RT->getDecl();
   for (FieldDecl *Field : RD->fields()) {
     SemanticInfo Info = ActiveSemantic;
-    if (!determineActiveSemantic(FD, Field, Info)) {
+    if (!determineActiveSemantic(FD, Field, Info, ActiveInputSemantics)) {
       Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field;
       return false;
     }
@@ -911,12 +937,14 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
     llvm_unreachable("Unhandled environment in triple");
   }
 
+  llvm::StringSet<> ActiveInputSemantics;
   for (ParmVarDecl *Param : FD->parameters()) {
     SemanticInfo ActiveSemantic;
     ActiveSemantic.Semantic = nullptr;
     ActiveSemantic.Index = std::nullopt;
 
-    if (!determineActiveSemantic(FD, Param, ActiveSemantic)) {
+    if (!determineActiveSemantic(FD, Param, ActiveSemantic,
+                                 ActiveInputSemantics)) {
       Diag(Param->getLocation(), diag::note_previous_decl) << Param;
       FD->setInvalidDecl();
     }
@@ -947,6 +975,8 @@ void SemaHLSL::checkSemanticAnnotation(FunctionDecl *EntryPoint,
       return;
     DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel});
     break;
+  case attr::HLSLUserSemantic:
+    return;
   default:
     llvm_unreachable("Unknown SemanticAttr");
   }
@@ -1766,7 +1796,7 @@ void SemaHLSL::handleSemanticAttr(Decl *D, const ParsedAttr &AL) {
   if (AL.getAttrName()->getName().starts_with_insensitive("SV_"))
     diagnoseSystemSemanticAttr(D, AL, Index);
   else
-    Diag(AL.getLoc(), diag::err_hlsl_unknown_semantic) << AL;
+    D->addAttr(createSemanticAttr<HLSLUserSemanticAttr>(AL, nullptr, Index));
 }
 
 void SemaHLSL::handlePackOffsetAttr(Decl *D, const ParsedAttr &AL) {
diff --git a/clang/test/CIR/CodeGen/object-size-flex-array.c b/clang/test/CIR/CodeGen/object-size-flex-array.c
new file mode 100644
index 0000000..74229fd
--- /dev/null
+++ b/clang/test/CIR/CodeGen/object-size-flex-array.c
@@ -0,0 +1,317 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR --check-prefix=CIR-NO-STRICT
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-llvm -disable-llvm-passes %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM --check-prefix=LLVM-NO-STRICT
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm -disable-llvm-passes %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG --check-prefix=OGCG-NO-STRICT
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=0 -emit-cir %s -o %t-strict-0.cir
+// RUN: FileCheck --input-file=%t-strict-0.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-0
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=0 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-0.ll
+// RUN: FileCheck --input-file=%t-cir-strict-0.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-0
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=0 -emit-llvm -disable-llvm-passes %s -o %t-strict-0.ll
+// RUN: FileCheck --input-file=%t-strict-0.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-0
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=1 -emit-cir %s -o %t-strict-1.cir
+// RUN: FileCheck --input-file=%t-strict-1.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-1
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=1 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-1.ll
+// RUN: FileCheck --input-file=%t-cir-strict-1.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-1
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=1 -emit-llvm -disable-llvm-passes %s -o %t-strict-1.ll
+// RUN: FileCheck --input-file=%t-strict-1.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-1
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=2 -emit-cir %s -o %t-strict-2.cir
+// RUN: FileCheck --input-file=%t-strict-2.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=2 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-2.ll
+// RUN: FileCheck --input-file=%t-cir-strict-2.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=2 -emit-llvm -disable-llvm-passes %s -o %t-strict-2.ll
+// RUN: FileCheck --input-file=%t-strict-2.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-2
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=3 -emit-cir %s -o %t-strict-3.cir
+// RUN: FileCheck --input-file=%t-strict-3.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-3
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=3 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-3.ll
+// RUN: FileCheck --input-file=%t-cir-strict-3.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-3
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=3 -emit-llvm -disable-llvm-passes %s -o %t-strict-3.ll
+// RUN: FileCheck --input-file=%t-strict-3.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-3
+
+#define OBJECT_SIZE_BUILTIN __builtin_object_size
+
+typedef struct {
+  float f;
+  double c[];
+} foo_t;
+
+typedef struct {
+  float f;
+  double c[0];
+} foo0_t;
+
+typedef struct {
+  float f;
+  double c[1];
+} foo1_t;
+
+typedef struct {
+  float f;
+  double c[2];
+} foo2_t;
+
+// CIR-LABEL: @bar
+// LLVM-LABEL: @bar(
+// OGCG-LABEL: @bar(
+unsigned bar(foo_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @bar0
+// LLVM-LABEL: @bar0(
+// OGCG-LABEL: @bar0(
+unsigned bar0(foo0_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.const #cir.int<0>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-3: store i32 0
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-3: ret i32 0
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @bar1
+// LLVM-LABEL: @bar1(
+// OGCG-LABEL: @bar1(
+unsigned bar1(foo1_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.const #cir.int<8>
+  // CIR-STRICT-3: cir.const #cir.int<8>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-2: store i32 8
+  // LLVM-STRICT-3: store i32 8
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-2: ret i32 8
+  // OGCG-STRICT-3: ret i32 8
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @bar2
+// LLVM-LABEL: @bar2(
+// OGCG-LABEL: @bar2(
+unsigned bar2(foo2_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.const #cir.int<16>
+  // CIR-STRICT-2: cir.const #cir.int<16>
+  // CIR-STRICT-3: cir.const #cir.int<16>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: store i32 16
+  // LLVM-STRICT-2: store i32 16
+  // LLVM-STRICT-3: store i32 16
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: ret i32 16
+  // OGCG-STRICT-2: ret i32 16
+  // OGCG-STRICT-3: ret i32 16
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+#define DYNAMIC_OBJECT_SIZE_BUILTIN __builtin_dynamic_object_size
+
+// CIR-LABEL: @dyn_bar
+// LLVM-LABEL: @dyn_bar(
+// OGCG-LABEL: @dyn_bar(
+unsigned dyn_bar(foo_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @dyn_bar0
+// LLVM-LABEL: @dyn_bar0(
+// OGCG-LABEL: @dyn_bar0(
+unsigned dyn_bar0(foo0_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.const #cir.int<0>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-3: store i32 0
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-3: ret i32 0
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @dyn_bar1
+// LLVM-LABEL: @dyn_bar1(
+// OGCG-LABEL: @dyn_bar1(
+unsigned dyn_bar1(foo1_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.const #cir.int<8>
+  // CIR-STRICT-3: cir.const #cir.int<8>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-2: store i32 8
+  // LLVM-STRICT-3: store i32 8
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-2: ret i32 8
+  // OGCG-STRICT-3: ret i32 8
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @dyn_bar2
+// LLVM-LABEL: @dyn_bar2(
+// OGCG-LABEL: @dyn_bar2(
+unsigned dyn_bar2(foo2_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.const #cir.int<16>
+  // CIR-STRICT-2: cir.const #cir.int<16>
+  // CIR-STRICT-3: cir.const #cir.int<16>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: store i32 16
+  // LLVM-STRICT-2: store i32 16
+  // LLVM-STRICT-3: store i32 16
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: ret i32 16
+  // OGCG-STRICT-2: ret i32 16
+  // OGCG-STRICT-3: ret i32 16
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// Also checks for non-trailing flex-array like members
+
+typedef struct {
+  double c[0];
+  float f;
+} foofoo0_t;
+
+typedef struct {
+  double c[1];
+  float f;
+} foofoo1_t;
+
+typedef struct {
+  double c[2];
+  float f;
+} foofoo2_t;
+
+// CIR-LABEL: @babar0
+// LLVM-LABEL: @babar0(
+// OGCG-LABEL: @babar0(
+unsigned babar0(foofoo0_t *f) {
+  // CIR-NO-STRICT: cir.const #cir.int<0>
+  // CIR-STRICT-0: cir.const #cir.int<0>
+  // CIR-STRICT-1: cir.const #cir.int<0>
+  // CIR-STRICT-2: cir.const #cir.int<0>
+  // CIR-STRICT-3: cir.const #cir.int<0>
+  // LLVM-NO-STRICT: store i32 0
+  // LLVM-STRICT-0: store i32 0
+  // LLVM-STRICT-1: store i32 0
+  // LLVM-STRICT-2: store i32 0
+  // LLVM-STRICT-3: store i32 0
+  // OGCG-NO-STRICT: ret i32 0
+  // OGCG-STRICT-0: ret i32 0
+  // OGCG-STRICT-1: ret i32 0
+  // OGCG-STRICT-2: ret i32 0
+  // OGCG-STRICT-3: ret i32 0
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @babar1
+// LLVM-LABEL: @babar1(
+// OGCG-LABEL: @babar1(
+unsigned babar1(foofoo1_t *f) {
+  // CIR-NO-STRICT: cir.const #cir.int<8>
+  // CIR-STRICT-0: cir.const #cir.int<8>
+  // CIR-STRICT-1: cir.const #cir.int<8>
+  // CIR-STRICT-2: cir.const #cir.int<8>
+  // CIR-STRICT-3: cir.const #cir.int<8>
+  // LLVM-NO-STRICT: store i32 8
+  // LLVM-STRICT-0: store i32 8
+  // LLVM-STRICT-1: store i32 8
+  // LLVM-STRICT-2: store i32 8
+  // LLVM-STRICT-3: store i32 8
+  // OGCG-NO-STRICT: ret i32 8
+  // OGCG-STRICT-0: ret i32 8
+  // OGCG-STRICT-1: ret i32 8
+  // OGCG-STRICT-2: ret i32 8
+  // OGCG-STRICT-3: ret i32 8
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @babar2
+// LLVM-LABEL: @babar2(
+// OGCG-LABEL: @babar2(
+unsigned babar2(foofoo2_t *f) {
+  // CIR-NO-STRICT: cir.const #cir.int<16>
+  // CIR-STRICT-0: cir.const #cir.int<16>
+  // CIR-STRICT-1: cir.const #cir.int<16>
+  // CIR-STRICT-2: cir.const #cir.int<16>
+  // CIR-STRICT-3: cir.const #cir.int<16>
+  // LLVM-NO-STRICT: store i32 16
+  // LLVM-STRICT-0: store i32 16
+  // LLVM-STRICT-1: store i32 16
+  // LLVM-STRICT-2: store i32 16
+  // LLVM-STRICT-3: store i32 16
+  // OGCG-NO-STRICT: ret i32 16
+  // OGCG-STRICT-0: ret i32 16
+  // OGCG-STRICT-1: ret i32 16
+  // OGCG-STRICT-2: ret i32 16
+  // OGCG-STRICT-3: ret i32 16
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
diff --git a/clang/test/CIR/CodeGen/object-size.c b/clang/test/CIR/CodeGen/object-size.c
new file mode 100644
index 0000000..1b10fb8b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/object-size.c
@@ -0,0 +1,877 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+char gbuf[63];
+char *gp;
+int gi, gj;
+
+// CIR-LABEL: @test1
+// LLVM-LABEL: define {{.*}} void @test1
+// OGCG-LABEL: define {{.*}} void @test1
+void test1(void) {
+  // CIR: cir.const #cir.int<59>
+  // LLVM: store i32 59
+  // OGCG: store i32 59
+  gi = __builtin_object_size(&gbuf[4], 1);
+}
+
+// CIR-LABEL: @test2
+// LLVM-LABEL: define {{.*}} void @test2
+// OGCG-LABEL: define {{.*}} void @test2
+void test2(void) {
+  // CIR: cir.const #cir.int<63>
+  // LLVM: store i32 63
+  // OGCG: store i32 63
+  gi = __builtin_object_size(gbuf, 1);
+}
+
+// CIR-LABEL: @test3
+// LLVM-LABEL: define {{.*}} void @test3
+// OGCG-LABEL: define {{.*}} void @test3
+void test3(void) {
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&gbuf[100], 1);
+}
+
+// CIR-LABEL: @test4
+// LLVM-LABEL: define {{.*}} void @test4
+// OGCG-LABEL: define {{.*}} void @test4
+void test4(void) {
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)(void*)&gbuf[-1], 1);
+}
+
+// CIR-LABEL: @test5
+// LLVM-LABEL: define {{.*}} void @test5
+// OGCG-LABEL: define {{.*}} void @test5
+void test5(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(gp, 0);
+}
+
+// CIR-LABEL: @test6
+// LLVM-LABEL: define {{.*}} void @test6
+// OGCG-LABEL: define {{.*}} void @test6
+void test6(void) {
+  char buf[57];
+
+  // CIR: cir.const #cir.int<53>
+  // LLVM: store i32 53
+  // OGCG: store i32 53
+  gi = __builtin_object_size(&buf[4], 1);
+}
+
+// CIR-LABEL: @test18
+// LLVM-LABEL: define {{.*}} i32 @test18
+// OGCG-LABEL: define {{.*}} i32 @test18
+unsigned test18(int cond) {
+  int a[4], b[4];
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64
+  // OGCG: call i64 @llvm.objectsize.i64
+  return __builtin_object_size(cond ? a : b, 0);
+}
+
+// CIR-LABEL: @test19
+// LLVM-LABEL: define {{.*}} void @test19
+// OGCG-LABEL: define {{.*}} void @test19
+void test19(void) {
+  struct {
+    int a, b;
+  } foo;
+
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size(&foo.a, 0);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.a, 1);
+  
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size(&foo.a, 2);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.a, 3);
+
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 0);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 1);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 2);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 3);
+}
+
+// CIR-LABEL: @test20
+// LLVM-LABEL: define {{.*}} void @test20
+// OGCG-LABEL: define {{.*}} void @test20
+void test20(void) {
+  struct { int t[10]; } t[10];
+
+  // CIR: cir.const #cir.int<380>
+  // LLVM: store i32 380
+  // OGCG: store i32 380
+  gi = __builtin_object_size(&t[0].t[5], 0);
+  
+  // CIR: cir.const #cir.int<20>
+  // LLVM: store i32 20
+  // OGCG: store i32 20
+  gi = __builtin_object_size(&t[0].t[5], 1);
+  
+  // CIR: cir.const #cir.int<380>
+  // LLVM: store i32 380
+  // OGCG: store i32 380
+  gi = __builtin_object_size(&t[0].t[5], 2);
+  
+  // CIR: cir.const #cir.int<20>
+  // LLVM: store i32 20
+  // OGCG: store i32 20
+  gi = __builtin_object_size(&t[0].t[5], 3);
+}
+
+// CIR-LABEL: @test21
+// LLVM-LABEL: define {{.*}} void @test21
+// OGCG-LABEL: define {{.*}} void @test21
+void test21(void) {
+  struct { int t; } t;
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 3);
+}
+
+// CIR-LABEL: @test22
+// LLVM-LABEL: define {{.*}} void @test22
+// OGCG-LABEL: define {{.*}} void @test22
+void test22(void) {
+  struct { int t[10]; } t[10];
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 3);
+}
+
+struct Test23Ty { int a; int t[10]; };
+
+// CIR-LABEL: @test23
+// LLVM-LABEL: define {{.*}} void @test23
+// OGCG-LABEL: define {{.*}} void @test23
+void test23(struct Test23Ty *p) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(p, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(p, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(p, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(p, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&p->a, 0);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&p->a, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(&p->a, 2);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&p->a, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&p->t[5], 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&p->t[5], 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(&p->t[5], 2);
+  
+  // CIR: cir.const #cir.int<20>
+  // LLVM: store i32 20
+  // OGCG: store i32 20
+  gi = __builtin_object_size(&p->t[5], 3);
+}
+
+// CIR-LABEL: @test24
+// LLVM-LABEL: define {{.*}} void @test24
+// OGCG-LABEL: define {{.*}} void @test24
+void test24(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size((void*)0, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((void*)0, 3);
+}
+
+// CIR-LABEL: @test25
+// LLVM-LABEL: define {{.*}} void @test25
+// OGCG-LABEL: define {{.*}} void @test25
+void test25(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0x1000, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0x1000, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size((void*)0x1000, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((void*)0x1000, 3);
+
+  // Skipping (void*)0 + 0x1000 tests - void pointer arithmetic NYI in CIR
+}
+
+// CIR-LABEL: @test26
+// LLVM-LABEL: define {{.*}} void @test26
+// OGCG-LABEL: define {{.*}} void @test26
+void test26(void) {
+  struct { int v[10]; } t[10];
+
+  // CIR: cir.const #cir.int<316>
+  // LLVM: store i32 316
+  // OGCG: store i32 316
+  gi = __builtin_object_size(&t[1].v[11], 0);
+  
+  // CIR: cir.const #cir.int<312>
+  // LLVM: store i32 312
+  // OGCG: store i32 312
+  gi = __builtin_object_size(&t[1].v[12], 1);
+  
+  // CIR: cir.const #cir.int<308>
+  // LLVM: store i32 308
+  // OGCG: store i32 308
+  gi = __builtin_object_size(&t[1].v[13], 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[1].v[14], 3);
+}
+
+struct Test27IncompleteTy;
+
+// CIR-LABEL: @test27
+// LLVM-LABEL: define {{.*}} void @test27
+// OGCG-LABEL: define {{.*}} void @test27
+void test27(struct Test27IncompleteTy *t) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(t, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(t, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(t, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(t, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&test27, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&test27, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(&test27, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&test27, 3);
+}
+
+// CIR-LABEL: @test28
+// LLVM-LABEL: define {{.*}} void @test28
+// OGCG-LABEL: define {{.*}} void @test28
+void test28(void) {
+  struct { int v[10]; } t[10];
+
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 0);
+  
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 1);
+  
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 2);
+  
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 3);
+
+  // CIR: cir.const #cir.int<356>
+  // LLVM: store i32 356
+  // OGCG: store i32 356
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 0);
+  
+  // CIR: cir.const #cir.int<36>
+  // LLVM: store i32 36
+  // OGCG: store i32 36
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 1);
+  
+  // CIR: cir.const #cir.int<356>
+  // LLVM: store i32 356
+  // OGCG: store i32 356
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 2);
+  
+  // CIR: cir.const #cir.int<36>
+  // LLVM: store i32 36
+  // OGCG: store i32 36
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 3);
+}
+
+struct DynStructVar {
+  char fst[16];
+  char snd[];
+};
+
+struct DynStruct0 {
+  char fst[16];
+  char snd[0];
+};
+
+struct DynStruct1 {
+  char fst[16];
+  char snd[1];
+};
+
+struct StaticStruct {
+  char fst[16];
+  char snd[2];
+};
+
+// CIR-LABEL: @test29
+// LLVM-LABEL: define {{.*}} void @test29
+// OGCG-LABEL: define {{.*}} void @test29
+void test29(struct DynStructVar *dv, struct DynStruct0 *d0,
+            struct DynStruct1 *d1, struct StaticStruct *ss) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(dv->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(dv->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(dv->snd, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(dv->snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d0->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d0->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(d0->snd, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(d0->snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d1->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d1->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(d1->snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(d1->snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(ss->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(ss->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(ss->snd, 2);
+  
+  // CIR: cir.const #cir.int<2>
+  // LLVM: store i32 2
+  // OGCG: store i32 2
+  gi = __builtin_object_size(ss->snd, 3);
+}
+
+// CIR-LABEL: @test30
+// LLVM-LABEL: define {{.*}} void @test30
+// OGCG-LABEL: define {{.*}} void @test30
+void test30(void) {
+  struct { struct DynStruct1 fst, snd; } *nested;
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(nested->fst.snd, 0);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(nested->fst.snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(nested->fst.snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(nested->fst.snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(nested->snd.snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(nested->snd.snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(nested->snd.snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(nested->snd.snd, 3);
+
+  union { struct DynStruct1 d1; char c[1]; } *u;
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->c, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->c, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(u->c, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(u->c, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->d1.snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->d1.snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(u->d1.snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(u->d1.snd, 3);
+}
+
+// CIR-LABEL: @test32
+// LLVM-LABEL: define {{.*}} i64 @test32
+// OGCG-LABEL: define {{.*}} i64 @test32
+static struct DynStructVar D32 = {
+  .fst = {},
+  .snd = { 0, 1, 2, },
+};
+unsigned long test32(void) {
+  // CIR: cir.const #cir.int<19>
+  // LLVM: store i64 19
+  // OGCG: ret i64 19
+  return __builtin_object_size(&D32, 1);
+}
+
+// CIR-LABEL: @test33
+// LLVM-LABEL: define {{.*}} i64 @test33
+// OGCG-LABEL: define {{.*}} i64 @test33
+static struct DynStructVar D33 = {
+  .fst = {},
+  .snd = {},
+};
+unsigned long test33(void) {
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i64 16
+  // OGCG: ret i64 16
+  return __builtin_object_size(&D33, 1);
+}
+
+// CIR-LABEL: @test34
+// LLVM-LABEL: define {{.*}} i64 @test34
+// OGCG-LABEL: define {{.*}} i64 @test34
+static struct DynStructVar D34 = {
+  .fst = {},
+};
+unsigned long test34(void) {
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i64 16
+  // OGCG: ret i64 16
+  return __builtin_object_size(&D34, 1);
+}
+
+// CIR-LABEL: @test35
+// LLVM-LABEL: define {{.*}} i64 @test35
+// OGCG-LABEL: define {{.*}} i64 @test35
+unsigned long test35(void) {
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i64 16
+  // OGCG: ret i64 16
+  return __builtin_object_size(&(struct DynStructVar){}, 1);
+}
+
+// CIR-LABEL: @test37
+// LLVM-LABEL: define {{.*}} i64 @test37
+// OGCG-LABEL: define {{.*}} i64 @test37
+struct Z { struct A { int x, y[]; } z; int a; int b[]; };
+static struct Z my_z = { .b = {1,2,3} };
+unsigned long test37(void) {
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i64 4
+  // OGCG: ret i64 4
+  return __builtin_object_size(&my_z.z, 1);
+}
+
+// CIR-LABEL: @PR30346
+// LLVM-LABEL: define {{.*}} void @PR30346
+// OGCG-LABEL: define {{.*}} void @PR30346
+void PR30346(void) {
+  struct sa_family_t {};
+  struct sockaddr {
+    struct sa_family_t sa_family;
+    char sa_data[14];
+  };
+
+  struct sockaddr *sa;
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(sa->sa_data, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(sa->sa_data, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(sa->sa_data, 2);
+  
+  // CIR: cir.const #cir.int<14>
+  // LLVM: store i32 14
+  // OGCG: store i32 14
+  gi = __builtin_object_size(sa->sa_data, 3);
+}
+
+extern char incomplete_char_array[];
+
+// CIR-LABEL: @incomplete_and_function_types
+// LLVM-LABEL: define {{.*}} void @incomplete_and_function_types
+// OGCG-LABEL: define {{.*}} void @incomplete_and_function_types
+void incomplete_and_function_types(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0
+  // OGCG: call i64 @llvm.objectsize.i64.p0
+  gi = __builtin_object_size(incomplete_char_array, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0
+  // OGCG: call i64 @llvm.objectsize.i64.p0
+  gi = __builtin_object_size(incomplete_char_array, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0
+  // OGCG: call i64 @llvm.objectsize.i64.p0
+  gi = __builtin_object_size(incomplete_char_array, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(incomplete_char_array, 3);
+}
+
+// CIR-LABEL: @deeply_nested
+// LLVM-LABEL: define {{.*}} void @deeply_nested
+// OGCG-LABEL: define {{.*}} void @deeply_nested
+void deeply_nested(void) {
+  struct {
+    struct {
+      struct {
+        struct {
+          int e[2];
+          char f;
+        } d[2];
+      } c[2];
+    } b[2];
+  } *a;
+
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 1);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 3);
+}
diff --git a/clang/test/CIR/CodeGen/object-size.cpp b/clang/test/CIR/CodeGen/object-size.cpp
new file mode 100644
index 0000000..b60e245
--- /dev/null
+++ b/clang/test/CIR/CodeGen/object-size.cpp
@@ -0,0 +1,108 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// C++-specific tests for __builtin_object_size
+
+int gi;
+
+// CIR-LABEL: @_Z5test1v
+// LLVM-LABEL: define{{.*}} void @_Z5test1v()
+// OGCG-LABEL: define{{.*}} void @_Z5test1v()
+void test1() {
+  // Guaranteeing that our cast removal logic doesn't break more interesting
+  // cases.
+  struct A { int a; };
+  struct B { int b; };
+  struct C: public A, public B {};
+
+  C c;
+
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size(&c, 0);
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size((A*)&c, 0);
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size((B*)&c, 0);
+
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size((char*)&c, 0);
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size((char*)(A*)&c, 0);
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size((char*)(B*)&c, 0);
+}
+
+// CIR-LABEL: @_Z5test2v()
+// LLVM-LABEL: define{{.*}} void @_Z5test2v()
+// OGCG-LABEL: define{{.*}} void @_Z5test2v()
+void test2() {
+  struct A { char buf[16]; };
+  struct B : A {};
+  struct C { int i; B bs[1]; } *c;
+
+  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0], 0);
+  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0], 1);
+  // CIR: cir.objsize min nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0], 2);
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size(&c->bs[0], 3);
+
+  // NYI: DerivedToBase cast
+  // gi = __builtin_object_size((A*)&c->bs[0], 0);
+
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size((A*)&c->bs[0], 1);
+
+  // NYI: DerivedToBase cast 
+  // gi = __builtin_object_size((A*)&c->bs[0], 2);
+
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size((A*)&c->bs[0], 3);
+
+  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0].buf[0], 0);
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size(&c->bs[0].buf[0], 1);
+  // CIR: cir.objsize min nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0].buf[0], 2);
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size(&c->bs[0].buf[0], 3);
+}
diff --git a/clang/test/CIR/IR/objsize.cir b/clang/test/CIR/IR/objsize.cir
new file mode 100644
index 0000000..bc24551
--- /dev/null
+++ b/clang/test/CIR/IR/objsize.cir
@@ -0,0 +1,89 @@
+// Test the cir.objsize operation can parse and print correctly (roundtrip)
+// with all possible combinations of optional attributes
+
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!u64i = !cir.int<u, 64>
+!void = !cir.void
+
+module {
+  cir.func @test_max(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_max_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_max_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_max_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+}
+
+// CHECK: cir.func @test_max(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_max_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_max_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_max_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
diff --git a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
index 7aeb877..b0abaed 100644
--- a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
@@ -24,4 +24,3 @@ void foo(uint Idx : SV_DispatchThreadID) {}
 [shader("compute")]
 [numthreads(8,8,1)]
 void bar(uint2 Idx : SV_DispatchThreadID) {}
-
diff --git a/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl
new file mode 100644
index 0000000..96d5b99
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+
+// CHECK-SPIRV-DAG:  @AAA0 = external hidden thread_local addrspace(7) externally_initialized constant float, !spirv.Decorations ![[#METADATA_0:]]
+// CHECK-SPIRV-DAG:    @B0 = external hidden thread_local addrspace(7) externally_initialized constant i32, !spirv.Decorations ![[#METADATA_2:]]
+// CHECK-SPIRV-DAG:   @CC0 = external hidden thread_local addrspace(7) externally_initialized constant <2 x float>, !spirv.Decorations ![[#METADATA_4:]]
+
+
+// FIXME: replace `float2 c` with a  matrix when available.
+void main(float a : AAA, int b : B, float2 c : CC) {
+  float tmp = a + b + c.x + c.y;
+}
+// CHECK-SPIRV: define internal spir_func void @_Z4mainfiDv2_f(float noundef nofpclass(nan inf) %a, i32 noundef %b, <2 x float> noundef nofpclass(nan inf) %c) #0 {
+
+// CHECK: define void @main()
+
+// CHECK-DXIL: %AAA0 = call float @llvm.dx.load.input.f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL:   %B0 = call i32 @llvm.dx.load.input.i32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL   %CC0 = call <2 x float> @llvm.dx.load.input.v2f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL:         call void @_Z4mainfiDv2_f(float %AAA0, i32 %B0, <2 x float> %CC0)
+
+// CHECK-SPIRV: %[[#AAA0:]] = load float, ptr addrspace(7) @AAA0, align 4
+// CHECK-SPIRV:   %[[#B0:]] = load i32, ptr addrspace(7) @B0, align 4
+// CHECK-SPIRV:  %[[#CC0:]] = load <2 x float>, ptr addrspace(7) @CC0, align 8
+// CHECK-SPIRV:               call spir_func void @_Z4mainfiDv2_f(float %[[#AAA0]], i32 %[[#B0]], <2 x float> %[[#CC0]]) [ "convergencectrl"(token %0) ]
+
+
+// CHECK-SPIRV-DAG: ![[#METADATA_0]] = !{![[#METADATA_1:]]}
+// CHECK-SPIRV-DAG: ![[#METADATA_2]] = !{![[#METADATA_3:]]}
+// CHECK-SPIRV-DAG: ![[#METADATA_4]] = !{![[#METADATA_5:]]}
+
+// CHECK-SPIRV-DAG: ![[#METADATA_1]] = !{i32 30, i32 0}
+// CHECK-SPIRV-DAG: ![[#METADATA_3]] = !{i32 30, i32 1}
+// CHECK-SPIRV-DAG: ![[#METADATA_5]] = !{i32 30, i32 2}
+//                                            |      `- Location index
+//                                            `-> Decoration "Location"
diff --git a/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl
new file mode 100644
index 0000000..b2cb3da
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+// RUN: %clang_cc1 -triple dxil-px-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+
+struct S0 {
+  float4 position[2];
+  float4 color;
+};
+
+// CHECK: %struct.S0 = type { [2 x <4 x float>], <4 x float> }
+
+// CHECK-SPIRV: @A0 = external hidden thread_local addrspace(7) externally_initialized constant [2 x <4 x float>], !spirv.Decorations ![[#MD_0:]]
+// CHECK-SPIRV: @A2 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_2:]]
+
+// CHECK:       define void @main0()
+// CHECK-DXIL:          %A0 = call [2 x <4 x float>] @llvm.dx.load.input.a2v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL: %[[#TMP0:]] = insertvalue %struct.S0 poison, [2 x <4 x float>] %A0, 0
+// CHECK-DXIL:         %A2 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL: %[[#TMP1:]] = insertvalue %struct.S0 %[[#TMP0]], <4 x float> %A2, 1
+
+// CHECK-SPIRV:   %[[#A0:]] = load [2 x <4 x float>], ptr addrspace(7) @A0, align 16
+// CHECK-SPIRV: %[[#TMP0:]] = insertvalue %struct.S0 poison, [2 x <4 x float>] %[[#A0]], 0
+// CHECK-SPIRV:   %[[#A2:]] = load <4 x float>, ptr addrspace(7) @A2, align 16
+// CHECK-SPIRV: %[[#TMP1:]] = insertvalue %struct.S0 %[[#TMP0]], <4 x float> %[[#A2]], 1
+
+// CHECK:        %[[#ARG:]] = alloca %struct.S0, align 16
+// CHECK:                     store %struct.S0 %[[#TMP1]], ptr %[[#ARG]], align 16
+// CHECK-DXIL:                call void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+[shader("pixel")]
+void main0(S0 p : A) {
+  float tmp = p.position[0] + p.position[1] + p.color;
+}
+
+// CHECK-SPIRV: ![[#MD_0]] = !{![[#MD_1:]]}
+// CHECK-SPIRV: ![[#MD_1]] = !{i32 30, i32 0}
+// CHECK-SPIRV: ![[#MD_2]] = !{![[#MD_3:]]}
+// CHECK-SPIRV: ![[#MD_3]] = !{i32 30, i32 2}
diff --git a/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl
new file mode 100644
index 0000000..733cf3a
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+struct S0 {
+  uint Idx : SV_DispatchThreadID;
+};
+
+// CHECK:       define void @main0()
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:        %[[#TMP:]] = insertvalue %struct.S0 poison, i32 %[[#ID:]], 0
+// CHECK:        %[[#ARG:]] = alloca %struct.S0, align 8
+// CHECK:                     store %struct.S0 %[[#TMP]], ptr %[[#ARG]], align 4
+// CHECK-DXIL:                call void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void main0(S0 p) {}
+
+struct S1 {
+  uint2  a : SV_DispatchThreadID;
+  uint2  b : SV_GroupThreadID;
+};
+
+// CHECK:                     define void @main1()
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:        %[[#AX_:]] = insertelement <2 x i32> poison, i32 %[[#ID]], i64 0
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 1)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 1)
+// CHECK:        %[[#AXY:]] = insertelement <2 x i32> %[[#AX_]], i32 %[[#ID]], i64 1
+// CHECK:       %[[#S1A_:]] = insertvalue %struct.S1 poison, <2 x i32> %[[#AXY]], 0
+// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 0)
+// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 0)
+// CHECK:      %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 1)
+// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 1)
+// CHECK:      %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK:       %[[#S1AB:]] = insertvalue %struct.S1 %[[#S1A_]], <2 x i32> %[[#ID_XYZ:]], 1
+// CHECK:        %[[#ARG:]] = alloca %struct.S1, align 8
+// CHECK:                     store %struct.S1 %[[#S1AB]], ptr %[[#ARG]], align 8
+// CHECK-DXIL:                call void @{{.*}}main1{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main1{{.*}}(ptr %[[#ARG]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void main1(S1 p) {}
+
+struct S2C {
+  uint2 b : SV_GroupThreadID;
+};
+
+struct S2 {
+  uint  a : SV_DispatchThreadID;
+  S2C child;
+};
+
+// CHECK:                     define void @main2()
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:       %[[#S2A_:]] = insertvalue %struct.S2 poison, i32 %[[#ID:]], 0
+
+// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 0)
+// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 0)
+// CHECK:      %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 1)
+// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 1)
+// CHECK:      %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK:        %[[#S2C:]] = insertvalue %struct.S2C poison, <2 x i32> %[[#ID_XY:]], 0
+
+// CHECK:       %[[#S2AB:]] = insertvalue %struct.S2 %[[#S2A_]], %struct.S2C %[[#S2V:]], 1
+// CHECK:        %[[#ARG:]] = alloca %struct.S2, align 8
+// CHECK:                     store %struct.S2 %[[#S2AB]], ptr %[[#ARG]], align 1
+// CHECK-DXIL:                call void @{{.*}}main2{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main2{{.*}}(ptr %[[#ARG]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void main2(S2 p) {}
diff --git a/clang/test/ParserHLSL/semantic_parsing.hlsl b/clang/test/ParserHLSL/semantic_parsing.hlsl
index 726dead..bff7bd0 100644
--- a/clang/test/ParserHLSL/semantic_parsing.hlsl
+++ b/clang/test/ParserHLSL/semantic_parsing.hlsl
@@ -12,30 +12,33 @@ void Pony(int GI : SV_IWantAPony) { }
 // expected-note@+1 {{to match this '('}}
 void SuperPony(int GI : 0) { }
 
-// expected-error@+1 {{unknown HLSL semantic '_'}}
+// '_' is a valid CPP identifier.
 void MegaPony(int GI : _) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'A0A'}}
+void GarguantuanPony(int GI : _1) { }
+
 void CoolPony(int GI : A0A0) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'A_'}}
 void NicePony(int GI : A_0) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'A'}}
 void CutePony(int GI : A00) { }
 
-// expected-error@+3 {{unknown HLSL semantic 'A'}}
 // expected-error@+2 {{expected ')'}}
 // expected-note@+1 {{to match this '('}}
 void DoublePony(int GI : A00 B) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'é'}}
-void BigPony(int GI : é) { }
+// Unicode can be used:
+// https://timsong-cpp.github.io/cppwp/n3337/charname.allowed
+void FrenchPony(int GI : garçon_de_café) { }
+void UnicodePony(int GI : ℮) { }
+
+// Since P1949 seems Emojis are not allowed, even if in the range
+// mentioned in N3337.
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html
 
 // expected-error@+2 {{unexpected character <U+1F60A>}}
 // expected-error@+1 {{expected HLSL Semantic identifier}}
 void UTFPony(int GI : 😊) { }
 
-// expected-error@+2 {{character <U+1F60A> not allowed in an identifier}}
-// expected-error@+1 {{unknown HLSL semantic 'PonyWithA😊'}}
+// expected-error@+1 {{character <U+1F60A> not allowed in an identifier}}
 void SmilingPony(int GI : PonyWithA😊) { }
diff --git a/clang/test/SemaCXX/dependent-switch-case.cpp b/clang/test/SemaCXX/dependent-switch-case.cpp
new file mode 100644
index 0000000..bbeab3a
--- /dev/null
+++ b/clang/test/SemaCXX/dependent-switch-case.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c++20 %s -verify
+// RUN: %clang_cc1 -std=c++20 %s -verify -fexperimental-new-constant-interpreter
+
+constexpr bool e(int){switch(0)0=0:return t(;} // expected-error {{expression is not assignable}} \
+                                               // expected-error {{expected 'case' keyword before expression}} \
+                                               // expected-error {{expected expression}}
diff --git a/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl b/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl
new file mode 100644
index 0000000..fdba6f6
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -hlsl-entry main -verify %s
+
+typedef float t_f : SEMANTIC; // expected-warning{{'SEMANTIC' attribute only applies to parameters, non-static data members, and functions}}
+
+struct semantic_on_struct : SEMANTIC { // expected-error{{expected class name}}
+  float a;
+};
+
+struct s_fields_multiple_semantics {
+  float a : semantic_a : semantic_c; // expected-error{{use of undeclared identifier 'semantic_c'}}
+  float b : semantic_b;
+};
+
+[numthreads(1, 1, 1)]
+void main() {
+  float a : SEM_A; // expected-warning{{'SEM_A' attribute only applies to parameters, non-static data members, and functions}}
+}
diff --git a/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl b/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl
new file mode 100644
index 0000000..1e6bae4
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -hlsl-entry CSMain -x hlsl  -finclude-default-header  -ast-dump -o - %s | FileCheck %s
+
+struct s_fields {
+  float a : semantic_a;
+  float b : semantic_b;
+// CHECK: |-CXXRecordDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-3]]:8 struct s_fields definition
+// CHECK: | |-FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:9 a 'float'
+// CHECK: | | `-HLSLUserSemanticAttr 0x{{[0-9a-fA-F]+}} <col:13>
+// CHECK: | `-FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:9 b 'float'
+// CHECK: |   `-HLSLUserSemanticAttr 0x{{[0-9a-fA-F]+}} <col:13>
+};
+
+float fn_foo1(float a : a, float b : b) : sem_ret { return 1.0f; }
+// CHECK:      |-FunctionDecl {{.*}} <{{.*}}> col:7 fn_foo1 'float (float, float)'
+// CHECK-NEXT: | |-ParmVarDecl {{.*}} <{{.*}}> col:21 a 'float'
+// CHECK-NEXT: | | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT: | |-ParmVarDecl {{.*}} <{{.*}}> col:34 b 'float'
+// CHECK-NEXT: | | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT: | |-CompoundStmt {{.*}} <{{.*}}>
+// CHECK-NEXT: | | `-ReturnStmt {{.*}} <{{.*}}>
+// CHECK-NEXT: | |   `-FloatingLiteral {{.*}} <{{.*}}> 'float' 1.000000e+00
+// CHECK-NEXT: | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+float fn_foo2(float a : a, float b : b) : sem_ret : also_ret { return 1.0f; }
+// CHECK:      `-FunctionDecl {{.*}} <{{.*}}> col:7 fn_foo2 'float (float, float)'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <{{.*}}> col:21 a 'float'
+// CHECK-NEXT:   | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <{{.*}}> col:34 b 'float'
+// CHECK-NEXT:   | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT:   |-CompoundStmt {{.*}} <{{.*}}>
+// CHECK-NEXT:   | `-ReturnStmt {{.*}} <{{.*}}>
+// CHECK-NEXT:   |   `-FloatingLiteral {{.*}} <{{.*}}> 'float' 1.000000e+00
+// CHECK-NEXT:   |-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT:   `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index 9692d6e..3fcb558 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1179,6 +1179,12 @@ TEST_P(ASTMatchersTest, PredefinedExpr) {
                                      has(stringLiteral()))));
 }
 
+TEST_P(ASTMatchersTest, FileScopeAsmDecl) {
+  EXPECT_TRUE(matches("__asm(\"nop\");", fileScopeAsmDecl()));
+  EXPECT_TRUE(
+      notMatches("void f() { __asm(\"mov al, 2\"); }", fileScopeAsmDecl()));
+}
+
 TEST_P(ASTMatchersTest, AsmStatement) {
   EXPECT_TRUE(matches("void foo() { __asm(\"mov al, 2\"); }", asmStmt()));
 }
@@ -2442,7 +2448,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                       "int main() {"
                       "  int a;"
                       "  f(a);"
-                      "}", matcher));
+                      "}",
+                      matcher));
   EXPECT_FALSE(matches("template <class ...T> void f(T &...args) {"
                        "  [...args = args] () mutable {"
                        "  }();"
@@ -2450,7 +2457,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                        "int main() {"
                        "  int a;"
                        "  f(a);"
-                       "}", matcher));
+                       "}",
+                       matcher));
 }
 
 TEST_P(ASTMatchersTest, IsDerivedFromRecursion) {
@@ -2628,7 +2636,7 @@ TEST(ASTMatchersTestObjC, ObjCStringLiteral) {
                           "    [Test someFunction:@\"Ola!\"]; "
                           "}\n"
                           "@end ";
-    EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
+  EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
 }
 
 TEST(ASTMatchersTestObjC, ObjCDecls) {
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index e544c89..3b18aa83 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -186,7 +186,8 @@ std::string buildTraceGraph(StringRef Json) {
 
 } // namespace
 
-TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
+// FIXME: Flaky test. See https://github.com/llvm/llvm-project/pull/138613
+TEST(TimeProfilerTest, DISABLED_ConstantEvaluationCxx20) {
   std::string Code = R"(
 void print(double value);
 
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
index 08a04fc..fc83b21 100644
--- a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
+++ b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
@@ -3,12 +3,12 @@
 // Default compiler instrumentation works with any shadow base (dynamic or fixed).
 // RUN: %clang_hwasan %s -o %t
 // RUN: %run %t
-// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
 // RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
 //
 // If -hwasan-mapping-offset is set, then the fixed_shadow_base needs to match.
 // RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=263878495698944 -o %t
-// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
 // RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t
 
 // RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=4398046511104 -o %t
@@ -26,6 +26,8 @@
 //
 // UNSUPPORTED: android
 
+// CHECK: FATAL: HWAddressSanitizer: Shadow range {{.*}} is not available
+
 #include <assert.h>
 #include <sanitizer/allocator_interface.h>
 #include <sanitizer/hwasan_interface.h>
diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h
index 7ada223..dc5e62b 100644
--- a/libc/src/__support/CPP/type_traits/is_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_destructible.h
@@ -15,6 +15,7 @@
 #include "src/__support/CPP/type_traits/remove_all_extents.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
+#include "src/__support/CPP/utility/declval.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
diff --git a/libc/startup/baremetal/arm/start.cpp b/libc/startup/baremetal/arm/start.cpp
index c089a14..4740067 100644
--- a/libc/startup/baremetal/arm/start.cpp
+++ b/libc/startup/baremetal/arm/start.cpp
@@ -131,6 +131,32 @@ namespace LIBC_NAMESPACE_DECL {
   __arm_wsr("CPSR_c", 0x13); // SVC
 #endif
 
+#ifdef __ARM_FP
+// Enable FPU
+#if __ARM_ARCH_PROFILE == 'M'
+  // Based on
+  // https://developer.arm.com/documentation/dui0646/c/Cortex-M7-Peripherals/Floating-Point-Unit/Enabling-the-FPU
+  // Set CPACR cp10 and cp11
+  auto cpacr = (volatile uint32_t *const)0xE000ED88;
+  *cpacr |= (0xF << 20);
+  __dsb(0xF);
+  __isb(0xF);
+#elif __ARM_ARCH_PROFILE == 'A' || __ARM_ARCH_PROFILE == 'R'
+  // Based on
+  // https://developer.arm.com/documentation/dui0472/m/Compiler-Coding-Practices/Enabling-NEON-and-FPU-for-bare-metal
+  // Set CPACR cp10 and cp11
+  uint32_t cpacr = __arm_rsr("p15:0:c1:c0:2");
+  cpacr |= (0xF << 20);
+  __arm_wsr("p15:0:c1:c0:2", cpacr);
+  __isb(0xF);
+  // Set FPEXC.EN
+  uint32_t fpexc;
+  __asm__ __volatile__("vmrs %0, FPEXC" : "=r"(fpexc) : :);
+  fpexc |= (1 << 30);
+  __asm__ __volatile__("vmsr FPEXC, %0" : : "r"(fpexc) :);
+#endif
+#endif
+
   // Perform the equivalent of scatterloading
   LIBC_NAMESPACE::memcpy(__data_start, __data_source,
                          reinterpret_cast<uintptr_t>(__data_size));
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 7499613..9ecd390 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -241,6 +241,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
    * - ``SPV_KHR_maximal_reconvergence``
      - Adds execution mode and capability to enable maximal reconvergence.
+   * - ``SPV_ALTERA_blocking_pipes``
+     - Adds new pipe read and write functions that have blocking semantics instead of the non-blocking semantics of the existing pipe read/write functions.
 
 SPIR-V representation in LLVM IR
 ================================
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 221d8f1..f585257 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1331,8 +1331,8 @@ public:
       bool SplitDst =
           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
           TargetLowering::TypeSplitVector;
-      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
-          DstVTy->getElementCount().isVector()) {
+      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isKnownEven() &&
+          DstVTy->getElementCount().isKnownEven()) {
         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
         const T *TTI = thisT();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 268025e7..9d6038d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -297,6 +297,10 @@ private:
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateIntrinsic(
+      const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+      const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+
   /// When an invoke or a cleanupret unwinds to the next EH pad, there are
   /// many places it could ultimately go. In the IR, we have a single unwind
   /// destination, but in the machine CFG, we enumerate all the possible blocks.
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 9924b90..d7db935 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -176,4 +176,10 @@ def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, l
 
 def int_dx_group_memory_barrier_with_group_sync
     : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>;
+
+def int_dx_load_input
+    : DefaultAttrsIntrinsic<[llvm_any_ty],
+                            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i8_ty,
+                             llvm_i32_ty],
+                            [IntrConvergent]>;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 4fd2204..be1b51f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2821,20 +2821,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID);
+
+  return translateIntrinsic(CI, ID, MIRBuilder,
+                            IsTgtMemIntrinsic ? &Info : nullptr);
+}
+
+/// Translate a call to an intrinsic.
+/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo
+/// is a pointer to the correspondingly populated IntrinsicInfo object.
+/// Otherwise, this pointer is null.
+bool IRTranslator::translateIntrinsic(
+    const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+    const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
   ArrayRef<Register> ResultRegs;
-  if (!CI.getType()->isVoidTy())
-    ResultRegs = getOrCreateVRegs(CI);
+  if (!CB.getType()->isVoidTy())
+    ResultRegs = getOrCreateVRegs(CB);
 
   // Ignore the callsite attributes. Backend code is most likely not expecting
   // an intrinsic to sometimes have side effects and sometimes not.
   MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs);
-  if (isa<FPMathOperator>(CI))
-    MIB->copyIRFlags(CI);
+  if (isa<FPMathOperator>(CB))
+    MIB->copyIRFlags(CB);
 
-  for (const auto &Arg : enumerate(CI.args())) {
+  for (const auto &Arg : enumerate(CB.args())) {
     // If this is required to be an immediate, don't materialize it in a
     // register.
-    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+    if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
         // imm arguments are more convenient than cimm (and realistically
         // probably sufficient), so use them.
@@ -2863,29 +2877,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  TargetLowering::IntrinsicInfo Info;
-  // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    Align Alignment = Info.align.value_or(
-        DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
-    LLT MemTy = Info.memVT.isSimple()
-                    ? getLLTForMVT(Info.memVT.getSimpleVT())
-                    : LLT::scalar(Info.memVT.getStoreSizeInBits());
+  if (TgtMemIntrinsicInfo) {
+    const Function *F = CB.getCalledFunction();
+
+    Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign(
+        TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext())));
+    LLT MemTy =
+        TgtMemIntrinsicInfo->memVT.isSimple()
+            ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT())
+            : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits());
 
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
     //       didn't yield anything useful.
     MachinePointerInfo MPI;
-    if (Info.ptrVal)
-      MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
-    else if (Info.fallbackAddressSpace)
-      MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+    if (TgtMemIntrinsicInfo->ptrVal) {
+      MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal,
+                               TgtMemIntrinsicInfo->offset);
+    } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) {
+      MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace);
+    }
     MIB.addMemOperand(MF->getMachineMemOperand(
-        MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
-        /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder));
+        MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(),
+        /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid,
+        TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder));
   }
 
-  if (CI.isConvergent()) {
-    if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+  if (CB.isConvergent()) {
+    if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
       auto *Token = Bundle->Inputs[0].get();
       Register TokenReg = getOrCreateVReg(*Token);
       MIB.addUse(TokenReg, RegState::Implicit);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fa0c899..9961c98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
 
   // Update successor info.
   addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
-  for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
-    BasicBlock *Dest = I.getIndirectDest(i);
+  for (BasicBlock *Dest : I.getIndirectDests()) {
     MachineBasicBlock *Target = FuncInfo.getMBB(Dest);
     Target->setIsInlineAsmBrIndirectTarget();
     // If we introduce a type of asm goto statement that is permitted to use an
@@ -5313,18 +5312,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   DAG.setRoot(OutChain);
 }
 
-/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
-/// node.
-void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
-                                               unsigned Intrinsic) {
-  // Ignore the callsite's attributes. A specific call site may be marked with
-  // readnone, but the lowering code will expect the chain based on the
-  // definition.
+/// Check if this intrinsic call depends on the chain (1st return value)
+/// and if it only *loads* memory.
+/// Ignore the callsite's attributes. A specific call site may be marked with
+/// readnone, but the lowering code will expect the chain based on the
+/// definition.
+std::pair<bool, bool>
+SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) {
   const Function *F = I.getCalledFunction();
   bool HasChain = !F->doesNotAccessMemory();
   bool OnlyLoad =
       HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow();
 
+  return {HasChain, OnlyLoad};
+}
+
+SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands(
+    const CallBase &I, bool HasChain, bool OnlyLoad,
+    TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   // Build the operand list.
   SmallVector<SDValue, 8> Ops;
   if (HasChain) {  // If this intrinsic has side-effects, chainify it.
@@ -5336,17 +5343,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
-  // Info is set by getTgtMemIntrinsic
-  TargetLowering::IntrinsicInfo Info;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
-                                               DAG.getMachineFunction(),
-                                               Intrinsic);
-
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
-  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
-      Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+  if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID ||
+      TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(),
                                         TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
@@ -5369,13 +5369,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
+  if (std::optional<OperandBundleUse> Bundle =
+          I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    Value *Token = Bundle->Inputs[0].get();
+    SDValue ConvControlToken = getValue(Token);
+    assert(Ops.back().getValueType() != MVT::Glue &&
+           "Did not expect another glue node here.");
+    ConvControlToken =
+        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
+    Ops.push_back(ConvControlToken);
+  }
+
+  return Ops;
+}
+
+SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I,
+                                                       bool HasChain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
 
-  SDVTList VTs = DAG.getVTList(ValueVTs);
+  return DAG.getVTList(ValueVTs);
+}
+
+/// Get an INTRINSIC node for a target intrinsic which does not touch memory.
+SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode(
+    const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops,
+    const SDVTList &VTs) {
+  if (!HasChain)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+  if (!IntrinsicVT.isVoidTy())
+    return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+  return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+}
+
+/// Set root, convert return type if necessary and check alignment.
+SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I,
+                                                      bool HasChain,
+                                                      bool OnlyLoad,
+                                                      SDValue Result) {
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (I.getType()->isVoidTy())
+    return Result;
+
+  if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) {
+    // Insert `assertalign` node if there's an alignment.
+    Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+  } else if (!isa<VectorType>(I.getType())) {
+    Result = lowerRangeToAssertZExt(DAG, I, Result);
+  }
+
+  return Result;
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I);
+
+  // Info is set by getTgtMemIntrinsic
+  TargetLowering::IntrinsicInfo Info;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtMemIntrinsic =
+      TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic);
+
+  SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands(
+      I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr);
+  SDVTList VTs = getTargetIntrinsicVTList(I, HasChain);
 
   // Propagate fast-math-flags from IR to node(s).
   SDNodeFlags Flags;
@@ -5386,19 +5458,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   // Create the node.
   SDValue Result;
 
-  if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
-    auto *Token = Bundle->Inputs[0].get();
-    SDValue ConvControlToken = getValue(Token);
-    assert(Ops.back().getValueType() != MVT::Glue &&
-           "Did not expected another glue node here.");
-    ConvControlToken =
-        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
-    Ops.push_back(ConvControlToken);
-  }
-
   // In some cases, custom collection of operands from CallInst I may be needed.
   TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
-  if (IsTgtIntrinsic) {
+  if (IsTgtMemIntrinsic) {
     // This is target intrinsic that touches memory
     //
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
@@ -5418,34 +5480,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
         Info.ssid, Info.order, Info.failureOrder);
     Result =
         DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO);
-  } else if (!HasChain) {
-    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
-  } else if (!I.getType()->isVoidTy()) {
-    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
   } else {
-    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+    Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs);
   }
 
-  if (HasChain) {
-    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
-    if (OnlyLoad)
-      PendingLoads.push_back(Chain);
-    else
-      DAG.setRoot(Chain);
-  }
-
-  if (!I.getType()->isVoidTy()) {
-    if (!isa<VectorType>(I.getType()))
-      Result = lowerRangeToAssertZExt(DAG, I, Result);
-
-    MaybeAlign Alignment = I.getRetAlign();
-
-    // Insert `assertalign` node if there's an alignment.
-    if (InsertAssertAlign && Alignment) {
-      Result =
-          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
-    }
-  }
+  Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result);
 
   setValue(&I, Result);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 47e19f7..ed63bee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -727,6 +727,17 @@ private:
                        MCSymbol *&BeginLabel);
   SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,
                      const BasicBlock *EHPadBB, MCSymbol *BeginLabel);
+
+  std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I);
+  SmallVector<SDValue, 8> getTargetIntrinsicOperands(
+      const CallBase &I, bool HasChain, bool OnlyLoad,
+      TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+  SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain);
+  SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain,
+                                       ArrayRef<SDValue> Ops,
+                                       const SDVTList &VTs);
+  SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain,
+                                   bool OnlyLoad, SDValue Result);
 };
 
 /// This struct represents the registers (physical or virtual)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d08f9b9..40e6400 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,6 +50,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -104,7 +105,6 @@
 #include <vector>
 
 using namespace llvm;
-using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64-lower"
 
@@ -1174,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
+  setTargetDAGCombine(ISD::CTPOP);
 
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
@@ -11330,9 +11331,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
     break;
   }
 
+  // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
+  // prefer using SVE if available.
   if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(
-          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
     switch (Opcode) {
     default:
       llvm_unreachable("Wrong instruction");
@@ -17554,6 +17556,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
     // udot instruction.
     if (SrcWidth * 4 <= DstWidth) {
       if (all_of(I->users(), [&](auto *U) {
+            using namespace llvm::PatternMatch;
             auto *SingleUser = cast<Instruction>(&*U);
             if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
               return true;
@@ -17825,6 +17828,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // into shift / and masks. For the moment we do this just for uitofp (not
   // zext) to avoid issues with widening instructions.
   if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
+        using namespace llvm::PatternMatch;
         return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
                SI->getType()->getScalarSizeInBits() * 4 ==
                    SI->user_back()->getType()->getScalarSizeInBits();
@@ -27841,6 +27845,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
       {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
 }
 
+static SDValue performCTPOPCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG) {
+  using namespace llvm::SDPatternMatch;
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+  SDValue Mask;
+  if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT MaskVT = Mask.getValueType();
+
+  if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+      MaskVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ReduceInVT =
+      EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+
+  SDLoc DL(N);
+  // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+  SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
+  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+  return DAG.getNegative(NegPopCount, DL, VT);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -28186,6 +28219,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performScalarToVectorCombine(N, DCI, DAG);
   case ISD::SHL:
     return performSHLCombine(N, DCI, DAG);
+  case ISD::CTPOP:
+    return performCTPOPCombine(N, DCI, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 636e31c..bf9de0a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
               if (!TII->isAddImmediate(*DeadMI, Reg))
                 continue;
               LIS->RemoveMachineInstrFromMaps(*DeadMI);
+              Register AddReg = DeadMI->getOperand(1).getReg();
               DeadMI->eraseFromParent();
+              if (AddReg.isVirtual())
+                LIS->shrinkToUses(&LIS->getInterval(AddReg));
             }
           }
         }
@@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   // Loop over the dead AVL values, and delete them now.  This has
   // to be outside the above loop to avoid invalidating iterators.
   for (auto *MI : ToDelete) {
+    assert(MI->getOpcode() == RISCV::ADDI);
+    Register AddReg = MI->getOperand(1).getReg();
     if (LIS) {
       LIS->removeInterval(MI->getOperand(0).getReg());
       LIS->RemoveMachineInstrFromMaps(*MI);
     }
     MI->eraseFromParent();
+    if (LIS && AddReg.isVirtual())
+      LIS->shrinkToUses(&LIS->getInterval(AddReg));
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 56a38bb..b2cbdb2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
   return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call,
+                                      MachineIRBuilder &MIRBuilder,
+                                      SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+  return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+}
+
 static bool
 generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call,
                                         MachineIRBuilder &MIRBuilder,
@@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generatePipeInst(Call.get(), MIRBuilder, GR);
   case SPIRV::PredicatedLoadStore:
     return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::BlockingPipes:
+    return generateBlockingPipesInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index c259cce..492a98e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup;
 def Block2DLoadStore : BuiltinGroup;
 def Pipe : BuiltinGroup;
 def PredicatedLoadStore : BuiltinGroup;
+def BlockingPipes : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
+//SPV_ALTERA_blocking_pipes
+defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>;
+defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>;
 defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 43b2869..f681b0d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -159,7 +159,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_KHR_maximal_reconvergence",
          SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
         {"SPV_INTEL_kernel_attributes",
-         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}};
+         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes},
+        {"SPV_ALTERA_blocking_pipes",
+         SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a61351e..03bd61b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr,
                   "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">;
 def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops),
                   "OpPredicatedStoreINTEL $ptr $object $predicate">;
+
+//SPV_ALTERA_blocking_pipes
+def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
+def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index e5ac76c4..af76016 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1885,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(
         SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL);
     break;
+  case SPIRV::OpReadPipeBlockingALTERA:
+  case SPIRV::OpWritePipeBlockingALTERA:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes);
+      Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA);
+    }
+    break;
   case SPIRV::OpCooperativeMatrixGetElementCoordINTEL:
     if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix))
       report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the "
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 4e4e6fb..be88f33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -56,6 +56,13 @@ public:
   }
 };
 
+static cl::list<std::string> SPVAllowUnknownIntrinsics(
+    "spv-allow-unknown-intrinsics", cl::CommaSeparated,
+    cl::desc("Emit unknown intrinsics as calls to external functions. A "
+             "comma-separated input list of intrinsic prefixes must be "
+             "provided, and only intrinsics carrying a listed prefix get "
+             "emitted as described."),
+    cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional);
 } // namespace
 
 char SPIRVPrepareFunctions::ID = 0;
@@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
                                        EraseFromParent);
         Changed = true;
         break;
+      default:
+        if (TM.getTargetTriple().getVendor() == Triple::AMD ||
+            any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) {
+              if (Prefix.empty())
+                return false;
+              return II->getCalledFunction()->getName().starts_with(Prefix);
+            }))
+          Changed |= lowerIntrinsicToFunction(II);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 1b4b29b..65a8885 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
 defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
 defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
 defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
 defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
 defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
@@ -611,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
 defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>;
 defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
 defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4d44227b3..168e041 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53442,7 +53442,8 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   }
 
   SDValue NewStore =
-      DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+      DAG.getStore(St->getChain(), DL, Res, NewPtr,
+                   MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
                    Align(), St->getMemOperand()->getFlags());
 
   // If there are other uses of StoredVal, replace with a new load of the
@@ -54639,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
         SDValue NewPtr = DAG.getMemBasePlusOffset(
             Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
         SDValue NewLoad =
-            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+                        MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
                         Align(), Ld->getMemOperand()->getFlags());
         DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
         return NewLoad;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8670822..3062e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1163,10 +1163,10 @@ public:
   bool opcodeMayReadOrWriteFromMemory() const;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override;
+  bool usesFirstPartOnly(const VPValue *Op) const override;
 
   /// Returns true if this VPInstruction produces a scalar value from a vector,
   /// e.g. by performing a reduction or extracting a lane.
@@ -1393,13 +1393,13 @@ public:
     return true;
   }
 
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1628,7 +1628,7 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 };
 
 /// A recipe for widening Call instructions using library calls.
@@ -1767,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getCond() && isInvariantCond();
@@ -1833,7 +1833,7 @@ public:
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getOperand(0))
@@ -1870,7 +1870,7 @@ public:
 
   void execute(VPTransformState &State) override;
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1884,7 +1884,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -1922,14 +1922,14 @@ public:
 
   Type *getSourceElementType() const { return SourceElementTy; }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -2110,7 +2110,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // The recipe creates its own wide start value, so it only requests the
@@ -2325,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getStartValue();
@@ -2399,7 +2399,7 @@ public:
   bool isInLoop() const { return IsInLoop; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isOrdered() || isInLoop();
@@ -2468,13 +2468,13 @@ public:
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Recursing through Blend recipes only, must terminate at header phi's the
     // latest.
     return all_of(users(),
-                  [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
+                  [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
   }
 };
 
@@ -2562,7 +2562,7 @@ public:
                               VPCostContext &Ctx) const override;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+  bool usesFirstLaneOnly(const VPValue *Op) const override = 0;
 
   /// Returns the number of stored operands of this interleave group. Returns 0
   /// for load interleave groups.
@@ -2608,7 +2608,7 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
@@ -2656,7 +2656,7 @@ public:
 #endif
 
   /// The recipe only uses the first lane of the address, and EVL operand.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
@@ -2862,7 +2862,7 @@ public:
   VPValue *getEVL() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getEVL();
@@ -2924,7 +2924,7 @@ public:
   bool isPredicated() const { return IsPredicated; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isSingleScalar();
@@ -3212,9 +3212,8 @@ protected:
         Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
         Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
-           !Reverse &&
-               "Reversed acccess without VPVectorEndPointerRecipe address?");
+    assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
+           "Reversed acccess without VPVectorEndPointerRecipe address?");
   }
 
 public:
@@ -3300,7 +3299,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive loads operations only demand the first lane of
@@ -3341,7 +3340,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened loads only demand the first lane of EVL and consecutive loads
@@ -3382,7 +3381,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive stores only demand the first lane of their address,
@@ -3425,7 +3424,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getEVL()) {
@@ -3509,14 +3508,14 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3591,7 +3590,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3701,7 +3700,7 @@ public:
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3766,7 +3765,7 @@ public:
   VPValue *getStepValue() const { return getOperand(1); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f792d0a..80cd112 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1276,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   }
 }
 
-bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);
@@ -1325,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   llvm_unreachable("switch should return");
 }
 
-bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstPartUsed(this);
@@ -1692,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     if (!VFTy->getParamType(I.index())->isVectorTy())
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     Args.push_back(Arg);
   }
 
@@ -1761,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
                                            State.TTI))
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
                                                State.TTI))
       TysForDecl.push_back(Arg->getType());
@@ -1843,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
 
-bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
     auto [Idx, V] = X;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 82bf79e..48bd697 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -204,7 +204,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
           return cast<VPRecipeBase>(U)->getParent() != SinkTo;
         });
     if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
-          return !U->onlyFirstLaneUsed(SinkCandidate);
+          return !U->usesFirstLaneOnly(SinkCandidate);
         }))
       continue;
     bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d6a0028..d4b8b72b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
-        return U.onlyFirstLaneUsed(DefR);
+        return U.usesFirstLaneOnly(DefR);
       });
 
       // Update each build vector user that currently has DefR as its only
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d3..e22c5df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
 }
 
 bool vputils::onlyFirstPartUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
 }
 
 bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fca..5da7463 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -274,12 +274,12 @@ public:
   virtual bool usesScalars(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return onlyFirstLaneUsed(Op);
+    return usesFirstLaneOnly(Op);
   }
 
   /// Returns true if the VPUser only uses the first lane of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+  virtual bool usesFirstLaneOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
@@ -287,7 +287,7 @@ public:
 
   /// Returns true if the VPUser only uses the first part of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+  virtual bool usesFirstPartOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
new file mode 100644
index 0000000..e784ead
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -0,0 +1,315 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = bitcast <16 x i1> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %mask = icmp slt <1 x i64> %a, %b
+  %t1 = bitcast <1 x i1> %mask to i1
+  %t2 = call i1 @llvm.ctpop(i1 %t1)
+  %t3 = zext i1 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i64> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i32
+  ret i32 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = bitcast <16 x i1> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %mask = icmp slt <1 x i64> %a, %b
+  %t1 = bitcast <1 x i1> %mask to i1
+  %t2 = call i1 @llvm.ctpop(i1 %t1)
+  %t3 = zext i1 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i64> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i64
+  ret i64 %t3
+}
+
+define i32 @non_vmask_popcount_1(half %a) {
+; CHECK-LABEL: non_vmask_popcount_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %t1 = bitcast half %a to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @non_vmask_popcount_2(<8 x i16> %a) {
+; CHECK-LABEL: non_vmask_popcount_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    umov w9, v0.b[1]
+; CHECK-NEXT:    umov w10, v0.b[2]
+; CHECK-NEXT:    and w8, w8, #0x3
+; CHECK-NEXT:    bfi w8, w9, #2, #2
+; CHECK-NEXT:    umov w9, v0.b[3]
+; CHECK-NEXT:    bfi w8, w10, #4, #2
+; CHECK-NEXT:    umov w10, v0.b[4]
+; CHECK-NEXT:    bfi w8, w9, #6, #2
+; CHECK-NEXT:    umov w9, v0.b[5]
+; CHECK-NEXT:    bfi w8, w10, #8, #2
+; CHECK-NEXT:    umov w10, v0.b[6]
+; CHECK-NEXT:    bfi w8, w9, #10, #2
+; CHECK-NEXT:    umov w9, v0.b[7]
+; CHECK-NEXT:    bfi w8, w10, #12, #2
+; CHECK-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %mask = trunc <8 x i16> %a to <8 x i2>
+  %t1 = bitcast <8 x i2> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
diff --git a/llvm/test/CodeGen/AArch64/vector-minmax.ll b/llvm/test/CodeGen/AArch64/vector-minmax.ll
new file mode 100644
index 0000000..6696f94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-minmax.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE
+
+define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: smax_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: smin_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smin_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: umax_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmhi v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umax_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: umin_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umin_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){
+; CHECK-LABEL: smax_v1i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt d2, d0, d1
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v1i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl1
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-SVE-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %0
+}
+
+; This is legal for Neon, so this should use the Neon smax.
+define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){
+; CHECK-LABEL: smax_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v4i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 20034b6..b6e29cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -863,3 +863,19 @@ entry:
     i64 2)
   ret <vscale x 1 x double> %2
 }
+
+; The two vsetvlis will be coalesced so the add will be made dead and
+; removed. Make sure we shrink the live interval of %x.
+define void @non_li_addi(i64 %x, ptr %p) {
+; CHECK-LABEL: non_li_addi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+entry:
+  %add = add i64 %x, 1
+  %0 = tail call i64 @llvm.riscv.vsetvli(i64 %add, i64 3, i64 0)
+  %1 = call <vscale x 8 x i8> @llvm.riscv.vle(<vscale x 8 x i8> poison, ptr %p, i64 %0)
+  %2 = tail call i64 @llvm.riscv.vsetvli(i64 1, i64 3, i64 0)
+  %3 = tail call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff(<vscale x 8 x i8> poison, ptr %p, i64 %2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index fdd30c9..f9929c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -104,6 +104,10 @@
     ret void
   }
 
+  define void @non_li_addi() {
+    ret void
+  }
+
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
 
   declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4
@@ -664,3 +668,23 @@ body:             |
   bb.2:
     $x10 = COPY %vl
     PseudoRET implicit killed $x10
+...
+---
+# The two vsetvlis will be coalesced so the ADDI will be made dead and removed.
+# Make sure we shrink the live interval of %0.
+name: non_li_addi
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: non_li_addi
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoVSETIVLI:%[0-9]+]]:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gpr = COPY $x10
+    %1:gprnox0 = ADDI %0, 1
+    %2:gprnox0 = PseudoVSETVLI %1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    %3:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    PseudoRET
diff --git a/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
new file mode 100644
index 0000000..677291a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
@@ -0,0 +1,36 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=notllvm %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.some.custom %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.,random.prefix %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; The test checks command-line option which allows to represent unknown
+; intrinsics as external function calls in SPIR-V.
+
+; CHECK-ERROR: LLVM ERROR: unable to legalize instruction: %3:iid(s64) = G_READCYCLECOUNTER (in function: foo)
+
+; CHECK: Name %[[READCYCLECOUNTER:[0-9]+]] "spirv.llvm_readcyclecounter"
+; CHECK: Name %[[SOME_CUSTOM_INTRINSIC:[0-9]+]] "spirv.llvm_some_custom_intrinsic"
+; CHECK-DAG: Decorate %[[READCYCLECOUNTER]] LinkageAttributes {{.*}} Import
+; CHECK: Decorate %[[SOME_CUSTOM_INTRINSIC]] LinkageAttributes {{.*}} Import
+; CHECK-DAG: %[[I64:[0-9]+]] = OpTypeInt 64
+; CHECK: %[[FnTy:[0-9]+]] = OpTypeFunction %[[I64]]
+; CHECK: %[[READCYCLECOUNTER]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: %[[SOME_CUSTOM_INTRINSIC]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: OpFunctionCall %[[I64]] %[[READCYCLECOUNTER]]
+; CHECK:     OpFunctionCall %[[I64]] %[[SOME_CUSTOM_INTRINSIC]]
+
+define spir_func void @foo() {
+entry:
+; TODO: if and when the SPIR-V learns how to lower readcyclecounter, we will have to pick another unhandled intrinsic
+  %0 = call i64 @llvm.readcyclecounter()
+  %1 = call i64 @llvm.some.custom.intrinsic()
+  ret void
+}
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.some.custom.intrinsic()
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
new file mode 100644
index 0000000..f6b6115
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - -filetype=obj | spirv-val %}
+
+%opencl.pipe_ro_t = type opaque
+%opencl.pipe_wo_t = type opaque
+
+; CHECK-SPIRV: OpCapability BlockingPipesALTERA
+; CHECK-SPIRV: OpExtension "SPV_ALTERA_blocking_pipes"
+; CHECK-SPIRV: %[[PipeRTy:[0-9]+]] = OpTypePipe ReadOnly
+; CHECK-SPIRV: %[[PipeWTy:[0-9]+]] = OpTypePipe WriteOnly
+; CHECK-SPIRV: %[[PipeR1:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR1]] %[[#]] %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeR2:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR2]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW1:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW1]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW2:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW2]] %[[#]] %[[#]] %[[#]]
+
+define spir_func void @foo(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @bar(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @boo(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+define spir_func void @baz(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+; CHECK-LLVM: declare spir_func void @__read_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+; CHECK-LLVM: declare spir_func void @__write_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+
+define linkonce_odr dso_local spir_func void @WritePipeBLockingi9Pointer(ptr addrspace(4) align 2 dereferenceable(2) %_Data) {
+entry:
+  %_Data.addr = alloca ptr addrspace(4), align 8
+  %_WPipe = alloca target("spirv.Pipe", 1), align 8
+  %_Data.addr.ascast = addrspacecast ptr %_Data.addr to ptr addrspace(4)
+  %_WPipe.ascast = addrspacecast target("spirv.Pipe", 1)* %_WPipe to target("spirv.Pipe", 1) addrspace(4)*
+  store ptr addrspace(4) %_Data, ptr addrspace(4) %_Data.addr.ascast, align 8
+  %0 = bitcast target("spirv.Pipe", 1)* %_WPipe to ptr
+  %1 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1) addrspace(4)* %_WPipe.ascast, align 8
+  %2 = load ptr addrspace(4), ptr addrspace(4) %_Data.addr.ascast, align 8
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1) %1, ptr addrspace(4) %2, i32 2, i32 2)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+ 
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
new file mode 100644
index 0000000..21b25d8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA
+
+; Ensure reloads are after narrowed i512 -> i32 store
+define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
+; POSTRA-LABEL: PR166744:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    movl $1029, %eax # imm = 0x405
+; POSTRA-NEXT:    shlxl %esi, %edx, %edx
+; POSTRA-NEXT:    bextrl %eax, %esi, %eax
+; POSTRA-NEXT:    movl (%rdi,%rax,4), %ecx
+; POSTRA-NEXT:    btrl %esi, %ecx
+; POSTRA-NEXT:    orl %ecx, %edx
+; POSTRA-NEXT:    movl %edx, (%rdi,%rax,4)
+; POSTRA-NEXT:    movq 16(%rdi), %rax
+; POSTRA-NEXT:    movq (%rdi), %rcx
+; POSTRA-NEXT:    movq 24(%rdi), %rdx
+; POSTRA-NEXT:    movq 8(%rdi), %rsi
+; POSTRA-NEXT:    orq 56(%rdi), %rdx
+; POSTRA-NEXT:    orq 40(%rdi), %rsi
+; POSTRA-NEXT:    orq 48(%rdi), %rax
+; POSTRA-NEXT:    orq 32(%rdi), %rcx
+; POSTRA-NEXT:    orq %rdx, %rsi
+; POSTRA-NEXT:    orq %rax, %rcx
+; POSTRA-NEXT:    orq %rsi, %rcx
+; POSTRA-NEXT:    setne %al
+; POSTRA-NEXT:    retq
+;
+; NOPOSTRA-LABEL: PR166744:
+; NOPOSTRA:       # %bb.0:
+; NOPOSTRA-NEXT:    movl %esi, %eax
+; NOPOSTRA-NEXT:    shrl $3, %eax
+; NOPOSTRA-NEXT:    andl $60, %eax
+; NOPOSTRA-NEXT:    movl (%rdi,%rax), %ecx
+; NOPOSTRA-NEXT:    btrl %esi, %ecx
+; NOPOSTRA-NEXT:    shlxl %esi, %edx, %edx
+; NOPOSTRA-NEXT:    orl %ecx, %edx
+; NOPOSTRA-NEXT:    movl %edx, (%rdi,%rax)
+; NOPOSTRA-NEXT:    movq 16(%rdi), %rax
+; NOPOSTRA-NEXT:    movq (%rdi), %rcx
+; NOPOSTRA-NEXT:    movq 8(%rdi), %rdx
+; NOPOSTRA-NEXT:    movq 24(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 56(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 40(%rdi), %rdx
+; NOPOSTRA-NEXT:    orq 48(%rdi), %rax
+; NOPOSTRA-NEXT:    orq 32(%rdi), %rcx
+; NOPOSTRA-NEXT:    orq %rsi, %rdx
+; NOPOSTRA-NEXT:    orq %rax, %rcx
+; NOPOSTRA-NEXT:    orq %rdx, %rcx
+; NOPOSTRA-NEXT:    setne %al
+; NOPOSTRA-NEXT:    retq
+  %rem = and i64 %idx, 511
+  %sh_prom = zext nneg i64 %rem to i512
+  %shl = shl nuw i512 1, %sh_prom
+  %not = xor i512 %shl, -1
+  %load = load i512, ptr %v, align 8
+  %and = and i512 %load, %not
+  %conv2 = zext i1 %b to i512
+  %shl4 = shl nuw i512 %conv2, %sh_prom
+  %or = or i512 %and, %shl4
+  store i512 %or, ptr %v, align 8
+  %cmp = icmp ne i512 %or, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll
new file mode 100644
index 0000000..921bcf0
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -passes=vector-combine %s -S -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 4 x i16> @interleave2_same_const_splat_nxv4i16() {
+;CHECK-LABEL: @interleave2_same_const_splat_nxv4i16(
+;CHECK: call <vscale x 4 x i16> @llvm.vector.interleave2
+;CHECK: ret <vscale x 4 x i16> %retval
+  %retval = call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> splat(i16 3), <vscale x 2 x i16> splat(i16 3))
+  ret <vscale x 4 x i16> %retval
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
new file mode 100644
index 0000000..2926371
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
new file mode 100644
index 0000000..88cb03e
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+; ASM-LABEL: test1:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movq %rdi, %rax
+; ASM-NEXT:    addq -{{[0-9]+}}(%rsp), %rax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test1
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $rdi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+; MIR-NEXT:   [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc)
+; MIR-NEXT:   $rax = COPY [[ADD64rm]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+; ASM-LABEL: test2:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movl %edi, %eax
+; ASM-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test2
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $edi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc)
+; MIR-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit
+; MIR-NEXT:   $rax = COPY [[SUBREG_TO_REG]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
new file mode 100644
index 0000000..7167bcf
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
new file mode 100644
index 0000000..1ba920d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
new file mode 100644
index 0000000..6fc57b5
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
@@ -0,0 +1,9 @@
+# REQUIRES: x86-registered-target
+## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
+
+## Verify that running the script again on an already updated file doesn't add duplicate checks
+# RUN: %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
new file mode 100644
index 0000000..0f8aaa54
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
@@ -0,0 +1,8 @@
+# REQUIRES: x86-registered-target
+## Test that using the same prefix for both ASM and MIR outputs generates a warning
+## and doesn't produce any checks.
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING
+# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll
+
+# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 2dad16a..baa0377 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -605,6 +605,7 @@ TRIPLE_IR_RE = re.compile(r'^\s*target\s+triple\s*=\s*"([^"]+)"$')
 TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)")
 MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
+STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
 IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
index 24bb8b3..01ee0e1 100644
--- a/llvm/utils/UpdateTestChecks/mir.py
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -163,13 +163,15 @@ def add_mir_checks_for_function(
     print_fixed_stack,
     first_check_is_next,
     at_the_function_name,
+    check_indent=None,
 ):
     printed_prefixes = set()
     for run in run_list:
         for prefix in run[0]:
             if prefix in printed_prefixes:
                 break
-            if not func_dict[prefix][func_name]:
+            # func_info can be empty if there was a prefix conflict.
+            if not func_dict[prefix].get(func_name):
                 continue
             if printed_prefixes:
                 # Add some space between different check prefixes.
@@ -185,6 +187,7 @@ def add_mir_checks_for_function(
                 func_dict[prefix][func_name],
                 print_fixed_stack,
                 first_check_is_next,
+                check_indent,
             )
             break
         else:
@@ -204,6 +207,7 @@ def add_mir_check_lines(
     func_info,
     print_fixed_stack,
     first_check_is_next,
+    check_indent=None,
 ):
     func_body = str(func_info).splitlines()
     if single_bb:
@@ -220,7 +224,10 @@ def add_mir_check_lines(
     first_line = func_body[0]
     indent = len(first_line) - len(first_line.lstrip(" "))
     # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
+    if check_indent is not None:
+        check = "{}; {}".format(check_indent, prefix)
+    else:
+        check = "{:>{}}; {}".format("", indent, prefix)
 
     output_lines.append("{}-LABEL: name: {}".format(check, func_name))
 
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 8c57e75..98864be 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -15,7 +15,7 @@ import argparse
 import os  # Used to advertise this file's name ("autogenerated_note").
 import sys
 
-from UpdateTestChecks import common
+from UpdateTestChecks import common, mir
 
 # llc is the only llc-like in the LLVM tree but downstream forks can add
 # additional ones here if they have them.
@@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo):
             break
 
     run_list = []
+    mir_run_list = []
     for l in ti.run_lines:
         if "|" not in l:
             common.warn("Skipping unparsable RUN line: " + l)
@@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo):
         if m:
             march_in_cmd = m.groups()[0]
 
+        target_list = run_list
         m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd)
         if m and m.groups()[0] == "isel":
             from UpdateTestChecks import isel as output_type
+        elif not m and common.STOP_PASS_RE.search(llc_cmd):
+            # MIR output mode. If -debug-only is present assume
+            # the debug output is the main point of interest.
+            target_list = mir_run_list
         else:
             from UpdateTestChecks import asm as output_type
 
@@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo):
 
         # FIXME: We should use multiple check prefixes to common check lines. For
         # now, we just ignore all but the last.
-        run_list.append(
+        target_list.append(
             (
                 check_prefixes,
                 llc_tool,
@@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo):
         ginfo=ginfo,
     )
 
-    for (
-        prefixes,
-        llc_tool,
-        llc_args,
-        preprocess_cmd,
-        triple_in_cmd,
-        march_in_cmd,
-    ) in run_list:
+    # Dictionary to store MIR function bodies separately
+    mir_func_dict = {}
+    for run_tuple, is_mir in [(run, False) for run in run_list] + [
+        (run, True) for run in mir_run_list
+    ]:
+        (
+            prefixes,
+            llc_tool,
+            llc_args,
+            preprocess_cmd,
+            triple_in_cmd,
+            march_in_cmd,
+        ) = run_tuple
+
         common.debug("Extracted LLC cmd:", llc_tool, llc_args)
         common.debug("Extracted FileCheck prefixes:", str(prefixes))
 
@@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo):
         if not triple:
             triple = common.get_triple_from_march(march_in_cmd)
 
-        scrubber, function_re = output_type.get_run_handler(triple)
-        if 0 == builder.process_run_line(
-            function_re, scrubber, raw_tool_output, prefixes
-        ):
-            common.warn(
-                "Couldn't match any function. Possibly the wrong target triple has been provided"
+        if is_mir:
+            # MIR output mode
+            common.debug("Detected MIR output mode for prefixes:", str(prefixes))
+            for prefix in prefixes:
+                if prefix not in mir_func_dict:
+                    mir_func_dict[prefix] = {}
+
+            mir.build_function_info_dictionary(
+                ti.path,
+                raw_tool_output,
+                triple,
+                prefixes,
+                mir_func_dict,
+                ti.args.verbose,
             )
-        builder.processed_prefixes(prefixes)
+        else:
+            # ASM output mode
+            scrubber, function_re = output_type.get_run_handler(triple)
+            if 0 == builder.process_run_line(
+                function_re, scrubber, raw_tool_output, prefixes
+            ):
+                common.warn(
+                    "Couldn't match any function. Possibly the wrong target triple has been provided"
+                )
+            builder.processed_prefixes(prefixes)
 
     func_dict = builder.finish_and_get_func_dict()
+
+    # Check for conflicts: same prefix used for both ASM and MIR
+    conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys())
+    if conflicting_prefixes:
+        common.warn(
+            "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format(
+                ", ".join(sorted(conflicting_prefixes))
+            ),
+            test_file=ti.path,
+        )
+        for prefix in conflicting_prefixes:
+            mir_func_dict[prefix] = {}
+            func_dict[prefix] = {}
+
     global_vars_seen_dict = {}
 
     is_in_function = False
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
+    prefix_set.update([prefix for p in mir_run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
     output_lines = []
 
@@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo):
                         is_filtered=builder.is_filtered(),
                     )
                 )
+
+                # Also add MIR checks if we have them for this function
+                if mir_run_list and func_name:
+                    mir.add_mir_checks_for_function(
+                        ti.path,
+                        output_lines,
+                        mir_run_list,
+                        mir_func_dict,
+                        func_name,
+                        single_bb=False,  # Don't skip basic block labels.
+                        print_fixed_stack=False,  # Don't print fixed stack (ASM tests don't need it).
+                        first_check_is_next=False,  # First check is LABEL, not NEXT.
+                        at_the_function_name=False,  # Use "name:" not "@name".
+                        check_indent="",  # No indentation for IR files (not MIR files).
+                    )
+
                 is_in_function_start = False
 
             if is_in_function:
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 8728e66..70d424b 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -21,13 +21,6 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/RegionKindInterface.td"
 
-// This is roughly similar to OpFoldResult assuming the handle produces a single
-// value in the payload IR.
-def TransformAnyParamTypeOrAnyHandle : Type<
-    Or<[TransformHandleTypeInterface.predicate,
-        TransformParamTypeInterface.predicate]>,
-    "transform any param type or any handle type">;
-
 //===----------------------------------------------------------------------===//
 // Apply...PatternsOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 48978eb..de07f50 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -33,22 +33,14 @@ namespace linalg {
 //===----------------------------------------------------------------------===//
 // Utilities for inferring various semantics properties of Linalg ops.
 //===----------------------------------------------------------------------===//
-/// Shell function to compute the Destination Permutation of PackOp
-/// This function uses the helper function `computePackUnPackPerm` to get
-/// the permutation vector. Only major difference between UnPack and Pack is
-/// that packOp uses destination rank whereas unpack Uses source rank.
-SmallVector<int64_t> getPackInverseDestPerm(linalg::PackOp packOp);
-
-/// Shell function to compute the Source Permutation of unPackOp.
-/// This function, like the getPackInverseDestPerm uses the helper function
-/// computePackUnPackPerm` to get the permutation vector.
-/// Only major difference between UnPack and Pack is that packOp uses
-/// destination rank whereas unpack Uses source rank.
-SmallVector<int64_t> getUnPackInverseSrcPerm(linalg::UnPackOp unpackOp);
-
-/// Shell function to compute the Source rank permutation for unpackOp
-/// Unpack requires some packing metadata data information, so created
-/// another function where this value is passed by reference.
+
+/// Compute inverse permutation for the destination tensor (i.e. in the packed
+/// domain).
+SmallVector<int64_t> getPackInverseDestPerm(linalg::PackOp packOp,
+                                            PackingMetadata &metadata);
+
+/// Compute inverse permutation for the source tensor (i.e. in the packed
+/// domain).
 SmallVector<int64_t> getUnPackInverseSrcPerm(linalg::UnPackOp,
                                              PackingMetadata &metadata);
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 0e42d08..b628f1a 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -395,7 +395,7 @@ def SPV_INTEL_fpga_buffer_location               : I32EnumAttrCase<"SPV_INTEL_fp
 def SPV_INTEL_arbitrary_precision_fixed_point    : I32EnumAttrCase<"SPV_INTEL_arbitrary_precision_fixed_point", 4019>;
 def SPV_INTEL_usm_storage_classes                : I32EnumAttrCase<"SPV_INTEL_usm_storage_classes", 4020>;
 def SPV_INTEL_io_pipes                           : I32EnumAttrCase<"SPV_INTEL_io_pipes", 4021>;
-def SPV_INTEL_blocking_pipes                     : I32EnumAttrCase<"SPV_INTEL_blocking_pipes", 4022>;
+def SPV_ALTERA_blocking_pipes                     : I32EnumAttrCase<"SPV_ALTERA_blocking_pipes", 4022>;
 def SPV_INTEL_fpga_reg                           : I32EnumAttrCase<"SPV_INTEL_fpga_reg", 4023>;
 def SPV_INTEL_long_constant_composite            : I32EnumAttrCase<"SPV_INTEL_long_constant_composite", 4024>;
 def SPV_INTEL_optnone                            : I32EnumAttrCase<"SPV_INTEL_optnone", 4025>;
@@ -465,7 +465,7 @@ def SPIRV_ExtensionAttr :
       SPV_INTEL_kernel_attributes, SPV_INTEL_fpga_memory_accesses,
       SPV_INTEL_fpga_cluster_attributes, SPV_INTEL_loop_fuse,
       SPV_INTEL_fpga_buffer_location, SPV_INTEL_arbitrary_precision_fixed_point,
-      SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_INTEL_blocking_pipes,
+      SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_ALTERA_blocking_pipes,
       SPV_INTEL_fpga_reg, SPV_INTEL_long_constant_composite, SPV_INTEL_optnone,
       SPV_INTEL_debug_module, SPV_INTEL_fp_fast_math_mode,
       SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier,
@@ -807,9 +807,9 @@ def SPIRV_C_IOPipesINTEL                                : I32EnumAttrCase<"IOPip
     Extension<[SPV_INTEL_io_pipes]>
   ];
 }
-def SPIRV_C_BlockingPipesINTEL                          : I32EnumAttrCase<"BlockingPipesINTEL", 5945> {
+def SPIRV_C_BlockingPipesALTERA                          : I32EnumAttrCase<"BlockingPipesALTERA", 5945> {
   list<Availability> availability = [
-    Extension<[SPV_INTEL_blocking_pipes]>
+    Extension<[SPV_ALTERA_blocking_pipes]>
   ];
 }
 def SPIRV_C_FPGARegINTEL                                : I32EnumAttrCase<"FPGARegINTEL", 5948> {
@@ -1519,7 +1519,7 @@ def SPIRV_CapabilityAttr :
       SPIRV_C_FPGAMemoryAccessesINTEL, SPIRV_C_FPGAClusterAttributesINTEL,
       SPIRV_C_LoopFuseINTEL, SPIRV_C_MemoryAccessAliasingINTEL,
       SPIRV_C_FPGABufferLocationINTEL, SPIRV_C_ArbitraryPrecisionFixedPointINTEL,
-      SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesINTEL,
+      SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesALTERA,
       SPIRV_C_FPGARegINTEL, SPIRV_C_DotProductInputAll,
       SPIRV_C_DotProductInput4x8BitPacked, SPIRV_C_DotProduct, SPIRV_C_RayCullMaskKHR,
       SPIRV_C_CooperativeMatrixKHR, SPIRV_C_ReplicatedCompositesEXT,
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
index 2d9a26e..3e3fff4 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
@@ -103,4 +103,9 @@ def TransformAnyHandle : Type<
         TransformValueHandleTypeInterface.predicate]>,
     "transform operation or value handle">;
 
+def TransformAnyParamTypeOrAnyHandle : Type<
+    Or<[TransformHandleTypeInterface.predicate,
+        TransformParamTypeInterface.predicate]>,
+    "transform any param type or any handle type">;
+
 #endif  // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMTYPES
diff --git a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
index 9f57627..cb1e9d0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(TransformOps)
diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/TransformOps/CMakeLists.txt
new file mode 100644
index 0000000..59246064
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS XeGPUTransformOps.td)
+mlir_tablegen(XeGPUTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(XeGPUTransformOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRXeGPUTransformOpsIncGen)
+
+add_mlir_doc(XeGPUTransformOps XeGPUTransformOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h
new file mode 100644
index 0000000..3e16d1e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h
@@ -0,0 +1,28 @@
+//===- XeGPUTransformOps.h - XeGPU transformation ops -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_TRANSFORMOPS_XEGPUTRANSFORMOPS_H
+#define MLIR_DIALECT_XEGPU_TRANSFORMOPS_XEGPUTRANSFORMOPS_H
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h.inc"
+
+namespace mlir {
+class DialectRegistry;
+
+namespace xegpu {
+void registerTransformDialectExtension(DialectRegistry &registry);
+} // namespace xegpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_XEGPU_TRANSFORMOPS_XEGPUTRANSFORMOPS_H
diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
new file mode 100644
index 0000000..b985d54
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
@@ -0,0 +1,81 @@
+//===- XeGPUTransformOps.td - XeGPU transformation ops -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XEGPU_TRANSFORM_OPS
+#define XEGPU_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformAttrs.td"
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
+include "mlir/Dialect/Transform/IR/TransformTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+def SetDescLayoutOp : Op<Transform_Dialect, "xegpu.set_desc_layout", [
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  TransformOpInterface
+]> {
+
+  let summary = "Set xegpu.layout attribute to a xegpu.create_nd_desc op result.";
+  let description = [{
+    Given an `xegpu.create_nd_desc` operation, this transform adds `xegpu.layout`
+    attribute to the result tensor descriptor. The layout is defined by the
+    `sg_layout`, and `sg_data` and optional `inst_data` attributes. Returns a handle
+    to the transformed op.
+  }];
+
+  let arguments = (ins
+                   TransformHandleTypeInterface : $target,
+                   Variadic<TransformAnyParamTypeOrAnyHandle> : $sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle> : $sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle> : $inst_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_layout,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_inst_data
+                   );
+
+  let results = (outs TransformHandleTypeInterface : $transformed);
+  let builders = [
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<OpFoldResult>":$mixedSgLayout,
+                   "ArrayRef<OpFoldResult>":$mixedSgData,
+                   "ArrayRef<OpFoldResult>":$mixedInstData
+                   )>,
+  ];
+
+  let assemblyFormat = [{
+    $target
+    `sg_layout` `=` custom<DynamicIndexList>($sg_layout, $static_sg_layout)
+    `sg_data` `=` custom<DynamicIndexList>($sg_data, $static_sg_data)
+    (`inst_data` `=` custom<DynamicIndexList>($inst_data, $static_inst_data)^)?
+    attr-dict `:` functional-type(operands, results)
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgLayout() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgLayout(), getSgLayout(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgData(), getSgData(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInstData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInstData(), getInstData(), b);
+    }
+  }];
+}
+
+#endif // XEGPU_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Support/Timing.h b/mlir/include/mlir/Support/Timing.h
index 3d61a0a..50ae847 100644
--- a/mlir/include/mlir/Support/Timing.h
+++ b/mlir/include/mlir/Support/Timing.h
@@ -473,6 +473,11 @@ void registerDefaultTimingManagerCLOptions();
 /// 'registerDefaultTimingManagerOptions' to a `DefaultTimingManager`.
 void applyDefaultTimingManagerCLOptions(DefaultTimingManager &tm);
 
+/// Create an output strategy for the specified format, to be passed to
+/// DefaultTimingManager::setOutput().
+std::unique_ptr<OutputStrategy>
+createOutputStrategy(DefaultTimingManager::OutputFormat fmt, raw_ostream &os);
+
 } // namespace mlir
 
 #endif // MLIR_SUPPORT_TIMING_H
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index bd25e94..027268c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -232,10 +232,9 @@ FailureOr<LowerPackResult> linalg::lowerPack(RewriterBase &rewriter,
 
   // 2. Compute the permutation vector to shuffle packed shape into the shape
   // before any outer or inner permutations have been applied.
-  PackingMetadata packingMetadata = computePackingMetadata(
-      packedTensorType.getRank(), packOp.getInnerDimsPos());
+  PackingMetadata packingMetadata;
   SmallVector<int64_t> packedToStripMinedShapePerm =
-      getPackInverseDestPerm(packOp);
+      getPackInverseDestPerm(packOp, packingMetadata);
 
   // 3. Compute the stripMinedShape: this is the packed shape before any outer
   // or inner permutations have been applied.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index cb6199f..19d2d85 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1564,13 +1564,6 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
   return success();
 }
 
-/// Given a linalg::PackOp, return the `dest` shape before any packing
-/// permutations.
-static SmallVector<int64_t> getTiledPackShape(linalg::PackOp packOp,
-                                              ArrayRef<int64_t> destShape) {
-  return applyPermutation(destShape, linalg::getPackInverseDestPerm(packOp));
-}
-
 /// Determines whether a mask for xfer_write is trivially "all true"
 ///
 /// Given all the inputs required to generate a mask (mask sizes and shapes),
@@ -1761,99 +1754,6 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore,
   return mlir::vector::maskOperation(builder, write, maskForWrite);
 }
 
-/// Vectorize linalg::PackOp with (1) static inner_tiles (2) constant
-/// padding value and (3) input vector sizes into:
-///
-///   masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds
-///
-/// As in the following example:
-/// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2]
-///     into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32>
-///
-/// This pack would be vectorized to:
-///
-/// %load = vector.mask %mask {
-///     vector.transfer_read %arg0[%c0, %c0, %c0], %cst
-///         {in_bounds = [true, true, true]} :
-///         tensor<32x7x16xf32>, vector<32x8x16xf32>
-/// } : vector<32x8x16xi1> -> vector<32x8x16xf32>
-/// %shape_cast = vector.shape_cast %load : vector<32x8x16xf32>
-///                                         to vector<32x4x2x1x16xf32>
-/// %transpose = vector.transpose %shape_cast, [0, 1, 3, 4, 2]
-///     : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-/// %write = vector.transfer_write %transpose,
-///     %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0]
-///     {in_bounds = [true, true, true, true, true]}
-///     : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-///
-/// If the (3) input vector sizes are not provided, the vector sizes are
-/// determined by the result tensor shape and the `in_bounds`
-/// attribute is used instead of masking to mark out-of-bounds accesses.
-///
-/// NOTE: The input vector sizes specify the dimensions corresponding to the
-/// outer dimensions of the output tensor. The remaining dimensions are
-/// computed based on, e.g., the static inner tiles.
-/// Supporting dynamic inner tiles will require the user to specify the
-/// missing vector sizes. This is left as a TODO.
-static LogicalResult
-vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
-                        ArrayRef<int64_t> inputVectorSizes,
-                        SmallVectorImpl<Value> &newResults) {
-  // TODO: Introduce a parent class that will handle the insertion point update.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(packOp);
-
-  Location loc = packOp.getLoc();
-  std::optional<Value> padValue = packOp.getPaddingValue()
-                                      ? std::optional(packOp.getPaddingValue())
-                                      : std::nullopt;
-
-  // If the input vector sizes are not provided, then the vector sizes are
-  // determined by the result tensor shape. In case the vector sizes aren't
-  // provided, we update the inBounds attribute instead of masking.
-  bool useInBoundsInsteadOfMasking = false;
-  if (inputVectorSizes.empty()) {
-    ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
-    inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank());
-    useInBoundsInsteadOfMasking = true;
-  }
-
-  // Create masked TransferReadOp.
-  SmallVector<int64_t> inputShape(inputVectorSizes);
-  auto innerTiles = packOp.getStaticInnerTiles();
-  auto innerDimsPos = packOp.getInnerDimsPos();
-  auto outerDimsPerm = packOp.getOuterDimsPerm();
-  if (!outerDimsPerm.empty())
-    applyPermutationToVector(inputShape,
-                             invertPermutationVector(outerDimsPerm));
-  for (auto [idx, size] : enumerate(innerTiles))
-    inputShape[innerDimsPos[idx]] *= size;
-  auto maskedRead = vector::createReadOrMaskedRead(
-      rewriter, loc, packOp.getSource(), inputShape, padValue,
-      useInBoundsInsteadOfMasking,
-      /*inputScalableVecSizes=*/{});
-
-  // Create ShapeCastOp.
-  SmallVector<int64_t> destShape(inputVectorSizes);
-  destShape.append(innerTiles.begin(), innerTiles.end());
-  auto tiledPackType = VectorType::get(getTiledPackShape(packOp, destShape),
-                                       packOp.getDestType().getElementType());
-  auto shapeCastOp =
-      vector::ShapeCastOp::create(rewriter, loc, tiledPackType, maskedRead);
-
-  // Create TransposeOp.
-  auto destPermutation =
-      invertPermutationVector(getPackInverseDestPerm(packOp));
-  auto transposeOp = vector::TransposeOp::create(
-      rewriter, loc, shapeCastOp.getResult(), destPermutation);
-
-  // Create TransferWriteOp.
-  Operation *write = createWriteOrMaskedWrite(
-      rewriter, loc, transposeOp.getResult(), packOp.getDest());
-  newResults.push_back(write->getResult(0));
-  return success();
-}
-
 /// Given the re-associations, "collapses" the input Vector type
 ///
 /// This is similar to CollapseShapeOp::inferCollapsedType with two notable
@@ -1901,12 +1801,121 @@ static VectorType getCollapsedVecType(VectorType type,
   return VectorType::get(newShape, type.getElementType(), newScalableFlags);
 }
 
+/// Vectorize `linalg.pack` as:
+///   * xfer_read -> shape_cast -> transpose -> xfer_write
+///
+/// The input-vector-sizes specify the _write_ vector sizes (i.e. the vector
+/// sizes for the xfer_write operation). This is sufficient to infer the other
+/// vector sizes required here.
+///
+/// If the vector sizes are not provided:
+///  * the vector sizes are determined from the destination tensor static shape.
+///  * the inBounds attribute is used instead of masking.
+///
+/// EXAMPLE (no vector sizes):
+/// ```
+///   %pack = tensor.pack %src
+///     inner_dims_pos = [2, 1]
+///     inner_tiles = [16, 2]
+///     into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32>
+/// ``
+/// is vectorizes as:
+/// ```
+///   %read = vector.transfer_read %src
+///     : tensor<32x7x16xf32>, vector<32x8x16xf32>
+///   %sc = vector.shape_cast %read
+///     : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+///   %tr = vector.transpose %sc, [0, 1, 3, 4, 2]
+///     : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+///   %write = vector.transfer_write %tr into %dest
+///     : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+/// ```
+static LogicalResult
+vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
+                        ArrayRef<int64_t> inputVectorSizes,
+                        SmallVectorImpl<Value> &newResults) {
+  if (!inputVectorSizes.empty()) {
+    assert(inputVectorSizes.size() == packOp.getDestRank() &&
+           "Invalid number of input vector sizes!");
+  }
+
+  // TODO: Introduce a parent class that will handle the insertion point update.
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(packOp);
+
+  Location loc = packOp.getLoc();
+  std::optional<Value> padValue = packOp.getPaddingValue()
+                                      ? std::optional(packOp.getPaddingValue())
+                                      : std::nullopt;
+
+  SmallVector<int64_t> destShape =
+      SmallVector<int64_t>(packOp.getDestType().getShape());
+
+  // This is just a convenience alias to clearly communicate that the input
+  // vector sizes determine the _write_ sizes.
+  ArrayRef<int64_t> &writeVectorSizes = inputVectorSizes;
+
+  // In the absence of input-vector-sizes, use the _static_ input tensor shape.
+  // In addition, use the inBounds attribute instead of masking.
+  bool useInBoundsInsteadOfMasking = false;
+  if (writeVectorSizes.empty()) {
+    if (ShapedType::isDynamicShape(destShape))
+      return rewriter.notifyMatchFailure(packOp,
+                                         "unable to infer vector sizes");
+
+    writeVectorSizes = destShape;
+    useInBoundsInsteadOfMasking = true;
+  }
+
+  // Compute pre-transpose-write-vector-type, i.e. the write vector type
+  // _before_ the transposition (i.e. before dimension permutation). This is
+  // done by inverting the permutation/transposition that's part of the Pack
+  // operation. This type is required to:
+  //   1) compute the read vector type for masked-read below, and
+  //   2) generate shape-cast Op below that expands the read vector type.
+  PackingMetadata packMetadata;
+  SmallVector<int64_t> preTransposeWriteVecSizses(writeVectorSizes);
+  auto destInvPermutation = getPackInverseDestPerm(packOp, packMetadata);
+  applyPermutationToVector(preTransposeWriteVecSizses, destInvPermutation);
+  auto preTransposeWriteVecType = VectorType::get(
+      preTransposeWriteVecSizses, packOp.getType().getElementType());
+
+  // Compute vector type for the _read_ opeartion. This is simply
+  // pre-transpose-write-vector-type with the dimensions collapsed
+  // as per the Pack operation.
+  VectorType readVecType = getCollapsedVecType(
+      preTransposeWriteVecType,
+      getSymbolLessAffineMaps(convertReassociationIndicesToExprs(
+          rewriter.getContext(), packMetadata.reassociations)));
+
+  // Create masked TransferReadOp.
+  auto maskedRead = vector::createReadOrMaskedRead(
+      rewriter, loc, packOp.getSource(), readVecType.getShape(), padValue,
+      useInBoundsInsteadOfMasking,
+      /*inputScalableVecSizes=*/{});
+
+  // Create ShapeCastOp.
+  auto shapeCastOp = vector::ShapeCastOp::create(
+      rewriter, loc, preTransposeWriteVecType, maskedRead);
+
+  // Create TransposeOp.
+  auto destPermutation = invertPermutationVector(destInvPermutation);
+  auto transposeOp = vector::TransposeOp::create(
+      rewriter, loc, shapeCastOp.getResult(), destPermutation);
+
+  // Create TransferWriteOp.
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, transposeOp.getResult(), packOp.getDest());
+  newResults.push_back(write->getResult(0));
+  return success();
+}
+
 /// Vectorize `linalg.unpack` as:
 ///   * xfer_read -> vector.transpose -> vector.shape_cast -> xfer_write
 ///
-/// The input-vector-sizes specify the read vector sizes (i.e. the vector sizes
-/// for the xfer_read operation). This is sufficient to infer the other vector
-/// sizes required here.
+/// The input-vector-sizes specify the _read_ vector sizes (i.e. the vector
+/// sizes for the xfer_read operation). This is sufficient to infer the other
+/// vector sizes required here.
 ///
 /// If the vector sizes are not provided:
 ///  * the vector sizes are determined from the input tensor static shape.
@@ -1960,7 +1969,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   // In the absence of input-vector-sizes, use the _static_ input tensor shape.
   if (inputVectorSizes.empty()) {
     if (ShapedType::isDynamicShape(sourceShape))
-      return failure();
+      return rewriter.notifyMatchFailure(unpackOp,
+                                         "Unable to infer vector sizes!");
 
     readVectorSizes.assign(sourceShape.begin(), sourceShape.end());
     useInBoundsInsteadOfMasking = true;
@@ -2443,6 +2453,7 @@ vectorizePackOpPrecondition(linalg::PackOp packOp,
                             ArrayRef<int64_t> inputVectorSizes) {
   auto padValue = packOp.getPaddingValue();
   Attribute cstAttr;
+  // TODO: Relax this condiiton
   if (padValue && !matchPattern(padValue, m_Constant(&cstAttr))) {
     LDBG() << "pad value is not constant: " << packOp;
     return failure();
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 24d3722..6eeb206 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -171,29 +171,24 @@ computePackUnPackPerm(int64_t rank, ArrayRef<int64_t> &innerDimsPos,
 namespace mlir {
 namespace linalg {
 
-SmallVector<int64_t> getPackInverseDestPerm(PackOp packOp) {
+SmallVector<int64_t> getPackInverseDestPerm(PackOp packOp,
+                                            PackingMetadata &metadata) {
 
-  PackingMetadata pMetadata;
   int64_t packedRank = packOp.getDestType().getRank();
   ArrayRef<int64_t> innerDimPos = packOp.getInnerDimsPos();
   ArrayRef<int64_t> outerPerm = packOp.getOuterDimsPerm();
   SmallVector<int64_t> packInvDestPerm =
-      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata);
+      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, metadata);
   return packInvDestPerm;
 }
 
-SmallVector<int64_t> getUnPackInverseSrcPerm(UnPackOp unpackOp) {
-  PackingMetadata metadata;
-  return getUnPackInverseSrcPerm(unpackOp, metadata);
-}
-
 SmallVector<int64_t> getUnPackInverseSrcPerm(UnPackOp unpackOp,
                                              PackingMetadata &metadata) {
-  int64_t unpackRank = unpackOp.getSourceType().getRank();
+  int64_t packedRank = unpackOp.getSourceType().getRank();
   ArrayRef<int64_t> innerDimPos = unpackOp.getInnerDimsPos();
   ArrayRef<int64_t> outerPerm = unpackOp.getOuterDimsPerm();
   SmallVector<int64_t> unpackInvSrcPerm =
-      computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata);
+      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, metadata);
   return unpackInvSrcPerm;
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
index 31167e6..46b8251 100644
--- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
 add_subdirectory(Utils)
+add_subdirectory(TransformOps)
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/TransformOps/CMakeLists.txt
new file mode 100644
index 0000000..48fe841
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_mlir_dialect_library(MLIRXeGPUTransformOps
+  XeGPUTransformOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/Dialect/XeGPU/TransformOps/
+
+  DEPENDS
+  MLIRXeGPUTransformOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRXeGPUDialect
+  MLIRXeGPUTransforms
+  MLIRIR
+  MLIRTransformDialect
+  MLIRFuncDialect
+  MLIRSCFDialect
+)
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
new file mode 100644
index 0000000..8943ba0
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
@@ -0,0 +1,225 @@
+//===- XeGPUTransformOps.cpp - Implementation of XeGPU transformation ops -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+
+#include <optional>
+
+using namespace mlir;
+using namespace mlir::transform;
+
+/// Assuming that `ofr` is an index attr or a param of index type
+/// or a transform dialect handle mapped to exactly one op
+/// with one index result, get that value and cast it to int type.
+static DiagnosedSilenceableFailure convertMixedValuesToInt(
+    transform::TransformState &state, TransformOpInterface transformOp,
+    SmallVectorImpl<int32_t> &result, ArrayRef<OpFoldResult> ofrs) {
+  for (OpFoldResult ofr : ofrs) {
+    // Attribute case.
+    if (auto attr = dyn_cast<Attribute>(ofr)) {
+      if (auto intAttr = dyn_cast<IntegerAttr>(attr)) {
+        result.push_back(intAttr.getInt());
+        continue;
+      }
+      return transformOp.emitDefiniteFailure() << "expected IntegerAttr";
+    }
+
+    // Transform param case.
+    Value transformValue = cast<Value>(ofr);
+    if (isa<TransformParamTypeInterface>(transformValue.getType())) {
+      ArrayRef<Attribute> params = state.getParams(transformValue);
+      if (params.size() != 1)
+        return transformOp.emitDefiniteFailure()
+               << "requires exactly one parameter associated";
+      result.push_back(
+          cast<IntegerAttr>(params.front()).getValue().getSExtValue());
+      continue;
+    }
+
+    // Payload value case.
+    auto payloadOps = state.getPayloadOps(transformValue);
+    if (!llvm::hasSingleElement(payloadOps)) {
+      DiagnosedSilenceableFailure diag =
+          transformOp.emitSilenceableError()
+          << "handle must be mapped to exactly one payload op";
+      diag.attachNote(transformValue.getLoc())
+          << "mapped to " << llvm::range_size(payloadOps) << " payload ops";
+      return diag;
+    }
+
+    Operation *op = *payloadOps.begin();
+    if (op->getNumResults() != 1 || !op->getResult(0).getType().isIndex()) {
+      DiagnosedSilenceableFailure diag =
+          transformOp.emitSilenceableError()
+          << "payload op must have exactly 1 index result";
+      diag.attachNote(op->getLoc())
+          << "has " << op->getNumResults() << " results";
+      return diag;
+    }
+
+    IntegerAttr intAttr;
+    if (!matchPattern(op->getResult(0), m_Constant(&intAttr)))
+      return transformOp.emitSilenceableError()
+             << "requires param or handle to be the result of a constant like "
+                "op";
+
+    result.push_back(intAttr.getInt());
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Create a layout attribute from the given parameters.
+static xegpu::LayoutAttr
+createLayoutAttr(MLIRContext *ctx, ArrayRef<int32_t> sgLayout,
+                 ArrayRef<int32_t> sgData,
+                 std::optional<ArrayRef<int32_t>> instData) {
+  return xegpu::LayoutAttr::get(
+      ctx, DenseI32ArrayAttr::get(ctx, sgLayout),
+      DenseI32ArrayAttr::get(ctx, sgData),
+      instData ? DenseI32ArrayAttr::get(ctx, instData.value()) : nullptr,
+      /*lane_layout=*/nullptr,
+      /*lane_data=*/nullptr,
+      /*order=*/nullptr);
+}
+
+/// Replace xegpu.create_nd_desc op with a new one with the given layout.
+static xegpu::CreateNdDescOp
+setDescLayout(transform::TransformRewriter &rewriter,
+              xegpu::CreateNdDescOp descOp, xegpu::LayoutAttr layout) {
+  assert(descOp.getMixedOffsets().size() == 0 &&
+         "create desc op with offsets is not supported");
+  auto oldTensorDesc = descOp.getType();
+  auto descType = xegpu::TensorDescType::get(
+      oldTensorDesc.getShape(), oldTensorDesc.getElementType(),
+      /*array_length=*/oldTensorDesc.getArrayLength(),
+      /*boundary_check=*/oldTensorDesc.getBoundaryCheck(),
+      /*memory_space=*/oldTensorDesc.getMemorySpace(),
+      /*layout=*/layout);
+
+  rewriter.setInsertionPointAfter(descOp);
+  auto newDescOp = rewriter.replaceOpWithNewOp<xegpu::CreateNdDescOp>(
+      descOp, descType, descOp.getSource(), descOp.getMixedSizes(),
+      descOp.getMixedStrides());
+  return newDescOp;
+}
+
+void transform::SetDescLayoutOp::build(OpBuilder &builder,
+                                       OperationState &result, Value target,
+                                       ArrayRef<OpFoldResult> mixedSgLayout,
+                                       ArrayRef<OpFoldResult> mixedSgData,
+                                       ArrayRef<OpFoldResult> mixedInstData) {
+  SmallVector<int64_t> staticSgLayout, staticSgData, staticInstData;
+  SmallVector<Value> dynamicSgLayout, dynamicSgData, dynamicInstData;
+  dispatchIndexOpFoldResults(mixedSgLayout, dynamicSgLayout, staticSgLayout);
+  dispatchIndexOpFoldResults(mixedSgData, dynamicSgData, staticSgData);
+  dispatchIndexOpFoldResults(mixedInstData, dynamicInstData, staticInstData);
+  build(builder, result, target.getType(),
+        /*target=*/target,
+        /*sg_layout=*/dynamicSgLayout,
+        /*sg_data=*/dynamicSgData,
+        /*inst_data=*/dynamicInstData,
+        /*static_sg_layout=*/staticSgLayout,
+        /*static_sg_data=*/staticSgData,
+        /*static_inst_data=*/staticInstData);
+}
+
+DiagnosedSilenceableFailure
+transform::SetDescLayoutOp::apply(transform::TransformRewriter &rewriter,
+                                  transform::TransformResults &results,
+                                  transform::TransformState &state) {
+  auto targetOps = state.getPayloadOps(getTarget());
+  if (!llvm::hasSingleElement(targetOps)) {
+    return emitDefiniteFailure() << "requires exactly one targetOp handle (got "
+                                 << llvm::range_size(targetOps) << ")";
+  }
+  Operation *target = *targetOps.begin();
+
+  SmallVector<int32_t> sgLayout;
+  DiagnosedSilenceableFailure status =
+      convertMixedValuesToInt(state, (*this), sgLayout, getMixedSgLayout());
+  if (!status.succeeded())
+    return status;
+
+  SmallVector<int32_t> sgData;
+  status = convertMixedValuesToInt(state, (*this), sgData, getMixedSgData());
+  if (!status.succeeded())
+    return status;
+
+  SmallVector<int32_t> instData;
+  status =
+      convertMixedValuesToInt(state, (*this), instData, getMixedInstData());
+  if (!status.succeeded())
+    return status;
+  auto maybeInstData = instData.empty()
+                           ? std::nullopt
+                           : std::optional<ArrayRef<int32_t>>(instData);
+
+  // For now only create_nd_desc op is supported.
+  auto descOp = dyn_cast<xegpu::CreateNdDescOp>(target);
+  if (!descOp) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "Expected a xegpu.create_nd_desc op, but got: "
+                << target->getName();
+    diag.attachNote(target->getLoc()) << "target op";
+    return diag;
+  }
+
+  // Set layout attr in desc op's return type. Replaces old desc op.
+  auto layoutAttr =
+      createLayoutAttr(rewriter.getContext(), sgLayout, sgData, maybeInstData);
+  auto newdescOp = setDescLayout(rewriter, descOp, layoutAttr);
+
+  // Map result handles.
+  results.set(cast<OpResult>(getTransformed()), {newdescOp.getOperation()});
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetDescLayoutOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getSgLayoutMutable(), effects);
+  onlyReadsHandle(getSgDataMutable(), effects);
+  onlyReadsHandle(getInstDataMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  modifiesPayload(effects);
+}
+
+namespace {
+class XeGPUTransformDialectExtension
+    : public transform::TransformDialectExtension<
+          XeGPUTransformDialectExtension> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(XeGPUTransformDialectExtension)
+
+  using Base::Base;
+
+  void init();
+};
+
+void XeGPUTransformDialectExtension::init() {
+  declareGeneratedDialect<scf::SCFDialect>();
+  declareGeneratedDialect<arith::ArithDialect>();
+  declareGeneratedDialect<xegpu::XeGPUDialect>();
+
+  registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp.inc"
+      >();
+}
+} // namespace
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp.inc"
+
+void mlir::xegpu::registerTransformDialectExtension(DialectRegistry &registry) {
+  registry.addExtensions<XeGPUTransformDialectExtension>();
+}
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index d2bafb7..a5bfde1 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLog.h"
 
 #define DEBUG_TYPE "value-bounds-op-interface"
 
@@ -195,7 +196,7 @@ void ValueBoundsConstraintSet::addBound(BoundType type, int64_t pos,
     // Even without this bound, there may be enough information in the
     // constraint system to compute the requested bound. In case this bound is
     // actually needed, `computeBound` will return `failure`.
-    LLVM_DEBUG(llvm::dbgs() << "Failed to add bound: " << expr << "\n");
+    LDBG() << "Failed to add bound: " << expr << "\n";
   }
 }
 
@@ -271,11 +272,9 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
   assert(!valueDimToPosition.contains(valueDim) && "already mapped");
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
-  LLVM_DEBUG(llvm::dbgs() << "Inserting constraint set column " << pos
-                          << " for: " << value
-                          << " (dim: " << dim.value_or(kIndexValue)
-                          << ", owner: " << getOwnerOfValue(value)->getName()
-                          << ")\n");
+  LDBG() << "Inserting constraint set column " << pos << " for: " << value
+         << " (dim: " << dim.value_or(kIndexValue)
+         << ", owner: " << getOwnerOfValue(value)->getName() << ")";
   positionToValueDim.insert(positionToValueDim.begin() + pos, valueDim);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -283,8 +282,8 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
       valueDimToPosition[*positionToValueDim[i]] = i;
 
   if (addToWorklist) {
-    LLVM_DEBUG(llvm::dbgs() << "Push to worklist: " << value
-                            << " (dim: " << dim.value_or(kIndexValue) << ")\n");
+    LDBG() << "Push to worklist: " << value
+           << " (dim: " << dim.value_or(kIndexValue) << ")";
     worklist.push(pos);
   }
 
@@ -294,8 +293,7 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
 int64_t ValueBoundsConstraintSet::insert(bool isSymbol) {
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
-  LLVM_DEBUG(llvm::dbgs() << "Inserting anonymous constraint set column " << pos
-                          << "\n");
+  LDBG() << "Inserting anonymous constraint set column " << pos;
   positionToValueDim.insert(positionToValueDim.begin() + pos, std::nullopt);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -339,10 +337,9 @@ int64_t ValueBoundsConstraintSet::getPos(Value value,
           cast<BlockArgument>(value).getOwner()->isEntryBlock()) &&
          "unstructured control flow is not supported");
 #endif // NDEBUG
-  LLVM_DEBUG(llvm::dbgs() << "Getting pos for: " << value
-                          << " (dim: " << dim.value_or(kIndexValue)
-                          << ", owner: " << getOwnerOfValue(value)->getName()
-                          << ")\n");
+  LDBG() << "Getting pos for: " << value
+         << " (dim: " << dim.value_or(kIndexValue)
+         << ", owner: " << getOwnerOfValue(value)->getName() << ")";
   auto it =
       valueDimToPosition.find(std::make_pair(value, dim.value_or(kIndexValue)));
   assert(it != valueDimToPosition.end() && "expected mapped entry");
@@ -364,7 +361,7 @@ bool ValueBoundsConstraintSet::isMapped(Value value,
 }
 
 void ValueBoundsConstraintSet::processWorklist() {
-  LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n");
+  LDBG() << "Processing value bounds worklist...";
   while (!worklist.empty()) {
     int64_t pos = worklist.front();
     worklist.pop();
@@ -386,8 +383,8 @@ void ValueBoundsConstraintSet::processWorklist() {
     // Do not process any further if the stop condition is met.
     auto maybeDim = dim == kIndexValue ? std::nullopt : std::make_optional(dim);
     if (stopCondition(value, maybeDim, *this)) {
-      LLVM_DEBUG(llvm::dbgs() << "Stop condition met for: " << value
-                              << " (dim: " << maybeDim << ")\n");
+      LDBG() << "Stop condition met for: " << value << " (dim: " << maybeDim
+             << ")";
       continue;
     }
 
@@ -395,9 +392,8 @@ void ValueBoundsConstraintSet::processWorklist() {
     // the worklist.
     auto valueBoundsOp =
         dyn_cast<ValueBoundsOpInterface>(getOwnerOfValue(value));
-    LLVM_DEBUG(llvm::dbgs()
-               << "Query value bounds for: " << value
-               << " (owner: " << getOwnerOfValue(value)->getName() << ")\n");
+    LDBG() << "Query value bounds for: " << value
+           << " (owner: " << getOwnerOfValue(value)->getName() << ")";
     if (valueBoundsOp) {
       if (dim == kIndexValue) {
         valueBoundsOp.populateBoundsForIndexValue(value, *this);
@@ -406,7 +402,7 @@ void ValueBoundsConstraintSet::processWorklist() {
       }
       continue;
     }
-    LLVM_DEBUG(llvm::dbgs() << "--> ValueBoundsOpInterface not implemented\n");
+    LDBG() << "--> ValueBoundsOpInterface not implemented";
 
     // If the op does not implement `ValueBoundsOpInterface`, check if it
     // implements the `DestinationStyleOpInterface`. OpResults of such ops are
@@ -705,9 +701,7 @@ bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
 
   // We cannot prove anything if the constraint set is already empty.
   if (cstr.isEmpty()) {
-    LLVM_DEBUG(
-        llvm::dbgs()
-        << "cannot compare value/dims: constraint system is already empty");
+    LDBG() << "cannot compare value/dims: constraint system is already empty";
     return false;
   }
 
diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp
index 3839172..c857c38 100644
--- a/mlir/lib/RegisterAllExtensions.cpp
+++ b/mlir/lib/RegisterAllExtensions.cpp
@@ -56,6 +56,7 @@
 #include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h"
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h"
 #include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -112,6 +113,7 @@ void mlir::registerAllExtensions(DialectRegistry &registry) {
   transform::registerSMTExtension(registry);
   transform::registerTuneExtension(registry);
   vector::registerTransformDialectExtension(registry);
+  xegpu::registerTransformDialectExtension(registry);
   arm_neon::registerTransformDialectExtension(registry);
   arm_sve::registerTransformDialectExtension(registry);
 
diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp
index 2e92d9c..b0ac379 100644
--- a/mlir/lib/Support/Timing.cpp
+++ b/mlir/lib/Support/Timing.cpp
@@ -619,11 +619,17 @@ void mlir::applyDefaultTimingManagerCLOptions(DefaultTimingManager &tm) {
     return;
   tm.setEnabled(options->timing);
   tm.setDisplayMode(options->displayMode);
+  tm.setOutput(createOutputStrategy(options->outputFormat, llvm::errs()));
+}
 
-  std::unique_ptr<OutputStrategy> printer;
-  if (options->outputFormat == OutputFormat::Text)
-    printer = std::make_unique<OutputTextStrategy>(llvm::errs());
-  else if (options->outputFormat == OutputFormat::Json)
-    printer = std::make_unique<OutputJsonStrategy>(llvm::errs());
-  tm.setOutput(std::move(printer));
+std::unique_ptr<OutputStrategy>
+mlir::createOutputStrategy(DefaultTimingManager::OutputFormat fmt,
+                           raw_ostream &os) {
+  switch (fmt) {
+  case OutputFormat::Text:
+    return std::make_unique<OutputTextStrategy>(os);
+  case OutputFormat::Json:
+    return std::make_unique<OutputJsonStrategy>(os);
+  }
+  llvm_unreachable("Invalid output format");
 }
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 979b396..41f3f9d 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -742,25 +742,7 @@ static void processBranchOp(BranchOpInterface branchOp, RunLivenessAnalysis &la,
 static void cleanUpDeadVals(RDVFinalCleanupList &list) {
   LDBG() << "Starting cleanup of dead values...";
 
-  // 1. Blocks
-  LDBG() << "Cleaning up " << list.blocks.size() << " block argument lists";
-  for (auto &b : list.blocks) {
-    // blocks that are accessed via multiple codepaths processed once
-    if (b.b->getNumArguments() != b.nonLiveArgs.size())
-      continue;
-    LDBG() << "Erasing " << b.nonLiveArgs.count()
-           << " non-live arguments from block: " << b.b;
-    // it iterates backwards because erase invalidates all successor indexes
-    for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
-      if (!b.nonLiveArgs[i])
-        continue;
-      LDBG() << "  Erasing block argument " << i << ": " << b.b->getArgument(i);
-      b.b->getArgument(i).dropAllUses();
-      b.b->eraseArgument(i);
-    }
-  }
-
-  // 2. Operations
+  // 1. Operations
   LDBG() << "Cleaning up " << list.operations.size() << " operations";
   for (auto &op : list.operations) {
     LDBG() << "Erasing operation: "
@@ -769,14 +751,14 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
     op->erase();
   }
 
-  // 3. Values
+  // 2. Values
   LDBG() << "Cleaning up " << list.values.size() << " values";
   for (auto &v : list.values) {
     LDBG() << "Dropping all uses of value: " << v;
     v.dropAllUses();
   }
 
-  // 4. Functions
+  // 3. Functions
   LDBG() << "Cleaning up " << list.functions.size() << " functions";
   // Record which function arguments were erased so we can shrink call-site
   // argument segments for CallOpInterface operations (e.g. ops using
@@ -798,7 +780,7 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
     (void)f.funcOp.eraseResults(f.nonLiveRets);
   }
 
-  // 5. Operands
+  // 4. Operands
   LDBG() << "Cleaning up " << list.operands.size() << " operand lists";
   for (OperationToCleanup &o : list.operands) {
     // Handle call-specific cleanup only when we have a cached callee reference.
@@ -840,7 +822,7 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
     }
   }
 
-  // 6. Results
+  // 5. Results
   LDBG() << "Cleaning up " << list.results.size() << " result lists";
   for (auto &r : list.results) {
     LDBG() << "Erasing " << r.nonLive.count()
@@ -849,6 +831,24 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
     dropUsesAndEraseResults(r.op, r.nonLive);
   }
 
+  // 6. Blocks
+  LDBG() << "Cleaning up " << list.blocks.size() << " block argument lists";
+  for (auto &b : list.blocks) {
+    // blocks that are accessed via multiple codepaths processed once
+    if (b.b->getNumArguments() != b.nonLiveArgs.size())
+      continue;
+    LDBG() << "Erasing " << b.nonLiveArgs.count()
+           << " non-live arguments from block: " << b.b;
+    // it iterates backwards because erase invalidates all successor indexes
+    for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
+      if (!b.nonLiveArgs[i])
+        continue;
+      LDBG() << "  Erasing block argument " << i << ": " << b.b->getArgument(i);
+      b.b->getArgument(i).dropAllUses();
+      b.b->eraseArgument(i);
+    }
+  }
+
   // 7. Successor Operands
   LDBG() << "Cleaning up " << list.successorOperands.size()
          << " successor operand lists";
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 20ed3ab..51c7576 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -322,6 +322,15 @@ declare_mlir_dialect_extension_python_bindings(
     "../../include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td"
 )
 
+declare_mlir_dialect_extension_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/XeGPUTransformOps.td
+  SOURCES
+    dialects/transform/xegpu.py
+  DIALECT_NAME transform
+  EXTENSION_NAME xegpu_transform)
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
diff --git a/mlir/python/mlir/dialects/XeGPUTransformOps.td b/mlir/python/mlir/dialects/XeGPUTransformOps.td
new file mode 100644
index 0000000..5a5e7b9
--- /dev/null
+++ b/mlir/python/mlir/dialects/XeGPUTransformOps.td
@@ -0,0 +1,19 @@
+//===---- XeGPUTransformOps.td -----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Entry point of the Python bindings generator for the XeGPU transform ops.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PYTHON_BINDINGS_XEGPU_TRANSFORM_OPS
+#define PYTHON_BINDINGS_XEGPU_TRANSFORM_OPS
+
+include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td"
+
+#endif // PYTHON_BINDINGS_XEGPU_TRANSFORM_OPS
diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py
new file mode 100644
index 0000000..2918bf5
--- /dev/null
+++ b/mlir/python/mlir/dialects/transform/xegpu.py
@@ -0,0 +1,66 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .._xegpu_transform_ops_gen import *
+from .._xegpu_transform_ops_gen import _Dialect
+
+try:
+    from ...ir import *
+    from .._ods_common import _cext as _ods_cext
+    from .._ods_common import (
+        MixedValues,
+        get_op_result_or_value as _get_op_result_or_value,
+        _dispatch_dynamic_index_list,
+    )
+
+except ImportError as e:
+    raise RuntimeError("Error loading imports from extension module") from e
+
+from typing import Union, Optional
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class SetDescLayoutOp(SetDescLayoutOp):
+    """Specialization for SetDescLayoutOp class."""
+
+    def __init__(
+        self,
+        target: Union[Operation, Value],
+        sg_layout: MixedValues,
+        sg_data: MixedValues,
+        *,
+        inst_data: Optional[MixedValues] = None,
+        loc=None,
+        ip=None,
+    ):
+        target_handle = _get_op_result_or_value(target)
+        inst_data = [] if inst_data is None else inst_data
+        (
+            dynamic_sg_layout,
+            static_sg_layout,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_layout)
+        (
+            dynamic_sg_data,
+            static_sg_data,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_data)
+        (
+            dynamic_inst_data,
+            static_inst_data,
+            _,
+        ) = _dispatch_dynamic_index_list(inst_data)
+
+        super().__init__(
+            target_handle.type,
+            target_handle,
+            dynamic_sg_layout,
+            dynamic_sg_data,
+            dynamic_inst_data,
+            static_sg_layout=static_sg_layout,
+            static_sg_data=static_sg_data,
+            static_inst_data=static_inst_data,
+            loc=loc,
+            ip=ip,
+        )
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index aa2c1da..9a14ab7 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -285,6 +285,8 @@ module attributes {transform.with_named_sequence} {
 
 ///----------------------------------------------------------------------------------------
 /// Tests for linalg.pack
+///
+/// TODO: Add similar tests for linalg.unpack
 ///----------------------------------------------------------------------------------------
 
 // Note, see a similar test in:
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 1304a90..170bae6 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1335,7 +1335,7 @@ func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pack"]} in %src : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 1, 32, 16, 2] : !transform.any_op
     transform.yield
   }
 }
@@ -1378,7 +1378,7 @@ func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [32, 4, 1, 16, 2] : !transform.any_op
     transform.yield
   }
 }
@@ -1424,8 +1424,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func @pack_with_dynamic_dims
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
-func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
-  %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
+func.func @pack_with_dynamic_dims(
+    %src: tensor<?x?xf32>, 
+    %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+  %pack = linalg.pack %src 
+    inner_dims_pos = [1, 0]
+    inner_tiles = [16, 2]
+    into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
 
@@ -1433,30 +1438,108 @@ func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2x
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index
+
+/// Compute mask for xfer_read
 //  CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor<?x?xf32>
 //  CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor<?x?xf32>
 //      CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1>
+
+/// --= read =---
 //      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
 // CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
 // CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
+
+/// --= shape_cast =---
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32>
+
+/// --= transpose =---
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
+
+/// Compute mask for xfer_write
 //  CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
 //  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
 //  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
 //  CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
 //      CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[C16]], %[[C2]] : vector<4x1x16x2xi1>
+
+/// --= write =---
 //      CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] {
 // CHECK-SAME:   vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x16x2xf32>
+
 //      CHECK: return %[[WRITE]] : tensor<?x?x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 1, 16, 2] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+/// Similar to the test above, but one of the inner tile sizes is dynamic. As a
+/// result, more output dims are dynamic (and, e.g., output mask calcuation is a bit different).
+
+// CHECK-LABEL: func @pack_with_dynamic_dims_and_dynamic_inner_tile
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x?x2xf32>
+func.func @pack_with_dynamic_dims_and_dynamic_inner_tile(
+    %src: tensor<?x?xf32>,
+    %dest: tensor<?x?x?x2xf32>) -> tensor<?x?x?x2xf32> {
+  %c16 = arith.constant 16 : index
+  %pack = linalg.pack %src
+    inner_dims_pos = [1, 0]
+    inner_tiles = [%c16, 2]
+    into %dest : tensor<?x?xf32> -> tensor<?x?x?x2xf32>
+  return %pack : tensor<?x?x?x2xf32>
+}
+
+//  CHECK-DAG: %[[CST:.*]] = ub.poison : f32
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index
+
+/// Compute mask for xfer_read
+//  CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor<?x?xf32>
+//  CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor<?x?xf32>
+//      CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1>
+
+/// --= read =---
+//      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
+// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]]
+// CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
+// CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
+
+/// --= shape_cast =---
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32>
+
+/// --= transpose =---
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
+
+/// Compute mask for xfer_write
+//  CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+//  CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index
+//  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x?x2xf32>
+//  CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x?x2xf32>
+//  CHECK-DAG: %[[D4:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x?x2xf32>
+//      CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[D4]], %[[C2_2]] : vector<4x1x16x2xi1>
+
+/// --= write =---
+//      CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] {
+// CHECK-SAME:   vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x?x2xf32>
+
+//      CHECK: return %[[WRITE]] : tensor<?x?x?x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 1, 16, 2] : !transform.any_op
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
new file mode 100644
index 0000000..3035845
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics
+
+func.func @set_desc_layout(%arg0: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index // expected-note {{target op}}
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Expected a xegpu.create_nd_desc op, but got: arith.constant}}
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [8, 4] sg_data = [32, 32] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
new file mode 100644
index 0000000..23e1cd9
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @set_desc_layout
+func.func @set_desc_layout(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: #xegpu.block_tdesc_attr<boundary_check = false>
+  // CHECK-SAME: #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16, #xegpu.block_tdesc_attr<boundary_check = false>>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.create_nd_tdesc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_desc_layout %{{.*}}
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [8, 4] sg_data = [32, 32] inst_data = [8, 16] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_desc_layout_minimal
+func.func @set_desc_layout_minimal(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.create_nd_tdesc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_desc_layout %{{.*}}
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [8, 4] sg_data = [32, 32] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_desc_layout_param
+func.func @set_desc_layout_param(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.create_nd_tdesc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_desc_layout %{{.*}}
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [%layout0, 4] sg_data = [32, 32] inst_data = [8, 16] : (!transform.any_op, !transform.param<i64>) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 8b5ccdc..e730450 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -674,18 +674,3 @@ func.func @dead_value_loop_ivs_no_result(%lb: index, %ub: index, %step: index, %
   }
   return
 }
-
-// -----
-
-// CHECK-LABEL: func @op_block_have_dead_arg
-func.func @op_block_have_dead_arg(%arg0: index, %arg1: index, %arg2: index, %arg3: i1) {
-  scf.for %iv = %arg0 to %arg1 step %arg2 {
-    scf.execute_region {
-      cf.cond_br %arg3, ^bb1(%arg0 : index), ^bb1(%arg1 : index)
-    ^bb1(%0: index):
-        scf.yield
-    }
-  }
-// CHECK-NEXT: return
-  return
-}
diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py
new file mode 100644
index 0000000..1c8a2bc
--- /dev/null
+++ b/mlir/test/python/dialects/transform_xegpu_ext.py
@@ -0,0 +1,51 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+from mlir.dialects import transform
+from mlir.dialects.transform import xegpu
+from mlir.dialects.transform import structured
+
+
+def run(f):
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            print("\nTEST:", f.__name__)
+            f()
+        print(module)
+    return f
+
+
+@run
+def setDescLayoutMinimal():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.create_nd_tdesc"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.SetDescLayoutOp(sequence.bodyTarget, sg_layout=[6, 4], sg_data=[32, 16])
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setDescLayoutMinimal
+    # CHECK: %0 = transform.xegpu.set_desc_layout %
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+
+
+@run
+def setDescLayoutInstData():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.create_nd_tdesc"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.SetDescLayoutOp(
+            sequence.bodyTarget, sg_layout=[6, 4], sg_data=[32, 16], inst_data=[8, 16]
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setDescLayoutInstData
+    # CHECK: %0 = transform.xegpu.set_desc_layout %
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+    # CHECK: inst_data = [8, 16]
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index b65fe64..ecd11b9 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -5281,6 +5281,7 @@ libc_function(
     hdrs = ["src/stdlib/strfromf.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -5291,6 +5292,7 @@ libc_function(
     hdrs = ["src/stdlib/strfromd.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -5301,6 +5303,7 @@ libc_function(
     hdrs = ["src/stdlib/strfroml.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -6514,12 +6517,34 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "printf_error_mapper",
+    hdrs = [
+        "src/stdio/printf_core/error_mapper.h",
+    ] + select({
+        "@platforms//os:linux": [
+            "src/stdio/printf_core/linux/error_mapper.h",
+        ],
+        "//conditions:default": [
+            "src/stdio/printf_core/generic/error_mapper.h",
+        ],
+    }),
+    deps = [
+        ":__support_cpp_type_traits",
+        ":__support_error_or",
+        ":__support_macros_properties_architectures",
+        ":hdr_errno_macros",
+        ":printf_core_structs",
+    ],
+)
+
+libc_support_library(
     name = "printf_main",
     hdrs = ["src/stdio/printf_core/printf_main.h"],
     deps = [
         ":__support_arg_list",
         ":printf_converter",
         ":printf_core_structs",
+        ":printf_error_mapper",
         ":printf_parser",
         ":printf_writer",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
index cbc6d13..e33199c 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
@@ -87,6 +87,8 @@ libc_test(
     name = "fprintf_test",
     srcs = ["fprintf_test.cpp"],
     deps = [
+        "//libc:__support_cpp_limits",
+        "//libc:__support_macros_properties_architectures",
         "//libc:fprintf",
     ],
 )