43 files changed, 1404 insertions, 298 deletions
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index 30ce1fb..6588b53 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -1244,8 +1244,9 @@ bool FlatLinearValueConstraints::areVarsAlignedWithOther(
 
 /// Checks if the SSA values associated with `cst`'s variables in range
 /// [start, end) are unique.
-static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
-    const FlatLinearValueConstraints &cst, unsigned start, unsigned end) {
+[[maybe_unused]] static bool
+areVarsUnique(const FlatLinearValueConstraints &cst, unsigned start,
+              unsigned end) {
 
   assert(start <= cst.getNumDimAndSymbolVars() &&
          "Start position out of bounds");
@@ -1267,14 +1268,14 @@ static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
 }
 
 /// Checks if the SSA values associated with `cst`'s variables are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static bool
 areVarsUnique(const FlatLinearValueConstraints &cst) {
   return areVarsUnique(cst, 0, cst.getNumDimAndSymbolVars());
 }
 
 /// Checks if the SSA values associated with `cst`'s variables of kind `kind`
 /// are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static bool
 areVarsUnique(const FlatLinearValueConstraints &cst, VarKind kind) {
 
   if (kind == VarKind::SetDim)
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index a1cbe29..547a4c2 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -34,7 +34,7 @@ using Direction = Simplex::Direction;
 const int nullIndex = std::numeric_limits<int>::max();
 
 // Return a + scale*b;
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static SmallVector<DynamicAPInt, 8>
 scaleAndAddForAssert(ArrayRef<DynamicAPInt> a, const DynamicAPInt &scale,
                      ArrayRef<DynamicAPInt> b) {
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 7b17106..06d0256 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2730,6 +2730,17 @@ public:
         operation->get(), toMlirStringRef(name)));
   }
 
+  static void
+  forEachAttr(MlirOperation op,
+              llvm::function_ref<void(MlirStringRef, MlirAttribute)> fn) {
+    intptr_t n = mlirOperationGetNumAttributes(op);
+    for (intptr_t i = 0; i < n; ++i) {
+      MlirNamedAttribute na = mlirOperationGetAttribute(op, i);
+      MlirStringRef name = mlirIdentifierStr(na.name);
+      fn(name, na.attribute);
+    }
+  }
+
   static void bind(nb::module_ &m) {
     nb::class_<PyOpAttributeMap>(m, "OpAttributeMap")
         .def("__contains__", &PyOpAttributeMap::dunderContains)
@@ -2737,7 +2748,50 @@ public:
         .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed)
         .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed)
         .def("__setitem__", &PyOpAttributeMap::dunderSetItem)
-        .def("__delitem__", &PyOpAttributeMap::dunderDelItem);
+        .def("__delitem__", &PyOpAttributeMap::dunderDelItem)
+        .def("__iter__",
+             [](PyOpAttributeMap &self) {
+               nb::list keys;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef name, MlirAttribute) {
+                     keys.append(nb::str(name.data, name.length));
+                   });
+               return nb::iter(keys);
+             })
+        .def("keys",
+             [](PyOpAttributeMap &self) {
+               nb::list out;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef name, MlirAttribute) {
+                     out.append(nb::str(name.data, name.length));
+                   });
+               return out;
+             })
+        .def("values",
+             [](PyOpAttributeMap &self) {
+               nb::list out;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef, MlirAttribute attr) {
+                     out.append(PyAttribute(self.operation->getContext(), attr)
+                                    .maybeDownCast());
+                   });
+               return out;
+             })
+        .def("items", [](PyOpAttributeMap &self) {
+          nb::list out;
+          PyOpAttributeMap::forEachAttr(
+              self.operation->get(),
+              [&](MlirStringRef name, MlirAttribute attr) {
+                out.append(nb::make_tuple(
+                    nb::str(name.data, name.length),
+                    PyAttribute(self.operation->getContext(), attr)
+                        .maybeDownCast()));
+              });
+          return out;
+        });
   }
 
 private:
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 5ddb3fb..0f0ed22 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -205,7 +205,7 @@ public:
       nb::object res = f(opView, PyPatternRewriter(rewriter));
       return logicalResultFromObject(res);
     };
-    MlirRewritePattern pattern = mlirOpRewritePattenCreate(
+    MlirRewritePattern pattern = mlirOpRewritePatternCreate(
         rootName, benefit, ctx, callbacks, matchAndRewrite.ptr(),
         /* nGeneratedNames */ 0,
         /* generatedNames */ nullptr);
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 46c329d..41ceb15 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -341,7 +341,7 @@ private:
 
 } // namespace mlir
 
-MlirRewritePattern mlirOpRewritePattenCreate(
+MlirRewritePattern mlirOpRewritePatternCreate(
     MlirStringRef rootName, unsigned benefit, MlirContext context,
     MlirRewritePatternCallbacks callbacks, void *userData,
     size_t nGeneratedNames, MlirStringRef *generatedNames) {
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index b215211..c03f3a5 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -484,5 +484,5 @@ void mlir::populateGpuToROCDLConversionPatterns(
                GPUSubgroupBroadcastOpToROCDL>(converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
-  populateMathToROCDLConversionPatterns(converter, patterns);
+  populateMathToROCDLConversionPatterns(converter, patterns, chipset);
 }
diff --git a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
index 2771955a..8cc3fde 100644
--- a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRMathToROCDL
   Core
 
   LINK_LIBS PUBLIC
+  MLIRAMDGPUUtils
   MLIRDialectUtils
   MLIRFuncDialect
   MLIRGPUToGPURuntimeTransforms
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index df219f3..a2dfc12 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -10,6 +10,8 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -19,6 +21,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/DebugLog.h"
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/OpToFuncCallLowering.h"
@@ -42,8 +45,46 @@ static void populateOpPatterns(const LLVMTypeConverter &converter,
                                            f32ApproxFunc, f16Func);
 }
 
+struct ClampFOpConversion final
+    : public ConvertOpToLLVMPattern<math::ClampFOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::ClampFOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only f16 and f32 types are supported by fmed3
+    Type opTy = op.getType();
+    Type resultType = getTypeConverter()->convertType(opTy);
+
+    if (auto vectorType = dyn_cast<VectorType>(opTy))
+      opTy = vectorType.getElementType();
+
+    if (!isa<Float16Type, Float32Type>(opTy))
+      return rewriter.notifyMatchFailure(
+          op, "fmed3 only supports f16 and f32 types");
+
+    // Handle multi-dimensional vectors (converted to LLVM arrays)
+    if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(resultType))
+      return LLVM::detail::handleMultidimensionalVectors(
+          op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
+          [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
+            typename math::ClampFOp::Adaptor adaptor(operands);
+            return ROCDL::FMed3Op::create(rewriter, op.getLoc(), llvm1DVectorTy,
+                                          adaptor.getValue(), adaptor.getMin(),
+                                          adaptor.getMax());
+          },
+          rewriter);
+
+    // Handle 1D vectors and scalars directly
+    rewriter.replaceOpWithNewOp<ROCDL::FMed3Op>(op, op.getType(), op.getValue(),
+                                                op.getMin(), op.getMax());
+    return success();
+  }
+};
+
 void mlir::populateMathToROCDLConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    std::optional<amdgpu::Chipset> chipset) {
   // Handled by mathToLLVM: math::AbsIOp
   // Handled by mathToLLVM: math::AbsFOp
   // Handled by mathToLLVM: math::CopySignOp
@@ -118,15 +159,21 @@ void mlir::populateMathToROCDLConversionPatterns(
   // worth creating a separate pass for it.
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",
                                     "__ocml_fmod_f64", "__ocml_fmod_f16");
+
+  if (chipset.has_value() && chipset->majorVersion >= 9) {
+    patterns.add<ClampFOpConversion>(converter);
+  } else {
+    LDBG() << "Chipset dependent patterns were not added";
+  }
 }
 
-namespace {
-struct ConvertMathToROCDLPass
-    : public impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> {
-  ConvertMathToROCDLPass() = default;
+struct ConvertMathToROCDLPass final
+    : impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> {
+  using impl::ConvertMathToROCDLBase<
+      ConvertMathToROCDLPass>::ConvertMathToROCDLBase;
+
   void runOnOperation() override;
 };
-} // namespace
 
 void ConvertMathToROCDLPass::runOnOperation() {
   auto m = getOperation();
@@ -135,10 +182,21 @@ void ConvertMathToROCDLPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   LowerToLLVMOptions options(ctx, DataLayout(m));
   LLVMTypeConverter converter(ctx, options);
-  populateMathToROCDLConversionPatterns(converter, patterns);
+
+  FailureOr<amdgpu::Chipset> maybeChipset;
+  if (!chipset.empty()) {
+    maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset))
+      return signalPassFailure();
+  }
+  populateMathToROCDLConversionPatterns(
+      converter, patterns,
+      succeeded(maybeChipset) ? std::optional(*maybeChipset) : std::nullopt);
+
   ConversionTarget target(getContext());
-  target.addLegalDialect<BuiltinDialect, func::FuncDialect,
-                         vector::VectorDialect, LLVM::LLVMDialect>();
+  target
+      .addLegalDialect<BuiltinDialect, func::FuncDialect, vector::VectorDialect,
+                       LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
   target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
                       LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
                       LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index f0d8b78..610ce1f 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -407,11 +407,11 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
     if (auto vectorType = dyn_cast<VectorType>(operandType))
       nanAttr = DenseElementsAttr::get(vectorType, nan);
 
-    Value NanValue =
+    Value nanValue =
         spirv::ConstantOp::create(rewriter, loc, operandType, nanAttr);
     Value lhs =
         spirv::SelectOp::create(rewriter, loc, cmpNegativeWithFractionalExp,
-                                NanValue, adaptor.getLhs());
+                                nanValue, adaptor.getLhs());
     Value abs = spirv::GLFAbsOp::create(rewriter, loc, lhs);
 
     // TODO: The following just forcefully casts y into an integer value in
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 5355909..41d8d53 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1723,17 +1723,18 @@ struct VectorBroadcastScalarToLowRankLowering
       return success();
     }
 
-    // For 1-d vector, we additionally do a `vectorshuffle`.
     auto v =
         LLVM::InsertElementOp::create(rewriter, broadcast.getLoc(), vectorType,
                                       poison, adaptor.getSource(), zero);
 
+    // For 1-d vector, we additionally do a `shufflevector`.
     int64_t width = cast<VectorType>(broadcast.getType()).getDimSize(0);
     SmallVector<int32_t> zeroValues(width, 0);
 
     // Shuffle the value across the desired number of elements.
-    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(broadcast, v, poison,
-                                                       zeroValues);
+    auto shuffle = rewriter.createOrFold<LLVM::ShuffleVectorOp>(
+        broadcast.getLoc(), v, poison, zeroValues);
+    rewriter.replaceOp(broadcast, shuffle);
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
index 84b2580..dd9edc4 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
@@ -21,6 +21,7 @@ add_mlir_conversion_library(MLIRXeGPUToXeVM
   MLIRIndexDialect
   MLIRSCFDialect
   MLIRXeGPUDialect
+  MLIRXeGPUUtils
   MLIRPass
   MLIRTransforms
   MLIRSCFTransforms
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 71687b1..fcbf66d 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -20,7 +20,9 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
@@ -62,6 +64,7 @@ static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) {
   case xegpu::MemorySpace::SLM:
     return static_cast<int>(xevm::AddrSpace::SHARED);
   }
+  llvm_unreachable("Unknown XeGPU memory space");
 }
 
 // Get same bitwidth flat vector type of new element type.
@@ -185,6 +188,7 @@ class CreateNdDescToXeVMPattern
     int64_t rank = mixedSizes.size();
     if (rank != 2)
       return rewriter.notifyMatchFailure(op, "Expected 2D shape.");
+
     auto sourceTy = source.getType();
     auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
     // If source is a memref, we need to extract the aligned pointer as index.
@@ -363,10 +367,11 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
 
 // Add a builder that creates
 // offset * elemByteSize + baseAddr
-static Value addOffset(ConversionPatternRewriter &rewriter, Location loc,
-                       Value baseAddr, Value offset, int64_t elemByteSize) {
+static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter,
+                                 Location loc, Value baseAddr, Value offset,
+                                 int64_t elemByteSize) {
   Value byteSize = arith::ConstantIntOp::create(
-      rewriter, loc, rewriter.getI64Type(), elemByteSize);
+      rewriter, loc, baseAddr.getType(), elemByteSize);
   Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize);
   Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset);
   return newAddr;
@@ -390,7 +395,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
     // Load result or Store valye Type can be vector or scalar.
     Type valOrResTy;
     if constexpr (std::is_same_v<OpType, xegpu::LoadGatherOp>)
-      valOrResTy = op.getResult().getType();
+      valOrResTy =
+          this->getTypeConverter()->convertType(op.getResult().getType());
     else
       valOrResTy = adaptor.getValue().getType();
     VectorType valOrResVecTy = dyn_cast<VectorType>(valOrResTy);
@@ -441,7 +447,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
       // If offset is provided, we add them to the base pointer.
       // Offset is in number of elements, we need to multiply by
       // element byte size.
-      basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize);
+      basePtrI64 =
+          addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize);
     }
     // Convert base pointer (i64) to LLVM pointer type.
     Value basePtrLLVM =
@@ -504,6 +511,147 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
   }
 };
 
+// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions
+// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than
+// 32 bits will be converted to 32 bits.
+class CreateMemDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateMemDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateMemDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto resTy = op.getMemDesc();
+
+    // Create the result MemRefType with the same shape, element type, and
+    // memory space
+    auto newResTy = getTypeConverter()->convertType<MemRefType>(resTy);
+
+    Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0);
+    auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy,
+                                         op.getSource(), zero, ValueRange());
+    rewriter.replaceOp(op, viewOp);
+    return success();
+  }
+};
+
+template <typename OpType,
+          typename = std::enable_if_t<llvm::is_one_of<
+              OpType, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>>
+class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(op, "Expected offset to be provided.");
+
+    auto loc = op.getLoc();
+    auto ctxt = rewriter.getContext();
+    Value basePtrStruct = adaptor.getMemDesc();
+    Value mdescVal = op.getMemDesc();
+    // Load result or Store value Type can be vector or scalar.
+    Value data;
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>)
+      data = op.getResult();
+    else
+      data = adaptor.getData();
+    VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
+    if (!valOrResVecTy)
+      valOrResVecTy = VectorType::get(1, data.getType());
+
+    int64_t elemBitWidth =
+        valOrResVecTy.getElementType().getIntOrFloatBitWidth();
+    // Element type must be multiple of 8 bits.
+    if (elemBitWidth % 8 != 0)
+      return rewriter.notifyMatchFailure(
+          op, "Expected element type bit width to be multiple of 8.");
+    int64_t elemByteSize = elemBitWidth / 8;
+
+    // Default memory space is SLM.
+    LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get(
+        ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::SLM));
+
+    auto mdescTy = cast<xegpu::MemDescType>(mdescVal.getType());
+
+    Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create(
+        rewriter, loc, basePtrStruct);
+
+    // Convert base pointer (ptr) to i32
+    Value basePtrI32 = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), basePtrLLVM);
+
+    Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets);
+    linearOffset = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), linearOffset);
+    basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset,
+                                     elemByteSize);
+
+    // convert base pointer (i32) to LLVM pointer type
+    basePtrLLVM =
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32);
+
+    if (op.getSubgroupBlockIoAttr()) {
+      // if the attribute 'subgroup_block_io' is set to true, it lowers to
+      // xevm.blockload
+
+      Type intElemTy = rewriter.getIntegerType(elemBitWidth);
+      VectorType intVecTy =
+          VectorType::get(valOrResVecTy.getShape(), intElemTy);
+
+      if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+        Value loadOp =
+            xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM);
+        if (intVecTy != valOrResVecTy) {
+          loadOp =
+              vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp);
+        }
+        rewriter.replaceOp(op, loadOp);
+      } else {
+        Value dataToStore = adaptor.getData();
+        if (valOrResVecTy != intVecTy) {
+          dataToStore =
+              vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore);
+        }
+        xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore,
+                                   nullptr);
+        rewriter.eraseOp(op);
+      }
+      return success();
+    }
+
+    if (valOrResVecTy.getNumElements() >= 1) {
+      auto chipOpt = xegpu::getChipStr(op);
+      if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) {
+        // the lowering for chunk load only works for pvc and bmg
+        return rewriter.notifyMatchFailure(
+            op, "The lowering is specific to pvc or bmg.");
+      }
+    }
+
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+      // if the size of valOrResVecTy is 1, it lowers to a scalar load/store
+      // operation. LLVM load/store does not support vector of size 1, so we
+      // need to handle this case separately.
+      auto scalarTy = valOrResVecTy.getElementType();
+      LLVM::LoadOp loadOp;
+      if (valOrResVecTy.getNumElements() == 1)
+        loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM);
+      else
+        loadOp =
+            LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM);
+      rewriter.replaceOp(op, loadOp);
+    } else {
+      LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
+      rewriter.eraseOp(op);
+    }
+    return success();
+  }
+};
+
 class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -546,8 +694,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
                 op, "Expected element type bit width to be multiple of 8.");
           elemByteSize = elemBitWidth / 8;
         }
-        basePtrI64 =
-            addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize);
+        basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets,
+                                         elemByteSize);
       }
     }
     // Default memory space is global.
@@ -784,6 +932,13 @@ struct ConvertXeGPUToXeVMPass
       auto i32Type = IntegerType::get(&getContext(), 32);
       return VectorType::get(8, i32Type);
     });
+    // Convert MemDescType into flattened MemRefType for SLM
+    typeConverter.addConversion([&](xegpu::MemDescType type) -> Type {
+      Type elemTy = type.getElementType();
+      int numElems = type.getNumElements();
+      return MemRefType::get(numElems, elemTy, AffineMap(), 3);
+    });
+
     typeConverter.addConversion([&](MemRefType type) -> Type {
       // Convert MemRefType to i64 type.
       return IntegerType::get(&getContext(), 64);
@@ -878,10 +1033,30 @@ struct ConvertXeGPUToXeVMPass
       }
       return {};
     };
-    typeConverter.addSourceMaterialization(memrefMaterializationCast);
-    typeConverter.addSourceMaterialization(ui64MaterializationCast);
-    typeConverter.addSourceMaterialization(ui32MaterializationCast);
-    typeConverter.addSourceMaterialization(vectorMaterializationCast);
+
+    // If result type of original op is single element vector and lowered type
+    // is scalar. This materialization cast creates a single element vector by
+    // broadcasting the scalar value.
+    auto singleElementVectorMaterializationCast =
+        [](OpBuilder &builder, Type type, ValueRange inputs,
+           Location loc) -> Value {
+      if (inputs.size() != 1)
+        return {};
+      auto input = inputs.front();
+      if (input.getType().isIntOrIndexOrFloat()) {
+        // If the input is a scalar, and the target type is a vector of single
+        // element, create a single element vector by broadcasting.
+        if (auto vecTy = dyn_cast<VectorType>(type)) {
+          if (vecTy.getNumElements() == 1) {
+            return vector::BroadcastOp::create(builder, loc, vecTy, input)
+                .getResult();
+          }
+        }
+      }
+      return {};
+    };
+    typeConverter.addSourceMaterialization(
+        singleElementVectorMaterializationCast);
     typeConverter.addTargetMaterialization(memrefMaterializationCast);
     typeConverter.addTargetMaterialization(ui32MaterializationCast);
     typeConverter.addTargetMaterialization(ui64MaterializationCast);
@@ -918,6 +1093,9 @@ void mlir::populateXeGPUToXeVMConversionPatterns(
                LoadStoreToXeVMPattern<xegpu::LoadGatherOp>,
                LoadStoreToXeVMPattern<xegpu::StoreScatterOp>>(
       typeConverter, patterns.getContext());
+  patterns.add<LoadStoreMatrixToXeVMPattern<xegpu::LoadMatrixOp>,
+               LoadStoreMatrixToXeVMPattern<xegpu::StoreMatrixOp>,
+               CreateMemDescOpPattern>(typeConverter, patterns.getContext());
   patterns.add<FenceToXeVMPattern, DpasToXeVMPattern>(typeConverter,
                                                       patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index f405d0c..c798adb 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -757,13 +757,13 @@ struct PackScales final : OpRewritePattern<ScaledMFMAOp> {
         offset = numElements - 4l;
       }
       Type scaleSrcElemType = scaleSrcType.getElementType();
-      auto newSrcType = VectorType::get(SmallVector<int64_t>({numElements}),
-                                        scaleSrcElemType);
+      auto newSrcType =
+          VectorType::get(ArrayRef{numElements}, scaleSrcElemType);
       Value newScaleSrc =
           vector::ShapeCastOp::create(rewriter, loc, newSrcType, scaleSrc);
       auto extract = vector::ExtractStridedSliceOp::create(
-          rewriter, loc, newScaleSrc, ArrayRef<int64_t>{offset},
-          ArrayRef<int64_t>{size}, ArrayRef<int64_t>{1});
+          rewriter, loc, newScaleSrc, ArrayRef{offset}, ArrayRef{size},
+          ArrayRef{int64_t(1)});
       rewriter.modifyOpInPlace(op, [&] {
         op->setOperand(opIdx, extract);
         setOpsel(opIdx, opsel);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 7e5ce26..749e2ba 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -125,9 +125,9 @@ static bool remainsLegalAfterInline(OpTy op, Region *src, Region *dest,
 //  Use "unused attribute" marker to silence clang-tidy warning stemming from
 //  the inability to see through "llvm::TypeSwitch".
 template <>
-bool LLVM_ATTRIBUTE_UNUSED remainsLegalAfterInline(AffineApplyOp op,
-                                                   Region *src, Region *dest,
-                                                   const IRMapping &mapping) {
+[[maybe_unused]] bool remainsLegalAfterInline(AffineApplyOp op, Region *src,
+                                              Region *dest,
+                                              const IRMapping &mapping) {
   // If it's a valid dimension, we need to check that it remains so.
   if (isValidDim(op.getResult(), src))
     return remainsLegalAfterInline(
@@ -1032,8 +1032,8 @@ static void simplifyMinOrMaxExprWithOperands(AffineMap &map,
 /// Simplify the map while exploiting information on the values in `operands`.
 //  Use "unused attribute" marker to silence warning stemming from the inability
 //  to see through the template expansion.
-static void LLVM_ATTRIBUTE_UNUSED
-simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
+[[maybe_unused]] static void simplifyMapWithOperands(AffineMap &map,
+                                                     ArrayRef<Value> operands) {
   assert(map.getNumInputs() == operands.size() && "invalid operands for map");
   SmallVector<AffineExpr> newResults;
   newResults.reserve(map.getNumResults());
@@ -1125,6 +1125,141 @@ static LogicalResult replaceAffineMinBoundingBoxExpression(AffineMinOp minOp,
   return success(*map != initialMap);
 }
 
+/// Recursively traverse `e`. If `e` or one of its sub-expressions has the form
+/// e1 + e2 + ... + eK, where the e_i are a super(multi)set of `exprsToRemove`,
+/// place a map between e and `newVal` + sum({e1, e2, .. eK} - exprsToRemove)
+/// into `replacementsMap`. If no entries were added to `replacementsMap`,
+/// nothing was found.
+static void shortenAddChainsContainingAll(
+    AffineExpr e, const llvm::SmallDenseSet<AffineExpr, 4> &exprsToRemove,
+    AffineExpr newVal, DenseMap<AffineExpr, AffineExpr> &replacementsMap) {
+  auto binOp = dyn_cast<AffineBinaryOpExpr>(e);
+  if (!binOp)
+    return;
+  AffineExpr lhs = binOp.getLHS();
+  AffineExpr rhs = binOp.getRHS();
+  if (binOp.getKind() != AffineExprKind::Add) {
+    shortenAddChainsContainingAll(lhs, exprsToRemove, newVal, replacementsMap);
+    shortenAddChainsContainingAll(rhs, exprsToRemove, newVal, replacementsMap);
+    return;
+  }
+  SmallVector<AffineExpr> toPreserve;
+  llvm::SmallDenseSet<AffineExpr, 4> ourTracker(exprsToRemove);
+  AffineExpr thisTerm = rhs;
+  AffineExpr nextTerm = lhs;
+
+  while (thisTerm) {
+    if (!ourTracker.erase(thisTerm)) {
+      toPreserve.push_back(thisTerm);
+      shortenAddChainsContainingAll(thisTerm, exprsToRemove, newVal,
+                                    replacementsMap);
+    }
+    auto nextBinOp = dyn_cast_if_present<AffineBinaryOpExpr>(nextTerm);
+    if (!nextBinOp || nextBinOp.getKind() != AffineExprKind::Add) {
+      thisTerm = nextTerm;
+      nextTerm = AffineExpr();
+    } else {
+      thisTerm = nextBinOp.getRHS();
+      nextTerm = nextBinOp.getLHS();
+    }
+  }
+  if (!ourTracker.empty())
+    return;
+  // We reverse the terms to be preserved here in order to preserve
+  // associativity between them.
+  AffineExpr newExpr = newVal;
+  for (AffineExpr preserved : llvm::reverse(toPreserve))
+    newExpr = newExpr + preserved;
+  replacementsMap.insert({e, newExpr});
+}
+
+/// If this map contains of the expression `x_1 + x_1 * C_1 + ... x_n * C_N +
+/// ...` (not necessarily in order) where the set of the `x_i` is the set of
+/// outputs of an `affine.delinearize_index` whos inverse is that expression,
+/// replace that expression with the input of that delinearize_index op.
+///
+/// `unitDimInput` is the input that was detected as the potential start to this
+/// replacement chain - if it isn't the rightmost result of the delinearization,
+/// this method fails. (This is intended to ensure we don't have redundant scans
+/// over the same expression).
+///
+/// While this currently only handles delinearizations with a constant basis,
+/// that isn't a fundamental limitation.
+///
+/// This is a utility function for `replaceDimOrSym` below.
+static LogicalResult replaceAffineDelinearizeIndexInverseExpression(
+    AffineDelinearizeIndexOp delinOp, Value resultToReplace, AffineMap *map,
+    SmallVectorImpl<Value> &dims, SmallVectorImpl<Value> &syms) {
+  if (!delinOp.getDynamicBasis().empty())
+    return failure();
+  if (resultToReplace != delinOp.getMultiIndex().back())
+    return failure();
+
+  MLIRContext *ctx = delinOp.getContext();
+  SmallVector<AffineExpr> resToExpr(delinOp.getNumResults(), AffineExpr());
+  for (auto [pos, dim] : llvm::enumerate(dims)) {
+    auto asResult = dyn_cast_if_present<OpResult>(dim);
+    if (!asResult)
+      continue;
+    if (asResult.getOwner() == delinOp.getOperation())
+      resToExpr[asResult.getResultNumber()] = getAffineDimExpr(pos, ctx);
+  }
+  for (auto [pos, sym] : llvm::enumerate(syms)) {
+    auto asResult = dyn_cast_if_present<OpResult>(sym);
+    if (!asResult)
+      continue;
+    if (asResult.getOwner() == delinOp.getOperation())
+      resToExpr[asResult.getResultNumber()] = getAffineSymbolExpr(pos, ctx);
+  }
+  if (llvm::is_contained(resToExpr, AffineExpr()))
+    return failure();
+
+  bool isDimReplacement = llvm::all_of(resToExpr, llvm::IsaPred<AffineDimExpr>);
+  int64_t stride = 1;
+  llvm::SmallDenseSet<AffineExpr, 4> expectedExprs;
+  // This isn't zip_equal since sometimes the delinearize basis is missing a
+  // size for the first result.
+  for (auto [binding, size] : llvm::zip(
+           llvm::reverse(resToExpr), llvm::reverse(delinOp.getStaticBasis()))) {
+    expectedExprs.insert(binding * getAffineConstantExpr(stride, ctx));
+    stride *= size;
+  }
+  if (resToExpr.size() != delinOp.getStaticBasis().size())
+    expectedExprs.insert(resToExpr[0] * stride);
+
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  AffineExpr delinInExpr = isDimReplacement
+                               ? getAffineDimExpr(dims.size(), ctx)
+                               : getAffineSymbolExpr(syms.size(), ctx);
+
+  for (AffineExpr e : map->getResults())
+    shortenAddChainsContainingAll(e, expectedExprs, delinInExpr, replacements);
+  if (replacements.empty())
+    return failure();
+
+  AffineMap origMap = *map;
+  if (isDimReplacement)
+    dims.push_back(delinOp.getLinearIndex());
+  else
+    syms.push_back(delinOp.getLinearIndex());
+  *map = origMap.replace(replacements, dims.size(), syms.size());
+
+  // Blank out dead dimensions and symbols
+  for (AffineExpr e : resToExpr) {
+    if (auto d = dyn_cast<AffineDimExpr>(e)) {
+      unsigned pos = d.getPosition();
+      if (!map->isFunctionOfDim(pos))
+        dims[pos] = nullptr;
+    }
+    if (auto s = dyn_cast<AffineSymbolExpr>(e)) {
+      unsigned pos = s.getPosition();
+      if (!map->isFunctionOfSymbol(pos))
+        syms[pos] = nullptr;
+    }
+  }
+  return success();
+}
+
 /// Replace all occurrences of AffineExpr at position `pos` in `map` by the
 /// defining AffineApplyOp expression and operands.
 /// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
@@ -1157,6 +1292,11 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
                                                  syms);
   }
 
+  if (auto delinOp = v.getDefiningOp<affine::AffineDelinearizeIndexOp>()) {
+    return replaceAffineDelinearizeIndexInverseExpression(delinOp, v, map, dims,
+                                                          syms);
+  }
+
   auto affineApply = v.getDefiningOp<AffineApplyOp>();
   if (!affineApply)
     return failure();
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index cd216ef..4743941 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -1357,7 +1357,7 @@ bool mlir::affine::isValidLoopInterchangePermutation(
 
 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
 /// in it from outermost to innermost.
-bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] bool
 mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
   assert(!loops.empty() && "no loops provided");
 
@@ -1920,8 +1920,7 @@ generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
   return copyNestRoot;
 }
 
-static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
-emitRemarkForBlock(Block &block) {
+[[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
   return block.getParentOp()->emitRemark();
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
index 70faa71..bc17990 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
@@ -41,18 +41,37 @@ namespace bufferization {
 
 using namespace mlir;
 
-/// Return the unique ReturnOp that terminates `funcOp`.
-/// Return nullptr if there is no such unique ReturnOp.
-static func::ReturnOp getAssumedUniqueReturnOp(func::FuncOp funcOp) {
-  func::ReturnOp returnOp;
+/// Get all the ReturnOp in the funcOp.
+static SmallVector<func::ReturnOp> getReturnOps(func::FuncOp funcOp) {
+  SmallVector<func::ReturnOp> returnOps;
   for (Block &b : funcOp.getBody()) {
     if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
-      if (returnOp)
-        return nullptr;
-      returnOp = candidateOp;
+      returnOps.push_back(candidateOp);
     }
   }
-  return returnOp;
+  return returnOps;
+}
+
+/// Get the operands at the specified position for all returnOps.
+static SmallVector<Value>
+getReturnOpsOperandInPos(ArrayRef<func::ReturnOp> returnOps, size_t pos) {
+  return llvm::map_to_vector(returnOps, [&](func::ReturnOp returnOp) {
+    return returnOp.getOperand(pos);
+  });
+}
+
+/// Check if all given values are the same buffer as the block argument (modulo
+/// cast ops).
+static bool operandsEqualFuncArgument(ArrayRef<Value> operands,
+                                      BlockArgument argument) {
+  for (Value val : operands) {
+    while (auto castOp = val.getDefiningOp<memref::CastOp>())
+      val = castOp.getSource();
+
+    if (val != argument)
+      return false;
+  }
+  return true;
 }
 
 LogicalResult
@@ -72,40 +91,45 @@ mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) {
   for (auto funcOp : module.getOps<func::FuncOp>()) {
     if (funcOp.isExternal() || funcOp.isPublic())
       continue;
-    func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
-    // TODO: Support functions with multiple blocks.
-    if (!returnOp)
+    SmallVector<func::ReturnOp> returnOps = getReturnOps(funcOp);
+    if (returnOps.empty())
       continue;
 
     // Compute erased results.
-    SmallVector<Value> newReturnValues;
-    BitVector erasedResultIndices(funcOp.getFunctionType().getNumResults());
+    size_t numReturnOps = returnOps.size();
+    size_t numReturnValues = funcOp.getFunctionType().getNumResults();
+    SmallVector<SmallVector<Value>> newReturnValues(numReturnOps);
+    BitVector erasedResultIndices(numReturnValues);
     DenseMap<int64_t, int64_t> resultToArgs;
-    for (const auto &it : llvm::enumerate(returnOp.getOperands())) {
+    for (size_t i = 0; i < numReturnValues; ++i) {
       bool erased = false;
+      SmallVector<Value> returnOperands =
+          getReturnOpsOperandInPos(returnOps, i);
       for (BlockArgument bbArg : funcOp.getArguments()) {
-        Value val = it.value();
-        while (auto castOp = val.getDefiningOp<memref::CastOp>())
-          val = castOp.getSource();
-
-        if (val == bbArg) {
-          resultToArgs[it.index()] = bbArg.getArgNumber();
+        if (operandsEqualFuncArgument(returnOperands, bbArg)) {
+          resultToArgs[i] = bbArg.getArgNumber();
           erased = true;
           break;
         }
       }
 
       if (erased) {
-        erasedResultIndices.set(it.index());
+        erasedResultIndices.set(i);
       } else {
-        newReturnValues.push_back(it.value());
+        for (auto [newReturnValue, operand] :
+             llvm::zip(newReturnValues, returnOperands)) {
+          newReturnValue.push_back(operand);
+        }
       }
     }
 
     // Update function.
     if (failed(funcOp.eraseResults(erasedResultIndices)))
       return failure();
-    returnOp.getOperandsMutable().assign(newReturnValues);
+
+    for (auto [returnOp, newReturnValue] :
+         llvm::zip(returnOps, newReturnValues))
+      returnOp.getOperandsMutable().assign(newReturnValue);
 
     // Update function calls.
     for (func::CallOp callOp : callerMap[funcOp]) {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 7ca09d9..3eae67f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -2826,6 +2826,20 @@ LogicalResult ShuffleVectorOp::verify() {
   return success();
 }
 
+// Folding for shufflevector op when v1 is single element 1D vector
+// and the mask is a single zero. OpFoldResult will be v1 in this case.
+OpFoldResult ShuffleVectorOp::fold(FoldAdaptor adaptor) {
+  // Check if operand 0 is a single element vector.
+  auto vecType = llvm::dyn_cast<VectorType>(getV1().getType());
+  if (!vecType || vecType.getRank() != 1 || vecType.getNumElements() != 1)
+    return {};
+  // Check if the mask is a single zero.
+  // Note: The mask is guaranteed to be non-empty.
+  if (getMask().size() != 1 || getMask()[0] != 0)
+    return {};
+  return getV1();
+}
+
 //===----------------------------------------------------------------------===//
 // Implementations for LLVM::LLVMFuncOp.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 01a16ce..ac35eea 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -134,10 +134,10 @@ static void printExtTypeParams(AsmPrinter &p, ArrayRef<Type> typeParams,
 
 /// These are unused for now.
 /// TODO: Move over to these once more types have been migrated to TypeDef.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #include "mlir/Dialect/LLVMIR/LLVMTypeInterfaces.cpp.inc"
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 55fe0d9..2a8c330 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -798,6 +798,26 @@ LogicalResult MmaOp::verify() {
                          " attribute");
   }
 
+  // Validate layout combinations. According to the operation description, most
+  // MMA operations require layoutA=row and layoutB=col. Only m8n8k4 with f16
+  // can use other layout combinations.
+  bool isM8N8K4_F16 =
+      (mmaShape[0] == 8 && mmaShape[1] == 8 && mmaShape[2] == 4 &&
+       getMultiplicandAPtxType() == MMATypes::f16);
+
+  if (!isM8N8K4_F16) {
+    // For all other shapes/types, layoutA must be row and layoutB must be col
+    if (getLayoutA() != MMALayout::row || getLayoutB() != MMALayout::col) {
+      return emitOpError("requires layoutA = #nvvm.mma_layout<row> and "
+                         "layoutB = #nvvm.mma_layout<col> for shape <")
+             << mmaShape[0] << ", " << mmaShape[1] << ", " << mmaShape[2]
+             << "> with element types "
+             << stringifyEnum(*getMultiplicandAPtxType()) << " and "
+             << stringifyEnum(*getMultiplicandBPtxType())
+             << ". Only m8n8k4 with f16 supports other layouts.";
+    }
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index d8f983f..6192d79 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3024,10 +3024,10 @@ ParseResult SplitOp::parse(OpAsmParser &parser, OperationState &result) {
     return failure();
   }
   if (dynamicPointParseResult.has_value()) {
-    Type ChunkSizesType;
+    Type chunkSizesType;
     if (failed(*dynamicPointParseResult) || parser.parseComma() ||
-        parser.parseType(ChunkSizesType) ||
-        parser.resolveOperand(dynamicChunkSizes, ChunkSizesType,
+        parser.parseType(chunkSizesType) ||
+        parser.resolveOperand(dynamicChunkSizes, chunkSizesType,
                               result.operands)) {
       return failure();
     }
@@ -3399,9 +3399,9 @@ void transform::ContinuousTileSizesOp::getEffects(
 }
 
 static void printContinuousTileSizeTypes(OpAsmPrinter &printer, Operation *op,
-                                         Type targetType, Type tile_sizes,
+                                         Type targetType, Type tileSizes,
                                          Type) {
-  printer.printFunctionalType(TypeRange{targetType}, TypeRange{tile_sizes});
+  printer.printFunctionalType(TypeRange{targetType}, TypeRange{tileSizes});
 }
 
 static ParseResult parseContinuousTileSizeTypes(OpAsmParser &parser,
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 507597b..94947b7 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2158,11 +2158,45 @@ public:
     return success();
   }
 };
+
+struct ReinterpretCastOpConstantFolder
+    : public OpRewritePattern<ReinterpretCastOp> {
+public:
+  using OpRewritePattern<ReinterpretCastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ReinterpretCastOp op,
+                                PatternRewriter &rewriter) const override {
+    unsigned srcStaticCount = llvm::count_if(
+        llvm::concat<OpFoldResult>(op.getMixedOffsets(), op.getMixedSizes(),
+                                   op.getMixedStrides()),
+        [](OpFoldResult ofr) { return isa<Attribute>(ofr); });
+
+    SmallVector<OpFoldResult> offsets = {op.getConstifiedMixedOffset()};
+    SmallVector<OpFoldResult> sizes = op.getConstifiedMixedSizes();
+    SmallVector<OpFoldResult> strides = op.getConstifiedMixedStrides();
+
+    // TODO: Using counting comparison instead of direct comparison because
+    // getMixedValues (and therefore ReinterpretCastOp::getMixed...) returns
+    // IntegerAttrs, while constifyIndexValues (and therefore
+    // ReinterpretCastOp::getConstifiedMixed...) returns IndexAttrs.
+    if (srcStaticCount ==
+        llvm::count_if(llvm::concat<OpFoldResult>(offsets, sizes, strides),
+                       [](OpFoldResult ofr) { return isa<Attribute>(ofr); }))
+      return failure();
+
+    auto newReinterpretCast = ReinterpretCastOp::create(
+        rewriter, op->getLoc(), op.getSource(), offsets[0], sizes, strides);
+
+    rewriter.replaceOpWithNewOp<CastOp>(op, op.getType(), newReinterpretCast);
+    return success();
+  }
+};
 } // namespace
 
 void ReinterpretCastOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                     MLIRContext *context) {
-  results.add<ReinterpretCastOpExtractStridedMetadataFolder>(context);
+  results.add<ReinterpretCastOpExtractStridedMetadataFolder,
+              ReinterpretCastOpConstantFolder>(context);
 }
 
 FailureOr<std::optional<SmallVector<Value>>>
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
index 49b7162..6f815ae 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
@@ -121,7 +121,7 @@ struct EmulateWideIntPass final
         [&typeConverter](Operation *op) { return typeConverter.isLegal(op); });
 
     RewritePatternSet patterns(ctx);
-    // Add common pattenrs to support contants, functions, etc.
+    // Add common patterns to support contants, functions, etc.
     arith::populateArithWideIntEmulationPatterns(typeConverter, patterns);
 
     memref::populateMemRefWideIntEmulationPatterns(typeConverter, patterns);
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 642ced9..90cbbd8 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -40,6 +40,16 @@ static bool isScalarLikeType(Type type) {
   return type.isIntOrIndexOrFloat() || isa<ComplexType>(type);
 }
 
+/// Helper function to attach the `VarName` attribute to an operation
+/// if a variable name is provided.
+static void attachVarNameAttr(Operation *op, OpBuilder &builder,
+                              StringRef varName) {
+  if (!varName.empty()) {
+    auto varNameAttr = acc::VarNameAttr::get(builder.getContext(), varName);
+    op->setAttr(acc::getVarNameAttrName(), varNameAttr);
+  }
+}
+
 struct MemRefPointerLikeModel
     : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
                                             MemRefType> {
@@ -83,7 +93,9 @@ struct MemRefPointerLikeModel
     // then we can generate an alloca operation.
     if (memrefTy.hasStaticShape()) {
       needsFree = false; // alloca doesn't need deallocation
-      return memref::AllocaOp::create(builder, loc, memrefTy).getResult();
+      auto allocaOp = memref::AllocaOp::create(builder, loc, memrefTy);
+      attachVarNameAttr(allocaOp, builder, varName);
+      return allocaOp.getResult();
     }
 
     // For dynamic memrefs, extract sizes from the original variable if
@@ -103,8 +115,10 @@ struct MemRefPointerLikeModel
         // Static dimensions are handled automatically by AllocOp
       }
       needsFree = true; // alloc needs deallocation
-      return memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes)
-          .getResult();
+      auto allocOp =
+          memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes);
+      attachVarNameAttr(allocOp, builder, varName);
+      return allocOp.getResult();
     }
 
     // TODO: Unranked not yet supported.
diff --git a/mlir/lib/Dialect/Shard/IR/ShardOps.cpp b/mlir/lib/Dialect/Shard/IR/ShardOps.cpp
index 135c033..645cbff 100644
--- a/mlir/lib/Dialect/Shard/IR/ShardOps.cpp
+++ b/mlir/lib/Dialect/Shard/IR/ShardOps.cpp
@@ -158,7 +158,7 @@ static FailureOr<GridOp> getGridAndVerify(Operation *op,
 }
 
 template <typename It>
-bool isUnique(It begin, It end) {
+static bool isUnique(It begin, It end) {
   if (begin == end) {
     return true;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
index a1711a6..069191c 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
@@ -143,8 +143,8 @@ void VarInfo::setNum(Var::Num n) {
 
 /// Helper function for `assertUsageConsistency` to better handle SMLoc
 /// mismatches.
-LLVM_ATTRIBUTE_UNUSED static llvm::SMLoc
-minSMLoc(AsmParser &parser, llvm::SMLoc sm1, llvm::SMLoc sm2) {
+[[maybe_unused]] static llvm::SMLoc minSMLoc(AsmParser &parser, llvm::SMLoc sm1,
+                                             llvm::SMLoc sm2) {
   const auto loc1 = dyn_cast<FileLineColLoc>(parser.getEncodedSourceLoc(sm1));
   assert(loc1 && "Could not get `FileLineColLoc` for first `SMLoc`");
   const auto loc2 = dyn_cast<FileLineColLoc>(parser.getEncodedSourceLoc(sm2));
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index f539502..684c088 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -43,8 +43,8 @@ using namespace mlir::sparse_tensor;
 //===----------------------------------------------------------------------===//
 
 #ifndef NDEBUG
-LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
-                                                  Location loc, Value memref) {
+[[maybe_unused]] static void dumpIndexMemRef(OpBuilder &builder, Location loc,
+                                             Value memref) {
   memref = memref::CastOp::create(
       builder, loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
   createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
diff --git a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
index 5aad671..1cba1bb 100644
--- a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Tosa/IR/TargetEnv.h"
+#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 namespace tosa {
@@ -27,7 +28,7 @@ TargetEnvAttr lookupTargetEnv(Operation *op) {
 }
 
 TargetEnvAttr getDefaultTargetEnv(MLIRContext *context) {
-  return TargetEnvAttr::get(context, Level::eightK,
+  return TargetEnvAttr::get(context, SpecificationVersion::V_1_0, Level::eightK,
                             {Profile::pro_int, Profile::pro_fp}, {});
 }
 
@@ -38,5 +39,9 @@ TargetEnvAttr lookupTargetEnvOrDefault(Operation *op) {
   return getDefaultTargetEnv(op->getContext());
 }
 
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version) {
+  return llvm::formatv("{0}.{1}", version.getMajor(), version.getMinor());
+}
+
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
index bcb880a..a0661e4 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
@@ -61,8 +61,8 @@ public:
 
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
-    const auto targetEnvAttr =
-        TargetEnvAttr::get(ctx, level, selectedProfiles, selectedExtensions);
+    const auto targetEnvAttr = TargetEnvAttr::get(
+        ctx, specificationVersion, level, selectedProfiles, selectedExtensions);
     mod->setAttr(TargetEnvAttr::name, targetEnvAttr);
   }
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index 20f9333..f072e3e 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -335,16 +335,15 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 template <typename T>
-FailureOr<SmallVector<T>>
-TosaProfileCompliance::getOperatorDefinition(Operation *op,
-                                             CheckCondition &condition) {
+FailureOr<OpComplianceInfo<T>>
+TosaProfileCompliance::getOperatorDefinition(Operation *op) {
   const std::string opName = op->getName().getStringRef().str();
   const auto complianceMap = getProfileComplianceMap<T>();
   const auto it = complianceMap.find(opName);
   if (it == complianceMap.end())
     return {};
 
-  return findMatchedProfile<T>(op, it->second, condition);
+  return findMatchedEntry<T>(op, it->second);
 }
 
 template <typename T>
@@ -356,22 +355,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
   if (specRequiredModeSet.size() == 0)
     return success();
 
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeOpRequiredMode = getOperatorDefinition<T>(op, condition);
-  if (failed(maybeOpRequiredMode)) {
+  const auto maybeOpDefinition = getOperatorDefinition<T>(op);
+  if (failed(maybeOpDefinition)) {
     // Operators such as control-flow and shape ops do not have an operand type
     // restriction. When the profile compliance information of operation is not
     // found, confirm if the target have enabled the profile required from the
     // specification.
-    int mode_count = 0;
+    int modeCount = 0;
     for (const auto &cands : specRequiredModeSet) {
       if (targetEnv.allowsAnyOf(cands))
         return success();
-      mode_count += cands.size();
+      modeCount += cands.size();
     }
 
     op->emitOpError() << "illegal: requires"
-                      << (mode_count > 1 ? " any of " : " ") << "["
+                      << (modeCount > 1 ? " any of " : " ") << "["
                       << llvm::join(stringifyProfile<T>(specRequiredModeSet),
                                     ", ")
                       << "] but not enabled in target\n";
@@ -381,7 +379,10 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
 
   // Find the required profiles or extensions according to the operand type
   // combination.
-  const auto opRequiredMode = maybeOpRequiredMode.value();
+  const auto opDefinition = maybeOpDefinition.value();
+  const SmallVector<T> opRequiredMode = opDefinition.mode;
+  const CheckCondition condition = opDefinition.condition;
+
   if (opRequiredMode.size() == 0) {
     // No matched restriction found.
     return success();
@@ -437,6 +438,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
     }
   }
 
+  // Ensure the matched op compliance version does not exceed the target
+  // specification version.
+  const VersionedTypeInfo versionedTypeInfo =
+      opDefinition.operandTypeInfoSet[0];
+  const TosaSpecificationVersion complianceVersion{versionedTypeInfo.second};
+  const TosaSpecificationVersion targetVersion{targetEnv.getSpecVersion()};
+  if (!targetVersion.isBackwardsCompatibleWith(complianceVersion)) {
+    op->emitOpError() << "illegal: the target specification version ("
+                      << stringifyVersion(targetVersion)
+                      << ") is not backwards compatible with the op compliance "
+                         "specification version ("
+                      << stringifyVersion(complianceVersion) << ")\n";
+    return failure();
+  }
+
   return success();
 }
 
@@ -461,14 +477,14 @@ TosaProfileCompliance::checkExtension(Operation *op,
 }
 
 LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeProfDef = getOperatorDefinition<Profile>(op, condition);
-  const auto maybeExtDef = getOperatorDefinition<Extension>(op, condition);
+  const auto maybeProfDef = getOperatorDefinition<Profile>(op);
+  const auto maybeExtDef = getOperatorDefinition<Extension>(op);
   if (failed(maybeProfDef) && failed(maybeExtDef))
     return success();
 
-  const bool hasEntry = (succeeded(maybeProfDef) && !maybeProfDef->empty()) ||
-                        (succeeded(maybeExtDef) && !maybeExtDef->empty());
+  const bool hasEntry =
+      (succeeded(maybeProfDef) && !maybeProfDef->mode.empty()) ||
+      (succeeded(maybeExtDef) && !maybeExtDef->mode.empty());
   if (!hasEntry) {
     std::string message;
     llvm::raw_string_ostream os(message);
@@ -488,7 +504,9 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
     SmallVector<TypeInfo> bestTypeInfo;
     const auto searchBestMatch = [&](auto map) {
       for (const auto &complianceInfos : map[opName]) {
-        for (const auto &typeInfos : complianceInfos.operandTypeInfoSet) {
+        for (const auto &versionedTypeInfos :
+             complianceInfos.operandTypeInfoSet) {
+          const SmallVector<TypeInfo> typeInfos = versionedTypeInfos.first;
           const int matches = llvm::count_if(
               llvm::zip_equal(current, typeInfos), [&](const auto zipType) {
                 return isSameTypeInfo(std::get<0>(zipType),
@@ -520,9 +538,8 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
 // Find the profiles or extensions requirement according to the signature of
 // type of the operand list.
 template <typename T>
-SmallVector<T> TosaProfileCompliance::findMatchedProfile(
-    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo,
-    CheckCondition &condition) {
+OpComplianceInfo<T> TosaProfileCompliance::findMatchedEntry(
+    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo) {
   assert(compInfo.size() != 0 &&
          "profile-based compliance information is empty");
 
@@ -533,27 +550,30 @@ SmallVector<T> TosaProfileCompliance::findMatchedProfile(
     return {};
 
   for (size_t i = 0; i < compInfo.size(); i++) {
-    SmallVector<SmallVector<TypeInfo>> sets = compInfo[i].operandTypeInfoSet;
-    for (SmallVector<TypeInfo> expected : sets) {
+    SmallVector<VersionedTypeInfo> sets = compInfo[i].operandTypeInfoSet;
+    for (const auto &set : sets) {
+      SmallVector<TypeInfo> expected = set.first;
       assert(present.size() == expected.size() &&
              "the entries for profile-based compliance do not match between "
              "the generated metadata and the type definition retrieved from "
              " the operation");
 
-      bool is_found = true;
+      bool isFound = true;
       // Compare the type signature between the given operation and the
       // compliance metadata.
       for (size_t j = 0; j < expected.size(); j++) {
         if (!isSameTypeInfo(present[j], expected[j])) {
           // Verify the next mode set from the list.
-          is_found = false;
+          isFound = false;
           break;
         }
       }
 
-      if (is_found == true) {
-        condition = compInfo[i].condition;
-        return compInfo[i].mode;
+      if (isFound == true) {
+        SmallVector<VersionedTypeInfo> typeInfoSet{set};
+        OpComplianceInfo<T> info{compInfo[i].mode, typeInfoSet,
+                                 compInfo[i].condition};
+        return info;
       }
     }
   }
diff --git a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
index 9a24c2b..a2cff6a 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
@@ -21,10 +21,10 @@ using namespace mlir;
 
 // These are automatically generated by ODS but are not used as the Transform
 // dialect uses a different dispatch mechanism to support dialect extensions.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/Transform/IR/TransformTypes.cpp.inc"
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 025ee9a..c809c502 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -91,7 +91,7 @@ mlir::vector::isTranspose2DSlice(vector::TransposeOp op) {
 
   // Check whether the two source vector dimensions that are greater than one
   // must be transposed with each other so that we can apply one of the 2-D
-  // transpose pattens. Otherwise, these patterns are not applicable.
+  // transpose patterns. Otherwise, these patterns are not applicable.
   if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1],
                                   op.getPermutation()))
     return failure();
diff --git a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
index 89b62a2..a514ea9 100644
--- a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
+++ b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
@@ -12,6 +12,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/SymbolTable.h"
@@ -39,28 +40,6 @@ void printElseRegion(OpAsmPrinter &opPrinter, Operation *op,
   opPrinter.printKeywordOrString("else ");
   opPrinter.printRegion(elseRegion);
 }
-
-ParseResult parseWasmVisibility(OpAsmParser &opParser, StringAttr &visibility) {
-  std::string keyword;
-  auto initLocation = opParser.getCurrentLocation();
-  std::ignore = opParser.parseOptionalKeywordOrString(&keyword);
-  if (keyword == "nested" or keyword == "") {
-    visibility = StringAttr::get(opParser.getContext(), "nested");
-    return ParseResult::success();
-  }
-
-  if (keyword == "public" || keyword == "private") {
-    visibility = StringAttr::get(opParser.getContext(), keyword);
-    return ParseResult::success();
-  }
-  opParser.emitError(initLocation, "expecting symbol visibility");
-  return ParseResult::failure();
-}
-
-void printWasmVisibility(OpAsmPrinter &opPrinter, Operation *op,
-                         Attribute visibility) {
-  opPrinter.printKeywordOrString(cast<StringAttr>(visibility).strref());
-}
 } // namespace
 
 #define GET_OP_CLASSES
@@ -167,10 +146,23 @@ Block *FuncOp::addEntryBlock() {
 
 void FuncOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                    StringRef symbol, FunctionType funcType) {
-  FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {}, "nested");
+  FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {});
 }
 
 ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto *ctx = parser.getContext();
+  std::string visibilityString;
+  auto loc = parser.getNameLoc();
+  ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString);
+  bool exported{false};
+  if (res.succeeded()) {
+    if (visibilityString != "exported")
+      return parser.emitError(
+                 loc, "expecting either `exported` or symbol name. got ")
+             << visibilityString;
+    exported = true;
+  }
+
   auto buildFuncType = [&parser](Builder &builder, ArrayRef<Type> argTypes,
                                  ArrayRef<Type> results,
                                  function_interface_impl::VariadicFlag,
@@ -191,11 +183,13 @@ ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
 
     return builder.getFunctionType(argTypesWithoutLocal, results);
   };
-
-  return function_interface_impl::parseFunctionOp(
+  auto funcParseRes = function_interface_impl::parseFunctionOp(
       parser, result, /*allowVariadic=*/false,
       getFunctionTypeAttrName(result.name), buildFuncType,
       getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
+  if (exported)
+    result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx));
+  return funcParseRes;
 }
 
 LogicalResult FuncOp::verifyBody() {
@@ -224,9 +218,18 @@ LogicalResult FuncOp::verifyBody() {
 }
 
 void FuncOp::print(OpAsmPrinter &p) {
+  /// If exported, print it before and mask it before printing
+  /// using generic interface.
+  auto exported = getExported();
+  if (exported) {
+    p << " exported";
+    removeExportedAttr();
+  }
   function_interface_impl::printFunctionOp(
       p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
       getArgAttrsAttrName(), getResAttrsAttrName());
+  if (exported)
+    setExported(true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,38 +240,37 @@ void FuncImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                          StringRef symbol, StringRef moduleName,
                          StringRef importName, FunctionType type) {
   FuncImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                      type, {}, {}, odsBuilder.getStringAttr("nested"));
+                      type, {}, {});
 }
 
 //===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
-
-void GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                     StringRef symbol, Type type, bool isMutable) {
-  GlobalOp::build(odsBuilder, odsState, symbol, type, isMutable,
-                  odsBuilder.getStringAttr("nested"));
-}
-
 // Custom formats
 ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) {
   StringAttr symbolName;
   Type globalType;
   auto *ctx = parser.getContext();
-  ParseResult res = parser.parseSymbolName(
-      symbolName, SymbolTable::getSymbolAttrName(), result.attributes);
+  std::string visibilityString;
+  auto loc = parser.getNameLoc();
+  ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString);
+  if (res.succeeded()) {
+    if (visibilityString != "exported")
+      return parser.emitError(
+                 loc, "expecting either `exported` or symbol name. got ")
+             << visibilityString;
+    result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx));
+  }
 
+  res = parser.parseSymbolName(symbolName, SymbolTable::getSymbolAttrName(),
+                               result.attributes);
   res = parser.parseType(globalType);
   result.addAttribute(getTypeAttrName(result.name), TypeAttr::get(globalType));
   std::string mutableString;
   res = parser.parseOptionalKeywordOrString(&mutableString);
   if (res.succeeded() && mutableString == "mutable")
     result.addAttribute("isMutable", UnitAttr::get(ctx));
-  std::string visibilityString;
-  res = parser.parseOptionalKeywordOrString(&visibilityString);
-  if (res.succeeded())
-    result.addAttribute("sym_visibility",
-                        StringAttr::get(ctx, visibilityString));
+
   res = parser.parseColon();
   Region *globalInitRegion = result.addRegion();
   res = parser.parseRegion(*globalInitRegion);
@@ -276,11 +278,11 @@ ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 void GlobalOp::print(OpAsmPrinter &printer) {
+  if (getExported())
+    printer << " exported";
   printer << " @" << getSymName().str() << " " << getType();
   if (getIsMutable())
     printer << " mutable";
-  if (auto vis = getSymVisibility())
-    printer << " " << *vis;
   printer << " :";
   Region &body = getRegion();
   if (!body.empty()) {
@@ -319,13 +321,6 @@ GlobalGetOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 // GlobalImportOp
 //===----------------------------------------------------------------------===//
 
-void GlobalImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                           StringRef symbol, StringRef moduleName,
-                           StringRef importName, Type type, bool isMutable) {
-  GlobalImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                        type, isMutable, odsBuilder.getStringAttr("nested"));
-}
-
 ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) {
   auto *ctx = parser.getContext();
   ParseResult res = parseImportOp(parser, result);
@@ -335,12 +330,8 @@ ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) {
   res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString);
   if (res.succeeded() && mutableOrSymVisString == "mutable") {
     result.addAttribute("isMutable", UnitAttr::get(ctx));
-    res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString);
   }
 
-  if (res.succeeded())
-    result.addAttribute("sym_visibility",
-                        StringAttr::get(ctx, mutableOrSymVisString));
   res = parser.parseColon();
 
   Type importedType;
@@ -356,8 +347,6 @@ void GlobalImportOp::print(OpAsmPrinter &printer) {
           << "\" as @" << getSymName();
   if (getIsMutable())
     printer << " mutable";
-  if (auto vis = getSymVisibility())
-    printer << " " << *vis;
   printer << " : " << getType();
 }
 
@@ -431,27 +420,6 @@ LogicalResult LocalTeeOp::verify() {
 Block *LoopOp::getLabelTarget() { return &getBody().front(); }
 
 //===----------------------------------------------------------------------===//
-// MemOp
-//===----------------------------------------------------------------------===//
-
-void MemOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                  StringRef symbol, LimitType limit) {
-  MemOp::build(odsBuilder, odsState, symbol, limit,
-               odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
-// MemImportOp
-//===----------------------------------------------------------------------===//
-
-void MemImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                        StringRef symbol, StringRef moduleName,
-                        StringRef importName, LimitType limits) {
-  MemImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                     limits, odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
 // ReinterpretOp
 //===----------------------------------------------------------------------===//
 
@@ -471,24 +439,3 @@ LogicalResult ReinterpretOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void ReturnOp::build(OpBuilder &odsBuilder, OperationState &odsState) {}
-
-//===----------------------------------------------------------------------===//
-// TableOp
-//===----------------------------------------------------------------------===//
-
-void TableOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                    StringRef symbol, TableType type) {
-  TableOp::build(odsBuilder, odsState, symbol, type,
-                 odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
-// TableImportOp
-//===----------------------------------------------------------------------===//
-
-void TableImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                          StringRef symbol, StringRef moduleName,
-                          StringRef importName, TableType type) {
-  TableImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                       type, odsBuilder.getStringAttr("nested"));
-}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 9beb22d..1599ae9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -727,6 +727,152 @@ void MemLayoutAttr::print(AsmPrinter &printer) const {
   }
   printer << ">";
 }
+// a helper utility to perform binary operation on OpFoldResult.
+// If both a and b are attributes, it will simply return the result.
+// Otherwise, the corresponding arith op will be generated, and an
+// contant op will be created if one of them is an attribute.
+template <typename ArithOp>
+OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc,
+                      OpBuilder &builder) {
+  auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a);
+  auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b);
+  return builder.create<ArithOp>(loc, aVal, bVal).getResult();
+}
+
+// a helper utility to perform division operation on OpFoldResult and int64_t.
+#define div(a, b)                                                              \
+  genBinOp<arith::DivSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform reminder operation on OpFoldResult and int64_t.
+#define rem(a, b)                                                              \
+  genBinOp<arith::RemSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform multiply operation on OpFoldResult and int64_t.
+#define mul(a, b)                                                              \
+  genBinOp<arith::MulIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform addition operation on two OpFoldResult.
+#define add(a, b) genBinOp<arith::AddIOp>(a, b, loc, builder)
+
+// block the given offsets according to the block shape
+// say the original offset is [y, x], and the block shape is [By, Bx],
+// then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+SmallVector<OpFoldResult> getBlockedOffsets(OpBuilder &builder, Location loc,
+                                            ArrayRef<OpFoldResult> offsets,
+                                            ArrayRef<int64_t> blockShape) {
+
+  assert(offsets.size() == blockShape.size() &&
+         "offsets and blockShape must have the same size");
+  SmallVector<OpFoldResult> blockedOffsets;
+  SmallVector<OpFoldResult> divs, rems;
+
+  for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+    divs.push_back(div(offset, block));
+    rems.push_back(rem(offset, block));
+  }
+  blockedOffsets.append(divs.begin(), divs.end());
+  blockedOffsets.append(rems.begin(), rems.end());
+
+  return blockedOffsets;
+}
+
+// Get strides as vector of integer for MemDesc.
+SmallVector<int64_t> MemDescType::getStrideShape() {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+
+  ArrayAttr strideAttr = getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+
+  SmallVector<int64_t> innerBlkShape = getBlockShape();
+
+  // get perm from FCD to LCD
+  // perm[i] = the dim with i-th smallest stride
+  SmallVector<int, 4> perm =
+      llvm::to_vector<4>(llvm::seq<int>(0, strides.size()));
+  llvm::sort(perm, [&](int a, int b) { return strides[a] < strides[b]; });
+
+  assert(strides[perm[0]] == 1 && "inner most dim must have stride 1");
+
+  SmallVector<int64_t> innerBlkStride(innerBlkShape.size());
+  innerBlkStride[perm[0]] = 1;
+  for (size_t i = 1; i < perm.size(); ++i)
+    innerBlkStride[perm[i]] =
+        innerBlkStride[perm[i - 1]] * innerBlkShape[perm[i - 1]];
+
+  // compute the original matrix shape using the stride info
+  // and compute the number of blocks in each dimension
+  // The shape of highest dim can't be derived from stride info,
+  // but doesn't impact the stride computation for blocked layout.
+  SmallVector<int64_t> matrixShapeOrig(matrixShape.size());
+  SmallVector<int64_t> BlkShapeOrig(matrixShape.size());
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    matrixShapeOrig[perm[i]] = strides[perm[i + 1]] / strides[perm[i]];
+    BlkShapeOrig[perm[i]] = matrixShapeOrig[perm[i]] / innerBlkShape[perm[i]];
+  }
+
+  int64_t innerBlkSize = 1;
+  for (auto s : innerBlkShape)
+    innerBlkSize *= s;
+
+  SmallVector<int64_t> outerBlkStride(matrixShape.size());
+  outerBlkStride[perm[0]] = innerBlkSize;
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    outerBlkStride[perm[i + 1]] =
+        outerBlkStride[perm[i]] * BlkShapeOrig[perm[i]];
+  }
+
+  // combine the inner and outer strides
+  SmallVector<int64_t> blockedStrides;
+  blockedStrides.append(outerBlkStride.begin(), outerBlkStride.end());
+  blockedStrides.append(innerBlkStride.begin(), innerBlkStride.end());
+
+  return blockedStrides;
+}
+
+// Calculate the linear offset using the blocked offsets and stride
+Value MemDescType::getLinearOffsets(OpBuilder &builder, Location loc,
+                                    ArrayRef<OpFoldResult> offsets) {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+  SmallVector<int64_t> blockShape = getBlockShape();
+  SmallVector<int64_t> strides = getStrideShape();
+  SmallVector<OpFoldResult> blockedOffsets;
+
+  // blockshape equal to matrixshape means no blocking
+  if (llvm::equal(blockShape, matrixShape)) {
+    // remove the outer dims from strides
+    strides.erase(strides.begin(), strides.begin() + matrixShape.size());
+  } else {
+    assert(offsets.size() == blockShape.size() &&
+           "offsets and blockShape must have the same size");
+    // say the original offset is [y, x], and the block shape is [By, Bx],
+    // then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+
+    SmallVector<OpFoldResult> divs, rems;
+
+    for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+      divs.push_back(div(offset, block));
+      rems.push_back(rem(offset, block));
+    }
+    blockedOffsets.append(divs.begin(), divs.end());
+    blockedOffsets.append(rems.begin(), rems.end());
+    offsets = blockedOffsets;
+  }
+
+  // Start with initial value as matrix descriptor's base offset.
+  Value linearOffset = arith::ConstantIndexOp::create(builder, loc, 0);
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    OpFoldResult mulResult = mul(offsets[i], strides[i]);
+    Value mulVal = getValueOrCreateConstantIndexOp(builder, loc, mulResult);
+    linearOffset = arith::AddIOp::create(builder, loc, mulVal, linearOffset);
+  }
+
+  return linearOffset;
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 81b5788..abd12e2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -20,8 +20,8 @@
 
 #define DEBUG_TYPE "xegpu"
 
-namespace mlir {
-namespace xegpu {
+using namespace mlir;
+using namespace mlir::xegpu;
 
 static bool isSharedMemory(const MemRefType &memrefTy) {
   Attribute attr = memrefTy.getMemorySpace();
@@ -173,6 +173,49 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
   return success();
 }
 
+LogicalResult
+IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
+                      UnitAttr subgroup_block_io,
+                      function_ref<InFlightDiagnostic()> emitError) {
+
+  if (!dataTy) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    else
+      return success();
+  }
+
+  if (mdescTy.getRank() != 2)
+    return emitError() << "mem_desc must be 2D.";
+
+  ArrayRef<int64_t> dataShape = dataTy.getShape();
+  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
+
+  if (dataShape.size() == 2) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+                     [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+      return emitError() << "data shape must not exceed mem_desc shape.";
+  } else {
+    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+    // if the subgroup_block_io attribute is set,  mdescTy must have block
+    // attribute
+    if (subgroup_block_io && !blockShape.size())
+      return emitError() << "mem_desc must have block attribute when "
+                            "subgroup_block_io is set.";
+    // if the subgroup_block_io attribute is set, the memdesc should be row
+    // major
+    if (subgroup_block_io && mdescTy.isColMajor())
+      return emitError() << "mem_desc should be row major when "
+                            "subgroup_block_io is set.";
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -1049,23 +1092,20 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  // Call the generated builder with all parameters (including optional ones as
+  // nullptr/empty)
   build(builder, state, res, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult LoadMatrixOp::verify() {
-  VectorType resTy = getRes().getType();
-  MemDescType mdescTy = getMemDesc().getType();
 
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
+  auto resTy = dyn_cast<VectorType>(getRes().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
 
-  ArrayRef<int64_t> valueShape = resTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed mem_desc shape.");
-  return success();
+  return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1080,62 +1120,18 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
   build(builder, state, data, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult StoreMatrixOp::verify() {
-  VectorType dataTy = getData().getType();
-  MemDescType mdescTy = getMemDesc().getType();
-
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
-
-  ArrayRef<int64_t> dataShape = dataTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("data shape must not exceed mem_desc shape.");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// XeGPU_MemDescSubviewOp
-//===----------------------------------------------------------------------===//
-
-void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state,
-                             Type resTy, Value src,
-                             llvm::ArrayRef<OpFoldResult> offsets) {
-  llvm::SmallVector<Value> dynamicOffsets;
-  llvm::SmallVector<int64_t> staticOffsets;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
-}
-
-LogicalResult MemDescSubviewOp::verify() {
-  MemDescType srcTy = getSrc().getType();
-  MemDescType resTy = getRes().getType();
-  ArrayRef<int64_t> srcShape = srcTy.getShape();
-  ArrayRef<int64_t> resShape = resTy.getShape();
-
-  if (srcTy.getRank() < resTy.getRank())
-    return emitOpError("result rank must not exceed source rank.");
 
-  if (llvm::any_of(
-          llvm::zip_equal(resShape, srcShape.take_back(resShape.size())),
-          [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed source shape.");
-
-  if (srcTy.getStrides() != resTy.getStrides())
-    return emitOpError("result must inherit the source strides.");
-
-  return success();
+  auto dataTy = dyn_cast<VectorType>(getData().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
+  return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
-} // namespace xegpu
-} // namespace mlir
-
 namespace mlir {
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index a178d0f..aafa1b7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -941,7 +941,9 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
   LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    VectorType valueTy = op.getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
+    assert(valueTy && "the value type must be vector type!");
+
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
       return failure();
@@ -984,7 +986,8 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
       return failure();
 
     Location loc = op.getLoc();
-    VectorType valueTy = op.getData().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
+    assert(valueTy && "the value type must be vector type!");
     ArrayRef<int64_t> shape = valueTy.getShape();
     auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index c28d2fc..31a967d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -991,7 +991,8 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
       return failure();
 
     ArrayRef<int64_t> wgShape = op.getDataShape();
-    VectorType valueTy = op.getRes().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getRes().getType());
+    assert(valueTy && "the value type must be vector type!");
     Type elemTy = valueTy.getElementType();
 
     xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 89b81cf..5f63fe6 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -1204,7 +1204,7 @@ AffineMap AffineMap::getImpl(unsigned dimCount, unsigned symbolCount,
 /// present in result expressions is less than `dimCount` and the highest index
 /// of symbolic identifier present in result expressions is less than
 /// `symbolCount`.
-LLVM_ATTRIBUTE_UNUSED static bool
+[[maybe_unused]] static bool
 willBeValidAffineMap(unsigned dimCount, unsigned symbolCount,
                      ArrayRef<AffineExpr> results) {
   int64_t maxDimPosition = -1;
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index cb90ef8..d52d5e7 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -49,9 +49,7 @@ StaticVerifierFunctionEmitter::StaticVerifierFunctionEmitter(
     raw_ostream &os, const RecordKeeper &records, StringRef tag)
     : os(os), uniqueOutputLabel(getUniqueOutputLabel(records, tag)) {}
 
-void StaticVerifierFunctionEmitter::emitOpConstraints(
-    ArrayRef<const Record *> opDefs) {
-  NamespaceEmitter namespaceEmitter(os, Operator(*opDefs[0]).getCppNamespace());
+void StaticVerifierFunctionEmitter::emitOpConstraints() {
   emitTypeConstraints();
   emitAttrConstraints();
   emitPropConstraints();
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 9603813..857e31b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -2604,6 +2604,7 @@ static constexpr std::array kExplicitLLVMFuncOpAttributes{
     StringLiteral("denormal-fp-math-f32"),
     StringLiteral("fp-contract"),
     StringLiteral("frame-pointer"),
+    StringLiteral("inlinehint"),
     StringLiteral("instrument-function-entry"),
     StringLiteral("instrument-function-exit"),
     StringLiteral("memory"),
@@ -2643,6 +2644,8 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func,
     funcOp.setNoInline(true);
   if (func->hasFnAttribute(llvm::Attribute::AlwaysInline))
     funcOp.setAlwaysInline(true);
+  if (func->hasFnAttribute(llvm::Attribute::InlineHint))
+    funcOp.setInlineHint(true);
   if (func->hasFnAttribute(llvm::Attribute::OptimizeNone))
     funcOp.setOptimizeNone(true);
   if (func->hasFnAttribute(llvm::Attribute::Convergent))
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 845a14f..147613f 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1652,6 +1652,8 @@ static void convertFunctionAttributes(LLVMFuncOp func,
     llvmFunc->addFnAttr(llvm::Attribute::NoInline);
   if (func.getAlwaysInlineAttr())
     llvmFunc->addFnAttr(llvm::Attribute::AlwaysInline);
+  if (func.getInlineHintAttr())
+    llvmFunc->addFnAttr(llvm::Attribute::InlineHint);
   if (func.getOptimizeNoneAttr())
     llvmFunc->addFnAttr(llvm::Attribute::OptimizeNone);
   if (func.getConvergentAttr())
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 0c3e87a..d9ad8fb 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -2619,6 +2619,11 @@ LogicalResult ControlFlowStructurizer::structurize() {
   // region. We cannot handle such cases given that once a value is sinked into
   // the SelectionOp/LoopOp's region, there is no escape for it.
   for (auto *block : constructBlocks) {
+    if (!block->use_empty())
+      return emitError(block->getParent()->getLoc(),
+                       "failed control flow structurization: "
+                       "block has uses outside of the "
+                       "enclosing selection/loop construct");
     for (Operation &op : *block)
       if (!op.use_empty())
         return op.emitOpError("failed control flow structurization: value has "
diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
index 51c6077..366ba8f 100644
--- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
+++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
@@ -14,6 +14,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Support/LLVM.h"
@@ -138,6 +139,10 @@ using ImportDesc =
 
 using parsed_inst_t = FailureOr<SmallVector<Value>>;
 
+struct EmptyBlockMarker {};
+using BlockTypeParseResult =
+    std::variant<EmptyBlockMarker, TypeIdxRecord, Type>;
+
 struct WasmModuleSymbolTables {
   SmallVector<FunctionSymbolRefContainer> funcSymbols;
   SmallVector<GlobalSymbolRefContainer> globalSymbols;
@@ -175,6 +180,9 @@ class ParserHead;
 /// Wrapper around SmallVector to only allow access as push and pop on the
 /// stack. Makes sure that there are no "free accesses" on the stack to preserve
 /// its state.
+/// This class also keep tracks of the Wasm labels defined by different ops,
+/// which can be targeted by control flow ops. This can be modeled as part of
+/// the Value Stack as Wasm control flow ops can only target enclosing labels.
 class ValueStack {
 private:
   struct LabelLevel {
@@ -206,6 +214,16 @@ public:
   ///   if an error occurs.
   LogicalResult pushResults(ValueRange results, Location *opLoc);
 
+  void addLabelLevel(LabelLevelOpInterface levelOp) {
+    labelLevel.push_back({values.size(), levelOp});
+    LDBG() << "Adding a new frame context to ValueStack";
+  }
+
+  void dropLabelLevel() {
+    assert(!labelLevel.empty() && "Trying to drop a frame from empty context");
+    auto newSize = labelLevel.pop_back_val().stackIdx;
+    values.truncate(newSize);
+  }
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// A simple dump function for debugging.
   /// Writes output to llvm::dbgs().
@@ -214,6 +232,7 @@ public:
 
 private:
   SmallVector<Value> values;
+  SmallVector<LabelLevel> labelLevel;
 };
 
 using local_val_t = TypedValue<wasmssa::LocalRefType>;
@@ -248,6 +267,19 @@ private:
   buildNumericOp(OpBuilder &builder,
                  std::enable_if_t<std::is_arithmetic_v<valueType>> * = nullptr);
 
+  /// Construct a conversion operation of type \p opType that takes a value from
+  /// type \p inputType on the stack and will produce a value of type
+  /// \p outputType.
+  ///
+  /// \p opType - The WASM dialect operation to build.
+  /// \p inputType - The operand type for the built instruction.
+  /// \p outputType - The result type for the built instruction.
+  ///
+  /// \returns The parsed instruction result, or failure.
+  template <typename opType, typename inputType, typename outputType,
+            typename... extraArgsT>
+  inline parsed_inst_t buildConvertOp(OpBuilder &builder, extraArgsT...);
+
   /// This function generates a dispatch tree to associate an opcode with a
   /// parser. Parsers are registered by specialising the
   /// `parseSpecificInstruction` function for the op code to handle.
@@ -280,11 +312,105 @@ private:
     }
   }
 
+  ///
+  /// RAII guard class for creating a nesting level
+  ///
+  struct NestingContextGuard {
+    NestingContextGuard(ExpressionParser &parser, LabelLevelOpInterface levelOp)
+        : parser{parser} {
+      parser.addNestingContextLevel(levelOp);
+    }
+    NestingContextGuard(NestingContextGuard &&other) : parser{other.parser} {
+      other.shouldDropOnDestruct = false;
+    }
+    NestingContextGuard(NestingContextGuard const &) = delete;
+    ~NestingContextGuard() {
+      if (shouldDropOnDestruct)
+        parser.dropNestingContextLevel();
+    }
+    ExpressionParser &parser;
+    bool shouldDropOnDestruct = true;
+  };
+
+  void addNestingContextLevel(LabelLevelOpInterface levelOp) {
+    valueStack.addLabelLevel(levelOp);
+  }
+
+  void dropNestingContextLevel() {
+    // Should always succeed as we are droping the frame that was previously
+    // created.
+    valueStack.dropLabelLevel();
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               EmptyBlockMarker) {
+    return builder.getFunctionType({}, {});
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               TypeIdxRecord type) {
+    if (type.id >= symbols.moduleFuncTypes.size())
+      return emitError(*currentOpLoc,
+                       "type index references nonexistent type (")
+             << type.id << "). Only " << symbols.moduleFuncTypes.size()
+             << " types are registered";
+    return symbols.moduleFuncTypes[type.id];
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               Type valType) {
+    return builder.getFunctionType({}, {valType});
+  }
+
+  llvm::FailureOr<FunctionType>
+  getFuncTypeFor(OpBuilder &builder, BlockTypeParseResult parseResult) {
+    return std::visit(
+        [this, &builder](auto value) { return getFuncTypeFor(builder, value); },
+        parseResult);
+  }
+
+  llvm::FailureOr<FunctionType>
+  getFuncTypeFor(OpBuilder &builder,
+                 llvm::FailureOr<BlockTypeParseResult> parseResult) {
+    if (llvm::failed(parseResult))
+      return failure();
+    return getFuncTypeFor(builder, *parseResult);
+  }
+
+  llvm::FailureOr<FunctionType> parseBlockFuncType(OpBuilder &builder);
+
   struct ParseResultWithInfo {
     SmallVector<Value> opResults;
     std::byte endingByte;
   };
 
+  template <typename FilterT = ByteSequence<WasmBinaryEncoding::endByte>>
+  /// @param blockToFill: the block which content will be populated
+  /// @param resType: the type that this block is supposed to return
+  llvm::FailureOr<std::byte>
+  parseBlockContent(OpBuilder &builder, Block *blockToFill, TypeRange resTypes,
+                    Location opLoc, LabelLevelOpInterface levelOp,
+                    FilterT parseEndBytes = {}) {
+    OpBuilder::InsertionGuard guard{builder};
+    builder.setInsertionPointToStart(blockToFill);
+    LDBG() << "parsing a block of type "
+           << builder.getFunctionType(blockToFill->getArgumentTypes(),
+                                      resTypes);
+    auto nC = addNesting(levelOp);
+
+    if (failed(pushResults(blockToFill->getArguments())))
+      return failure();
+    auto bodyParsingRes = parse(builder, parseEndBytes);
+    if (failed(bodyParsingRes))
+      return failure();
+    auto returnOperands = popOperands(resTypes);
+    if (failed(returnOperands))
+      return failure();
+    builder.create<BlockReturnOp>(opLoc, *returnOperands);
+    LDBG() << "end of parsing of a block";
+    return bodyParsingRes->endingByte;
+  }
+
 public:
   template <std::byte ParseEndByte = WasmBinaryEncoding::endByte>
   parsed_inst_t parse(OpBuilder &builder, UniqueByte<ParseEndByte> = {});
@@ -294,7 +420,11 @@ public:
   parse(OpBuilder &builder,
         ByteSequence<ExpressionParseEnd...> parsingEndFilters);
 
-  FailureOr<SmallVector<Value>> popOperands(TypeRange operandTypes) {
+  NestingContextGuard addNesting(LabelLevelOpInterface levelOp) {
+    return NestingContextGuard{*this, levelOp};
+  }
+
+  FailureOr<llvm::SmallVector<Value>> popOperands(TypeRange operandTypes) {
     return valueStack.popOperands(operandTypes, &currentOpLoc.value());
   }
 
@@ -308,6 +438,12 @@ public:
   template <typename OpToCreate>
   parsed_inst_t parseSetOrTee(OpBuilder &);
 
+  /// Blocks and Loops have a similar format and differ only in how their exit
+  /// is handled which doesn´t matter at parsing time. Factorizes in one
+  /// function.
+  template <typename OpToCreate>
+  parsed_inst_t parseBlockLikeOp(OpBuilder &);
+
 private:
   std::optional<Location> currentOpLoc;
   ParserHead &parser;
@@ -586,6 +722,29 @@ public:
     return success();
   }
 
+  llvm::FailureOr<BlockTypeParseResult> parseBlockType(MLIRContext *ctx) {
+    auto loc = getLocation();
+    auto blockIndicator = peek();
+    if (failed(blockIndicator))
+      return failure();
+    if (*blockIndicator == WasmBinaryEncoding::Type::emptyBlockType) {
+      offset += 1;
+      return {EmptyBlockMarker{}};
+    }
+    if (isValueOneOf(*blockIndicator, valueTypesEncodings))
+      return parseValueType(ctx);
+    /// Block type idx is a 32 bit positive integer encoded as a 33 bit signed
+    /// value
+    auto typeIdx = parseI64();
+    if (failed(typeIdx))
+      return failure();
+    if (*typeIdx < 0 || *typeIdx > std::numeric_limits<uint32_t>::max())
+      return emitError(loc, "type ID should be representable with an unsigned "
+                            "32 bits integer. Got ")
+             << *typeIdx;
+    return {TypeIdxRecord{static_cast<uint32_t>(*typeIdx)}};
+  }
+
   bool end() const { return curHead().empty(); }
 
   ParserHead copy() const { return *this; }
@@ -701,17 +860,41 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction(OpBuilder &) {
 void ValueStack::dump() const {
   llvm::dbgs() << "================= Wasm ValueStack =======================\n";
   llvm::dbgs() << "size: " << size() << "\n";
+  llvm::dbgs() << "nbFrames: " << labelLevel.size() << '\n';
   llvm::dbgs() << "<Top>"
                << "\n";
   // Stack is pushed to via push_back. Therefore the top of the stack is the
   // end of the vector. Iterate in reverse so that the first thing we print
   // is the top of the stack.
+  auto indexGetter = [this]() {
+    size_t idx = labelLevel.size();
+    return [this, idx]() mutable -> std::optional<std::pair<size_t, size_t>> {
+      llvm::dbgs() << "IDX: " << idx << '\n';
+      if (idx == 0)
+        return std::nullopt;
+      auto frameId = idx - 1;
+      auto frameLimit = labelLevel[frameId].stackIdx;
+      idx -= 1;
+      return {{frameId, frameLimit}};
+    };
+  };
+  auto getNextFrameIndex = indexGetter();
+  auto nextFrameIdx = getNextFrameIndex();
   size_t stackSize = size();
-  for (size_t idx = 0; idx < stackSize; idx++) {
+  for (size_t idx = 0; idx < stackSize; ++idx) {
     size_t actualIdx = stackSize - 1 - idx;
+    while (nextFrameIdx && (nextFrameIdx->second > actualIdx)) {
+      llvm::dbgs() << "  --------------- Frame (" << nextFrameIdx->first
+                   << ")\n";
+      nextFrameIdx = getNextFrameIndex();
+    }
     llvm::dbgs() << "  ";
     values[actualIdx].dump();
   }
+  while (nextFrameIdx) {
+    llvm::dbgs() << "  --------------- Frame (" << nextFrameIdx->first << ")\n";
+    nextFrameIdx = getNextFrameIndex();
+  }
   llvm::dbgs() << "<Bottom>"
                << "\n";
   llvm::dbgs() << "=========================================================\n";
@@ -726,7 +909,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) {
     return emitError(*opLoc,
                      "stack doesn't contain enough values. trying to get ")
            << operandTypes.size() << " operands on a stack containing only "
-           << values.size() << " values.";
+           << values.size() << " values";
   size_t stackIdxOffset = values.size() - operandTypes.size();
   SmallVector<Value> res{};
   res.reserve(operandTypes.size());
@@ -735,8 +918,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) {
     Type stackType = operand.getType();
     if (stackType != operandTypes[i])
       return emitError(*opLoc, "invalid operand type on stack. expecting ")
-             << operandTypes[i] << ", value on stack is of type " << stackType
-             << ".";
+             << operandTypes[i] << ", value on stack is of type " << stackType;
     LDBG() << "    POP: " << operand;
     res.push_back(operand);
   }
@@ -792,6 +974,151 @@ ExpressionParser::parse(OpBuilder &builder,
   }
 }
 
+llvm::FailureOr<FunctionType>
+ExpressionParser::parseBlockFuncType(OpBuilder &builder) {
+  return getFuncTypeFor(builder, parser.parseBlockType(builder.getContext()));
+}
+
+template <typename OpToCreate>
+parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) {
+  auto opLoc = currentOpLoc;
+  auto funcType = parseBlockFuncType(builder);
+  if (failed(funcType))
+    return failure();
+
+  auto inputTypes = funcType->getInputs();
+  auto inputOps = popOperands(inputTypes);
+  if (failed(inputOps))
+    return failure();
+
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto resTypes = funcType->getResults();
+  llvm::SmallVector<Location> locations{};
+  locations.resize(resTypes.size(), *currentOpLoc);
+  auto *successor =
+      builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
+  builder.setInsertionPointToEnd(curBlock);
+  auto blockOp =
+      builder.create<OpToCreate>(*currentOpLoc, *inputOps, successor);
+  auto *blockBody = blockOp.createBlock();
+  if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp)))
+    return failure();
+  builder.setInsertionPointToStart(successor);
+  return {ValueRange{successor->getArguments()}};
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::block>(
+    OpBuilder &builder) {
+  return parseBlockLikeOp<BlockOp>(builder);
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::loop>(
+    OpBuilder &builder) {
+  return parseBlockLikeOp<LoopOp>(builder);
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::ifOpCode>(OpBuilder &builder) {
+  auto opLoc = currentOpLoc;
+  auto funcType = parseBlockFuncType(builder);
+  if (failed(funcType))
+    return failure();
+
+  LDBG() << "Parsing an if instruction of type " << *funcType;
+  auto inputTypes = funcType->getInputs();
+  auto conditionValue = popOperands(builder.getI32Type());
+  if (failed(conditionValue))
+    return failure();
+  auto inputOps = popOperands(inputTypes);
+  if (failed(inputOps))
+    return failure();
+
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto resTypes = funcType->getResults();
+  llvm::SmallVector<Location> locations{};
+  locations.resize(resTypes.size(), *currentOpLoc);
+  auto *successor =
+      builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
+  builder.setInsertionPointToEnd(curBlock);
+  auto ifOp = builder.create<IfOp>(*currentOpLoc, conditionValue->front(),
+                                   *inputOps, successor);
+  auto *ifEntryBlock = ifOp.createIfBlock();
+  constexpr auto ifElseFilter =
+      ByteSequence<WasmBinaryEncoding::endByte,
+                   WasmBinaryEncoding::OpCode::elseOpCode>{};
+  auto parseIfRes = parseBlockContent(builder, ifEntryBlock, resTypes, *opLoc,
+                                      ifOp, ifElseFilter);
+  if (failed(parseIfRes))
+    return failure();
+  if (*parseIfRes == WasmBinaryEncoding::OpCode::elseOpCode) {
+    LDBG() << "  else block is present.";
+    Block *elseEntryBlock = ifOp.createElseBlock();
+    auto parseElseRes =
+        parseBlockContent(builder, elseEntryBlock, resTypes, *opLoc, ifOp);
+    if (failed(parseElseRes))
+      return failure();
+  }
+  builder.setInsertionPointToStart(successor);
+  return {ValueRange{successor->getArguments()}};
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::branchIf>(OpBuilder &builder) {
+  auto level = parser.parseLiteral<uint32_t>();
+  if (failed(level))
+    return failure();
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto sip = builder.saveInsertionPoint();
+  Block *elseBlock = builder.createBlock(curRegion, curRegion->end());
+  auto condition = popOperands(builder.getI32Type());
+  if (failed(condition))
+    return failure();
+  builder.restoreInsertionPoint(sip);
+  auto targetOp =
+      LabelBranchingOpInterface::getTargetOpFromBlock(curBlock, *level);
+  if (failed(targetOp))
+    return failure();
+  auto inputTypes = targetOp->getLabelTarget()->getArgumentTypes();
+  auto branchArgs = popOperands(inputTypes);
+  if (failed(branchArgs))
+    return failure();
+  builder.create<BranchIfOp>(*currentOpLoc, condition->front(),
+                             builder.getUI32IntegerAttr(*level), *branchArgs,
+                             elseBlock);
+  builder.setInsertionPointToStart(elseBlock);
+  return {*branchArgs};
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::call>(
+    OpBuilder &builder) {
+  auto loc = *currentOpLoc;
+  auto funcIdx = parser.parseLiteral<uint32_t>();
+  if (failed(funcIdx))
+    return failure();
+  if (*funcIdx >= symbols.funcSymbols.size())
+    return emitError(loc, "Invalid function index: ") << *funcIdx;
+  auto callee = symbols.funcSymbols[*funcIdx];
+  llvm::ArrayRef<Type> inTypes = callee.functionType.getInputs();
+  llvm::ArrayRef<Type> resTypes = callee.functionType.getResults();
+  parsed_inst_t inOperands = popOperands(inTypes);
+  if (failed(inOperands))
+    return failure();
+  auto callOp =
+      builder.create<FuncCallOp>(loc, resTypes, callee.symbol, *inOperands);
+  return {callOp.getResults()};
+}
+
 template <>
 inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
     WasmBinaryEncoding::OpCode::localGet>(OpBuilder &builder) {
@@ -834,7 +1161,7 @@ parsed_inst_t ExpressionParser::parseSetOrTee(OpBuilder &builder) {
   if (valueStack.empty())
     return emitError(
         *currentOpLoc,
-        "invalid stack access, trying to access a value on an empty stack.");
+        "invalid stack access, trying to access a value on an empty stack");
 
   parsed_inst_t poppedOp = popOperands(locals[*id].getType().getElementType());
   if (failed(poppedOp))
@@ -1000,11 +1327,23 @@ inline parsed_inst_t ExpressionParser::buildNumericOp(
 
 BUILD_NUMERIC_BINOP_FP(CopySignOp, copysign)
 BUILD_NUMERIC_BINOP_FP(DivOp, div)
+BUILD_NUMERIC_BINOP_FP(GeOp, ge)
+BUILD_NUMERIC_BINOP_FP(GtOp, gt)
+BUILD_NUMERIC_BINOP_FP(LeOp, le)
+BUILD_NUMERIC_BINOP_FP(LtOp, lt)
 BUILD_NUMERIC_BINOP_FP(MaxOp, max)
 BUILD_NUMERIC_BINOP_FP(MinOp, min)
 BUILD_NUMERIC_BINOP_INT(AndOp, and)
 BUILD_NUMERIC_BINOP_INT(DivSIOp, divS)
 BUILD_NUMERIC_BINOP_INT(DivUIOp, divU)
+BUILD_NUMERIC_BINOP_INT(GeSIOp, geS)
+BUILD_NUMERIC_BINOP_INT(GeUIOp, geU)
+BUILD_NUMERIC_BINOP_INT(GtSIOp, gtS)
+BUILD_NUMERIC_BINOP_INT(GtUIOp, gtU)
+BUILD_NUMERIC_BINOP_INT(LeSIOp, leS)
+BUILD_NUMERIC_BINOP_INT(LeUIOp, leU)
+BUILD_NUMERIC_BINOP_INT(LtSIOp, ltS)
+BUILD_NUMERIC_BINOP_INT(LtUIOp, ltU)
 BUILD_NUMERIC_BINOP_INT(OrOp, or)
 BUILD_NUMERIC_BINOP_INT(RemSIOp, remS)
 BUILD_NUMERIC_BINOP_INT(RemUIOp, remU)
@@ -1015,7 +1354,9 @@ BUILD_NUMERIC_BINOP_INT(ShRSOp, shrS)
 BUILD_NUMERIC_BINOP_INT(ShRUOp, shrU)
 BUILD_NUMERIC_BINOP_INT(XOrOp, xor)
 BUILD_NUMERIC_BINOP_INTFP(AddOp, add)
+BUILD_NUMERIC_BINOP_INTFP(EqOp, eq)
 BUILD_NUMERIC_BINOP_INTFP(MulOp, mul)
+BUILD_NUMERIC_BINOP_INTFP(NeOp, ne)
 BUILD_NUMERIC_BINOP_INTFP(SubOp, sub)
 BUILD_NUMERIC_UNARY_OP_FP(AbsOp, abs)
 BUILD_NUMERIC_UNARY_OP_FP(CeilOp, ceil)
@@ -1025,6 +1366,7 @@ BUILD_NUMERIC_UNARY_OP_FP(SqrtOp, sqrt)
 BUILD_NUMERIC_UNARY_OP_FP(TruncOp, trunc)
 BUILD_NUMERIC_UNARY_OP_INT(ClzOp, clz)
 BUILD_NUMERIC_UNARY_OP_INT(CtzOp, ctz)
+BUILD_NUMERIC_UNARY_OP_INT(EqzOp, eqz)
 BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt)
 
 // Don't need these anymore so let's undef them.
@@ -1036,6 +1378,105 @@ BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt)
 #undef BUILD_NUMERIC_OP
 #undef BUILD_NUMERIC_CAST_OP
 
+template <typename opType, typename inputType, typename outputType,
+          typename... extraArgsT>
+inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder,
+                                                      extraArgsT... extraArgs) {
+  static_assert(std::is_arithmetic_v<inputType>,
+                "InputType should be an arithmetic type");
+  static_assert(std::is_arithmetic_v<outputType>,
+                "OutputType should be an arithmetic type");
+  auto intype = buildLiteralType<inputType>(builder);
+  auto outType = buildLiteralType<outputType>(builder);
+  auto operand = popOperands(intype);
+  if (failed(operand))
+    return failure();
+  auto op = builder.create<opType>(*currentOpLoc, outType, operand->front(),
+                                   extraArgs...);
+  LDBG() << "Built operation: " << op;
+  return {{op.getResult()}};
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::demoteF64ToF32>(OpBuilder &builder) {
+  return buildConvertOp<DemoteOp, double, float>(builder);
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::wrap>(
+    OpBuilder &builder) {
+  return buildConvertOp<WrapOp, int64_t, int32_t>(builder);
+}
+
+#define BUILD_CONVERSION_OP(IN_T, OUT_T, SOURCE_OP, TARGET_OP)                 \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::SOURCE_OP>(OpBuilder & builder) {            \
+    return buildConvertOp<TARGET_OP, IN_T, OUT_T>(builder);                    \
+  }
+
+#define BUILD_CONVERT_OP_FOR(DEST_T, WIDTH)                                    \
+  BUILD_CONVERSION_OP(uint32_t, DEST_T, convertUI32F##WIDTH, ConvertUOp)       \
+  BUILD_CONVERSION_OP(int32_t, DEST_T, convertSI32F##WIDTH, ConvertSOp)        \
+  BUILD_CONVERSION_OP(uint64_t, DEST_T, convertUI64F##WIDTH, ConvertUOp)       \
+  BUILD_CONVERSION_OP(int64_t, DEST_T, convertSI64F##WIDTH, ConvertSOp)
+
+BUILD_CONVERT_OP_FOR(float, 32)
+BUILD_CONVERT_OP_FOR(double, 64)
+
+#undef BUILD_CONVERT_OP_FOR
+
+BUILD_CONVERSION_OP(int32_t, int64_t, extendS, ExtendSI32Op)
+BUILD_CONVERSION_OP(int32_t, int64_t, extendU, ExtendUI32Op)
+
+#undef BUILD_CONVERSION_OP
+
+#define BUILD_SLICE_EXTEND_PARSER(IT_WIDTH, EXTRACT_WIDTH)                     \
+  template <>                                                                  \
+  parsed_inst_t ExpressionParser::parseSpecificInstruction<                    \
+      WasmBinaryEncoding::OpCode::extendI##IT_WIDTH##EXTRACT_WIDTH##S>(        \
+      OpBuilder & builder) {                                                   \
+    using inout_t = int##IT_WIDTH##_t;                                         \
+    auto attr = builder.getUI32IntegerAttr(EXTRACT_WIDTH);                     \
+    return buildConvertOp<ExtendLowBitsSOp, inout_t, inout_t>(builder, attr);  \
+  }
+
+BUILD_SLICE_EXTEND_PARSER(32, 8)
+BUILD_SLICE_EXTEND_PARSER(32, 16)
+BUILD_SLICE_EXTEND_PARSER(64, 8)
+BUILD_SLICE_EXTEND_PARSER(64, 16)
+BUILD_SLICE_EXTEND_PARSER(64, 32)
+
+#undef BUILD_SLICE_EXTEND_PARSER
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::promoteF32ToF64>(OpBuilder &builder) {
+  return buildConvertOp<PromoteOp, float, double>(builder);
+}
+
+#define BUILD_REINTERPRET_PARSER(WIDTH, FP_TYPE)                               \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::reinterpretF##WIDTH##AsI##WIDTH>(OpBuilder & \
+                                                                   builder) {  \
+    return buildConvertOp<ReinterpretOp, FP_TYPE, int##WIDTH##_t>(builder);    \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::reinterpretI##WIDTH##AsF##WIDTH>(OpBuilder & \
+                                                                   builder) {  \
+    return buildConvertOp<ReinterpretOp, int##WIDTH##_t, FP_TYPE>(builder);    \
+  }
+
+BUILD_REINTERPRET_PARSER(32, float)
+BUILD_REINTERPRET_PARSER(64, double)
+
+#undef BUILD_REINTERPRET_PARSER
+
 class WasmBinaryParser {
 private:
   struct SectionRegistry {
@@ -1153,7 +1594,7 @@ private:
     if (tid.id >= symbols.moduleFuncTypes.size())
       return emitError(loc, "invalid type id: ")
              << tid.id << ". Only " << symbols.moduleFuncTypes.size()
-             << " type registration.";
+             << " type registrations";
     FunctionType type = symbols.moduleFuncTypes[tid.id];
     std::string symbol = symbols.getNewFuncSymbolName();
     auto funcOp = FuncImportOp::create(builder, loc, symbol, moduleName,
@@ -1221,7 +1662,7 @@ public:
     FileLineColLoc magicLoc = parser.getLocation();
     FailureOr<StringRef> magic = parser.consumeNBytes(wasmHeader.size());
     if (failed(magic) || magic->compare(wasmHeader)) {
-      emitError(magicLoc, "source file does not contain valid Wasm header.");
+      emitError(magicLoc, "source file does not contain valid Wasm header");
       return;
     }
     auto const expectedVersionString = StringRef{"\1\0\0\0", 4};
@@ -1391,7 +1832,7 @@ WasmBinaryParser::parseSectionItem<WasmSectionType::EXPORT>(ParserHead &ph,
     return failure();
 
   Operation *op = SymbolTable::lookupSymbolIn(mOp, *currentSymbol);
-  SymbolTable::setSymbolVisibility(op, SymbolTable::Visibility::Public);
+  op->setAttr("exported", UnitAttr::get(op->getContext()));
   StringAttr symName = SymbolTable::getSymbolName(op);
   return SymbolTable{mOp}.rename(symName, *exportName);
 }
diff --git a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
index 9670285..3fda5a7 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
@@ -93,7 +93,7 @@ void CodeGen::generate(const ast::Module &astModule, ModuleOp module) {
 
   // Emit function to add the generated matchers to the pattern list.
   os << "template <typename... ConfigsT>\n"
-        "static void LLVM_ATTRIBUTE_UNUSED populateGeneratedPDLLPatterns("
+        "[[maybe_unused]] static void populateGeneratedPDLLPatterns("
         "::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {\n";
   for (const auto &name : patternNames)
     os << "  patterns.add<" << name