29 files changed, 1498 insertions, 480 deletions
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 5ddb3fb..0f0ed22 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -205,7 +205,7 @@ public:
       nb::object res = f(opView, PyPatternRewriter(rewriter));
       return logicalResultFromObject(res);
     };
-    MlirRewritePattern pattern = mlirOpRewritePattenCreate(
+    MlirRewritePattern pattern = mlirOpRewritePatternCreate(
         rootName, benefit, ctx, callbacks, matchAndRewrite.ptr(),
         /* nGeneratedNames */ 0,
         /* generatedNames */ nullptr);
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 46c329d..41ceb15 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -341,7 +341,7 @@ private:
 
 } // namespace mlir
 
-MlirRewritePattern mlirOpRewritePattenCreate(
+MlirRewritePattern mlirOpRewritePatternCreate(
     MlirStringRef rootName, unsigned benefit, MlirContext context,
     MlirRewritePatternCallbacks callbacks, void *userData,
     size_t nGeneratedNames, MlirStringRef *generatedNames) {
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index b215211..c03f3a5 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -484,5 +484,5 @@ void mlir::populateGpuToROCDLConversionPatterns(
                GPUSubgroupBroadcastOpToROCDL>(converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
-  populateMathToROCDLConversionPatterns(converter, patterns);
+  populateMathToROCDLConversionPatterns(converter, patterns, chipset);
 }
diff --git a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
index 2771955a..8cc3fde 100644
--- a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRMathToROCDL
   Core
 
   LINK_LIBS PUBLIC
+  MLIRAMDGPUUtils
   MLIRDialectUtils
   MLIRFuncDialect
   MLIRGPUToGPURuntimeTransforms
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index df219f3..a2dfc12 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -10,6 +10,8 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -19,6 +21,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/DebugLog.h"
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/OpToFuncCallLowering.h"
@@ -42,8 +45,46 @@ static void populateOpPatterns(const LLVMTypeConverter &converter,
                                            f32ApproxFunc, f16Func);
 }
 
+struct ClampFOpConversion final
+    : public ConvertOpToLLVMPattern<math::ClampFOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::ClampFOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only f16 and f32 types are supported by fmed3
+    Type opTy = op.getType();
+    Type resultType = getTypeConverter()->convertType(opTy);
+
+    if (auto vectorType = dyn_cast<VectorType>(opTy))
+      opTy = vectorType.getElementType();
+
+    if (!isa<Float16Type, Float32Type>(opTy))
+      return rewriter.notifyMatchFailure(
+          op, "fmed3 only supports f16 and f32 types");
+
+    // Handle multi-dimensional vectors (converted to LLVM arrays)
+    if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(resultType))
+      return LLVM::detail::handleMultidimensionalVectors(
+          op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
+          [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
+            typename math::ClampFOp::Adaptor adaptor(operands);
+            return ROCDL::FMed3Op::create(rewriter, op.getLoc(), llvm1DVectorTy,
+                                          adaptor.getValue(), adaptor.getMin(),
+                                          adaptor.getMax());
+          },
+          rewriter);
+
+    // Handle 1D vectors and scalars directly
+    rewriter.replaceOpWithNewOp<ROCDL::FMed3Op>(op, op.getType(), op.getValue(),
+                                                op.getMin(), op.getMax());
+    return success();
+  }
+};
+
 void mlir::populateMathToROCDLConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    std::optional<amdgpu::Chipset> chipset) {
   // Handled by mathToLLVM: math::AbsIOp
   // Handled by mathToLLVM: math::AbsFOp
   // Handled by mathToLLVM: math::CopySignOp
@@ -118,15 +159,21 @@ void mlir::populateMathToROCDLConversionPatterns(
   // worth creating a separate pass for it.
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",
                                     "__ocml_fmod_f64", "__ocml_fmod_f16");
+
+  if (chipset.has_value() && chipset->majorVersion >= 9) {
+    patterns.add<ClampFOpConversion>(converter);
+  } else {
+    LDBG() << "Chipset dependent patterns were not added";
+  }
 }
 
-namespace {
-struct ConvertMathToROCDLPass
-    : public impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> {
-  ConvertMathToROCDLPass() = default;
+struct ConvertMathToROCDLPass final
+    : impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> {
+  using impl::ConvertMathToROCDLBase<
+      ConvertMathToROCDLPass>::ConvertMathToROCDLBase;
+
   void runOnOperation() override;
 };
-} // namespace
 
 void ConvertMathToROCDLPass::runOnOperation() {
   auto m = getOperation();
@@ -135,10 +182,21 @@ void ConvertMathToROCDLPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   LowerToLLVMOptions options(ctx, DataLayout(m));
   LLVMTypeConverter converter(ctx, options);
-  populateMathToROCDLConversionPatterns(converter, patterns);
+
+  FailureOr<amdgpu::Chipset> maybeChipset;
+  if (!chipset.empty()) {
+    maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset))
+      return signalPassFailure();
+  }
+  populateMathToROCDLConversionPatterns(
+      converter, patterns,
+      succeeded(maybeChipset) ? std::optional(*maybeChipset) : std::nullopt);
+
   ConversionTarget target(getContext());
-  target.addLegalDialect<BuiltinDialect, func::FuncDialect,
-                         vector::VectorDialect, LLVM::LLVMDialect>();
+  target
+      .addLegalDialect<BuiltinDialect, func::FuncDialect, vector::VectorDialect,
+                       LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
   target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
                       LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
                       LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
index 84b2580..dd9edc4 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
@@ -21,6 +21,7 @@ add_mlir_conversion_library(MLIRXeGPUToXeVM
   MLIRIndexDialect
   MLIRSCFDialect
   MLIRXeGPUDialect
+  MLIRXeGPUUtils
   MLIRPass
   MLIRTransforms
   MLIRSCFTransforms
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index ddcbc44..fcbf66d 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
@@ -63,6 +64,7 @@ static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) {
   case xegpu::MemorySpace::SLM:
     return static_cast<int>(xevm::AddrSpace::SHARED);
   }
+  llvm_unreachable("Unknown XeGPU memory space");
 }
 
 // Get same bitwidth flat vector type of new element type.
@@ -186,6 +188,7 @@ class CreateNdDescToXeVMPattern
     int64_t rank = mixedSizes.size();
     if (rank != 2)
       return rewriter.notifyMatchFailure(op, "Expected 2D shape.");
+
     auto sourceTy = source.getType();
     auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
     // If source is a memref, we need to extract the aligned pointer as index.
@@ -364,10 +367,11 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
 
 // Add a builder that creates
 // offset * elemByteSize + baseAddr
-static Value addOffset(ConversionPatternRewriter &rewriter, Location loc,
-                       Value baseAddr, Value offset, int64_t elemByteSize) {
+static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter,
+                                 Location loc, Value baseAddr, Value offset,
+                                 int64_t elemByteSize) {
   Value byteSize = arith::ConstantIntOp::create(
-      rewriter, loc, rewriter.getI64Type(), elemByteSize);
+      rewriter, loc, baseAddr.getType(), elemByteSize);
   Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize);
   Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset);
   return newAddr;
@@ -443,7 +447,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
       // If offset is provided, we add them to the base pointer.
       // Offset is in number of elements, we need to multiply by
       // element byte size.
-      basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize);
+      basePtrI64 =
+          addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize);
     }
     // Convert base pointer (i64) to LLVM pointer type.
     Value basePtrLLVM =
@@ -506,6 +511,147 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
   }
 };
 
+// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions
+// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than
+// 32 bits will be converted to 32 bits.
+class CreateMemDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateMemDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateMemDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto resTy = op.getMemDesc();
+
+    // Create the result MemRefType with the same shape, element type, and
+    // memory space
+    auto newResTy = getTypeConverter()->convertType<MemRefType>(resTy);
+
+    Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0);
+    auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy,
+                                         op.getSource(), zero, ValueRange());
+    rewriter.replaceOp(op, viewOp);
+    return success();
+  }
+};
+
+template <typename OpType,
+          typename = std::enable_if_t<llvm::is_one_of<
+              OpType, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>>
+class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(op, "Expected offset to be provided.");
+
+    auto loc = op.getLoc();
+    auto ctxt = rewriter.getContext();
+    Value basePtrStruct = adaptor.getMemDesc();
+    Value mdescVal = op.getMemDesc();
+    // Load result or Store value Type can be vector or scalar.
+    Value data;
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>)
+      data = op.getResult();
+    else
+      data = adaptor.getData();
+    VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
+    if (!valOrResVecTy)
+      valOrResVecTy = VectorType::get(1, data.getType());
+
+    int64_t elemBitWidth =
+        valOrResVecTy.getElementType().getIntOrFloatBitWidth();
+    // Element type must be multiple of 8 bits.
+    if (elemBitWidth % 8 != 0)
+      return rewriter.notifyMatchFailure(
+          op, "Expected element type bit width to be multiple of 8.");
+    int64_t elemByteSize = elemBitWidth / 8;
+
+    // Default memory space is SLM.
+    LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get(
+        ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::SLM));
+
+    auto mdescTy = cast<xegpu::MemDescType>(mdescVal.getType());
+
+    Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create(
+        rewriter, loc, basePtrStruct);
+
+    // Convert base pointer (ptr) to i32
+    Value basePtrI32 = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), basePtrLLVM);
+
+    Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets);
+    linearOffset = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), linearOffset);
+    basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset,
+                                     elemByteSize);
+
+    // convert base pointer (i32) to LLVM pointer type
+    basePtrLLVM =
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32);
+
+    if (op.getSubgroupBlockIoAttr()) {
+      // if the attribute 'subgroup_block_io' is set to true, it lowers to
+      // xevm.blockload
+
+      Type intElemTy = rewriter.getIntegerType(elemBitWidth);
+      VectorType intVecTy =
+          VectorType::get(valOrResVecTy.getShape(), intElemTy);
+
+      if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+        Value loadOp =
+            xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM);
+        if (intVecTy != valOrResVecTy) {
+          loadOp =
+              vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp);
+        }
+        rewriter.replaceOp(op, loadOp);
+      } else {
+        Value dataToStore = adaptor.getData();
+        if (valOrResVecTy != intVecTy) {
+          dataToStore =
+              vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore);
+        }
+        xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore,
+                                   nullptr);
+        rewriter.eraseOp(op);
+      }
+      return success();
+    }
+
+    if (valOrResVecTy.getNumElements() >= 1) {
+      auto chipOpt = xegpu::getChipStr(op);
+      if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) {
+        // the lowering for chunk load only works for pvc and bmg
+        return rewriter.notifyMatchFailure(
+            op, "The lowering is specific to pvc or bmg.");
+      }
+    }
+
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+      // if the size of valOrResVecTy is 1, it lowers to a scalar load/store
+      // operation. LLVM load/store does not support vector of size 1, so we
+      // need to handle this case separately.
+      auto scalarTy = valOrResVecTy.getElementType();
+      LLVM::LoadOp loadOp;
+      if (valOrResVecTy.getNumElements() == 1)
+        loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM);
+      else
+        loadOp =
+            LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM);
+      rewriter.replaceOp(op, loadOp);
+    } else {
+      LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
+      rewriter.eraseOp(op);
+    }
+    return success();
+  }
+};
+
 class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -548,8 +694,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
                 op, "Expected element type bit width to be multiple of 8.");
           elemByteSize = elemBitWidth / 8;
         }
-        basePtrI64 =
-            addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize);
+        basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets,
+                                         elemByteSize);
       }
     }
     // Default memory space is global.
@@ -786,6 +932,13 @@ struct ConvertXeGPUToXeVMPass
       auto i32Type = IntegerType::get(&getContext(), 32);
       return VectorType::get(8, i32Type);
     });
+    // Convert MemDescType into flattened MemRefType for SLM
+    typeConverter.addConversion([&](xegpu::MemDescType type) -> Type {
+      Type elemTy = type.getElementType();
+      int numElems = type.getNumElements();
+      return MemRefType::get(numElems, elemTy, AffineMap(), 3);
+    });
+
     typeConverter.addConversion([&](MemRefType type) -> Type {
       // Convert MemRefType to i64 type.
       return IntegerType::get(&getContext(), 64);
@@ -940,6 +1093,9 @@ void mlir::populateXeGPUToXeVMConversionPatterns(
                LoadStoreToXeVMPattern<xegpu::LoadGatherOp>,
                LoadStoreToXeVMPattern<xegpu::StoreScatterOp>>(
       typeConverter, patterns.getContext());
+  patterns.add<LoadStoreMatrixToXeVMPattern<xegpu::LoadMatrixOp>,
+               LoadStoreMatrixToXeVMPattern<xegpu::StoreMatrixOp>,
+               CreateMemDescOpPattern>(typeConverter, patterns.getContext());
   patterns.add<FenceToXeVMPattern, DpasToXeVMPattern>(typeConverter,
                                                       patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index c798adb..61166db 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -339,6 +339,25 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
 }
 
 //===----------------------------------------------------------------------===//
+// ScaledExtPacked816Op
+//===----------------------------------------------------------------------===//
+LogicalResult ScaledExtPacked816Op::verify() {
+  int blockSize = getBlockSize();
+  assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+  int firstScaleByte = getFirstScaleByte();
+  if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) {
+    return emitOpError(
+        "blockSize of 16 can only have firstScaleByte be 0 or 1.");
+  }
+  if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) {
+    return emitOpError(
+        "blockSize of 32 can only have firstScaleByte be 0 or 2.");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // WMMAOp
 //===----------------------------------------------------------------------===//
 LogicalResult WMMAOp::verify() {
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 749e2ba..e0a53cd 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2600,6 +2600,65 @@ static LogicalResult foldLoopBounds(AffineForOp forOp) {
   return success(folded);
 }
 
+/// Returns constant trip count in trivial cases.
+static std::optional<uint64_t> getTrivialConstantTripCount(AffineForOp forOp) {
+  int64_t step = forOp.getStepAsInt();
+  if (!forOp.hasConstantBounds() || step <= 0)
+    return std::nullopt;
+  int64_t lb = forOp.getConstantLowerBound();
+  int64_t ub = forOp.getConstantUpperBound();
+  return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step;
+}
+
+/// Fold the empty loop.
+static SmallVector<OpFoldResult> AffineForEmptyLoopFolder(AffineForOp forOp) {
+  if (!llvm::hasSingleElement(*forOp.getBody()))
+    return {};
+  if (forOp.getNumResults() == 0)
+    return {};
+  std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp);
+  if (tripCount == 0) {
+    // The initial values of the iteration arguments would be the op's
+    // results.
+    return forOp.getInits();
+  }
+  SmallVector<Value, 4> replacements;
+  auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->getTerminator());
+  auto iterArgs = forOp.getRegionIterArgs();
+  bool hasValDefinedOutsideLoop = false;
+  bool iterArgsNotInOrder = false;
+  for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) {
+    Value val = yieldOp.getOperand(i);
+    BlockArgument *iterArgIt = llvm::find(iterArgs, val);
+    // TODO: It should be possible to perform a replacement by computing the
+    // last value of the IV based on the bounds and the step.
+    if (val == forOp.getInductionVar())
+      return {};
+    if (iterArgIt == iterArgs.end()) {
+      // `val` is defined outside of the loop.
+      assert(forOp.isDefinedOutsideOfLoop(val) &&
+             "must be defined outside of the loop");
+      hasValDefinedOutsideLoop = true;
+      replacements.push_back(val);
+    } else {
+      unsigned pos = std::distance(iterArgs.begin(), iterArgIt);
+      if (pos != i)
+        iterArgsNotInOrder = true;
+      replacements.push_back(forOp.getInits()[pos]);
+    }
+  }
+  // Bail out when the trip count is unknown and the loop returns any value
+  // defined outside of the loop or any iterArg out of order.
+  if (!tripCount.has_value() &&
+      (hasValDefinedOutsideLoop || iterArgsNotInOrder))
+    return {};
+  // Bail out when the loop iterates more than once and it returns any iterArg
+  // out of order.
+  if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder)
+    return {};
+  return llvm::to_vector_of<OpFoldResult>(replacements);
+}
+
 /// Canonicalize the bounds of the given loop.
 static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) {
   SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
@@ -2631,79 +2690,30 @@ static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) {
   return success();
 }
 
-namespace {
-/// Returns constant trip count in trivial cases.
-static std::optional<uint64_t> getTrivialConstantTripCount(AffineForOp forOp) {
-  int64_t step = forOp.getStepAsInt();
-  if (!forOp.hasConstantBounds() || step <= 0)
-    return std::nullopt;
-  int64_t lb = forOp.getConstantLowerBound();
-  int64_t ub = forOp.getConstantUpperBound();
-  return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step;
+/// Returns true if the affine.for has zero iterations in trivial cases.
+static bool hasTrivialZeroTripCount(AffineForOp op) {
+  return getTrivialConstantTripCount(op) == 0;
 }
 
-/// This is a pattern to fold trivially empty loop bodies.
-/// TODO: This should be moved into the folding hook.
-struct AffineForEmptyLoopFolder : public OpRewritePattern<AffineForOp> {
-  using OpRewritePattern<AffineForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AffineForOp forOp,
-                                PatternRewriter &rewriter) const override {
-    // Check that the body only contains a yield.
-    if (!llvm::hasSingleElement(*forOp.getBody()))
-      return failure();
-    if (forOp.getNumResults() == 0)
-      return success();
-    std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp);
-    if (tripCount == 0) {
-      // The initial values of the iteration arguments would be the op's
-      // results.
-      rewriter.replaceOp(forOp, forOp.getInits());
-      return success();
-    }
-    SmallVector<Value, 4> replacements;
-    auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->getTerminator());
-    auto iterArgs = forOp.getRegionIterArgs();
-    bool hasValDefinedOutsideLoop = false;
-    bool iterArgsNotInOrder = false;
-    for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) {
-      Value val = yieldOp.getOperand(i);
-      auto *iterArgIt = llvm::find(iterArgs, val);
-      // TODO: It should be possible to perform a replacement by computing the
-      // last value of the IV based on the bounds and the step.
-      if (val == forOp.getInductionVar())
-        return failure();
-      if (iterArgIt == iterArgs.end()) {
-        // `val` is defined outside of the loop.
-        assert(forOp.isDefinedOutsideOfLoop(val) &&
-               "must be defined outside of the loop");
-        hasValDefinedOutsideLoop = true;
-        replacements.push_back(val);
-      } else {
-        unsigned pos = std::distance(iterArgs.begin(), iterArgIt);
-        if (pos != i)
-          iterArgsNotInOrder = true;
-        replacements.push_back(forOp.getInits()[pos]);
-      }
-    }
-    // Bail out when the trip count is unknown and the loop returns any value
-    // defined outside of the loop or any iterArg out of order.
-    if (!tripCount.has_value() &&
-        (hasValDefinedOutsideLoop || iterArgsNotInOrder))
-      return failure();
-    // Bail out when the loop iterates more than once and it returns any iterArg
-    // out of order.
-    if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder)
-      return failure();
-    rewriter.replaceOp(forOp, replacements);
-    return success();
+LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
+                                SmallVectorImpl<OpFoldResult> &results) {
+  bool folded = succeeded(foldLoopBounds(*this));
+  folded |= succeeded(canonicalizeLoopBounds(*this));
+  if (hasTrivialZeroTripCount(*this) && getNumResults() != 0) {
+    // The initial values of the loop-carried variables (iter_args) are the
+    // results of the op. But this must be avoided for an affine.for op that
+    // does not return any results. Since ops that do not return results cannot
+    // be folded away, we would enter an infinite loop of folds on the same
+    // affine.for op.
+    results.assign(getInits().begin(), getInits().end());
+    folded = true;
   }
-};
-} // namespace
-
-void AffineForOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
-  results.add<AffineForEmptyLoopFolder>(context);
+  SmallVector<OpFoldResult> foldResults = AffineForEmptyLoopFolder(*this);
+  if (!foldResults.empty()) {
+    results.assign(foldResults);
+    folded = true;
+  }
+  return success(folded);
 }
 
 OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
@@ -2746,27 +2756,6 @@ void AffineForOp::getSuccessorRegions(
   regions.push_back(RegionSuccessor(getResults()));
 }
 
-/// Returns true if the affine.for has zero iterations in trivial cases.
-static bool hasTrivialZeroTripCount(AffineForOp op) {
-  return getTrivialConstantTripCount(op) == 0;
-}
-
-LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
-                                SmallVectorImpl<OpFoldResult> &results) {
-  bool folded = succeeded(foldLoopBounds(*this));
-  folded |= succeeded(canonicalizeLoopBounds(*this));
-  if (hasTrivialZeroTripCount(*this) && getNumResults() != 0) {
-    // The initial values of the loop-carried variables (iter_args) are the
-    // results of the op. But this must be avoided for an affine.for op that
-    // does not return any results. Since ops that do not return results cannot
-    // be folded away, we would enter an infinite loop of folds on the same
-    // affine.for op.
-    results.assign(getInits().begin(), getInits().end());
-    folded = true;
-  }
-  return success(folded);
-}
-
 AffineBound AffineForOp::getLowerBound() {
   return AffineBound(*this, getLowerBoundOperands(), getLowerBoundMap());
 }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
index 70faa71..bc17990 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
@@ -41,18 +41,37 @@ namespace bufferization {
 
 using namespace mlir;
 
-/// Return the unique ReturnOp that terminates `funcOp`.
-/// Return nullptr if there is no such unique ReturnOp.
-static func::ReturnOp getAssumedUniqueReturnOp(func::FuncOp funcOp) {
-  func::ReturnOp returnOp;
+/// Get all the ReturnOp in the funcOp.
+static SmallVector<func::ReturnOp> getReturnOps(func::FuncOp funcOp) {
+  SmallVector<func::ReturnOp> returnOps;
   for (Block &b : funcOp.getBody()) {
     if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
-      if (returnOp)
-        return nullptr;
-      returnOp = candidateOp;
+      returnOps.push_back(candidateOp);
     }
   }
-  return returnOp;
+  return returnOps;
+}
+
+/// Get the operands at the specified position for all returnOps.
+static SmallVector<Value>
+getReturnOpsOperandInPos(ArrayRef<func::ReturnOp> returnOps, size_t pos) {
+  return llvm::map_to_vector(returnOps, [&](func::ReturnOp returnOp) {
+    return returnOp.getOperand(pos);
+  });
+}
+
+/// Check if all given values are the same buffer as the block argument (modulo
+/// cast ops).
+static bool operandsEqualFuncArgument(ArrayRef<Value> operands,
+                                      BlockArgument argument) {
+  for (Value val : operands) {
+    while (auto castOp = val.getDefiningOp<memref::CastOp>())
+      val = castOp.getSource();
+
+    if (val != argument)
+      return false;
+  }
+  return true;
 }
 
 LogicalResult
@@ -72,40 +91,45 @@ mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) {
   for (auto funcOp : module.getOps<func::FuncOp>()) {
     if (funcOp.isExternal() || funcOp.isPublic())
       continue;
-    func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
-    // TODO: Support functions with multiple blocks.
-    if (!returnOp)
+    SmallVector<func::ReturnOp> returnOps = getReturnOps(funcOp);
+    if (returnOps.empty())
       continue;
 
     // Compute erased results.
-    SmallVector<Value> newReturnValues;
-    BitVector erasedResultIndices(funcOp.getFunctionType().getNumResults());
+    size_t numReturnOps = returnOps.size();
+    size_t numReturnValues = funcOp.getFunctionType().getNumResults();
+    SmallVector<SmallVector<Value>> newReturnValues(numReturnOps);
+    BitVector erasedResultIndices(numReturnValues);
     DenseMap<int64_t, int64_t> resultToArgs;
-    for (const auto &it : llvm::enumerate(returnOp.getOperands())) {
+    for (size_t i = 0; i < numReturnValues; ++i) {
       bool erased = false;
+      SmallVector<Value> returnOperands =
+          getReturnOpsOperandInPos(returnOps, i);
       for (BlockArgument bbArg : funcOp.getArguments()) {
-        Value val = it.value();
-        while (auto castOp = val.getDefiningOp<memref::CastOp>())
-          val = castOp.getSource();
-
-        if (val == bbArg) {
-          resultToArgs[it.index()] = bbArg.getArgNumber();
+        if (operandsEqualFuncArgument(returnOperands, bbArg)) {
+          resultToArgs[i] = bbArg.getArgNumber();
           erased = true;
           break;
         }
       }
 
       if (erased) {
-        erasedResultIndices.set(it.index());
+        erasedResultIndices.set(i);
       } else {
-        newReturnValues.push_back(it.value());
+        for (auto [newReturnValue, operand] :
+             llvm::zip(newReturnValues, returnOperands)) {
+          newReturnValue.push_back(operand);
+        }
       }
     }
 
     // Update function.
     if (failed(funcOp.eraseResults(erasedResultIndices)))
       return failure();
-    returnOp.getOperandsMutable().assign(newReturnValues);
+
+    for (auto [returnOp, newReturnValue] :
+         llvm::zip(returnOps, newReturnValues))
+      returnOp.getOperandsMutable().assign(newReturnValue);
 
     // Update function calls.
     for (func::CallOp callOp : callerMap[funcOp]) {
diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
index 70a9c77..ec68acf 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRGPUPipelines
   GPUToNVVMPipeline.cpp
+  GPUToXeVMPipeline.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
@@ -11,12 +12,17 @@ add_mlir_dialect_library(MLIRGPUPipelines
   MLIRTransforms
   MLIRLinalgTransforms
   MLIRAffineToStandard
+  MLIRGPUToLLVMSPV
   MLIRGPUToNVVMTransforms
   MLIRIndexToLLVM
   MLIRMathToLLVM
+  MLIRMathToXeVM
   MLIRNVGPUToNVVM
   MLIRNVVMToLLVM
   MLIRReconcileUnrealizedCasts
   MLIRSCFToControlFlow
   MLIRVectorToSCF
+  MLIRXeGPUTransforms
+  MLIRXeGPUToXeVM
+  MLIRXeVMToLLVM
 )
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
new file mode 100644
index 0000000..1a1485b
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -0,0 +1,139 @@
+//===- GPUToXeVMPipeline.cpp - Lowering pipeline to XeVM/LLVM -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass for testing the lowering to XeVM as a generally
+// usable sink pass. If XeGPU ops are used, it expects the MLIR code to have
+// XeGPU ops already embedded in gpu code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/MathToXeVM/MathToXeVM.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Conversion/XeGPUToXeVM/XeGPUToXeVM.h"
+#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Pipelines/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Pre-GPU common pipeline for both Host and GPU.
+//===----------------------------------------------------------------------===//
+void buildPreGPUCommonPassPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToXeVMPipelineOptions &options) {
+  // builtin.module scope passes.
+  pm.addPass(createCSEPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  {
+    GpuXeVMAttachTargetOptions xevmTargetOptions;
+    xevmTargetOptions.moduleMatcher = options.xevmModuleMatcher;
+    xevmTargetOptions.triple = options.zebinTriple;
+    xevmTargetOptions.chip = options.zebinChip;
+    xevmTargetOptions.optLevel = options.optLevel;
+    xevmTargetOptions.cmdOptions = options.cmdOptions;
+    pm.addPass(createGpuXeVMAttachTarget(xevmTargetOptions));
+  }
+  pm.addPass(createLowerAffinePass());
+  pm.addNestedPass<func::FuncOp>(createGpuAsyncRegionPass());
+}
+
+//===----------------------------------------------------------------------===//
+// GPUModule-specific stuff.
+//===----------------------------------------------------------------------===//
+void buildGPUPassPipeline(OpPassManager &pm,
+                          const mlir::gpu::GPUToXeVMPipelineOptions &options) {
+  if (options.xegpuOpLevel == "workgroup") {
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUWgToSgDistribute());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUBlocking());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  }
+  if (options.xegpuOpLevel == "subgroup" ||
+      options.xegpuOpLevel == "workgroup") {
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPropagateLayout());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUVectorLinearize());
+  }
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToXeVM());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeGPUToXeVMPass());
+  {
+    ConvertGpuOpsToLLVMSPVOpsOptions gpuToLLVMSPVOptions;
+    gpuToLLVMSPVOptions.use64bitIndex = options.use64bitIndex;
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        createConvertGpuOpsToLLVMSPVOps(gpuToLLVMSPVOptions));
+  }
+  pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
+
+//===----------------------------------------------------------------------===//
+// Post-GPU pipeline for both Host and GPU.
+//===----------------------------------------------------------------------===//
+void buildPostGPUCommonPassPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToXeVMPipelineOptions &options) {
+  // builtin.module scope passes.
+  pm.addPass(createSCFToControlFlowPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+  {
+    GpuToLLVMConversionPassOptions gpuToLLVMOptions;
+    gpuToLLVMOptions.hostBarePtrCallConv = options.hostBarePtrCallConv;
+    gpuToLLVMOptions.kernelBarePtrCallConv = options.kernelBarePtrCallConv;
+    pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMOptions));
+  }
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createConvertToLLVMPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+  // gpu-module-to-binary
+  {
+    GpuModuleToBinaryPassOptions gpuToModuleBinOptions;
+    gpuToModuleBinOptions.compilationTarget = options.binaryFormat;
+    gpuToModuleBinOptions.cmdOptions = options.cmdOptions;
+    pm.addPass(createGpuModuleToBinaryPass(gpuToModuleBinOptions));
+  }
+}
+} // namespace
+
+void mlir::gpu::buildLowerToXeVMPassPipeline(
+    OpPassManager &pm, const GPUToXeVMPipelineOptions &options) {
+  // Pre-GPU common pipelines.
+  buildPreGPUCommonPassPipeline(pm, options);
+
+  // GPUModule-specific stuff.
+  buildGPUPassPipeline(pm, options);
+
+  // Post-GPU pipeline for both Host and GPU.
+  buildPostGPUCommonPassPipeline(pm, options);
+}
+
+void mlir::gpu::registerGPUToXeVMPipeline() {
+  PassPipelineRegistration<GPUToXeVMPipelineOptions>(
+      "gpu-lower-to-xevm-pipeline",
+      "The default GPU to XeVM lowering pipeline. It starts by lowering GPU "
+      "code to the "
+      "specified compilation target (default is fatbin) then lowers the host "
+      "code.",
+      buildLowerToXeVMPassPipeline);
+}
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 6192d79..9a8a63e 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2457,26 +2457,24 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
     }
 
     // Set options.
-    TilingInterface paddedOp;
     PadTilingInterfaceOptions options;
     options.setPaddingValues(paddingValues)
         .setPaddingSizes(getMixedPaddingSizes())
         .setPadToMultipleOf(getPadToMultipleOf());
 
-    // Apply padding.
-    SmallVector<tensor::PadOp> newPadOps;
-    FailureOr<TilingInterface> maybePaddedOp = rewriteAsPaddedOp(
-        rewriter, cast<TilingInterface>(targetOp.getOperation()), options,
-        newPadOps);
-    if (failed(maybePaddedOp)) {
+    auto maybePadOps = rewriteAsPaddedOp(
+        rewriter, cast<TilingInterface>(targetOp.getOperation()), options);
+    if (failed(maybePadOps)) {
       auto diag = emitSilenceableError() << "failed to pad op";
       diag.attachNote(target->getLoc()) << "target op";
       return diag;
     }
+    const auto &[paddedOperands, paddedOp, slicedResults] = maybePadOps.value();
 
     // Set transform results.
-    paddedOps.push_back(cast<TilingInterface>(maybePaddedOp->getOperation()));
-    padOps.append(newPadOps.begin(), newPadOps.end());
+    paddedOps.push_back(paddedOp);
+    padOps.append(paddedOperands.begin(), paddedOperands.end());
+    rewriter.replaceOp(targetOp.getOperation(), slicedResults);
   }
 
   results.set(cast<OpResult>(getPadded()), paddedOps);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 0956c5d..3e787a2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -95,10 +95,11 @@ static int64_t extractConstantMultiplier(AffineExpr expr) {
 /// - affine_map<(d0, d1) -> (d0 * 3 + d1)>
 /// In the future, more general interfaces can be devised to encode similar
 /// shape evolutions and map between an op and its operands.
-SmallVector<OpFoldResult> linalg::computePaddedShape(
-    RewriterBase &rewriter, TypedValue<RankedTensorType> v,
-    AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
-    const PadTilingInterfaceOptions &options) {
+SmallVector<OpFoldResult>
+linalg::computePaddedShape(OpBuilder &builder, TypedValue<RankedTensorType> v,
+                           AffineMap indexingMap,
+                           ArrayRef<OpFoldResult> indexingSizes,
+                           const PadTilingInterfaceOptions &options) {
   Location loc = v.getLoc();
   SmallVector<OpFoldResult> paddedShape;
   auto tensorType = cast<RankedTensorType>(v.getType());
@@ -109,7 +110,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
 
   // "Full-rank" padding specification.
   SmallVector<OpFoldResult> paddingSizes =
-      getFullRankPaddingSizes(rewriter, indexingSizes, options);
+      getFullRankPaddingSizes(builder, indexingSizes, options);
 
   // For each dimension in the operand's shape, iterate over indexingSizes and
   // add the various term contributions.
@@ -147,28 +148,27 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
       OpFoldResult paddingDimOfr;
       if (options.padToMultipleOf) {
         AffineExpr d0, s0;
-        bindDims(rewriter.getContext(), d0);
-        bindSymbols(rewriter.getContext(), s0);
+        bindDims(builder.getContext(), d0);
+        bindSymbols(builder.getContext(), s0);
         AffineMap ceilMap = AffineMap::get(1, 1, d0.ceilDiv(s0) * s0);
         AffineMap composedMap = projectedMap.compose(ceilMap);
         paddingDimOfr = affine::makeComposedFoldedAffineApply(
-            rewriter, loc, composedMap,
-            {indexingSizes[paddingDim], paddingSize},
+            builder, loc, composedMap, {indexingSizes[paddingDim], paddingSize},
             /*composeAffineMin=*/true);
       } else {
         // Otherwise just set to paddingSize.
         paddingDimOfr = affine::makeComposedFoldedAffineApply(
-            rewriter, loc, projectedMap, paddingSize);
+            builder, loc, projectedMap, paddingSize);
       }
 
       // Adjust for the maximum accessed index, which is (paddingSize - 1) *
       // multiplier.
       AffineExpr d0;
-      bindDims(rewriter.getContext(), d0);
+      bindDims(builder.getContext(), d0);
       int64_t multiplier = extractConstantMultiplier(projectedMap.getResult(0));
       AffineMap subtractMap = AffineMap::get(1, 0, d0 - multiplier);
       OpFoldResult maxAccessIdx = affine::makeComposedFoldedAffineApply(
-          rewriter, loc, subtractMap, {paddingDimOfr});
+          builder, loc, subtractMap, {paddingDimOfr});
       terms.push_back(maxAccessIdx);
 
       LLVM_DEBUG(DBGS() << "------new term: " << terms.back() << "\n");
@@ -177,19 +177,19 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
     // If there are no terms, just return the dim.
     if (terms.empty()) {
       paddedShape[resultIndex] =
-          createFoldedDimOp(rewriter, loc, v, resultIndex);
+          createFoldedDimOp(builder, loc, v, resultIndex);
       continue;
     }
 
     // Sum individual terms' contributions.
     SmallVector<AffineExpr> dims(terms.size());
-    bindDimsList(rewriter.getContext(), MutableArrayRef{dims});
+    bindDimsList(builder.getContext(), MutableArrayRef{dims});
     AffineExpr sumExpr = dims.front();
     for (unsigned i = 1; i < dims.size(); ++i)
       sumExpr = sumExpr + dims[i];
     // Add 1 to the maximum accessed index and get the final padded size.
-    OpFoldResult paddedDimOfr = affine::makeComposedFoldedAffineApply(
-        rewriter, loc, sumExpr + 1, terms);
+    OpFoldResult paddedDimOfr =
+        affine::makeComposedFoldedAffineApply(builder, loc, sumExpr + 1, terms);
     paddedShape[resultIndex] = paddedDimOfr;
   }
 
@@ -198,7 +198,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
 
 FailureOr<SmallVector<OpFoldResult>>
 linalg::computeIndexingMapOpInterfacePaddedShape(
-    RewriterBase &rewriter, OpOperand &operandToPad,
+    OpBuilder &builder, OpOperand &operandToPad,
     ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options) {
   auto transferOp =
       llvm::dyn_cast<IndexingMapOpInterface>(operandToPad.getOwner());
@@ -206,9 +206,9 @@ linalg::computeIndexingMapOpInterfacePaddedShape(
     return failure();
 
   // clang-format off
-  assert(llvm::all_of(iterationDomain, [&rewriter](Range r) {
-    return r.offset == OpFoldResult(rewriter.getIndexAttr(0)) &&
-    r.stride == OpFoldResult(rewriter.getIndexAttr(1));
+  assert(llvm::all_of(iterationDomain, [&builder](Range r) {
+    return r.offset == OpFoldResult(builder.getIndexAttr(0)) &&
+    r.stride == OpFoldResult(builder.getIndexAttr(1));
   }) && "expected 0-offset 1-stride loop ranges");
   // clang-format on
   SmallVector<OpFoldResult> loopUpperBounds;
@@ -218,13 +218,13 @@ linalg::computeIndexingMapOpInterfacePaddedShape(
 
   AffineMap indexingMap = transferOp.getMatchingIndexingMap(&operandToPad);
   return computePaddedShape(
-      rewriter, cast<TypedValue<RankedTensorType>>(operandToPad.get()),
+      builder, cast<TypedValue<RankedTensorType>>(operandToPad.get()),
       indexingMap, loopUpperBounds, options);
 }
 
 /// Pad a single operand to `paddedShape` using `paddingValueAttr` as padding
 /// Value.
-static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
+static Value padOperand(OpBuilder &builder, TilingInterface opToPad,
                         TypedValue<RankedTensorType> v,
                         ArrayRef<OpFoldResult> paddedShape,
                         Attribute paddingValueAttr) {
@@ -232,15 +232,15 @@ static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
   if (auto complexTy =
           dyn_cast<ComplexType>(getElementTypeOrSelf(v.getType()))) {
     if (auto complexAttr = dyn_cast<ArrayAttr>(paddingValueAttr)) {
-      paddingValue = complex::ConstantOp::create(rewriter, opToPad.getLoc(),
+      paddingValue = complex::ConstantOp::create(builder, opToPad.getLoc(),
                                                  complexTy, complexAttr);
     }
   } else if (isa<ub::PoisonAttr>(paddingValueAttr)) {
-    paddingValue = ub::PoisonOp::create(rewriter, opToPad.getLoc(),
+    paddingValue = ub::PoisonOp::create(builder, opToPad.getLoc(),
                                         getElementTypeOrSelf(v.getType()));
   } else if (auto typedAttr = dyn_cast<TypedAttr>(paddingValueAttr)) {
     paddingValue =
-        arith::ConstantOp::create(rewriter, opToPad.getLoc(), typedAttr);
+        arith::ConstantOp::create(builder, opToPad.getLoc(), typedAttr);
   }
   assert(paddingValue && "failed to create value from padding attribute");
 
@@ -259,49 +259,48 @@ static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
       RankedTensorType::get(tensorShape, getElementTypeOrSelf(v));
   LLVM_DEBUG(DBGS() << "--SUCCESS, makeComposedPadHighOp with type: "
                     << paddedTensorType);
-  return makeComposedPadHighOp(rewriter, opToPad.getLoc(), paddedTensorType, v,
+  return makeComposedPadHighOp(builder, opToPad.getLoc(), paddedTensorType, v,
                                paddingValue, /*nofold=*/false, dynDims);
 }
 
-FailureOr<TilingInterface> linalg::rewriteAsPaddedOp(
-    RewriterBase &rewriter, TilingInterface opToPad,
-    const PadTilingInterfaceOptions &constOptions,
-    SmallVector<tensor::PadOp> &padOps,
+FailureOr<PadTilingInterfaceResult> linalg::rewriteAsPaddedOp(
+    OpBuilder &builder, TilingInterface toPad,
+    PadTilingInterfaceOptions options,
     const PadSizeComputationFunction &computePaddingSizeFun) {
-  LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << opToPad << "\n");
+  LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << toPad << "\n");
+  SmallVector<tensor::PadOp> padOps;
+  Location loc = toPad.getLoc();
 
-  Location loc = opToPad.getLoc();
-  PadTilingInterfaceOptions options(constOptions);
   // Allow inference of pad values if they are not explicitly specified.
   // TODO: be mindful about the value depending on the actual operation.
   if (options.paddingValues.empty()) {
-    SmallVector<Type> types(opToPad->getOperandTypes());
-    llvm::append_range(types, opToPad->getResultTypes());
+    SmallVector<Type> types(toPad->getOperandTypes());
+    llvm::append_range(types, toPad->getResultTypes());
     for (Type t : types) {
       options.paddingValues.push_back(
-          rewriter.getZeroAttr(getElementTypeOrSelf(t)));
+          builder.getZeroAttr(getElementTypeOrSelf(t)));
     }
   }
 
-  if (llvm::any_of(opToPad->getOperands(),
+  if (llvm::any_of(toPad->getOperands(),
                    [](Value v) { return isa<MemRefType>(v.getType()); })) {
-    return rewriter.notifyMatchFailure(opToPad,
-                                       "expected operation on tensors");
+    LLVM_DEBUG(DBGS() << "Not an operation on tensors: FAIL\n");
+    return failure();
   }
 
-  OpBuilder::InsertionGuard g(rewriter);
-  // Set IP after opToPad because we also take the dims of opToPad's output.
-  rewriter.setInsertionPointAfter(opToPad);
+  OpBuilder::InsertionGuard g(builder);
+  // Set IP after toPad because we also take the dims of toPad's output.
+  builder.setInsertionPointAfter(toPad);
 
   // 1. Get the loopUpperBounds from the TilingInterface.
-  SmallVector<Range> iterationDomain = opToPad.getIterationDomain(rewriter);
+  SmallVector<Range> iterationDomain = toPad.getIterationDomain(builder);
 
   // 2. For each operand.
   SmallVector<Value> newOperands;
-  newOperands.reserve(opToPad->getNumOperands());
-  for (OpOperand &opOperand : opToPad->getOpOperands()) {
+  newOperands.reserve(toPad->getNumOperands());
+  for (OpOperand &opOperand : toPad->getOpOperands()) {
     Value operand = opOperand.get();
-    LLVM_DEBUG(DBGS() << "--start padding oprd: " << operand << "\n");
+    LLVM_DEBUG(DBGS() << "--start padding operand: " << operand << "\n");
 
     // 2.a. Skip scalar-like operands.
     Type operandType = operand.getType();
@@ -311,30 +310,31 @@ FailureOr<TilingInterface> linalg::rewriteAsPaddedOp(
       newOperands.push_back(operand);
       continue;
     }
+
     // 2.a. Compute padded shape.
     FailureOr<SmallVector<OpFoldResult>> maybePaddedShape =
-        computePaddingSizeFun(rewriter, opOperand, iterationDomain, options);
+        computePaddingSizeFun(builder, opOperand, iterationDomain, options);
     if (failed(maybePaddedShape)) {
-      return rewriter.notifyMatchFailure(opToPad, "could not pad op");
+      LLVM_DEBUG(DBGS() << "Could not get padded shape of operand: FAIL\n");
+      return failure();
     }
 
     // 2.b. Expect proper `paddingValues`.
     // TODO: we may want to allow garbage padding in the future, in which case
     // we would just not assert.
     if (opOperand.getOperandNumber() >= options.paddingValues.size()) {
-      return rewriter.notifyMatchFailure(opToPad,
-                                         "--no padding value specified");
+      LLVM_DEBUG(DBGS() << "Too few padding values specified: FAIL\n");
+      return failure();
     }
     Attribute paddingValueAttr =
         options.paddingValues[opOperand.getOperandNumber()];
 
     // 2.c. Perform actual padding.
-    Value paddedOperand = padOperand(
-        rewriter, opToPad, cast<TypedValue<RankedTensorType>>(operand),
-        *maybePaddedShape, paddingValueAttr);
+    Value paddedOperand =
+        padOperand(builder, toPad, cast<TypedValue<RankedTensorType>>(operand),
+                   *maybePaddedShape, paddingValueAttr);
     LLVM_DEBUG(DBGS() << "--done padding operand: " << paddedOperand << "\n");
 
-    // 2.d. Perform actual padding.
     newOperands.push_back(paddedOperand);
     if (auto padOp = paddedOperand.getDefiningOp<tensor::PadOp>())
       padOps.push_back(padOp);
@@ -342,38 +342,34 @@ FailureOr<TilingInterface> linalg::rewriteAsPaddedOp(
 
   // 3. Form the resulting tensor::ExtractSliceOp.
   ReifiedRankedShapedTypeDims reifiedResultShapes;
-  if (failed(reifyResultShapes(rewriter, opToPad, reifiedResultShapes))) {
-    LLVM_DEBUG(DBGS() << "--failed to reify result shapes -> FAIL\n");
-    return rewriter.notifyMatchFailure(opToPad,
-                                       "failed to reify result shapes");
+  if (failed(reifyResultShapes(builder, toPad, reifiedResultShapes))) {
+    LLVM_DEBUG(DBGS() << "Failed to reify result shapes: FAIL\n");
+    return failure();
   }
-  assert(reifiedResultShapes.size() == opToPad->getNumResults() &&
+  assert(reifiedResultShapes.size() == toPad->getNumResults() &&
          "expected same number of results");
 
-  // Clone `opToPad` to operate on the statically padded shapes.
+  // Clone `toPad` to operate on the statically padded shapes.
   auto resultTensorTypes =
-      ValueRange(newOperands).take_back(opToPad->getNumResults()).getTypes();
-  // clone **should** properly notify the rewriter.
+      ValueRange(newOperands).take_back(toPad->getNumResults()).getTypes();
+  // clone **should** properly notify the builder.
   TilingInterface paddedOp =
-      clone(rewriter, opToPad, resultTensorTypes, newOperands);
+      clone(builder, toPad, resultTensorTypes, newOperands);
   LLVM_DEBUG(DBGS() << "--cloned padded op: " << paddedOp << "\n");
 
-  // Recover the slice out of the new static results. This keeps the original
-  // opToPad around because it uses the dims of the original results.
+  // Recover the slice out of the new static results.
   SmallVector<Value> paddedSubtensorResults;
-  paddedSubtensorResults.reserve(opToPad->getNumResults());
+  paddedSubtensorResults.reserve(toPad->getNumResults());
   for (const auto &en : llvm::enumerate(paddedOp->getResults())) {
     Value paddedResult = en.value();
     int64_t resultNumber = en.index();
     int64_t rank = cast<RankedTensorType>(paddedResult.getType()).getRank();
-    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
-    SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
+    SmallVector<OpFoldResult> offsets(rank, builder.getIndexAttr(0));
+    SmallVector<OpFoldResult> strides(rank, builder.getIndexAttr(1));
     paddedSubtensorResults.push_back(tensor::ExtractSliceOp::create(
-        rewriter, loc, paddedResult, offsets, reifiedResultShapes[resultNumber],
+        builder, loc, paddedResult, offsets, reifiedResultShapes[resultNumber],
         strides));
   }
 
-  rewriter.replaceOp(opToPad, paddedSubtensorResults);
-
-  return paddedOp;
+  return PadTilingInterfaceResult{padOps, paddedOp, paddedSubtensorResults};
 }
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 507597b..94947b7 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2158,11 +2158,45 @@ public:
     return success();
   }
 };
+
+struct ReinterpretCastOpConstantFolder
+    : public OpRewritePattern<ReinterpretCastOp> {
+public:
+  using OpRewritePattern<ReinterpretCastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ReinterpretCastOp op,
+                                PatternRewriter &rewriter) const override {
+    unsigned srcStaticCount = llvm::count_if(
+        llvm::concat<OpFoldResult>(op.getMixedOffsets(), op.getMixedSizes(),
+                                   op.getMixedStrides()),
+        [](OpFoldResult ofr) { return isa<Attribute>(ofr); });
+
+    SmallVector<OpFoldResult> offsets = {op.getConstifiedMixedOffset()};
+    SmallVector<OpFoldResult> sizes = op.getConstifiedMixedSizes();
+    SmallVector<OpFoldResult> strides = op.getConstifiedMixedStrides();
+
+    // TODO: Using counting comparison instead of direct comparison because
+    // getMixedValues (and therefore ReinterpretCastOp::getMixed...) returns
+    // IntegerAttrs, while constifyIndexValues (and therefore
+    // ReinterpretCastOp::getConstifiedMixed...) returns IndexAttrs.
+    if (srcStaticCount ==
+        llvm::count_if(llvm::concat<OpFoldResult>(offsets, sizes, strides),
+                       [](OpFoldResult ofr) { return isa<Attribute>(ofr); }))
+      return failure();
+
+    auto newReinterpretCast = ReinterpretCastOp::create(
+        rewriter, op->getLoc(), op.getSource(), offsets[0], sizes, strides);
+
+    rewriter.replaceOpWithNewOp<CastOp>(op, op.getType(), newReinterpretCast);
+    return success();
+  }
+};
 } // namespace
 
 void ReinterpretCastOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                     MLIRContext *context) {
-  results.add<ReinterpretCastOpExtractStridedMetadataFolder>(context);
+  results.add<ReinterpretCastOpExtractStridedMetadataFolder,
+              ReinterpretCastOpConstantFolder>(context);
 }
 
 FailureOr<std::optional<SmallVector<Value>>>
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
index 49b7162..6f815ae 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
@@ -121,7 +121,7 @@ struct EmulateWideIntPass final
         [&typeConverter](Operation *op) { return typeConverter.isLegal(op); });
 
     RewritePatternSet patterns(ctx);
-    // Add common pattenrs to support contants, functions, etc.
+    // Add common patterns to support contants, functions, etc.
     arith::populateArithWideIntEmulationPatterns(typeConverter, patterns);
 
     memref::populateMemRefWideIntEmulationPatterns(typeConverter, patterns);
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 642ced9..dcfe2c7 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -40,6 +40,16 @@ static bool isScalarLikeType(Type type) {
   return type.isIntOrIndexOrFloat() || isa<ComplexType>(type);
 }
 
+/// Helper function to attach the `VarName` attribute to an operation
+/// if a variable name is provided.
+static void attachVarNameAttr(Operation *op, OpBuilder &builder,
+                              StringRef varName) {
+  if (!varName.empty()) {
+    auto varNameAttr = acc::VarNameAttr::get(builder.getContext(), varName);
+    op->setAttr(acc::getVarNameAttrName(), varNameAttr);
+  }
+}
+
 struct MemRefPointerLikeModel
     : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
                                             MemRefType> {
@@ -83,7 +93,9 @@ struct MemRefPointerLikeModel
     // then we can generate an alloca operation.
     if (memrefTy.hasStaticShape()) {
       needsFree = false; // alloca doesn't need deallocation
-      return memref::AllocaOp::create(builder, loc, memrefTy).getResult();
+      auto allocaOp = memref::AllocaOp::create(builder, loc, memrefTy);
+      attachVarNameAttr(allocaOp, builder, varName);
+      return allocaOp.getResult();
     }
 
     // For dynamic memrefs, extract sizes from the original variable if
@@ -103,8 +115,10 @@ struct MemRefPointerLikeModel
         // Static dimensions are handled automatically by AllocOp
       }
       needsFree = true; // alloc needs deallocation
-      return memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes)
-          .getResult();
+      auto allocOp =
+          memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes);
+      attachVarNameAttr(allocOp, builder, varName);
+      return allocOp.getResult();
     }
 
     // TODO: Unranked not yet supported.
@@ -1016,12 +1030,12 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> {
 //===----------------------------------------------------------------------===//
 
 /// Create and populate an init region for privatization recipes.
-/// Returns the init block on success, or nullptr on failure.
+/// Returns success if the region is populated, failure otherwise.
 /// Sets needsFree to indicate if the allocated memory requires deallocation.
-static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc,
-                                               Type varType, StringRef varName,
-                                               ValueRange bounds,
-                                               bool &needsFree) {
+static LogicalResult createInitRegion(OpBuilder &builder, Location loc,
+                                      Region &initRegion, Type varType,
+                                      StringRef varName, ValueRange bounds,
+                                      bool &needsFree) {
   // Create init block with arguments: original value + bounds
   SmallVector<Type> argTypes{varType};
   SmallVector<Location> argLocs{loc};
@@ -1030,9 +1044,9 @@ static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc,
     argLocs.push_back(loc);
   }
 
-  auto initBlock = std::make_unique<Block>();
+  Block *initBlock = builder.createBlock(&initRegion);
   initBlock->addArguments(argTypes, argLocs);
-  builder.setInsertionPointToStart(initBlock.get());
+  builder.setInsertionPointToStart(initBlock);
 
   Value privatizedValue;
 
@@ -1046,7 +1060,7 @@ static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc,
     privatizedValue = mappableTy.generatePrivateInit(
         builder, loc, typedVar, varName, bounds, {}, needsFree);
     if (!privatizedValue)
-      return nullptr;
+      return failure();
   } else {
     assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType");
     auto pointerLikeTy = cast<PointerLikeType>(varType);
@@ -1054,21 +1068,21 @@ static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc,
     privatizedValue = pointerLikeTy.genAllocate(builder, loc, varName, varType,
                                                 blockArgVar, needsFree);
     if (!privatizedValue)
-      return nullptr;
+      return failure();
   }
 
   // Add yield operation to init block
   acc::YieldOp::create(builder, loc, privatizedValue);
 
-  return initBlock;
+  return success();
 }
 
 /// Create and populate a copy region for firstprivate recipes.
-/// Returns the copy block on success, or nullptr on failure.
+/// Returns success if the region is populated, failure otherwise.
 /// TODO: Handle MappableType - it does not yet have a copy API.
-static std::unique_ptr<Block> createCopyRegion(OpBuilder &builder, Location loc,
-                                               Type varType,
-                                               ValueRange bounds) {
+static LogicalResult createCopyRegion(OpBuilder &builder, Location loc,
+                                      Region &copyRegion, Type varType,
+                                      ValueRange bounds) {
   // Create copy block with arguments: original value + privatized value +
   // bounds
   SmallVector<Type> copyArgTypes{varType, varType};
@@ -1078,16 +1092,16 @@ static std::unique_ptr<Block> createCopyRegion(OpBuilder &builder, Location loc,
     copyArgLocs.push_back(loc);
   }
 
-  auto copyBlock = std::make_unique<Block>();
+  Block *copyBlock = builder.createBlock(&copyRegion);
   copyBlock->addArguments(copyArgTypes, copyArgLocs);
-  builder.setInsertionPointToStart(copyBlock.get());
+  builder.setInsertionPointToStart(copyBlock);
 
   bool isMappable = isa<MappableType>(varType);
   bool isPointerLike = isa<PointerLikeType>(varType);
   // TODO: Handle MappableType - it does not yet have a copy API.
   // Otherwise, for now just fallback to pointer-like behavior.
   if (isMappable && !isPointerLike)
-    return nullptr;
+    return failure();
 
   // Generate copy region body based on variable type
   if (isPointerLike) {
@@ -1099,21 +1113,20 @@ static std::unique_ptr<Block> createCopyRegion(OpBuilder &builder, Location loc,
     if (!pointerLikeTy.genCopy(
             builder, loc, cast<TypedValue<PointerLikeType>>(privatizedArg),
             cast<TypedValue<PointerLikeType>>(originalArg), varType))
-      return nullptr;
+      return failure();
   }
 
   // Add terminator to copy block
   acc::TerminatorOp::create(builder, loc);
 
-  return copyBlock;
+  return success();
 }
 
 /// Create and populate a destroy region for privatization recipes.
-/// Returns the destroy block on success, or nullptr if not needed.
-static std::unique_ptr<Block> createDestroyRegion(OpBuilder &builder,
-                                                  Location loc, Type varType,
-                                                  Value allocRes,
-                                                  ValueRange bounds) {
+/// Returns success if the region is populated, failure otherwise.
+static LogicalResult createDestroyRegion(OpBuilder &builder, Location loc,
+                                         Region &destroyRegion, Type varType,
+                                         Value allocRes, ValueRange bounds) {
   // Create destroy block with arguments: original value + privatized value +
   // bounds
   SmallVector<Type> destroyArgTypes{varType, varType};
@@ -1123,28 +1136,25 @@ static std::unique_ptr<Block> createDestroyRegion(OpBuilder &builder,
     destroyArgLocs.push_back(loc);
   }
 
-  auto destroyBlock = std::make_unique<Block>();
+  Block *destroyBlock = builder.createBlock(&destroyRegion);
   destroyBlock->addArguments(destroyArgTypes, destroyArgLocs);
-  builder.setInsertionPointToStart(destroyBlock.get());
+  builder.setInsertionPointToStart(destroyBlock);
 
-  bool isMappable = isa<MappableType>(varType);
-  bool isPointerLike = isa<PointerLikeType>(varType);
-  // TODO: Handle MappableType - it does not yet have a deallocation API.
-  // Otherwise, for now just fallback to pointer-like behavior.
-  if (isMappable && !isPointerLike)
-    return nullptr;
-
-  assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType");
-  auto pointerLikeTy = cast<PointerLikeType>(varType);
-  auto privatizedArg =
+  auto varToFree =
       cast<TypedValue<PointerLikeType>>(destroyBlock->getArgument(1));
-  // Pass allocRes to help determine the allocation type
-  if (!pointerLikeTy.genFree(builder, loc, privatizedArg, allocRes, varType))
-    return nullptr;
+  if (isa<MappableType>(varType)) {
+    auto mappableTy = cast<MappableType>(varType);
+    if (!mappableTy.generatePrivateDestroy(builder, loc, varToFree))
+      return failure();
+  } else {
+    assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType");
+    auto pointerLikeTy = cast<PointerLikeType>(varType);
+    if (!pointerLikeTy.genFree(builder, loc, varToFree, allocRes, varType))
+      return failure();
+  }
 
   acc::TerminatorOp::create(builder, loc);
-
-  return destroyBlock;
+  return success();
 }
 
 } // namespace
@@ -1206,40 +1216,33 @@ PrivateRecipeOp::createAndPopulate(OpBuilder &builder, Location loc,
   if (!isMappable && !isPointerLike)
     return std::nullopt;
 
-  // Create init and destroy blocks using shared helpers
   OpBuilder::InsertionGuard guard(builder);
 
-  // Save the original insertion point for creating the recipe operation later
-  auto originalInsertionPoint = builder.saveInsertionPoint();
+  // Create the recipe operation first so regions have proper parent context
+  auto recipe = PrivateRecipeOp::create(builder, loc, recipeName, varType);
 
+  // Populate the init region
   bool needsFree = false;
-  auto initBlock =
-      createInitRegion(builder, loc, varType, varName, bounds, needsFree);
-  if (!initBlock)
+  if (failed(createInitRegion(builder, loc, recipe.getInitRegion(), varType,
+                              varName, bounds, needsFree))) {
+    recipe.erase();
     return std::nullopt;
+  }
 
   // Only create destroy region if the allocation needs deallocation
-  std::unique_ptr<Block> destroyBlock;
   if (needsFree) {
     // Extract the allocated value from the init block's yield operation
-    auto yieldOp = cast<acc::YieldOp>(initBlock->getTerminator());
+    auto yieldOp =
+        cast<acc::YieldOp>(recipe.getInitRegion().front().getTerminator());
     Value allocRes = yieldOp.getOperand(0);
 
-    destroyBlock = createDestroyRegion(builder, loc, varType, allocRes, bounds);
-    if (!destroyBlock)
+    if (failed(createDestroyRegion(builder, loc, recipe.getDestroyRegion(),
+                                   varType, allocRes, bounds))) {
+      recipe.erase();
       return std::nullopt;
+    }
   }
 
-  // Now create the recipe operation at the original insertion point and attach
-  // the blocks
-  builder.restoreInsertionPoint(originalInsertionPoint);
-  auto recipe = PrivateRecipeOp::create(builder, loc, recipeName, varType);
-
-  // Move the blocks into the recipe's regions
-  recipe.getInitRegion().push_back(initBlock.release());
-  if (destroyBlock)
-    recipe.getDestroyRegion().push_back(destroyBlock.release());
-
   return recipe;
 }
 
@@ -1285,45 +1288,40 @@ FirstprivateRecipeOp::createAndPopulate(OpBuilder &builder, Location loc,
   if (!isMappable && !isPointerLike)
     return std::nullopt;
 
-  // Create init, copy, and destroy blocks using shared helpers
   OpBuilder::InsertionGuard guard(builder);
 
-  // Save the original insertion point for creating the recipe operation later
-  auto originalInsertionPoint = builder.saveInsertionPoint();
+  // Create the recipe operation first so regions have proper parent context
+  auto recipe = FirstprivateRecipeOp::create(builder, loc, recipeName, varType);
 
+  // Populate the init region
   bool needsFree = false;
-  auto initBlock =
-      createInitRegion(builder, loc, varType, varName, bounds, needsFree);
-  if (!initBlock)
+  if (failed(createInitRegion(builder, loc, recipe.getInitRegion(), varType,
+                              varName, bounds, needsFree))) {
+    recipe.erase();
     return std::nullopt;
+  }
 
-  auto copyBlock = createCopyRegion(builder, loc, varType, bounds);
-  if (!copyBlock)
+  // Populate the copy region
+  if (failed(createCopyRegion(builder, loc, recipe.getCopyRegion(), varType,
+                              bounds))) {
+    recipe.erase();
     return std::nullopt;
+  }
 
   // Only create destroy region if the allocation needs deallocation
-  std::unique_ptr<Block> destroyBlock;
   if (needsFree) {
     // Extract the allocated value from the init block's yield operation
-    auto yieldOp = cast<acc::YieldOp>(initBlock->getTerminator());
+    auto yieldOp =
+        cast<acc::YieldOp>(recipe.getInitRegion().front().getTerminator());
     Value allocRes = yieldOp.getOperand(0);
 
-    destroyBlock = createDestroyRegion(builder, loc, varType, allocRes, bounds);
-    if (!destroyBlock)
+    if (failed(createDestroyRegion(builder, loc, recipe.getDestroyRegion(),
+                                   varType, allocRes, bounds))) {
+      recipe.erase();
       return std::nullopt;
+    }
   }
 
-  // Now create the recipe operation at the original insertion point and attach
-  // the blocks
-  builder.restoreInsertionPoint(originalInsertionPoint);
-  auto recipe = FirstprivateRecipeOp::create(builder, loc, recipeName, varType);
-
-  // Move the blocks into the recipe's regions
-  recipe.getInitRegion().push_back(initBlock.release());
-  recipe.getCopyRegion().push_back(copyBlock.release());
-  if (destroyBlock)
-    recipe.getDestroyRegion().push_back(destroyBlock.release());
-
   return recipe;
 }
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 12e6475..7c019e7 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -2032,11 +2032,19 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     }
 
     // Newly created `WarpOp` will yield values in following order:
-    // 1. All init args of the `ForOp`.
-    // 2. All escaping values.
-    // 3. All non-`ForOp` yielded values.
+    // 1. Loop bounds.
+    // 2. All init args of the `ForOp`.
+    // 3. All escaping values.
+    // 4. All non-`ForOp` yielded values.
     SmallVector<Value> newWarpOpYieldValues;
     SmallVector<Type> newWarpOpDistTypes;
+    newWarpOpYieldValues.insert(
+        newWarpOpYieldValues.end(),
+        {forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()});
+    newWarpOpDistTypes.insert(newWarpOpDistTypes.end(),
+                              {forOp.getLowerBound().getType(),
+                               forOp.getUpperBound().getType(),
+                               forOp.getStep().getType()});
     for (auto [i, initArg] : llvm::enumerate(forOp.getInitArgs())) {
       newWarpOpYieldValues.push_back(initArg);
       // Compute the distributed type for this init arg.
@@ -2072,20 +2080,24 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
 
     // Next, we create a new `ForOp` with the init args yielded by the new
     // `WarpOp`.
+    const unsigned initArgsStartIdx = 3; // After loop bounds.
     const unsigned escapingValuesStartIdx =
+        initArgsStartIdx +
         forOp.getInitArgs().size(); // `ForOp` init args are positioned before
                                     // escaping values in the new `WarpOp`.
     SmallVector<Value> newForOpOperands;
-    for (size_t i = 0; i < escapingValuesStartIdx; ++i)
+    for (size_t i = initArgsStartIdx; i < escapingValuesStartIdx; ++i)
       newForOpOperands.push_back(newWarpOp.getResult(newIndices[i]));
 
     // Create a new `ForOp` outside the new `WarpOp` region.
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newWarpOp);
     auto newForOp = scf::ForOp::create(
-        rewriter, forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
-        forOp.getStep(), newForOpOperands, /*bodyBuilder=*/nullptr,
-        forOp.getUnsignedCmp());
+        rewriter, forOp.getLoc(),
+        /**LowerBound=**/ newWarpOp.getResult(newIndices[0]),
+        /**UpperBound=**/ newWarpOp.getResult(newIndices[1]),
+        /**Step=**/ newWarpOp.getResult(newIndices[2]), newForOpOperands,
+        /*bodyBuilder=*/nullptr, forOp.getUnsignedCmp());
     // Next, we insert a new `WarpOp` (called inner `WarpOp`) inside the
     // newly created `ForOp`. This `WarpOp` will contain all ops that were
     // contained within the original `ForOp` body.
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 1b656d8..ea93085 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -817,6 +817,50 @@ struct LinearizeVectorToElements final
   }
 };
 
+/// Convert broadcasts from scalars or 1-element vectors, such as
+///
+/// ```mlir
+///   vector.broadcast %value : f32 to vector<4x4xf32>
+/// ```
+///
+/// to broadcasts to rank-1 vectors, with shape_casts before/after as needed.
+/// The above becomes,
+///
+/// ```mlir
+///   %out_1d = vector.broadcast %value : f32 to vector<16xf32>
+///   %out_nd = vector.shape_cast %out_1d : vector<16xf32> to vector<4x4xf32>
+/// ```
+struct LinearizeVectorBroadcast final
+    : public OpConversionPattern<vector::BroadcastOp> {
+  using Base::Base;
+
+  LinearizeVectorBroadcast(const TypeConverter &typeConverter,
+                           MLIRContext *context, PatternBenefit benefit = 1)
+      : OpConversionPattern(typeConverter, context, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(vector::BroadcastOp broadcastOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    int numElements = 1;
+    Type sourceType = broadcastOp.getSourceType();
+    if (auto vecType = dyn_cast<VectorType>(sourceType)) {
+      numElements = vecType.getNumElements();
+    }
+
+    if (numElements != 1) {
+      return rewriter.notifyMatchFailure(
+          broadcastOp, "only broadcasts of single elements can be linearized.");
+    }
+
+    auto dstTy = getTypeConverter()->convertType(broadcastOp.getType());
+    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(broadcastOp, dstTy,
+                                                     adaptor.getSource());
+
+    return success();
+  }
+};
+
 } // namespace
 
 /// This method defines the set of operations that are linearizable, and hence
@@ -909,8 +953,8 @@ void mlir::vector::populateVectorLinearizeBasePatterns(
   patterns
       .add<LinearizeConstantLike, LinearizeVectorizable, LinearizeVectorBitCast,
            LinearizeVectorCreateMask, LinearizeVectorLoad, LinearizeVectorStore,
-           LinearizeVectorFromElements, LinearizeVectorToElements>(
-          typeConverter, patterns.getContext());
+           LinearizeVectorBroadcast, LinearizeVectorFromElements,
+           LinearizeVectorToElements>(typeConverter, patterns.getContext());
 }
 
 void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 025ee9a..c809c502 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -91,7 +91,7 @@ mlir::vector::isTranspose2DSlice(vector::TransposeOp op) {
 
   // Check whether the two source vector dimensions that are greater than one
   // must be transposed with each other so that we can apply one of the 2-D
-  // transpose pattens. Otherwise, these patterns are not applicable.
+  // transpose patterns. Otherwise, these patterns are not applicable.
   if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1],
                                   op.getPermutation()))
     return failure();
diff --git a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
index 89b62a2..a514ea9 100644
--- a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
+++ b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
@@ -12,6 +12,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/SymbolTable.h"
@@ -39,28 +40,6 @@ void printElseRegion(OpAsmPrinter &opPrinter, Operation *op,
   opPrinter.printKeywordOrString("else ");
   opPrinter.printRegion(elseRegion);
 }
-
-ParseResult parseWasmVisibility(OpAsmParser &opParser, StringAttr &visibility) {
-  std::string keyword;
-  auto initLocation = opParser.getCurrentLocation();
-  std::ignore = opParser.parseOptionalKeywordOrString(&keyword);
-  if (keyword == "nested" or keyword == "") {
-    visibility = StringAttr::get(opParser.getContext(), "nested");
-    return ParseResult::success();
-  }
-
-  if (keyword == "public" || keyword == "private") {
-    visibility = StringAttr::get(opParser.getContext(), keyword);
-    return ParseResult::success();
-  }
-  opParser.emitError(initLocation, "expecting symbol visibility");
-  return ParseResult::failure();
-}
-
-void printWasmVisibility(OpAsmPrinter &opPrinter, Operation *op,
-                         Attribute visibility) {
-  opPrinter.printKeywordOrString(cast<StringAttr>(visibility).strref());
-}
 } // namespace
 
 #define GET_OP_CLASSES
@@ -167,10 +146,23 @@ Block *FuncOp::addEntryBlock() {
 
 void FuncOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                    StringRef symbol, FunctionType funcType) {
-  FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {}, "nested");
+  FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {});
 }
 
 ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto *ctx = parser.getContext();
+  std::string visibilityString;
+  auto loc = parser.getNameLoc();
+  ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString);
+  bool exported{false};
+  if (res.succeeded()) {
+    if (visibilityString != "exported")
+      return parser.emitError(
+                 loc, "expecting either `exported` or symbol name. got ")
+             << visibilityString;
+    exported = true;
+  }
+
   auto buildFuncType = [&parser](Builder &builder, ArrayRef<Type> argTypes,
                                  ArrayRef<Type> results,
                                  function_interface_impl::VariadicFlag,
@@ -191,11 +183,13 @@ ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
 
     return builder.getFunctionType(argTypesWithoutLocal, results);
   };
-
-  return function_interface_impl::parseFunctionOp(
+  auto funcParseRes = function_interface_impl::parseFunctionOp(
       parser, result, /*allowVariadic=*/false,
       getFunctionTypeAttrName(result.name), buildFuncType,
       getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
+  if (exported)
+    result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx));
+  return funcParseRes;
 }
 
 LogicalResult FuncOp::verifyBody() {
@@ -224,9 +218,18 @@ LogicalResult FuncOp::verifyBody() {
 }
 
 void FuncOp::print(OpAsmPrinter &p) {
+  /// If exported, print it before and mask it before printing
+  /// using generic interface.
+  auto exported = getExported();
+  if (exported) {
+    p << " exported";
+    removeExportedAttr();
+  }
   function_interface_impl::printFunctionOp(
       p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
       getArgAttrsAttrName(), getResAttrsAttrName());
+  if (exported)
+    setExported(true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,38 +240,37 @@ void FuncImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                          StringRef symbol, StringRef moduleName,
                          StringRef importName, FunctionType type) {
   FuncImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                      type, {}, {}, odsBuilder.getStringAttr("nested"));
+                      type, {}, {});
 }
 
 //===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
-
-void GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                     StringRef symbol, Type type, bool isMutable) {
-  GlobalOp::build(odsBuilder, odsState, symbol, type, isMutable,
-                  odsBuilder.getStringAttr("nested"));
-}
-
 // Custom formats
 ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) {
   StringAttr symbolName;
   Type globalType;
   auto *ctx = parser.getContext();
-  ParseResult res = parser.parseSymbolName(
-      symbolName, SymbolTable::getSymbolAttrName(), result.attributes);
+  std::string visibilityString;
+  auto loc = parser.getNameLoc();
+  ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString);
+  if (res.succeeded()) {
+    if (visibilityString != "exported")
+      return parser.emitError(
+                 loc, "expecting either `exported` or symbol name. got ")
+             << visibilityString;
+    result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx));
+  }
 
+  res = parser.parseSymbolName(symbolName, SymbolTable::getSymbolAttrName(),
+                               result.attributes);
   res = parser.parseType(globalType);
   result.addAttribute(getTypeAttrName(result.name), TypeAttr::get(globalType));
   std::string mutableString;
   res = parser.parseOptionalKeywordOrString(&mutableString);
   if (res.succeeded() && mutableString == "mutable")
     result.addAttribute("isMutable", UnitAttr::get(ctx));
-  std::string visibilityString;
-  res = parser.parseOptionalKeywordOrString(&visibilityString);
-  if (res.succeeded())
-    result.addAttribute("sym_visibility",
-                        StringAttr::get(ctx, visibilityString));
+
   res = parser.parseColon();
   Region *globalInitRegion = result.addRegion();
   res = parser.parseRegion(*globalInitRegion);
@@ -276,11 +278,11 @@ ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 void GlobalOp::print(OpAsmPrinter &printer) {
+  if (getExported())
+    printer << " exported";
   printer << " @" << getSymName().str() << " " << getType();
   if (getIsMutable())
     printer << " mutable";
-  if (auto vis = getSymVisibility())
-    printer << " " << *vis;
   printer << " :";
   Region &body = getRegion();
   if (!body.empty()) {
@@ -319,13 +321,6 @@ GlobalGetOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 // GlobalImportOp
 //===----------------------------------------------------------------------===//
 
-void GlobalImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                           StringRef symbol, StringRef moduleName,
-                           StringRef importName, Type type, bool isMutable) {
-  GlobalImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                        type, isMutable, odsBuilder.getStringAttr("nested"));
-}
-
 ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) {
   auto *ctx = parser.getContext();
   ParseResult res = parseImportOp(parser, result);
@@ -335,12 +330,8 @@ ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) {
   res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString);
   if (res.succeeded() && mutableOrSymVisString == "mutable") {
     result.addAttribute("isMutable", UnitAttr::get(ctx));
-    res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString);
   }
 
-  if (res.succeeded())
-    result.addAttribute("sym_visibility",
-                        StringAttr::get(ctx, mutableOrSymVisString));
   res = parser.parseColon();
 
   Type importedType;
@@ -356,8 +347,6 @@ void GlobalImportOp::print(OpAsmPrinter &printer) {
           << "\" as @" << getSymName();
   if (getIsMutable())
     printer << " mutable";
-  if (auto vis = getSymVisibility())
-    printer << " " << *vis;
   printer << " : " << getType();
 }
 
@@ -431,27 +420,6 @@ LogicalResult LocalTeeOp::verify() {
 Block *LoopOp::getLabelTarget() { return &getBody().front(); }
 
 //===----------------------------------------------------------------------===//
-// MemOp
-//===----------------------------------------------------------------------===//
-
-void MemOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                  StringRef symbol, LimitType limit) {
-  MemOp::build(odsBuilder, odsState, symbol, limit,
-               odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
-// MemImportOp
-//===----------------------------------------------------------------------===//
-
-void MemImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                        StringRef symbol, StringRef moduleName,
-                        StringRef importName, LimitType limits) {
-  MemImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                     limits, odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
 // ReinterpretOp
 //===----------------------------------------------------------------------===//
 
@@ -471,24 +439,3 @@ LogicalResult ReinterpretOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void ReturnOp::build(OpBuilder &odsBuilder, OperationState &odsState) {}
-
-//===----------------------------------------------------------------------===//
-// TableOp
-//===----------------------------------------------------------------------===//
-
-void TableOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                    StringRef symbol, TableType type) {
-  TableOp::build(odsBuilder, odsState, symbol, type,
-                 odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
-// TableImportOp
-//===----------------------------------------------------------------------===//
-
-void TableImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                          StringRef symbol, StringRef moduleName,
-                          StringRef importName, TableType type) {
-  TableImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                       type, odsBuilder.getStringAttr("nested"));
-}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 9beb22d..1599ae9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -727,6 +727,152 @@ void MemLayoutAttr::print(AsmPrinter &printer) const {
   }
   printer << ">";
 }
+// a helper utility to perform binary operation on OpFoldResult.
+// If both a and b are attributes, it will simply return the result.
+// Otherwise, the corresponding arith op will be generated, and an
+// contant op will be created if one of them is an attribute.
+template <typename ArithOp>
+OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc,
+                      OpBuilder &builder) {
+  auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a);
+  auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b);
+  return builder.create<ArithOp>(loc, aVal, bVal).getResult();
+}
+
+// a helper utility to perform division operation on OpFoldResult and int64_t.
+#define div(a, b)                                                              \
+  genBinOp<arith::DivSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform reminder operation on OpFoldResult and int64_t.
+#define rem(a, b)                                                              \
+  genBinOp<arith::RemSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform multiply operation on OpFoldResult and int64_t.
+#define mul(a, b)                                                              \
+  genBinOp<arith::MulIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform addition operation on two OpFoldResult.
+#define add(a, b) genBinOp<arith::AddIOp>(a, b, loc, builder)
+
+// block the given offsets according to the block shape
+// say the original offset is [y, x], and the block shape is [By, Bx],
+// then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+SmallVector<OpFoldResult> getBlockedOffsets(OpBuilder &builder, Location loc,
+                                            ArrayRef<OpFoldResult> offsets,
+                                            ArrayRef<int64_t> blockShape) {
+
+  assert(offsets.size() == blockShape.size() &&
+         "offsets and blockShape must have the same size");
+  SmallVector<OpFoldResult> blockedOffsets;
+  SmallVector<OpFoldResult> divs, rems;
+
+  for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+    divs.push_back(div(offset, block));
+    rems.push_back(rem(offset, block));
+  }
+  blockedOffsets.append(divs.begin(), divs.end());
+  blockedOffsets.append(rems.begin(), rems.end());
+
+  return blockedOffsets;
+}
+
+// Get strides as vector of integer for MemDesc.
+SmallVector<int64_t> MemDescType::getStrideShape() {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+
+  ArrayAttr strideAttr = getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+
+  SmallVector<int64_t> innerBlkShape = getBlockShape();
+
+  // get perm from FCD to LCD
+  // perm[i] = the dim with i-th smallest stride
+  SmallVector<int, 4> perm =
+      llvm::to_vector<4>(llvm::seq<int>(0, strides.size()));
+  llvm::sort(perm, [&](int a, int b) { return strides[a] < strides[b]; });
+
+  assert(strides[perm[0]] == 1 && "inner most dim must have stride 1");
+
+  SmallVector<int64_t> innerBlkStride(innerBlkShape.size());
+  innerBlkStride[perm[0]] = 1;
+  for (size_t i = 1; i < perm.size(); ++i)
+    innerBlkStride[perm[i]] =
+        innerBlkStride[perm[i - 1]] * innerBlkShape[perm[i - 1]];
+
+  // compute the original matrix shape using the stride info
+  // and compute the number of blocks in each dimension
+  // The shape of highest dim can't be derived from stride info,
+  // but doesn't impact the stride computation for blocked layout.
+  SmallVector<int64_t> matrixShapeOrig(matrixShape.size());
+  SmallVector<int64_t> BlkShapeOrig(matrixShape.size());
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    matrixShapeOrig[perm[i]] = strides[perm[i + 1]] / strides[perm[i]];
+    BlkShapeOrig[perm[i]] = matrixShapeOrig[perm[i]] / innerBlkShape[perm[i]];
+  }
+
+  int64_t innerBlkSize = 1;
+  for (auto s : innerBlkShape)
+    innerBlkSize *= s;
+
+  SmallVector<int64_t> outerBlkStride(matrixShape.size());
+  outerBlkStride[perm[0]] = innerBlkSize;
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    outerBlkStride[perm[i + 1]] =
+        outerBlkStride[perm[i]] * BlkShapeOrig[perm[i]];
+  }
+
+  // combine the inner and outer strides
+  SmallVector<int64_t> blockedStrides;
+  blockedStrides.append(outerBlkStride.begin(), outerBlkStride.end());
+  blockedStrides.append(innerBlkStride.begin(), innerBlkStride.end());
+
+  return blockedStrides;
+}
+
+// Calculate the linear offset using the blocked offsets and stride
+Value MemDescType::getLinearOffsets(OpBuilder &builder, Location loc,
+                                    ArrayRef<OpFoldResult> offsets) {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+  SmallVector<int64_t> blockShape = getBlockShape();
+  SmallVector<int64_t> strides = getStrideShape();
+  SmallVector<OpFoldResult> blockedOffsets;
+
+  // blockshape equal to matrixshape means no blocking
+  if (llvm::equal(blockShape, matrixShape)) {
+    // remove the outer dims from strides
+    strides.erase(strides.begin(), strides.begin() + matrixShape.size());
+  } else {
+    assert(offsets.size() == blockShape.size() &&
+           "offsets and blockShape must have the same size");
+    // say the original offset is [y, x], and the block shape is [By, Bx],
+    // then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+
+    SmallVector<OpFoldResult> divs, rems;
+
+    for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+      divs.push_back(div(offset, block));
+      rems.push_back(rem(offset, block));
+    }
+    blockedOffsets.append(divs.begin(), divs.end());
+    blockedOffsets.append(rems.begin(), rems.end());
+    offsets = blockedOffsets;
+  }
+
+  // Start with initial value as matrix descriptor's base offset.
+  Value linearOffset = arith::ConstantIndexOp::create(builder, loc, 0);
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    OpFoldResult mulResult = mul(offsets[i], strides[i]);
+    Value mulVal = getValueOrCreateConstantIndexOp(builder, loc, mulResult);
+    linearOffset = arith::AddIOp::create(builder, loc, mulVal, linearOffset);
+  }
+
+  return linearOffset;
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index e0a8ac4..abd12e2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -173,6 +173,49 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
   return success();
 }
 
+LogicalResult
+IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
+                      UnitAttr subgroup_block_io,
+                      function_ref<InFlightDiagnostic()> emitError) {
+
+  if (!dataTy) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    else
+      return success();
+  }
+
+  if (mdescTy.getRank() != 2)
+    return emitError() << "mem_desc must be 2D.";
+
+  ArrayRef<int64_t> dataShape = dataTy.getShape();
+  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
+
+  if (dataShape.size() == 2) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+                     [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+      return emitError() << "data shape must not exceed mem_desc shape.";
+  } else {
+    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+    // if the subgroup_block_io attribute is set,  mdescTy must have block
+    // attribute
+    if (subgroup_block_io && !blockShape.size())
+      return emitError() << "mem_desc must have block attribute when "
+                            "subgroup_block_io is set.";
+    // if the subgroup_block_io attribute is set, the memdesc should be row
+    // major
+    if (subgroup_block_io && mdescTy.isColMajor())
+      return emitError() << "mem_desc should be row major when "
+                            "subgroup_block_io is set.";
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -1049,23 +1092,20 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  // Call the generated builder with all parameters (including optional ones as
+  // nullptr/empty)
   build(builder, state, res, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult LoadMatrixOp::verify() {
-  VectorType resTy = getRes().getType();
-  MemDescType mdescTy = getMemDesc().getType();
 
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
+  auto resTy = dyn_cast<VectorType>(getRes().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
 
-  ArrayRef<int64_t> valueShape = resTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed mem_desc shape.");
-  return success();
+  return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1080,57 +1120,16 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
   build(builder, state, data, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult StoreMatrixOp::verify() {
-  VectorType dataTy = getData().getType();
-  MemDescType mdescTy = getMemDesc().getType();
-
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
-
-  ArrayRef<int64_t> dataShape = dataTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("data shape must not exceed mem_desc shape.");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// XeGPU_MemDescSubviewOp
-//===----------------------------------------------------------------------===//
-
-void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state,
-                             Type resTy, Value src,
-                             llvm::ArrayRef<OpFoldResult> offsets) {
-  llvm::SmallVector<Value> dynamicOffsets;
-  llvm::SmallVector<int64_t> staticOffsets;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
-}
-
-LogicalResult MemDescSubviewOp::verify() {
-  MemDescType srcTy = getSrc().getType();
-  MemDescType resTy = getRes().getType();
-  ArrayRef<int64_t> srcShape = srcTy.getShape();
-  ArrayRef<int64_t> resShape = resTy.getShape();
 
-  if (srcTy.getRank() < resTy.getRank())
-    return emitOpError("result rank must not exceed source rank.");
-
-  if (llvm::any_of(
-          llvm::zip_equal(resShape, srcShape.take_back(resShape.size())),
-          [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed source shape.");
-
-  if (srcTy.getStrides() != resTy.getStrides())
-    return emitOpError("result must inherit the source strides.");
-
-  return success();
+  auto dataTy = dyn_cast<VectorType>(getData().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
+  return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index a178d0f..aafa1b7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -941,7 +941,9 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
   LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    VectorType valueTy = op.getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
+    assert(valueTy && "the value type must be vector type!");
+
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
       return failure();
@@ -984,7 +986,8 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
       return failure();
 
     Location loc = op.getLoc();
-    VectorType valueTy = op.getData().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
+    assert(valueTy && "the value type must be vector type!");
     ArrayRef<int64_t> shape = valueTy.getShape();
     auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index c28d2fc..31a967d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -991,7 +991,8 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
       return failure();
 
     ArrayRef<int64_t> wgShape = op.getDataShape();
-    VectorType valueTy = op.getRes().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getRes().getType());
+    assert(valueTy && "the value type must be vector type!");
     Type elemTy = valueTy.getElementType();
 
     xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index c67b242..dd413d2de 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -98,4 +98,5 @@ void mlir::registerAllPasses() {
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
   gpu::registerGPUToNVVMPipeline();
+  gpu::registerGPUToXeVMPipeline();
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 1e2099d..8de49dd 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -246,7 +246,7 @@ public:
 
   // Rewrite all uses of the original variable in `BBName`
   //  with the linear variable in-place
-  void rewriteInPlace(llvm::IRBuilderBase &builder, std::string BBName,
+  void rewriteInPlace(llvm::IRBuilderBase &builder, const std::string &BBName,
                       size_t varIndex) {
     llvm::SmallVector<llvm::User *> users;
     for (llvm::User *user : linearOrigVal[varIndex]->users())
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 0c3e87a..d9ad8fb 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -2619,6 +2619,11 @@ LogicalResult ControlFlowStructurizer::structurize() {
   // region. We cannot handle such cases given that once a value is sinked into
   // the SelectionOp/LoopOp's region, there is no escape for it.
   for (auto *block : constructBlocks) {
+    if (!block->use_empty())
+      return emitError(block->getParent()->getLoc(),
+                       "failed control flow structurization: "
+                       "block has uses outside of the "
+                       "enclosing selection/loop construct");
     for (Operation &op : *block)
       if (!op.use_empty())
         return op.emitOpError("failed control flow structurization: value has "
diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
index 51c6077..366ba8f 100644
--- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
+++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
@@ -14,6 +14,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Support/LLVM.h"
@@ -138,6 +139,10 @@ using ImportDesc =
 
 using parsed_inst_t = FailureOr<SmallVector<Value>>;
 
+struct EmptyBlockMarker {};
+using BlockTypeParseResult =
+    std::variant<EmptyBlockMarker, TypeIdxRecord, Type>;
+
 struct WasmModuleSymbolTables {
   SmallVector<FunctionSymbolRefContainer> funcSymbols;
   SmallVector<GlobalSymbolRefContainer> globalSymbols;
@@ -175,6 +180,9 @@ class ParserHead;
 /// Wrapper around SmallVector to only allow access as push and pop on the
 /// stack. Makes sure that there are no "free accesses" on the stack to preserve
 /// its state.
+/// This class also keep tracks of the Wasm labels defined by different ops,
+/// which can be targeted by control flow ops. This can be modeled as part of
+/// the Value Stack as Wasm control flow ops can only target enclosing labels.
 class ValueStack {
 private:
   struct LabelLevel {
@@ -206,6 +214,16 @@ public:
   ///   if an error occurs.
   LogicalResult pushResults(ValueRange results, Location *opLoc);
 
+  void addLabelLevel(LabelLevelOpInterface levelOp) {
+    labelLevel.push_back({values.size(), levelOp});
+    LDBG() << "Adding a new frame context to ValueStack";
+  }
+
+  void dropLabelLevel() {
+    assert(!labelLevel.empty() && "Trying to drop a frame from empty context");
+    auto newSize = labelLevel.pop_back_val().stackIdx;
+    values.truncate(newSize);
+  }
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// A simple dump function for debugging.
   /// Writes output to llvm::dbgs().
@@ -214,6 +232,7 @@ public:
 
 private:
   SmallVector<Value> values;
+  SmallVector<LabelLevel> labelLevel;
 };
 
 using local_val_t = TypedValue<wasmssa::LocalRefType>;
@@ -248,6 +267,19 @@ private:
   buildNumericOp(OpBuilder &builder,
                  std::enable_if_t<std::is_arithmetic_v<valueType>> * = nullptr);
 
+  /// Construct a conversion operation of type \p opType that takes a value from
+  /// type \p inputType on the stack and will produce a value of type
+  /// \p outputType.
+  ///
+  /// \p opType - The WASM dialect operation to build.
+  /// \p inputType - The operand type for the built instruction.
+  /// \p outputType - The result type for the built instruction.
+  ///
+  /// \returns The parsed instruction result, or failure.
+  template <typename opType, typename inputType, typename outputType,
+            typename... extraArgsT>
+  inline parsed_inst_t buildConvertOp(OpBuilder &builder, extraArgsT...);
+
   /// This function generates a dispatch tree to associate an opcode with a
   /// parser. Parsers are registered by specialising the
   /// `parseSpecificInstruction` function for the op code to handle.
@@ -280,11 +312,105 @@ private:
     }
   }
 
+  ///
+  /// RAII guard class for creating a nesting level
+  ///
+  struct NestingContextGuard {
+    NestingContextGuard(ExpressionParser &parser, LabelLevelOpInterface levelOp)
+        : parser{parser} {
+      parser.addNestingContextLevel(levelOp);
+    }
+    NestingContextGuard(NestingContextGuard &&other) : parser{other.parser} {
+      other.shouldDropOnDestruct = false;
+    }
+    NestingContextGuard(NestingContextGuard const &) = delete;
+    ~NestingContextGuard() {
+      if (shouldDropOnDestruct)
+        parser.dropNestingContextLevel();
+    }
+    ExpressionParser &parser;
+    bool shouldDropOnDestruct = true;
+  };
+
+  void addNestingContextLevel(LabelLevelOpInterface levelOp) {
+    valueStack.addLabelLevel(levelOp);
+  }
+
+  void dropNestingContextLevel() {
+    // Should always succeed as we are droping the frame that was previously
+    // created.
+    valueStack.dropLabelLevel();
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               EmptyBlockMarker) {
+    return builder.getFunctionType({}, {});
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               TypeIdxRecord type) {
+    if (type.id >= symbols.moduleFuncTypes.size())
+      return emitError(*currentOpLoc,
+                       "type index references nonexistent type (")
+             << type.id << "). Only " << symbols.moduleFuncTypes.size()
+             << " types are registered";
+    return symbols.moduleFuncTypes[type.id];
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               Type valType) {
+    return builder.getFunctionType({}, {valType});
+  }
+
+  llvm::FailureOr<FunctionType>
+  getFuncTypeFor(OpBuilder &builder, BlockTypeParseResult parseResult) {
+    return std::visit(
+        [this, &builder](auto value) { return getFuncTypeFor(builder, value); },
+        parseResult);
+  }
+
+  llvm::FailureOr<FunctionType>
+  getFuncTypeFor(OpBuilder &builder,
+                 llvm::FailureOr<BlockTypeParseResult> parseResult) {
+    if (llvm::failed(parseResult))
+      return failure();
+    return getFuncTypeFor(builder, *parseResult);
+  }
+
+  llvm::FailureOr<FunctionType> parseBlockFuncType(OpBuilder &builder);
+
   struct ParseResultWithInfo {
     SmallVector<Value> opResults;
     std::byte endingByte;
   };
 
+  template <typename FilterT = ByteSequence<WasmBinaryEncoding::endByte>>
+  /// @param blockToFill: the block which content will be populated
+  /// @param resType: the type that this block is supposed to return
+  llvm::FailureOr<std::byte>
+  parseBlockContent(OpBuilder &builder, Block *blockToFill, TypeRange resTypes,
+                    Location opLoc, LabelLevelOpInterface levelOp,
+                    FilterT parseEndBytes = {}) {
+    OpBuilder::InsertionGuard guard{builder};
+    builder.setInsertionPointToStart(blockToFill);
+    LDBG() << "parsing a block of type "
+           << builder.getFunctionType(blockToFill->getArgumentTypes(),
+                                      resTypes);
+    auto nC = addNesting(levelOp);
+
+    if (failed(pushResults(blockToFill->getArguments())))
+      return failure();
+    auto bodyParsingRes = parse(builder, parseEndBytes);
+    if (failed(bodyParsingRes))
+      return failure();
+    auto returnOperands = popOperands(resTypes);
+    if (failed(returnOperands))
+      return failure();
+    builder.create<BlockReturnOp>(opLoc, *returnOperands);
+    LDBG() << "end of parsing of a block";
+    return bodyParsingRes->endingByte;
+  }
+
 public:
   template <std::byte ParseEndByte = WasmBinaryEncoding::endByte>
   parsed_inst_t parse(OpBuilder &builder, UniqueByte<ParseEndByte> = {});
@@ -294,7 +420,11 @@ public:
   parse(OpBuilder &builder,
         ByteSequence<ExpressionParseEnd...> parsingEndFilters);
 
-  FailureOr<SmallVector<Value>> popOperands(TypeRange operandTypes) {
+  NestingContextGuard addNesting(LabelLevelOpInterface levelOp) {
+    return NestingContextGuard{*this, levelOp};
+  }
+
+  FailureOr<llvm::SmallVector<Value>> popOperands(TypeRange operandTypes) {
     return valueStack.popOperands(operandTypes, &currentOpLoc.value());
   }
 
@@ -308,6 +438,12 @@ public:
   template <typename OpToCreate>
   parsed_inst_t parseSetOrTee(OpBuilder &);
 
+  /// Blocks and Loops have a similar format and differ only in how their exit
+  /// is handled which doesn´t matter at parsing time. Factorizes in one
+  /// function.
+  template <typename OpToCreate>
+  parsed_inst_t parseBlockLikeOp(OpBuilder &);
+
 private:
   std::optional<Location> currentOpLoc;
   ParserHead &parser;
@@ -586,6 +722,29 @@ public:
     return success();
   }
 
+  llvm::FailureOr<BlockTypeParseResult> parseBlockType(MLIRContext *ctx) {
+    auto loc = getLocation();
+    auto blockIndicator = peek();
+    if (failed(blockIndicator))
+      return failure();
+    if (*blockIndicator == WasmBinaryEncoding::Type::emptyBlockType) {
+      offset += 1;
+      return {EmptyBlockMarker{}};
+    }
+    if (isValueOneOf(*blockIndicator, valueTypesEncodings))
+      return parseValueType(ctx);
+    /// Block type idx is a 32 bit positive integer encoded as a 33 bit signed
+    /// value
+    auto typeIdx = parseI64();
+    if (failed(typeIdx))
+      return failure();
+    if (*typeIdx < 0 || *typeIdx > std::numeric_limits<uint32_t>::max())
+      return emitError(loc, "type ID should be representable with an unsigned "
+                            "32 bits integer. Got ")
+             << *typeIdx;
+    return {TypeIdxRecord{static_cast<uint32_t>(*typeIdx)}};
+  }
+
   bool end() const { return curHead().empty(); }
 
   ParserHead copy() const { return *this; }
@@ -701,17 +860,41 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction(OpBuilder &) {
 void ValueStack::dump() const {
   llvm::dbgs() << "================= Wasm ValueStack =======================\n";
   llvm::dbgs() << "size: " << size() << "\n";
+  llvm::dbgs() << "nbFrames: " << labelLevel.size() << '\n';
   llvm::dbgs() << "<Top>"
                << "\n";
   // Stack is pushed to via push_back. Therefore the top of the stack is the
   // end of the vector. Iterate in reverse so that the first thing we print
   // is the top of the stack.
+  auto indexGetter = [this]() {
+    size_t idx = labelLevel.size();
+    return [this, idx]() mutable -> std::optional<std::pair<size_t, size_t>> {
+      llvm::dbgs() << "IDX: " << idx << '\n';
+      if (idx == 0)
+        return std::nullopt;
+      auto frameId = idx - 1;
+      auto frameLimit = labelLevel[frameId].stackIdx;
+      idx -= 1;
+      return {{frameId, frameLimit}};
+    };
+  };
+  auto getNextFrameIndex = indexGetter();
+  auto nextFrameIdx = getNextFrameIndex();
   size_t stackSize = size();
-  for (size_t idx = 0; idx < stackSize; idx++) {
+  for (size_t idx = 0; idx < stackSize; ++idx) {
     size_t actualIdx = stackSize - 1 - idx;
+    while (nextFrameIdx && (nextFrameIdx->second > actualIdx)) {
+      llvm::dbgs() << "  --------------- Frame (" << nextFrameIdx->first
+                   << ")\n";
+      nextFrameIdx = getNextFrameIndex();
+    }
     llvm::dbgs() << "  ";
     values[actualIdx].dump();
   }
+  while (nextFrameIdx) {
+    llvm::dbgs() << "  --------------- Frame (" << nextFrameIdx->first << ")\n";
+    nextFrameIdx = getNextFrameIndex();
+  }
   llvm::dbgs() << "<Bottom>"
                << "\n";
   llvm::dbgs() << "=========================================================\n";
@@ -726,7 +909,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) {
     return emitError(*opLoc,
                      "stack doesn't contain enough values. trying to get ")
            << operandTypes.size() << " operands on a stack containing only "
-           << values.size() << " values.";
+           << values.size() << " values";
   size_t stackIdxOffset = values.size() - operandTypes.size();
   SmallVector<Value> res{};
   res.reserve(operandTypes.size());
@@ -735,8 +918,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) {
     Type stackType = operand.getType();
     if (stackType != operandTypes[i])
       return emitError(*opLoc, "invalid operand type on stack. expecting ")
-             << operandTypes[i] << ", value on stack is of type " << stackType
-             << ".";
+             << operandTypes[i] << ", value on stack is of type " << stackType;
     LDBG() << "    POP: " << operand;
     res.push_back(operand);
   }
@@ -792,6 +974,151 @@ ExpressionParser::parse(OpBuilder &builder,
   }
 }
 
+llvm::FailureOr<FunctionType>
+ExpressionParser::parseBlockFuncType(OpBuilder &builder) {
+  return getFuncTypeFor(builder, parser.parseBlockType(builder.getContext()));
+}
+
+template <typename OpToCreate>
+parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) {
+  auto opLoc = currentOpLoc;
+  auto funcType = parseBlockFuncType(builder);
+  if (failed(funcType))
+    return failure();
+
+  auto inputTypes = funcType->getInputs();
+  auto inputOps = popOperands(inputTypes);
+  if (failed(inputOps))
+    return failure();
+
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto resTypes = funcType->getResults();
+  llvm::SmallVector<Location> locations{};
+  locations.resize(resTypes.size(), *currentOpLoc);
+  auto *successor =
+      builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
+  builder.setInsertionPointToEnd(curBlock);
+  auto blockOp =
+      builder.create<OpToCreate>(*currentOpLoc, *inputOps, successor);
+  auto *blockBody = blockOp.createBlock();
+  if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp)))
+    return failure();
+  builder.setInsertionPointToStart(successor);
+  return {ValueRange{successor->getArguments()}};
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::block>(
+    OpBuilder &builder) {
+  return parseBlockLikeOp<BlockOp>(builder);
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::loop>(
+    OpBuilder &builder) {
+  return parseBlockLikeOp<LoopOp>(builder);
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::ifOpCode>(OpBuilder &builder) {
+  auto opLoc = currentOpLoc;
+  auto funcType = parseBlockFuncType(builder);
+  if (failed(funcType))
+    return failure();
+
+  LDBG() << "Parsing an if instruction of type " << *funcType;
+  auto inputTypes = funcType->getInputs();
+  auto conditionValue = popOperands(builder.getI32Type());
+  if (failed(conditionValue))
+    return failure();
+  auto inputOps = popOperands(inputTypes);
+  if (failed(inputOps))
+    return failure();
+
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto resTypes = funcType->getResults();
+  llvm::SmallVector<Location> locations{};
+  locations.resize(resTypes.size(), *currentOpLoc);
+  auto *successor =
+      builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
+  builder.setInsertionPointToEnd(curBlock);
+  auto ifOp = builder.create<IfOp>(*currentOpLoc, conditionValue->front(),
+                                   *inputOps, successor);
+  auto *ifEntryBlock = ifOp.createIfBlock();
+  constexpr auto ifElseFilter =
+      ByteSequence<WasmBinaryEncoding::endByte,
+                   WasmBinaryEncoding::OpCode::elseOpCode>{};
+  auto parseIfRes = parseBlockContent(builder, ifEntryBlock, resTypes, *opLoc,
+                                      ifOp, ifElseFilter);
+  if (failed(parseIfRes))
+    return failure();
+  if (*parseIfRes == WasmBinaryEncoding::OpCode::elseOpCode) {
+    LDBG() << "  else block is present.";
+    Block *elseEntryBlock = ifOp.createElseBlock();
+    auto parseElseRes =
+        parseBlockContent(builder, elseEntryBlock, resTypes, *opLoc, ifOp);
+    if (failed(parseElseRes))
+      return failure();
+  }
+  builder.setInsertionPointToStart(successor);
+  return {ValueRange{successor->getArguments()}};
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::branchIf>(OpBuilder &builder) {
+  auto level = parser.parseLiteral<uint32_t>();
+  if (failed(level))
+    return failure();
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto sip = builder.saveInsertionPoint();
+  Block *elseBlock = builder.createBlock(curRegion, curRegion->end());
+  auto condition = popOperands(builder.getI32Type());
+  if (failed(condition))
+    return failure();
+  builder.restoreInsertionPoint(sip);
+  auto targetOp =
+      LabelBranchingOpInterface::getTargetOpFromBlock(curBlock, *level);
+  if (failed(targetOp))
+    return failure();
+  auto inputTypes = targetOp->getLabelTarget()->getArgumentTypes();
+  auto branchArgs = popOperands(inputTypes);
+  if (failed(branchArgs))
+    return failure();
+  builder.create<BranchIfOp>(*currentOpLoc, condition->front(),
+                             builder.getUI32IntegerAttr(*level), *branchArgs,
+                             elseBlock);
+  builder.setInsertionPointToStart(elseBlock);
+  return {*branchArgs};
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::call>(
+    OpBuilder &builder) {
+  auto loc = *currentOpLoc;
+  auto funcIdx = parser.parseLiteral<uint32_t>();
+  if (failed(funcIdx))
+    return failure();
+  if (*funcIdx >= symbols.funcSymbols.size())
+    return emitError(loc, "Invalid function index: ") << *funcIdx;
+  auto callee = symbols.funcSymbols[*funcIdx];
+  llvm::ArrayRef<Type> inTypes = callee.functionType.getInputs();
+  llvm::ArrayRef<Type> resTypes = callee.functionType.getResults();
+  parsed_inst_t inOperands = popOperands(inTypes);
+  if (failed(inOperands))
+    return failure();
+  auto callOp =
+      builder.create<FuncCallOp>(loc, resTypes, callee.symbol, *inOperands);
+  return {callOp.getResults()};
+}
+
 template <>
 inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
     WasmBinaryEncoding::OpCode::localGet>(OpBuilder &builder) {
@@ -834,7 +1161,7 @@ parsed_inst_t ExpressionParser::parseSetOrTee(OpBuilder &builder) {
   if (valueStack.empty())
     return emitError(
         *currentOpLoc,
-        "invalid stack access, trying to access a value on an empty stack.");
+        "invalid stack access, trying to access a value on an empty stack");
 
   parsed_inst_t poppedOp = popOperands(locals[*id].getType().getElementType());
   if (failed(poppedOp))
@@ -1000,11 +1327,23 @@ inline parsed_inst_t ExpressionParser::buildNumericOp(
 
 BUILD_NUMERIC_BINOP_FP(CopySignOp, copysign)
 BUILD_NUMERIC_BINOP_FP(DivOp, div)
+BUILD_NUMERIC_BINOP_FP(GeOp, ge)
+BUILD_NUMERIC_BINOP_FP(GtOp, gt)
+BUILD_NUMERIC_BINOP_FP(LeOp, le)
+BUILD_NUMERIC_BINOP_FP(LtOp, lt)
 BUILD_NUMERIC_BINOP_FP(MaxOp, max)
 BUILD_NUMERIC_BINOP_FP(MinOp, min)
 BUILD_NUMERIC_BINOP_INT(AndOp, and)
 BUILD_NUMERIC_BINOP_INT(DivSIOp, divS)
 BUILD_NUMERIC_BINOP_INT(DivUIOp, divU)
+BUILD_NUMERIC_BINOP_INT(GeSIOp, geS)
+BUILD_NUMERIC_BINOP_INT(GeUIOp, geU)
+BUILD_NUMERIC_BINOP_INT(GtSIOp, gtS)
+BUILD_NUMERIC_BINOP_INT(GtUIOp, gtU)
+BUILD_NUMERIC_BINOP_INT(LeSIOp, leS)
+BUILD_NUMERIC_BINOP_INT(LeUIOp, leU)
+BUILD_NUMERIC_BINOP_INT(LtSIOp, ltS)
+BUILD_NUMERIC_BINOP_INT(LtUIOp, ltU)
 BUILD_NUMERIC_BINOP_INT(OrOp, or)
 BUILD_NUMERIC_BINOP_INT(RemSIOp, remS)
 BUILD_NUMERIC_BINOP_INT(RemUIOp, remU)
@@ -1015,7 +1354,9 @@ BUILD_NUMERIC_BINOP_INT(ShRSOp, shrS)
 BUILD_NUMERIC_BINOP_INT(ShRUOp, shrU)
 BUILD_NUMERIC_BINOP_INT(XOrOp, xor)
 BUILD_NUMERIC_BINOP_INTFP(AddOp, add)
+BUILD_NUMERIC_BINOP_INTFP(EqOp, eq)
 BUILD_NUMERIC_BINOP_INTFP(MulOp, mul)
+BUILD_NUMERIC_BINOP_INTFP(NeOp, ne)
 BUILD_NUMERIC_BINOP_INTFP(SubOp, sub)
 BUILD_NUMERIC_UNARY_OP_FP(AbsOp, abs)
 BUILD_NUMERIC_UNARY_OP_FP(CeilOp, ceil)
@@ -1025,6 +1366,7 @@ BUILD_NUMERIC_UNARY_OP_FP(SqrtOp, sqrt)
 BUILD_NUMERIC_UNARY_OP_FP(TruncOp, trunc)
 BUILD_NUMERIC_UNARY_OP_INT(ClzOp, clz)
 BUILD_NUMERIC_UNARY_OP_INT(CtzOp, ctz)
+BUILD_NUMERIC_UNARY_OP_INT(EqzOp, eqz)
 BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt)
 
 // Don't need these anymore so let's undef them.
@@ -1036,6 +1378,105 @@ BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt)
 #undef BUILD_NUMERIC_OP
 #undef BUILD_NUMERIC_CAST_OP
 
+template <typename opType, typename inputType, typename outputType,
+          typename... extraArgsT>
+inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder,
+                                                      extraArgsT... extraArgs) {
+  static_assert(std::is_arithmetic_v<inputType>,
+                "InputType should be an arithmetic type");
+  static_assert(std::is_arithmetic_v<outputType>,
+                "OutputType should be an arithmetic type");
+  auto intype = buildLiteralType<inputType>(builder);
+  auto outType = buildLiteralType<outputType>(builder);
+  auto operand = popOperands(intype);
+  if (failed(operand))
+    return failure();
+  auto op = builder.create<opType>(*currentOpLoc, outType, operand->front(),
+                                   extraArgs...);
+  LDBG() << "Built operation: " << op;
+  return {{op.getResult()}};
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::demoteF64ToF32>(OpBuilder &builder) {
+  return buildConvertOp<DemoteOp, double, float>(builder);
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::wrap>(
+    OpBuilder &builder) {
+  return buildConvertOp<WrapOp, int64_t, int32_t>(builder);
+}
+
+#define BUILD_CONVERSION_OP(IN_T, OUT_T, SOURCE_OP, TARGET_OP)                 \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::SOURCE_OP>(OpBuilder & builder) {            \
+    return buildConvertOp<TARGET_OP, IN_T, OUT_T>(builder);                    \
+  }
+
+#define BUILD_CONVERT_OP_FOR(DEST_T, WIDTH)                                    \
+  BUILD_CONVERSION_OP(uint32_t, DEST_T, convertUI32F##WIDTH, ConvertUOp)       \
+  BUILD_CONVERSION_OP(int32_t, DEST_T, convertSI32F##WIDTH, ConvertSOp)        \
+  BUILD_CONVERSION_OP(uint64_t, DEST_T, convertUI64F##WIDTH, ConvertUOp)       \
+  BUILD_CONVERSION_OP(int64_t, DEST_T, convertSI64F##WIDTH, ConvertSOp)
+
+BUILD_CONVERT_OP_FOR(float, 32)
+BUILD_CONVERT_OP_FOR(double, 64)
+
+#undef BUILD_CONVERT_OP_FOR
+
+BUILD_CONVERSION_OP(int32_t, int64_t, extendS, ExtendSI32Op)
+BUILD_CONVERSION_OP(int32_t, int64_t, extendU, ExtendUI32Op)
+
+#undef BUILD_CONVERSION_OP
+
+#define BUILD_SLICE_EXTEND_PARSER(IT_WIDTH, EXTRACT_WIDTH)                     \
+  template <>                                                                  \
+  parsed_inst_t ExpressionParser::parseSpecificInstruction<                    \
+      WasmBinaryEncoding::OpCode::extendI##IT_WIDTH##EXTRACT_WIDTH##S>(        \
+      OpBuilder & builder) {                                                   \
+    using inout_t = int##IT_WIDTH##_t;                                         \
+    auto attr = builder.getUI32IntegerAttr(EXTRACT_WIDTH);                     \
+    return buildConvertOp<ExtendLowBitsSOp, inout_t, inout_t>(builder, attr);  \
+  }
+
+BUILD_SLICE_EXTEND_PARSER(32, 8)
+BUILD_SLICE_EXTEND_PARSER(32, 16)
+BUILD_SLICE_EXTEND_PARSER(64, 8)
+BUILD_SLICE_EXTEND_PARSER(64, 16)
+BUILD_SLICE_EXTEND_PARSER(64, 32)
+
+#undef BUILD_SLICE_EXTEND_PARSER
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::promoteF32ToF64>(OpBuilder &builder) {
+  return buildConvertOp<PromoteOp, float, double>(builder);
+}
+
+#define BUILD_REINTERPRET_PARSER(WIDTH, FP_TYPE)                               \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::reinterpretF##WIDTH##AsI##WIDTH>(OpBuilder & \
+                                                                   builder) {  \
+    return buildConvertOp<ReinterpretOp, FP_TYPE, int##WIDTH##_t>(builder);    \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::reinterpretI##WIDTH##AsF##WIDTH>(OpBuilder & \
+                                                                   builder) {  \
+    return buildConvertOp<ReinterpretOp, int##WIDTH##_t, FP_TYPE>(builder);    \
+  }
+
+BUILD_REINTERPRET_PARSER(32, float)
+BUILD_REINTERPRET_PARSER(64, double)
+
+#undef BUILD_REINTERPRET_PARSER
+
 class WasmBinaryParser {
 private:
   struct SectionRegistry {
@@ -1153,7 +1594,7 @@ private:
     if (tid.id >= symbols.moduleFuncTypes.size())
       return emitError(loc, "invalid type id: ")
              << tid.id << ". Only " << symbols.moduleFuncTypes.size()
-             << " type registration.";
+             << " type registrations";
     FunctionType type = symbols.moduleFuncTypes[tid.id];
     std::string symbol = symbols.getNewFuncSymbolName();
     auto funcOp = FuncImportOp::create(builder, loc, symbol, moduleName,
@@ -1221,7 +1662,7 @@ public:
     FileLineColLoc magicLoc = parser.getLocation();
     FailureOr<StringRef> magic = parser.consumeNBytes(wasmHeader.size());
     if (failed(magic) || magic->compare(wasmHeader)) {
-      emitError(magicLoc, "source file does not contain valid Wasm header.");
+      emitError(magicLoc, "source file does not contain valid Wasm header");
       return;
     }
     auto const expectedVersionString = StringRef{"\1\0\0\0", 4};
@@ -1391,7 +1832,7 @@ WasmBinaryParser::parseSectionItem<WasmSectionType::EXPORT>(ParserHead &ph,
     return failure();
 
   Operation *op = SymbolTable::lookupSymbolIn(mOp, *currentSymbol);
-  SymbolTable::setSymbolVisibility(op, SymbolTable::Visibility::Public);
+  op->setAttr("exported", UnitAttr::get(op->getContext()));
   StringAttr symName = SymbolTable::getSymbolName(op);
   return SymbolTable{mOp}.rename(symName, *exportName);
 }