33 files changed, 1490 insertions, 82 deletions
diff --git a/mlir/test/Conversion/SCFToEmitC/while.mlir b/mlir/test/Conversion/SCFToEmitC/while.mlir
new file mode 100644
index 0000000..28524a0
--- /dev/null
+++ b/mlir/test/Conversion/SCFToEmitC/while.mlir
@@ -0,0 +1,293 @@
+// RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -convert-to-emitc="filter-dialects=scf" %s | FileCheck %s
+
+emitc.func @payload_one_result(%arg: i32) -> i32 {
+  %result = add %arg, %arg : (i32, i32) -> i32
+  return %result : i32
+}
+
+func.func @one_result() -> i32 {
+  %init = emitc.literal "1.0" : i32
+  %var  = emitc.literal "1.0" : i32
+  %exit = emitc.literal "10.0" : i32
+
+  %res = scf.while (%arg1 = %init) : (i32) -> i32 {
+    %sum = emitc.add %arg1, %var : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    %next = emitc.add %arg1, %arg1 : (i32, i32) -> i32
+    scf.condition(%condition) %next : i32
+  } do {
+  ^bb0(%arg2: i32):
+    %next_arg1 = emitc.call @payload_one_result(%arg2) : (i32) -> i32
+    scf.yield %next_arg1 : i32
+  }
+  
+  return %res : i32
+}
+// CHECK-LABEL:   emitc.func @payload_one_result(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @one_result() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_2:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_6:.*]] = load %[[VAL_4]] : <i32>
+// CHECK:             %[[VAL_7:.*]] = add %[[VAL_6]], %[[VAL_1]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_8:.*]] = cmp lt, %[[VAL_7]], %[[VAL_2]] : (i32, i32) -> i1
+// CHECK:             %[[VAL_9:.*]] = add %[[VAL_6]], %[[VAL_6]] : (i32, i32) -> i32
+// CHECK:             assign %[[VAL_9]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_8]] : i1 to %[[VAL_5]] : <i1>
+// CHECK:             if %[[VAL_8]] {
+// CHECK:               %[[VAL_10:.*]] = call @payload_one_result(%[[VAL_9]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_10]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_11:.*]] = expression %[[VAL_5]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_12:.*]] = load %[[VAL_5]] : <i1>
+// CHECK:               yield %[[VAL_12]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_11]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_13:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           return %[[VAL_13]] : i32
+// CHECK:         }
+
+emitc.func @payload_two_results(%arg: i32) -> i32 {
+  %result = add %arg, %arg : (i32, i32) -> i32
+  return %result : i32
+}
+
+func.func @two_results() -> i32 {
+  %init = emitc.literal "1.0" : i32
+  %exit = emitc.literal "10.0" : i32
+
+  %res1, %res2 = scf.while (%arg1_1 = %init, %arg1_2 = %init) : (i32, i32) -> (i32, i32) {
+    %sum = emitc.add %arg1_1, %arg1_2 : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    scf.condition(%condition) %init, %arg1_2  : i32, i32
+  } do {
+  ^bb0(%arg2_1 : i32, %arg2_2 : i32):
+    %next1 = emitc.call @payload_two_results(%arg2_1) : (i32) -> i32
+    %next2 = emitc.call @payload_two_results(%arg2_2) : (i32) -> i32
+    scf.yield %next1, %next2 : i32, i32
+  }
+  
+  return %res1 : i32
+}
+// CHECK-LABEL:   emitc.func @payload_two_results(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @two_results() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:           %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_7:.*]] = load %[[VAL_4]] : <i32>
+// CHECK:             %[[VAL_8:.*]] = load %[[VAL_5]] : <i32>
+// CHECK:             %[[VAL_9:.*]] = add %[[VAL_7]], %[[VAL_8]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_10:.*]] = cmp lt, %[[VAL_9]], %[[VAL_1]] : (i32, i32) -> i1
+// CHECK:             assign %[[VAL_0]] : i32 to %[[VAL_2]] : <i32>
+// CHECK:             assign %[[VAL_8]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_10]] : i1 to %[[VAL_6]] : <i1>
+// CHECK:             if %[[VAL_10]] {
+// CHECK:               %[[VAL_11:.*]] = call @payload_two_results(%[[VAL_0]]) : (i32) -> i32
+// CHECK:               %[[VAL_12:.*]] = call @payload_two_results(%[[VAL_8]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_11]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:               assign %[[VAL_12]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_13:.*]] = expression %[[VAL_6]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_14:.*]] = load %[[VAL_6]] : <i1>
+// CHECK:               yield %[[VAL_14]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_13]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_15:.*]] = emitc.load %[[VAL_2]] : <i32>
+// CHECK:           %[[VAL_16:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           return %[[VAL_15]] : i32
+// CHECK:         }
+
+emitc.func @payload_double_use(%arg: i32) -> i32 {
+  %result = add %arg, %arg : (i32, i32) -> i32
+  return %result : i32
+}
+
+emitc.func @foo_with_side_effect(%arg: i32, %p : !emitc.ptr<i32>) -> i32 {
+  %sum = add %arg, %arg : (i32, i32) -> i32
+  emitc.verbatim "{}[0] = {};" args %p, %sum : !emitc.ptr<i32>, i32
+  return %sum : i32
+}
+
+func.func @double_use(%p : !emitc.ptr<i32>) -> i32 {
+  %init = emitc.literal "1.0" : i32
+  %var  = emitc.literal "1.0" : i32
+  %exit = emitc.literal "10.0" : i32
+  %res = scf.while (%arg1 = %init) : (i32) -> i32 {
+    %used_twice = emitc.call @foo_with_side_effect(%arg1, %p) : (i32, !emitc.ptr<i32>) -> i32
+    %prod = emitc.add %used_twice, %used_twice : (i32, i32) -> i32
+    %sum = emitc.add %arg1, %prod : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    scf.condition(%condition) %arg1 : i32
+  } do {
+  ^bb0(%arg2: i32):
+    %next_arg1 = emitc.call @payload_double_use(%arg2) : (i32) -> i32
+    scf.yield %next_arg1 : i32
+  }
+  return %res : i32
+}
+// CHECK-LABEL:   emitc.func @payload_double_use(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   emitc.func @foo_with_side_effect(
+// CHECK-SAME:      %[[ARG0:.*]]: i32,
+// CHECK-SAME:      %[[ARG1:.*]]: !emitc.ptr<i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           verbatim "{}[0] = {};" args %[[ARG1]], %[[VAL_0]] : !emitc.ptr<i32>, i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @double_use(
+// CHECK-SAME:      %[[ARG0:.*]]: !emitc.ptr<i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_2:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_6:.*]] = load %[[VAL_4]] : <i32>
+// CHECK:             %[[VAL_7:.*]] = call @foo_with_side_effect(%[[VAL_6]], %[[ARG0]]) : (i32, !emitc.ptr<i32>) -> i32
+// CHECK:             %[[VAL_8:.*]] = add %[[VAL_7]], %[[VAL_7]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_9:.*]] = add %[[VAL_6]], %[[VAL_8]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_10:.*]] = cmp lt, %[[VAL_9]], %[[VAL_2]] : (i32, i32) -> i1
+// CHECK:             assign %[[VAL_6]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_10]] : i1 to %[[VAL_5]] : <i1>
+// CHECK:             if %[[VAL_10]] {
+// CHECK:               %[[VAL_11:.*]] = call @payload_double_use(%[[VAL_6]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_11]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_12:.*]] = expression %[[VAL_5]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_13:.*]] = load %[[VAL_5]] : <i1>
+// CHECK:               yield %[[VAL_13]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_12]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_14:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           return %[[VAL_14]] : i32
+// CHECK:         }
+
+emitc.func @payload_empty_after_region() -> i1 {
+  %true = emitc.literal "true" : i1
+  return %true : i1
+}
+
+func.func @empty_after_region() {
+  scf.while () : () -> () {
+    %condition = emitc.call @payload_empty_after_region() : () -> i1
+    scf.condition(%condition)
+  } do {
+  ^bb0():
+    scf.yield
+  }
+  return
+}
+// CHECK-LABEL:   emitc.func @payload_empty_after_region() -> i1 {
+// CHECK:           %[[VAL_0:.*]] = literal "true" : i1
+// CHECK:           return %[[VAL_0]] : i1
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @empty_after_region() {
+// CHECK:           %[[VAL_0:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_1:.*]] = call @payload_empty_after_region() : () -> i1
+// CHECK:             assign %[[VAL_1]] : i1 to %[[VAL_0]] : <i1>
+// CHECK:           } while {
+// CHECK:             %[[VAL_2:.*]] = expression %[[VAL_0]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_3:.*]] = load %[[VAL_0]] : <i1>
+// CHECK:               yield %[[VAL_3]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_2]] : i1
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+emitc.func @payload_different_number_of_vars(%arg0: i32) -> i32 {
+  %0 = add %arg0, %arg0 : (i32, i32) -> i32
+  return %0 : i32
+}
+func.func @different_number_of_vars() -> (i32, i32) {
+  %init = emitc.literal "1.0" : i32
+  %var  = emitc.literal "7.0" : i32
+  %exit = emitc.literal "10.0" : i32
+  %res, %res2 = scf.while (%arg1 = %init) : (i32) -> (i32, i32) {
+    %sum = emitc.add %arg1, %var : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    %next = emitc.add %arg1, %arg1 : (i32, i32) -> i32
+    scf.condition(%condition) %next, %sum : i32, i32
+  } do {
+  ^bb0(%arg2: i32, %arg3 : i32):
+    %next_arg1 = emitc.call @payload_different_number_of_vars(%arg2) : (i32) -> i32
+    scf.yield %next_arg1 : i32
+  }
+  return %res, %res2 : i32, i32
+}
+// CHECK-LABEL:   emitc.func @payload_different_number_of_vars(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @different_number_of_vars() -> (i32, i32) {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "7.0" : i32
+// CHECK:           %[[VAL_2:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:           %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_7:.*]] = load %[[VAL_5]] : <i32>
+// CHECK:             %[[VAL_8:.*]] = add %[[VAL_7]], %[[VAL_1]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_9:.*]] = cmp lt, %[[VAL_8]], %[[VAL_2]] : (i32, i32) -> i1
+// CHECK:             %[[VAL_10:.*]] = add %[[VAL_7]], %[[VAL_7]] : (i32, i32) -> i32
+// CHECK:             assign %[[VAL_10]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_8]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:             assign %[[VAL_9]] : i1 to %[[VAL_6]] : <i1>
+// CHECK:             if %[[VAL_9]] {
+// CHECK:               %[[VAL_11:.*]] = call @payload_different_number_of_vars(%[[VAL_10]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_11]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_12:.*]] = expression %[[VAL_6]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_13:.*]] = load %[[VAL_6]] : <i1>
+// CHECK:               yield %[[VAL_13]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_12]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_14:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           %[[VAL_15:.*]] = emitc.load %[[VAL_4]] : <i32>
+// CHECK:           return %[[VAL_14]], %[[VAL_15]] : i32, i32
+// CHECK:         }
diff --git a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
index 8f60a07..b31a973 100644
--- a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
+++ b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
@@ -261,3 +261,87 @@ llvm.func @llvm.store(%a: !llvm.ptr<1>, %val: i32) {
   llvm.store %val, %a {cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>} : i32, !llvm.ptr<1>
   llvm.return
 }
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z30intel_sub_group_block_read_us8PU3AS1t
+// CHECK: llvm.func @blockload_as1(%[[ARG0:.*]]: !llvm.ptr<1>)
+llvm.func @blockload_as1(%ptr: !llvm.ptr<1>) -> vector<8xi16> {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z30intel_sub_group_block_read_us8PU3AS1t(%[[ARG0]])
+  // CHECK-SAME: {function_type = !llvm.func<vector<8xi16> (ptr<1>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:  no_unwind, sym_name = "_Z30intel_sub_group_block_read_us8PU3AS1t",
+  // CHECK-SAME:  visibility_ = 0 : i64, will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6442 : i32, 0 : i32, 1 : i32, 0 : i32],
+  // CHECK-SAME:    [6442 : i32, 1 : i32, 1 : i32, 0 : i32]
+  %loaded_a = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<1>) -> vector<8xi16>
+  llvm.return %loaded_a : vector<8xi16>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z31intel_sub_group_block_read_uc16PU3AS3h(!llvm.ptr<3>)
+// CHECK: llvm.func @blockload_as3(%[[ARG0:.*]]: !llvm.ptr<3>)
+llvm.func @blockload_as3(%ptr: !llvm.ptr<3>) -> vector<16xi8> {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z31intel_sub_group_block_read_uc16PU3AS3h(%[[ARG0]])
+  // CHECK-SAME: {function_type = !llvm.func<vector<16xi8> (ptr<3>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z31intel_sub_group_block_read_uc16PU3AS3h", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6442 : i32, 0 : i32, 1 : i32, 0 : i32],
+  // CHECK-SAME:    [6442 : i32, 1 : i32, 1 : i32, 0 : i32]
+  %loaded_a = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<3>) -> vector<16xi8>
+  llvm.return %loaded_a : vector<16xi8>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z29intel_sub_group_block_read_ucPU3AS3h(!llvm.ptr<3>)
+// CHECK: llvm.func @blockload_scalar(%[[ARG0:.*]]: !llvm.ptr<3>)
+llvm.func @blockload_scalar(%ptr: !llvm.ptr<3>) -> i8 {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z29intel_sub_group_block_read_ucPU3AS3h(%[[ARG0]])
+  // CHECK-SAME: {function_type = !llvm.func<i8 (ptr<3>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z29intel_sub_group_block_read_ucPU3AS3h", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6442 : i32, 0 : i32, 1 : i32, 0 : i32],
+  // CHECK-SAME:    [6442 : i32, 1 : i32, 1 : i32, 0 : i32]
+  %loaded_a = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<3>) -> i8
+  llvm.return %loaded_a : i8
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z31intel_sub_group_block_write_ui8PU3AS1jDv8_j
+// CHECK: llvm.func @blockstore_as1(%[[ARG0:.*]]: !llvm.ptr<1>, %[[ARG1:.*]]: vector<8xi32>) {
+llvm.func @blockstore_as1(%ptr: !llvm.ptr<1>, %data: vector<8xi32>) {
+  // CHECK: llvm.call spir_funccc @_Z31intel_sub_group_block_write_ui8PU3AS1jDv8_j(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, vector<8xi32>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z31intel_sub_group_block_write_ui8PU3AS1jDv8_j", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6443 : i32, 0 : i32, 2 : i32, 0 : i32],
+  // CHECK-SAME:    [6443 : i32, 1 : i32, 2 : i32, 0 : i32]
+  xevm.blockstore %ptr, %data <{cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>}> : (!llvm.ptr<1>, vector<8xi32>)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z31intel_sub_group_block_write_ul2PU3AS3mDv2_m
+// CHECK: llvm.func @blockstore_as3(%[[ARG0:.*]]: !llvm.ptr<3>, %[[ARG1:.*]]: vector<2xi64>) {
+llvm.func @blockstore_as3(%ptr: !llvm.ptr<3>, %data: vector<2xi64>) {
+  // CHECK: llvm.call spir_funccc @_Z31intel_sub_group_block_write_ul2PU3AS3mDv2_m(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<3>, vector<2xi64>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z31intel_sub_group_block_write_ul2PU3AS3mDv2_m", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6443 : i32, 0 : i32, 2 : i32, 0 : i32],
+  // CHECK-SAME:    [6443 : i32, 1 : i32, 2 : i32, 0 : i32]
+  xevm.blockstore %ptr, %data <{cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>}> : (!llvm.ptr<3>, vector<2xi64>)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z30intel_sub_group_block_write_ulPU3AS3mm
+// CHECK: llvm.func @blockstore_scalar(%[[ARG0:.*]]: !llvm.ptr<3>, %[[ARG1:.*]]: i64) {
+llvm.func @blockstore_scalar(%ptr: !llvm.ptr<3>, %data: i64) {
+  // CHECK: llvm.call spir_funccc @_Z30intel_sub_group_block_write_ulPU3AS3mm(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<3>, i64)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z30intel_sub_group_block_write_ulPU3AS3mm", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6443 : i32, 0 : i32, 2 : i32, 0 : i32],
+  // CHECK-SAME:    [6443 : i32, 1 : i32, 2 : i32, 0 : i32]
+  xevm.blockstore %ptr, %data <{cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>}> : (!llvm.ptr<3>, i64)
+  llvm.return
+}
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index f4c15f5..5f594fb 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -252,7 +252,7 @@ func.func @sub_pointer_pointer(%arg0: !emitc.ptr<f32>, %arg1: !emitc.ptr<f32>) {
 // -----
 
 func.func @test_misplaced_yield() {
-  // expected-error @+1 {{'emitc.yield' op expects parent op to be one of 'emitc.expression, emitc.if, emitc.for, emitc.switch'}}
+  // expected-error @+1 {{'emitc.yield' op expects parent op to be one of 'emitc.do, emitc.expression, emitc.for, emitc.if, emitc.switch'}}
   emitc.yield
   return
 }
@@ -729,3 +729,150 @@ emitc.class @testClass {
     return
   }
 }
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op condition region must contain exactly two operations: 'emitc.expression' followed by 'emitc.yield', but found 3 operations}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    %3 = emitc.literal "3" : i32
+    emitc.yield %r : i1
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  // expected-error @+1 {{'emitc.do' op expected first op in condition region to be 'emitc.expression', but got emitc.literal}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %true = emitc.literal "true" : i1
+    emitc.yield %true : i1
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op emitc.expression in condition region must return 'i1', but returns 'i32'}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i32 {
+      %add = emitc.add %1, %2 : (i32, i32) -> i32
+      emitc.yield %add : i32
+    }
+
+    emitc.yield %r : i32
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op expected last op in condition region to be 'emitc.yield', but got emitc.expression}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r1 = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    %r2 = emitc.expression %1, %2 : (i32, i32) -> i32 {
+      %add = emitc.add %1, %2 : (i32, i32) -> i32
+      emitc.yield %add : i32
+    }
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op expected condition region to return 1 value, but it returns 0 values}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    emitc.yield
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  %true = emitc.literal "true" : i1
+
+  // expected-error @+1 {{'emitc.yield' must return result of 'emitc.expression' from this condition region}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    emitc.yield %true: i1
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op body region must not contain terminator}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+    emitc.yield
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    emitc.yield %r: i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 84c9b65..1259748 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -335,3 +335,23 @@ emitc.class final @finalClass {
     return
   }
 }
+
+func.func @do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+  %3 = emitc.literal "3" : i32
+
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2, %3 : (i32, i32, i32) -> i1 {
+      %add = emitc.add %1, %2 : (i32, i32) -> i32
+      %cmp = emitc.cmp eq, %add, %3 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+    
+    emitc.yield %r : i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/Linalg/decompose-pack.mlir b/mlir/test/Dialect/Linalg/decompose-pack.mlir
index 17e6c29..18a09f4 100644
--- a/mlir/test/Dialect/Linalg/decompose-pack.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-pack.mlir
@@ -274,3 +274,24 @@ func.func @pack_with_adjacent_trailing_dimensions_inner_dims_pos_and_unit_outer(
 // CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
 // CHECK-SAME:      [0, 0, 0, 0, 0] [1, 1, 1, 4, 1] [1, 1, 1, 1, 1] : tensor<1x4x1xf32> into tensor<1x1x1x4x1xf32>
 // CHECK:         return %[[INSERT]]
+
+// -----
+
+// The following example shows a pack operation where the inner dims
+// positions are non-adjacent and non-permuted.
+func.func @pack_with_non_adjacent_and_non_permuted_inner_dims(%arg0: tensor<8x1x1x1xf32>, %arg1:tensor<1x1x1x1x8x1xf32>) -> tensor<1x1x1x1x8x1xf32> {
+  %pack = linalg.pack %arg0 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [0, 3] inner_tiles = [8, 1] into %arg1: tensor<8x1x1x1xf32> -> tensor<1x1x1x1x8x1xf32>
+  return %pack : tensor<1x1x1x1x8x1xf32>
+}
+
+// CHECK-LABEL: func.func @pack_with_non_adjacent_and_non_permuted_inner_dims
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[DEST:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<1x1x8x1xf32>
+// CHECK:         %[[TRANSP:.+]] = linalg.transpose
+// CHECK-SAME:      ins(%[[SRC]] : tensor<8x1x1x1xf32>)
+// CHECK-SAME:      outs(%[[EMPTY]] : tensor<1x1x8x1xf32>)
+// CHECK-SAME:      permutation = [1, 2, 0, 3]
+// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
+// CHECK-SAME:      [0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 8, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x1xf32> into tensor<1x1x1x1x8x1xf32>
+// CHECK:         return %[[INSERT]]
diff --git a/mlir/test/Dialect/Linalg/runtime-verification.mlir b/mlir/test/Dialect/Linalg/runtime-verification.mlir
index 07e96c8..287f0e0 100644
--- a/mlir/test/Dialect/Linalg/runtime-verification.mlir
+++ b/mlir/test/Dialect/Linalg/runtime-verification.mlir
@@ -12,7 +12,9 @@ func.func @static_dims(%arg0: tensor<5xf32>, %arg1: tensor<5xf32>) -> (tensor<5x
     // CHECK: cf.assert %[[TRUE]]
     // VERBOSE0: %[[TRUE:.*]] = index.bool.constant true
     // VERBOSE0: cf.assert %[[TRUE]]
-    // VERBOSE0-SAME: ERROR: Runtime op verification failed\0A^\0ALocation: loc(
+    // VERBOSE0-SAME: ERROR: Runtime op verification failed\0A^ unexpected negative result on dimension #0
+    // VERBOSE0-SAME: Location 
+    // VERBOSE0-SAME: 19:10 
     %result = tensor.empty() : tensor<5xf32> 
     %0 = linalg.generic {
       indexing_maps = [#identity, #identity, #identity],
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
index c7b0bd5..8465e55 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
@@ -127,3 +127,119 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:   %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S6]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
 // CHECK-NEXT:   return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32>
 // CHECK-NEXT: }
+
+// -----
+
+func.func @conv2d_type_promotion(%arg0: tensor<2x6x6x5xf16>, %arg1: tensor<2x3x3x5xf16>, %arg2: tensor<1xf32>, %arg3: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<6x6x5x2xf16>
+  %1 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x3x5xf16>) outs(%0 : tensor<6x6x5x2xf16>) -> tensor<6x6x5x2xf16> // no-crash
+  %2 = tensor.empty() : tensor<6x6x1x1x2x5xf16>
+  %3 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x6x6x5xf16>) outs(%2 : tensor<6x6x1x1x2x5xf16>) -> tensor<6x6x1x1x2x5xf16> // no-crash
+  %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf16> into tensor<36x5x2xf16>
+  %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf16> into tensor<36x2x5xf16>
+  %4 = tensor.empty() : tensor<36x2x2xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+  %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%5 : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+  %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+  %7 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x6x1x1x2x2xf32>) outs(%arg3 : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+  return %7 : tensor<2x4x4x2xf32>
+}
+
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0) -> (d0 * 4)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1) -> ()>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL:   func.func @conv2d_type_promotion(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<2x6x6x5xf16>,
+// CHECK-SAME:      %[[ARG1:.*]]: tensor<2x3x3x5xf16>,
+// CHECK-SAME:      %[[ARG2:.*]]: tensor<1xf32>,
+// CHECK-SAME:      %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
+// CHECK-DAG:           %[[VAL_0:.*]] = arith.constant 1.024000e+03 : f32
+// CHECK-DAG:           %[[VAL_1:.*]] = arith.constant dense<{{\[\[}}1.250000e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00], [2.500000e-01, -2.500000e-01, 2.500000e-01, -2.500000e-01], [2.500000e-01, 2.500000e-01, 2.500000e-01, 2.500000e-01], [1.250000e-01, -2.500000e-01, 5.000000e-01, -1.000000e+00], [1.250000e-01, 2.500000e-01, 5.000000e-01, 1.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 5.000000e-01]]> : tensor<6x4xf32>
+// CHECK-DAG:           %[[VAL_2:.*]] = arith.constant dense<{{\[\[}}1.250000e-01, 2.500000e-01, 2.500000e-01, 1.250000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, 2.500000e-01, -2.500000e-01, 2.500000e-01, 0.000000e+00], [0.000000e+00, 2.500000e-01, 2.500000e-01, 5.000000e-01, 5.000000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, 2.500000e-01, -1.000000e+00, 1.000000e+00, 5.000000e-01]]> : tensor<4x6xf32>
+// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant dense<{{\[\[}}2.500000e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00], [0.000000e+00, 2.500000e-01, -2.500000e-01, 2.500000e-01, -2.500000e-01, 2.500000e-01], [-3.125000e-01, -2.500000e-01, -2.500000e-01, -1.250000e-01, -1.250000e-01, 0.000000e+00], [0.000000e+00, -6.250000e-02, 6.250000e-02, -2.500000e-01, 2.500000e-01, -3.125000e-01], [6.250000e-02, 6.250000e-02, 6.250000e-02, 1.250000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 6.250000e-02]]> : tensor<6x6xf16>
+// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant dense<{{\[\[}}2.500000e-01, 0.000000e+00, -3.125000e-01, 0.000000e+00, 6.250000e-02, 0.000000e+00], [0.000000e+00, 2.500000e-01, -2.500000e-01, -6.250000e-02, 6.250000e-02, 0.000000e+00], [0.000000e+00, -2.500000e-01, -2.500000e-01, 6.250000e-02, 6.250000e-02, 0.000000e+00], [0.000000e+00, 2.500000e-01, -1.250000e-01, -2.500000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, -1.250000e-01, 2.500000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, 2.500000e-01, 0.000000e+00, -3.125000e-01, 0.000000e+00, 6.250000e-02]]> : tensor<6x6xf16>
+// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant dense<{{\[\[}}1.000000e+00, -3.332520e-01, -3.332520e-01, 8.331300e-02, 8.331300e-02, 0.000000e+00], [0.000000e+00, 3.332520e-01, -3.332520e-01, -1.666260e-01, 1.666260e-01, 0.000000e+00], [0.000000e+00, -3.332520e-01, -3.332520e-01, 3.332520e-01, 3.332520e-01, 1.000000e+00]]> : tensor<3x6xf16>
+// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant dense<{{\[\[}}1.000000e+00, 0.000000e+00, 0.000000e+00], [-3.332520e-01, 3.332520e-01, -3.332520e-01], [-3.332520e-01, -3.332520e-01, -3.332520e-01], [8.331300e-02, -1.666260e-01, 3.332520e-01], [8.331300e-02, 1.666260e-01, 3.332520e-01], [0.000000e+00, 0.000000e+00, 1.000000e+00]]> : tensor<6x3xf16>
+// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f16
+// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK-DAG:           %[[VAL_9:.*]] = arith.constant 5 : index
+// CHECK-DAG:           %[[VAL_10:.*]] = arith.constant 2 : index
+// CHECK-DAG:           %[[VAL_11:.*]] = arith.constant 0 : index
+// CHECK-DAG:           %[[VAL_12:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_13:.*]] = tensor.empty() : tensor<6x6x5x2xf16>
+// CHECK-NEXT:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_8]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (tensor<6x6x5x2xf16>) {
+// CHECK-NEXT:             %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_11]] to %[[VAL_9]] step %[[VAL_8]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (tensor<6x6x5x2xf16>) {
+// CHECK-NEXT:               %[[VAL_20:.*]] = tensor.extract_slice %[[ARG1]]{{\[}}%[[VAL_15]], %[[VAL_11]], %[[VAL_11]], %[[VAL_18]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf16> to tensor<3x3xf16>
+// CHECK-NEXT:               %[[VAL_21:.*]] = tensor.empty() : tensor<6x3xf16>
+// CHECK-NEXT:               %[[VAL_22:.*]] = linalg.fill ins(%[[VAL_7]] : f16) outs(%[[VAL_21]] : tensor<6x3xf16>) -> tensor<6x3xf16>
+// CHECK-NEXT:               %[[VAL_23:.*]] = linalg.matmul ins(%[[VAL_6]], %[[VAL_20]] : tensor<6x3xf16>, tensor<3x3xf16>) outs(%[[VAL_22]] : tensor<6x3xf16>) -> tensor<6x3xf16>
+// CHECK-NEXT:               %[[VAL_24:.*]] = tensor.empty() : tensor<6x6xf16>
+// CHECK-NEXT:               %[[VAL_25:.*]] = linalg.fill ins(%[[VAL_7]] : f16) outs(%[[VAL_24]] : tensor<6x6xf16>) -> tensor<6x6xf16>
+// CHECK-NEXT:               %[[VAL_26:.*]] = linalg.matmul ins(%[[VAL_23]], %[[VAL_5]] : tensor<6x3xf16>, tensor<3x6xf16>) outs(%[[VAL_25]] : tensor<6x6xf16>) -> tensor<6x6xf16>
+// CHECK-NEXT:               %[[VAL_27:.*]] = tensor.insert_slice %[[VAL_26]] into %[[VAL_19]]{{\[}}%[[VAL_11]], %[[VAL_11]], %[[VAL_18]], %[[VAL_15]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf16> into tensor<6x6x5x2xf16>
+// CHECK-NEXT:               scf.yield %[[VAL_27]] : tensor<6x6x5x2xf16>
+// CHECK-NEXT:             }
+// CHECK-NEXT:             scf.yield %[[VAL_17]] : tensor<6x6x5x2xf16>
+// CHECK-NEXT:           }
+// CHECK-NEXT:           %[[VAL_28:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:           %[[VAL_29:.*]] = scf.for %[[VAL_30:.*]] = %[[VAL_11]] to %[[VAL_8]] step %[[VAL_8]] iter_args(%[[VAL_31:.*]] = %[[VAL_28]]) -> (tensor<6x6x1x1x2x5xf16>) {
+// CHECK-NEXT:             %[[VAL_32:.*]] = scf.for %[[VAL_33:.*]] = %[[VAL_11]] to %[[VAL_8]] step %[[VAL_8]] iter_args(%[[VAL_34:.*]] = %[[VAL_31]]) -> (tensor<6x6x1x1x2x5xf16>) {
+// CHECK-NEXT:               %[[VAL_35:.*]] = scf.for %[[VAL_36:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_8]] iter_args(%[[VAL_37:.*]] = %[[VAL_34]]) -> (tensor<6x6x1x1x2x5xf16>) {
+// CHECK-NEXT:                 %[[VAL_38:.*]] = scf.for %[[VAL_39:.*]] = %[[VAL_11]] to %[[VAL_9]] step %[[VAL_8]] iter_args(%[[VAL_40:.*]] = %[[VAL_37]]) -> (tensor<6x6x1x1x2x5xf16>) {
+// CHECK-NEXT:                   %[[VAL_41:.*]] = affine.apply #[[$ATTR_0]](%[[VAL_30]])
+// CHECK-NEXT:                   %[[VAL_42:.*]] = affine.apply #[[$ATTR_0]](%[[VAL_33]])
+// CHECK-NEXT:                   %[[VAL_43:.*]] = tensor.extract_slice %[[ARG0]]{{\[}}%[[VAL_36]], %[[VAL_41]], %[[VAL_42]], %[[VAL_39]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x6x6x5xf16> to tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_44:.*]] = tensor.empty() : tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_45:.*]] = linalg.fill ins(%[[VAL_7]] : f16) outs(%[[VAL_44]] : tensor<6x6xf16>) -> tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_46:.*]] = linalg.matmul ins(%[[VAL_4]], %[[VAL_43]] : tensor<6x6xf16>, tensor<6x6xf16>) outs(%[[VAL_45]] : tensor<6x6xf16>) -> tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_47:.*]] = tensor.empty() : tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_48:.*]] = linalg.fill ins(%[[VAL_7]] : f16) outs(%[[VAL_47]] : tensor<6x6xf16>) -> tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_49:.*]] = linalg.matmul ins(%[[VAL_46]], %[[VAL_3]] : tensor<6x6xf16>, tensor<6x6xf16>) outs(%[[VAL_48]] : tensor<6x6xf16>) -> tensor<6x6xf16>
+// CHECK-NEXT:                   %[[VAL_50:.*]] = tensor.insert_slice %[[VAL_49]] into %[[VAL_40]][0, 0, %[[VAL_30]], %[[VAL_33]], %[[VAL_36]], %[[VAL_39]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf16> into tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:                   scf.yield %[[VAL_50]] : tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:                 }
+// CHECK-NEXT:                 scf.yield %[[VAL_38]] : tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:               }
+// CHECK-NEXT:               scf.yield %[[VAL_35]] : tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:             }
+// CHECK-NEXT:             scf.yield %[[VAL_32]] : tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:           }
+// CHECK-NEXT:           %[[VAL_51:.*]] = tensor.collapse_shape %[[VAL_14]] {{\[\[}}0, 1], [2], [3]] : tensor<6x6x5x2xf16> into tensor<36x5x2xf16>
+// CHECK-NEXT:           %[[VAL_52:.*]] = tensor.collapse_shape %[[VAL_29]] {{\[\[}}0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf16> into tensor<36x2x5xf16>
+// CHECK-NEXT:           %[[VAL_53:.*]] = tensor.empty() : tensor<36x2x2xf32>
+// CHECK-NEXT:           %[[VAL_54:.*]] = linalg.fill ins(%[[VAL_12]] : f32) outs(%[[VAL_53]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:           %[[VAL_55:.*]] = linalg.batch_matmul ins(%[[VAL_52]], %[[VAL_51]] : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%[[VAL_54]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:           %[[VAL_56:.*]] = tensor.expand_shape %[[VAL_55]] {{\[\[}}0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:           %[[VAL_57:.*]] = scf.for %[[VAL_58:.*]] = %[[VAL_11]] to %[[VAL_8]] step %[[VAL_8]] iter_args(%[[VAL_59:.*]] = %[[ARG3]]) -> (tensor<2x4x4x2xf32>) {
+// CHECK-NEXT:             %[[VAL_60:.*]] = scf.for %[[VAL_61:.*]] = %[[VAL_11]] to %[[VAL_8]] step %[[VAL_8]] iter_args(%[[VAL_62:.*]] = %[[VAL_59]]) -> (tensor<2x4x4x2xf32>) {
+// CHECK-NEXT:               %[[VAL_63:.*]] = scf.for %[[VAL_64:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_8]] iter_args(%[[VAL_65:.*]] = %[[VAL_62]]) -> (tensor<2x4x4x2xf32>) {
+// CHECK-NEXT:                 %[[VAL_66:.*]] = scf.for %[[VAL_67:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_8]] iter_args(%[[VAL_68:.*]] = %[[VAL_65]]) -> (tensor<2x4x4x2xf32>) {
+// CHECK-NEXT:                   %[[VAL_69:.*]] = tensor.extract_slice %[[VAL_56]][0, 0, %[[VAL_58]], %[[VAL_61]], %[[VAL_64]], %[[VAL_67]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x2xf32> to tensor<6x6xf32>
+// CHECK-NEXT:                   %[[VAL_70:.*]] = affine.apply #[[$ATTR_0]](%[[VAL_58]])
+// CHECK-NEXT:                   %[[VAL_71:.*]] = affine.apply #[[$ATTR_0]](%[[VAL_61]])
+// CHECK-NEXT:                   %[[VAL_72:.*]] = tensor.extract_slice %[[VAL_68]]{{\[}}%[[VAL_64]], %[[VAL_70]], %[[VAL_71]], %[[VAL_67]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<2x4x4x2xf32> to tensor<4x4xf32>
+// CHECK-NEXT:                   %[[VAL_73:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK-NEXT:                   %[[VAL_74:.*]] = linalg.fill ins(%[[VAL_12]] : f32) outs(%[[VAL_73]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK-NEXT:                   %[[VAL_75:.*]] = linalg.matmul ins(%[[VAL_2]], %[[VAL_69]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[VAL_74]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK-NEXT:                   %[[VAL_76:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK-NEXT:                   %[[VAL_77:.*]] = linalg.fill ins(%[[VAL_12]] : f32) outs(%[[VAL_76]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:                   %[[VAL_78:.*]] = linalg.matmul ins(%[[VAL_75]], %[[VAL_1]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[VAL_77]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:                   %[[VAL_79:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_2]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_0]], %[[VAL_78]] : f32, tensor<4x4xf32>) outs(%[[VAL_72]] : tensor<4x4xf32>) {
+// CHECK-NEXT:                   ^bb0(%[[VAL_80:.*]]: f32, %[[VAL_81:.*]]: f32, %[[VAL_82:.*]]: f32):
+// CHECK-NEXT:                     %[[VAL_83:.*]] = arith.mulf %[[VAL_80]], %[[VAL_81]] : f32
+// CHECK-NEXT:                     %[[VAL_84:.*]] = arith.addf %[[VAL_83]], %[[VAL_82]] : f32
+// CHECK-NEXT:                     linalg.yield %[[VAL_84]] : f32
+// CHECK-NEXT:                   } -> tensor<4x4xf32>
+// CHECK-NEXT:                   %[[VAL_85:.*]] = tensor.insert_slice %[[VAL_79]] into %[[VAL_68]]{{\[}}%[[VAL_64]], %[[VAL_70]], %[[VAL_71]], %[[VAL_67]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x4x4x2xf32>
+// CHECK-NEXT:                   scf.yield %[[VAL_85]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:                 }
+// CHECK-NEXT:                 scf.yield %[[VAL_66]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:               }
+// CHECK-NEXT:               scf.yield %[[VAL_63]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:             }
+// CHECK-NEXT:             scf.yield %[[VAL_60]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:           }
+// CHECK-NEXT:           return %[[VAL_57]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:         }
+\ No newline at end of file
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
new file mode 100644
index 0000000..603ace8
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=alloc}))" 2>&1 | FileCheck %s
+
+func.func @test_static_memref_alloc() {
+  %0 = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK: Successfully generated alloc for operation: %[[ORIG:.*]] = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloca() : memref<10x20xf32>
+  return
+}
+
+// -----
+
+func.func @test_dynamic_memref_alloc() {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %orig = memref.alloc(%c10, %c20) {test.ptr} : memref<?x?xf32>
+  
+  // CHECK: Successfully generated alloc for operation: %[[ORIG:.*]] = memref.alloc(%[[C10:.*]], %[[C20:.*]]) {test.ptr} : memref<?x?xf32>
+  // CHECK: Generated: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: Generated: %[[DIM0:.*]] = memref.dim %[[ORIG]], %[[C0]] : memref<?x?xf32>
+  // CHECK: Generated: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: Generated: %[[DIM1:.*]] = memref.dim %[[ORIG]], %[[C1]] : memref<?x?xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-copy.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-copy.mlir
new file mode 100644
index 0000000..9220d84
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-copy.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=copy}))" 2>&1 | FileCheck %s
+
+func.func @test_copy_static() {
+  %src = memref.alloca() {test.src_ptr} : memref<10x20xf32>
+  %dest = memref.alloca() {test.dest_ptr} : memref<10x20xf32>
+  
+  // CHECK: Successfully generated copy from source: %[[SRC:.*]] = memref.alloca() {test.src_ptr} : memref<10x20xf32> to destination: %[[DEST:.*]] = memref.alloca() {test.dest_ptr} : memref<10x20xf32>
+  // CHECK: Generated: memref.copy %[[SRC]], %[[DEST]] : memref<10x20xf32> to memref<10x20xf32>
+  return
+}
+
+// -----
+
+func.func @test_copy_dynamic() {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %src = memref.alloc(%c10, %c20) {test.src_ptr} : memref<?x?xf32>
+  %dest = memref.alloc(%c10, %c20) {test.dest_ptr} : memref<?x?xf32>
+  
+  // CHECK: Successfully generated copy from source: %[[SRC:.*]] = memref.alloc(%[[C10:.*]], %[[C20:.*]]) {test.src_ptr} : memref<?x?xf32> to destination: %[[DEST:.*]] = memref.alloc(%[[C10]], %[[C20]]) {test.dest_ptr} : memref<?x?xf32>
+  // CHECK: Generated: memref.copy %[[SRC]], %[[DEST]] : memref<?x?xf32> to memref<?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-free.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-free.mlir
new file mode 100644
index 0000000..ecf4f75
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-free.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=free}))" 2>&1 | FileCheck %s
+
+func.func @test_static_memref_free() {
+  %0 = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK: Successfully generated free for operation: %[[ORIG:.*]] = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK-NOT: Generated
+  return
+}
+
+// -----
+
+func.func @test_dynamic_memref_free() {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %orig = memref.alloc(%c10, %c20) {test.ptr} : memref<?x?xf32>
+  
+  // CHECK: Successfully generated free for operation: %[[ORIG:.*]] = memref.alloc(%[[C10:.*]], %[[C20:.*]]) {test.ptr} : memref<?x?xf32>
+  // CHECK: Generated: memref.dealloc %[[ORIG]] : memref<?x?xf32>
+  return
+}
+
+// -----
+
+func.func @test_cast_walking_free() {
+  %0 = memref.alloca() : memref<10x20xf32>
+  %1 = memref.cast %0 {test.ptr} : memref<10x20xf32> to memref<?x?xf32>
+  
+  // CHECK: Successfully generated free for operation: %[[CAST:.*]] = memref.cast %[[ALLOCA:.*]] {test.ptr} : memref<10x20xf32> to memref<?x?xf32>
+  // CHECK-NOT: Generated
+  return
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index c07edac..eb369c0 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3322,6 +3322,46 @@ func.func @from_elements_to_elements_shuffle(%a: vector<4x2xf32>) -> vector<4x2x
 
 // -----
 
+// CHECK-LABEL: func @to_elements_of_scalar_broadcast_folds
+// CHECK-SAME: (%[[S:.*]]: f32) -> (f32, f32, f32, f32)
+func.func @to_elements_of_scalar_broadcast_folds(%s: f32) -> (f32, f32, f32, f32) {
+  %v = vector.broadcast %s : f32 to vector<4xf32>
+  %e:4 = vector.to_elements %v : vector<4xf32>
+  // CHECK-NOT: vector.broadcast
+  // CHECK-NOT: vector.to_elements
+  // CHECK: return %[[S]], %[[S]], %[[S]], %[[S]]
+  return %e#0, %e#1, %e#2, %e#3 : f32, f32, f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @to_elements_of_vector_broadcast
+// CHECK-SAME: (%[[VEC:.*]]: vector<2xf32>) -> (f32, f32, f32, f32, f32, f32)
+func.func @to_elements_of_vector_broadcast(%vec: vector<2xf32>) -> (f32, f32, f32, f32, f32, f32) {
+  %v = vector.broadcast %vec : vector<2xf32> to vector<3x2xf32>
+  %e:6 = vector.to_elements %v : vector<3x2xf32>
+  // CHECK-NOT: vector.broadcast
+  // CHECK: %[[SRC_ELEMS:.*]]:2 = vector.to_elements %[[VEC]]
+  // CHECK: return %[[SRC_ELEMS]]#0, %[[SRC_ELEMS]]#1, %[[SRC_ELEMS]]#0, %[[SRC_ELEMS]]#1, %[[SRC_ELEMS]]#0, %[[SRC_ELEMS]]#1
+  return %e#0, %e#1, %e#2, %e#3, %e#4, %e#5 : f32, f32, f32, f32, f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @to_elements_of_vector_broadcast_inner_dim
+// CHECK-SAME: (%[[V:.*]]: vector<2x1x2xf32>) -> (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)
+func.func @to_elements_of_vector_broadcast_inner_dim(%v: vector<2x1x2xf32>) -> (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) {
+  %b = vector.broadcast %v : vector<2x1x2xf32> to vector<2x3x2xf32>
+  %e:12 = vector.to_elements %b : vector<2x3x2xf32>
+  // CHECK-NOT: vector.broadcast
+  // CHECK: %[[SRC:.*]]:4 = vector.to_elements %[[V]] : vector<2x1x2xf32>
+  // CHECK: return %[[SRC]]#0, %[[SRC]]#1, %[[SRC]]#0, %[[SRC]]#1, %[[SRC]]#0, %[[SRC]]#1, %[[SRC]]#2, %[[SRC]]#3, %[[SRC]]#2, %[[SRC]]#3, %[[SRC]]#2, %[[SRC]]#3
+  return %e#0, %e#1, %e#2, %e#3, %e#4, %e#5, %e#6, %e#7, %e#8, %e#9, %e#10, %e#11 :
+    f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32
+}
+
+// -----
+
 // +---------------------------------------------------------------------------
 // Tests for foldFromElementsToConstant
 // +---------------------------------------------------------------------------
diff --git a/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
index 01a826a..ae46de1 100644
--- a/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
@@ -38,7 +38,7 @@ func.func @main() {
   %buffer = builtin.unrealized_conversion_cast %10 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<1xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: %[[ASSUME:.*]] = "memref.assume_alignment"(%{{.*}}) <{alignment = 4 : i32}> : (memref<1xf32>)
+  // CHECK-NEXT: %[[ASSUME:.*]] = memref.assume_alignment %{{.*}}, 4 : memref<1xf32>
   // CHECK-NEXT: ^ memref is not aligned to 4
   // CHECK-NEXT: Location: loc({{.*}})
   %assume = memref.assume_alignment %buffer, 4 : memref<1xf32>
diff --git a/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
index 1144a7c..6a7984c 100644
--- a/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
@@ -41,7 +41,7 @@ func.func @main() {
   %cast = memref.cast %buffer : memref<5xf32> to memref<?xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.atomic_rmw"(%{{.*}}, %{{.*}}, %{{.*}}) <{kind = 0 : i64}> : (f32, memref<?xf32>, index) -> f32
+  // CHECK-NEXT: memref.atomic_rmw addf %{{.*}}, %{{.*}} : (f32, memref<?xf32>) -> f32
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   %c9 = arith.constant 9 : index
diff --git a/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
index 1ac1030..b605c77 100644
--- a/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
@@ -43,26 +43,26 @@ func.func @main() {
   %alloc = memref.alloc() : memref<5xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32>) -> memref<10xf32>
+  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32> to memref<10xf32>
   // CHECK-NEXT: ^ size mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %1 = memref.cast %alloc : memref<5xf32> to memref<?xf32>
   func.call @cast_to_static_dim(%1) : (memref<?xf32>) -> (memref<10xf32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<*xf32>) -> memref<f32>
+  // CHECK-NEXT: memref.cast %{{.*}} : memref<*xf32> to memref<f32>
   // CHECK-NEXT: ^ rank mismatch
   // CHECK-NEXT: Location: loc({{.*}})
   %3 = memref.cast %alloc : memref<5xf32> to memref<*xf32>
   func.call @cast_to_ranked(%3) : (memref<*xf32>) -> (memref<f32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
+  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>>
   // CHECK-NEXT: ^ offset mismatch
   // CHECK-NEXT: Location: loc({{.*}})
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
+  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>>
   // CHECK-NEXT: ^ stride mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %4 = memref.cast %alloc
diff --git a/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir
index be9417b..413cb19 100644
--- a/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir
@@ -28,7 +28,7 @@ func.func @main() {
   %cast2 = memref.cast %alloca2 : memref<5xf32> to memref<?xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.copy"(%{{.*}}, %{{.*}}) : (memref<?xf32>, memref<?xf32>) -> ()
+  // CHECK-NEXT: memref.copy %{{.*}}, %{{.*}} : memref<?xf32> to memref<?xf32>
   // CHECK-NEXT: ^ size of 0-th source/target dim does not match
   // CHECK-NEXT: Location: loc({{.*}})
   call @memcpy_helper(%cast1, %cast2) : (memref<?xf32>, memref<?xf32>) -> ()
diff --git a/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir
index ef4af62..8f5a2c7 100644
--- a/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir
@@ -20,7 +20,7 @@ func.func @main() {
   %alloca = memref.alloca() : memref<1xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.dim"(%{{.*}}, %{{.*}}) : (memref<1xf32>, index) -> index
+  // CHECK-NEXT: memref.dim %{{.*}}, %{{.*}} : memref<1xf32> 
   // CHECK-NEXT: ^ index is out of bounds
   // CHECK-NEXT: Location: loc({{.*}})
   %dim = memref.dim %alloca, %c4 : memref<1xf32>
diff --git a/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
index 2e42648..364880c 100644
--- a/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
@@ -40,19 +40,19 @@ func.func @main() {
   %alloc_2x2x2 = memref.alloc(%2, %2, %2) : memref<?x?x?xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<1xf32>, index) -> f32
+  // CHECK-NEXT: memref.load %{{.*}}[%{{.*}}] : memref<1xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @load(%alloca_1, %1) : (memref<1xf32>, index) -> ()
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?xf32>, index) -> f32
+  // CHECK-NEXT: memref.load %{{.*}}[%{{.*}}] : memref<?xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @load_dynamic(%alloc_1, %1) : (memref<?xf32>, index) -> ()
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?x?x?xf32>, index, index, index) -> f32
+  // CHECK-NEXT: memref.load %{{.*}}[%{{.*}}] : memref<?x?x?xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @load_nd_dynamic(%alloc_2x2x2, %1, %n1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
diff --git a/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
index 82e6380..760f2a7 100644
--- a/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
@@ -41,7 +41,7 @@ func.func @main() {
   %cast = memref.cast %buffer : memref<5xf32> to memref<?xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.store"(%{{.*}}, %{{.*}}, %{{.*}}) : (f32, memref<?xf32>, index) -> ()
+  // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<?xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   %c9 = arith.constant 9 : index
diff --git a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
index 9fbe5bc..71e813c 100644
--- a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
@@ -51,47 +51,47 @@ func.func @main() {
 
   // Offset is out-of-bounds and slice runs out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: -9223372036854775808, 1>}> : (memref<?x4xf32>, index, index, index) -> memref<?xf32, strided<[?], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}, 0] [%{{.*}}, 1] [%{{.*}}, 1] : memref<?x4xf32> to memref<?xf32, strided<[?], offset: ?>>
   // CHECK-NEXT: ^ offset 0 is out-of-bounds
   // CHECK-NEXT: Location: loc({{.*}})
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: -9223372036854775808, 1>}> : (memref<?x4xf32>, index, index, index) -> memref<?xf32, strided<[?], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}, 0] [%{{.*}}, 1] [%{{.*}}, 1] : memref<?x4xf32> to memref<?xf32, strided<[?], offset: ?>>
   // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // Offset is out-of-bounds and slice runs out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<1xf32> to memref<1xf32, strided<[1], offset: ?>>
   // CHECK-NEXT: ^ offset 0 is out-of-bounds
   // CHECK-NEXT: Location: loc({{.*}})
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<1xf32> to memref<1xf32, strided<[1], offset: ?>>
   // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @subview(%alloca, %1) : (memref<1xf32>, index) -> ()
 
   // Offset is out-of-bounds and slice runs out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<1xf32> to memref<1xf32, strided<[1], offset: ?>>
   // CHECK-NEXT: ^ offset 0 is out-of-bounds
   // CHECK-NEXT: Location: loc({{.*}})
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<1xf32> to memref<1xf32, strided<[1], offset: ?>>
   // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @subview(%alloca, %n1) : (memref<1xf32>, index) -> ()
 
   // Slice runs out-of-bounds due to size
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 4>, static_strides = array<i64: -9223372036854775808, 1>}> : (memref<?x4xf32>, index, index, index) -> memref<?x4xf32, strided<[?, 1], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}, 0] [%{{.*}}, 4] [%{{.*}}, 1] : memref<?x4xf32> to memref<?x4xf32, strided<[?, 1], offset: ?>>
   // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @subview_dynamic(%alloca_4_dyn, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // Slice runs out-of-bounds due to stride
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 4>, static_strides = array<i64: -9223372036854775808, 1>}> : (memref<?x4xf32>, index, index, index) -> memref<?x4xf32, strided<[?, 1], offset: ?>>
+  // CHECK-NEXT: memref.subview %{{.*}}[%{{.*}}, 0] [%{{.*}}, 4] [%{{.*}}, 1] : memref<?x4xf32> to memref<?x4xf32, strided<[?, 1], offset: ?>>
   // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @subview_dynamic(%alloca_4_dyn, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
diff --git a/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir
index f37a6d6..a96b2be 100644
--- a/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir
@@ -40,14 +40,14 @@ func.func @main() {
   %alloc = tensor.empty() : tensor<5xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.cast"(%{{.*}}) : (tensor<?xf32>) -> tensor<10xf32>
+  // CHECK-NEXT: tensor.cast %{{.*}} : tensor<?xf32> to tensor<10xf32>
   // CHECK-NEXT: ^ size mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %1 = tensor.cast %alloc : tensor<5xf32> to tensor<?xf32>
   func.call @cast_to_static_dim(%1) : (tensor<?xf32>) -> (tensor<10xf32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.cast"(%{{.*}}) : (tensor<*xf32>) -> tensor<f32>
+  // CHECK-NEXT: tensor.cast %{{.*}} : tensor<*xf32> to tensor<f32>
   // CHECK-NEXT: ^ rank mismatch
   // CHECK-NEXT: Location: loc({{.*}})
   %3 = tensor.cast %alloc : tensor<5xf32> to tensor<*xf32>
diff --git a/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir
index e9e5c04..1a26ebe 100644
--- a/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir
@@ -22,7 +22,7 @@ func.func @main() {
   %tensor = tensor.empty() : tensor<1xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.dim"(%{{.*}}, %{{.*}}) : (tensor<1xf32>, index) -> index
+  // CHECK-NEXT: tensor.dim %{{.*}}, %{{.*}} : tensor<1xf32>
   // CHECK-NEXT: ^ index is out of bounds
   // CHECK-NEXT: Location: loc({{.*}})
   %dim = tensor.dim %tensor, %c4 : tensor<1xf32>
diff --git a/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir
index 73fcec4..cc252a2b 100644
--- a/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir
@@ -44,19 +44,19 @@ func.func @main() {
   %alloc_2x2x2 = tensor.empty(%2, %2, %2) : tensor<?x?x?xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract"(%{{.*}}, %{{.*}}) : (tensor<1xf32>, index) -> f32
+  // CHECK-NEXT: tensor.extract %{{.*}}[%{{.*}}] : tensor<1xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract(%alloca_1, %1) : (tensor<1xf32>, index) -> ()
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract"(%{{.*}}, %{{.*}}) : (tensor<?xf32>, index) -> f32
+  // CHECK-NEXT: tensor.extract %{{.*}}[%{{.*}}] : tensor<?xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_dynamic(%alloc_1, %1) : (tensor<?xf32>, index) -> ()
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract"(%{{.*}}, %{{.*}}) : (tensor<?x?x?xf32>, index, index, index) -> f32
+  // CHECK-NEXT: tensor.extract %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : tensor<?x?x?xf32>
   // CHECK-NEXT: ^ out-of-bounds access
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_nd_dynamic(%alloc_2x2x2, %1, %n1, %0) : (tensor<?x?x?xf32>, index, index, index) -> ()
diff --git a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
index 341a59e..0c7c4a6 100644
--- a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
@@ -47,47 +47,47 @@ func.func @main() {
 
   // Offset is out-of-bounds and slice runs out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: -9223372036854775808, 1>}> : (tensor<?x4xf32>, index, index, index) -> tensor<?xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}, 0] [%{{.*}}, 1] [%{{.*}}, 1] : tensor<?x4xf32> to tensor<?xf32>
   // CHECK-NEXT: ^ offset 0 is out-of-bounds
   // CHECK-NEXT: Location: loc({{.*}})
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: -9223372036854775808, 1>}> : (tensor<?x4xf32>, index, index, index) -> tensor<?xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}, 0] [%{{.*}}, 1] [%{{.*}}, 1] : tensor<?x4xf32> to tensor<?xf32>
   // CHECK-NEXT: ^ extract_slice runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_slice_dynamic_rank_reduce(%alloca_4_dyn, %5, %5, %1) : (tensor<?x4xf32>, index, index, index) -> ()
 
   // Offset is out-of-bounds and slice runs out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (tensor<1xf32>, index) -> tensor<1xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<1xf32> to tensor<1xf32>
   // CHECK-NEXT: ^ offset 0 is out-of-bounds
   // CHECK-NEXT: Location: loc({{.*}})
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (tensor<1xf32>, index) -> tensor<1xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<1xf32> to tensor<1xf32>
   // CHECK-NEXT: ^ extract_slice runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_slice(%alloca, %1) : (tensor<1xf32>, index) -> ()
 
   // Offset is out-of-bounds and slice runs out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (tensor<1xf32>, index) -> tensor<1xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<1xf32> to tensor<1xf32>
   // CHECK-NEXT: ^ offset 0 is out-of-bounds
   // CHECK-NEXT: Location: loc({{.*}})
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (tensor<1xf32>, index) -> tensor<1xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<1xf32> to tensor<1xf32>
   // CHECK-NEXT: ^ extract_slice runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_slice(%alloca, %n1) : (tensor<1xf32>, index) -> ()
 
   // Slice runs out-of-bounds due to size
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 4>, static_strides = array<i64: -9223372036854775808, 1>}> : (tensor<?x4xf32>, index, index, index) -> tensor<?x4xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}, 0] [%{{.*}}, 4] [%{{.*}}, 1] : tensor<?x4xf32> to tensor<?x4xf32>
   // CHECK-NEXT: ^ extract_slice runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_slice_dynamic(%alloca_4_dyn, %0, %5, %1) : (tensor<?x4xf32>, index, index, index) -> ()
 
   // Slice runs out-of-bounds due to stride
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: "tensor.extract_slice"(%arg0, %arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 4>, static_strides = array<i64: -9223372036854775808, 1>}> : (tensor<?x4xf32>, index, index, index) -> tensor<?x4xf32>
+  // CHECK-NEXT: tensor.extract_slice %{{.*}}[%{{.*}}, 0] [%{{.*}}, 4] [%{{.*}}, 1] : tensor<?x4xf32> to tensor<?x4xf32>
   // CHECK-NEXT: ^ extract_slice runs out-of-bounds along dimension 0
   // CHECK-NEXT: Location: loc({{.*}})
   func.call @extract_slice_dynamic(%alloca_4_dyn, %0, %4, %4) : (tensor<?x4xf32>, index, index, index) -> ()
diff --git a/mlir/test/Target/Cpp/do.mlir b/mlir/test/Target/Cpp/do.mlir
new file mode 100644
index 0000000..38cbc81
--- /dev/null
+++ b/mlir/test/Target/Cpp/do.mlir
@@ -0,0 +1,168 @@
+// RUN: mlir-translate -mlir-to-cpp %s | FileCheck %s -check-prefix=CPP-DEFAULT
+
+
+// CPP-DEFAULT-LABEL: void emitc_do(
+// CPP-DEFAULT:         int32_t* [[VAL_1:v[0-9]+]]) {
+// CPP-DEFAULT:         int32_t [[VAL_2:v[0-9]+]] = 0;
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:           printf("%d", *[[VAL_1]]);
+// CPP-DEFAULT:           int32_t [[VAL_3:v[0-9]+]] = [[VAL_2]];
+// CPP-DEFAULT:           int32_t [[VAL_4:v[0-9]+]] = [[VAL_3]] + 1;
+// CPP-DEFAULT:           [[VAL_2]] = [[VAL_4]];
+// CPP-DEFAULT:         } while ([[VAL_2]] <= 10);
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @emitc_do(%arg0 : !emitc.ptr<i32>) {
+  %var = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  %0 = literal "10" : i32
+  %1 = literal "1" : i32
+
+  do {
+    verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+    %var_load = load %var : <i32>
+    %tmp_add = add %var_load, %1 : (i32, i32) -> i32
+    "emitc.assign"(%var, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+  } while {
+    %r = expression %var, %0 : (!emitc.lvalue<i32>, i32) -> i1 {
+      %var_load = load %var : <i32>
+      %cmp = cmp le, %var_load, %0 : (i32, i32) -> i1
+      yield %cmp : i1
+    }
+    
+    yield %r : i1
+  }
+
+  return
+}
+
+
+// CPP-DEFAULT-LABEL: void emitc_do_with_expression(
+// CPP-DEFAULT:         int32_t* [[VAL_1:v[0-9]+]]) {
+// CPP-DEFAULT:         int32_t [[VAL_2:v[0-9]+]] = 0;
+// CPP-DEFAULT:         int32_t [[VAL_3:v[0-9]+]] = 10 + 1;
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:           printf("%d", *[[VAL_1]]);
+// CPP-DEFAULT:           int32_t [[VAL_4:v[0-9]+]] = [[VAL_2]];
+// CPP-DEFAULT:           int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]] + 1;
+// CPP-DEFAULT:           [[VAL_2]] = [[VAL_5]];
+// CPP-DEFAULT:         } while ([[VAL_2]] <= [[VAL_3]]);
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @emitc_do_with_expression(%arg0 : !emitc.ptr<i32>) {
+  %var = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  %0 = literal "10" : i32
+  %1 = literal "1" : i32
+
+  %add = expression %0, %1 : (i32, i32) -> i32 {
+    %add = add %0, %1 : (i32, i32) -> i32
+    yield %add : i32
+  }
+
+  do {
+    verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+    %var_load = load %var : <i32>
+    %tmp_add = add %var_load, %1 : (i32, i32) -> i32
+    "emitc.assign"(%var, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+  } while {
+    %r = expression %var, %add : (!emitc.lvalue<i32>, i32) -> i1 {
+      %var_load = load %var : <i32>
+      %cmp = cmp le, %var_load, %add : (i32, i32) -> i1
+      yield %cmp : i1
+    }
+
+    yield %r : i1
+  }
+
+  return
+}
+
+
+// CPP-DEFAULT-LABEL: void emitc_double_do()
+// CPP-DEFAULT:         int32_t [[VAL_1:v[0-9]+]] = 0;
+// CPP-DEFAULT:         int32_t [[VAL_2:v[0-9]+]] = 0;
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:           int32_t [[VAL_3:v[0-9]+]] = [[VAL_1]];
+// CPP-DEFAULT:           do {
+// CPP-DEFAULT:             int32_t [[VAL_4:v[0-9]+]] = [[VAL_2]];
+// CPP-DEFAULT:             printf("i = %d, j = %d", [[VAL_3]], [[VAL_4]]);
+// CPP-DEFAULT:             int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]] + 1;
+// CPP-DEFAULT:             [[VAL_2]] = [[VAL_5]];
+// CPP-DEFAULT:           } while ([[VAL_2]] <= 5);
+// CPP-DEFAULT:           int32_t [[VAL_6:v[0-9]+]] = [[VAL_3]] + 1;
+// CPP-DEFAULT:           [[VAL_1]] = [[VAL_6]];
+// CPP-DEFAULT:         } while ([[VAL_1]] <= 3);
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @emitc_double_do() {
+  %var_1 = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  %var_2 = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  
+  %step = literal "1" : i32
+  %end_1 = literal "3" : i32
+  %end_2 = literal "5" : i32
+
+  do {
+    %var_1_load = load %var_1 : <i32>
+    
+    do {
+      %var_2_load = load %var_2 : <i32>
+      verbatim "printf(\"i = %d, j = %d\", {}, {});" args %var_1_load, %var_2_load : i32, i32
+      %tmp_add = add %var_2_load, %step : (i32, i32) -> i32
+      "emitc.assign"(%var_2, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+    } while {
+      %r = expression %var_2, %end_2 : (!emitc.lvalue<i32>, i32) -> i1 {
+        %var_2_load = load %var_2 : <i32>
+        %cmp = cmp le, %var_2_load, %end_2 : (i32, i32) -> i1
+        yield %cmp : i1
+      }
+      
+      yield %r : i1
+    }
+
+    %tmp_add = add %var_1_load, %step : (i32, i32) -> i32
+    "emitc.assign"(%var_1, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+  } while {
+    %r = expression %var_1, %end_1 : (!emitc.lvalue<i32>, i32) -> i1 {
+      %var_1_load = load %var_1 : <i32>
+      %cmp = cmp le, %var_1_load, %end_1 : (i32, i32) -> i1
+      yield %cmp : i1
+    }
+    
+    yield %r : i1
+  }
+
+  return
+}
+
+
+// CPP-DEFAULT-LABEL: bool payload_do_with_empty_body(
+// CPP-DEFAULT:         int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]]) {
+// CPP-DEFAULT:         bool [[VAL_3:v[0-9]+]] = [[VAL_1]] < [[VAL_2]];
+// CPP-DEFAULT:         return [[VAL_3]];
+// CPP-DEFAULT:       }
+// CPP-DEFAULT:       void emitc_do_with_empty_body(
+// CPP-DEFAULT:         int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]]) {
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:         } while (payload_do_with_empty_body([[VAL_1]], [[VAL_2]]));
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @payload_do_with_empty_body(%1 : i32, %2 : i32) -> i1 {
+  %cmp = emitc.cmp lt, %1, %2 : (i32, i32) -> i1
+  return %cmp : i1
+}
+func.func @emitc_do_with_empty_body(%arg1 : i32, %arg2 : i32) {
+  emitc.do {
+  } while {
+    %r = emitc.expression %arg1, %arg2 : (i32, i32) -> i1 {
+      %call = emitc.call @payload_do_with_empty_body(%arg1, %arg2) : (i32, i32) -> i1
+      emitc.yield %call : i1
+    }
+    emitc.yield %r: i1
+  }
+
+  return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
index 04163b5..9928992 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
@@ -3,9 +3,9 @@
 // CHECK-LABEL: @convert_f32x2_to_fp6x2_packed
 llvm.func @convert_f32x2_to_fp6x2_packed(%srcA : f32, %srcB : f32) {
   //CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f6x2 <e2m3> %srcA, %srcB : i16
+  %res1 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : i16 (f6E2M3FN)
   //CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f6x2 <e3m2> %srcA, %srcB : i16
+  %res2 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : i16 (f6E3M2FN)
   llvm.return
 }
 
@@ -13,9 +13,9 @@ llvm.func @convert_f32x2_to_fp6x2_packed(%srcA : f32, %srcB : f32) {
 llvm.func @convert_f32x2_to_fp6x2_vector(%srcA : f32, %srcB : f32) {
   //CHECK: %[[res0:.*]] = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %{{.*}}, float %{{.*}})
   //CHECK-NEXT: %{{.*}} = bitcast i16 %[[res0]] to <2 x i8>
-  %res1 = nvvm.convert.f32x2.to.f6x2 <e2m3> %srcA, %srcB : vector<2xi8>
+  %res1 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : vector<2xi8> (f6E2M3FN)
   //CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %{{.*}}, float %{{.*}})
   //CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res2 = nvvm.convert.f32x2.to.f6x2 <e3m2> %srcA, %srcB : vector<2xi8>
+  %res2 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : vector<2xi8> (f6E3M2FN)
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
index 4a15efb..de21826 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
@@ -5,31 +5,31 @@
 // CHECK-LABEL: @convert_f32x2_to_f8x2_e4m3
 llvm.func @convert_f32x2_to_f8x2_e4m3(%srcA : f32, %srcB : f32) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
   llvm.return
 }
 
 // CHECK-LABEL: @convert_f32x2_to_f8x2_e5m2
 llvm.func @convert_f32x2_to_f8x2_e5m2(%srcA : f32, %srcB : f32) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f8x2 <e5m2> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E5M2)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f8x2 <e5m2> %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E5M2)
   llvm.return
 }
 
 // CHECK-LABEL: @convert_f32x2_to_f8x2_ue8m0
 llvm.func @convert_f32x2_to_f8x2_ue8m0(%srcA : f32, %srcB : f32) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>} : i16
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>} : i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>} : i16
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>} : i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float %{{.*}}, float %{{.*}})
-  %res3 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res3 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float %{{.*}}, float %{{.*}})
-  %res4 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res4 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E8M0FNU)
   llvm.return
 }
 
@@ -37,10 +37,10 @@ llvm.func @convert_f32x2_to_f8x2_ue8m0(%srcA : f32, %srcB : f32) {
 llvm.func @convert_f32x2_to_f8x2_vector_return(%srcA : f32, %srcB : f32) {
   // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float %{{.*}}, float %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res1 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8>
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8> (f8E4M3FN)
   // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float %{{.*}}, float %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res2]] to <2 x i8>
-  %res2 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8>
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8> (f8E4M3FN)
   llvm.return
 }
 
@@ -49,18 +49,18 @@ llvm.func @convert_f32x2_to_f8x2_vector_return(%srcA : f32, %srcB : f32) {
 // CHECK-LABEL: @convert_f16x2_to_f8x2_e4m3
 llvm.func @convert_f16x2_to_f8x2_e4m3(%src : vector<2xf16>) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> %{{.*}})
-  %res1 = nvvm.convert.f16x2.to.f8x2 <e4m3> %src : vector<2xf16> -> i16
+  %res1 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> i16 (f8E4M3FN)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> %{{.*}})
-  %res2 = nvvm.convert.f16x2.to.f8x2 <e4m3> %src {relu = true} : vector<2xf16> -> i16
+  %res2 = nvvm.convert.f16x2.to.f8x2 %src {relu = true} : vector<2xf16> -> i16 (f8E4M3FN)
   llvm.return
 }
 
 // CHECK-LABEL: @convert_f16x2_to_f8x2_e5m2
 llvm.func @convert_f16x2_to_f8x2_e5m2(%src : vector<2xf16>) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> %{{.*}})
-  %res1 = nvvm.convert.f16x2.to.f8x2 <e5m2> %src : vector<2xf16> -> i16
+  %res1 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> i16 (f8E5M2)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> %{{.*}})
-  %res2 = nvvm.convert.f16x2.to.f8x2 <e5m2> %src {relu = true} : vector<2xf16> -> i16
+  %res2 = nvvm.convert.f16x2.to.f8x2 %src {relu = true} : vector<2xf16> -> i16 (f8E5M2)
   llvm.return
 }
 
@@ -68,10 +68,10 @@ llvm.func @convert_f16x2_to_f8x2_e5m2(%src : vector<2xf16>) {
 llvm.func @convert_f16x2_to_f8x2_vector_return(%src : vector<2xf16>) {
   // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res1 = nvvm.convert.f16x2.to.f8x2 <e4m3> %src : vector<2xf16> -> vector<2xi8>
+  %res1 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> vector<2xi8> (f8E4M3FN)
   // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res2]] to <2 x i8>
-  %res2 = nvvm.convert.f16x2.to.f8x2 <e5m2> %src : vector<2xf16> -> vector<2xi8>
+  %res2 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> vector<2xi8> (f8E5M2)
   llvm.return
 }
 
@@ -80,13 +80,13 @@ llvm.func @convert_f16x2_to_f8x2_vector_return(%src : vector<2xf16>) {
 // CHECK-LABEL: @convert_bf16x2_to_f8x2_ue8m0
 llvm.func @convert_bf16x2_to_f8x2_ue8m0(%src : vector<2xbf16>) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> %{{.*}})
-  %res1 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16
+  %res1 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> %{{.*}})
-  %res2 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rp>} : vector<2xbf16> -> i16
+  %res2 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> %{{.*}})
-  %res3 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16
+  %res3 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> %{{.*}})
-  %res4 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16
+  %res4 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   llvm.return
 }
 
@@ -94,9 +94,9 @@ llvm.func @convert_bf16x2_to_f8x2_ue8m0(%src : vector<2xbf16>) {
 llvm.func @convert_bf16x2_to_f8x2_vector_return(%src : vector<2xbf16>) {
   // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res1 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> vector<2xi8>
+  %res1 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> vector<2xi8> (f8E8M0FNU)
   // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res2]] to <2 x i8>
-  %res2 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> vector<2xi8>
+  %res2 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> vector<2xi8> (f8E8M0FNU)
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 383f482..0b36154 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -175,64 +175,64 @@ llvm.func @nvvm_match_sync_any(%val32: i32, %thread_mask: i32) {
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_rounding_e4m3(%a : f32, %b : f32) {
-  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e4m3> %a, %b {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_rounding_e5m2(%a : f32, %b : f32) {
-  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e5m2> %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E5M2)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_rounding_ue8m0(%a : f32, %b : f32) {
-  // expected-error @below {{Only RZ or RP rounding modes are supported for conversions from f32x2 to .ue8m0x2 type}}
-  %res = nvvm.convert.f32x2.to.f8x2 <ue8m0> %a, %b {rnd = #nvvm.fp_rnd_mode<rn>} : i16
+  // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from f32x2 to 'f8E8M0FNU' type}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rn>} : i16 (f8E8M0FNU)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_saturation_e4m3(%a : f32, %b : f32) {
-  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e4m3> %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16
+  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16 (f8E4M3FN)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_saturation_e5m2(%a : f32, %b : f32) {
-  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e5m2> %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16
+  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16 (f8E5M2)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_relu_not_supported_ue8m0(%a : f32, %b : f32) {
-  // expected-error @below {{relu not supported for conversions to .ue8m0x2 type}}
-  %res = nvvm.convert.f32x2.to.f8x2 <ue8m0> %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, relu = true} : i16
+  // expected-error @below {{relu not supported for conversions to 'f8E8M0FNU' type}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, relu = true} : i16 (f8E8M0FNU)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_f16x2_to_f8x2_invalid_type(%src : vector<2xf16>) {
-  // expected-error @below {{Only .e4m3 or .e5m2 types are supported for conversions from f16x2 to f8x2.}}
-  %res = nvvm.convert.f16x2.to.f8x2 <ue8m0> %src : vector<2xf16> -> i16
+  // expected-error @below {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f16x2 to f8x2.}}
+  %res = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> i16 (f8E8M0FNU)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_type(%src : vector<2xbf16>) {
-  // expected-error @below {{Only .ue8m0 type is supported for conversions from bf16x2 to f8x2.}}
-  %res = nvvm.convert.bf16x2.to.f8x2 <e4m3> %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16
+  // expected-error @below {{Only 'f8E8M0FNU' type is supported for conversions from bf16x2 to f8x2.}}
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16 (f8E4M3FN)
   llvm.return
 }
 
@@ -240,7 +240,15 @@ llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_type(%src : vector<2xbf16>) {
 
 llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_rounding(%src : vector<2xbf16>) {
   // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}}
-  %res = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xbf16> -> i16
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xbf16> -> i16 (f8E8M0FNU)
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_cvt_f32x2_to_f6x2_invalid_type(%a : f32, %b : f32) {
+  // expected-error @below {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x2 to f6x2.}}
+  %res = nvvm.convert.f32x2.to.f6x2 %a, %b : i16 (f8E8M0FNU)
   llvm.return
 }
 
diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
index d57b41c..eb0d980 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestDenseDataFlowAnalysis.h"
-#include "TestDialect.h"
 #include "TestOps.h"
 #include "mlir/Analysis/DataFlow/DenseAnalysis.h"
 #include "mlir/Analysis/DataFlow/Utils.h"
@@ -23,12 +22,15 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/TypeID.h"
+#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::dataflow;
 using namespace mlir::dataflow::test;
 
+#define DEBUG_TYPE "test-next-access"
+
 namespace {
 
 class NextAccess : public AbstractDenseLattice, public AccessLatticeBase {
@@ -72,6 +74,7 @@ public:
   // means "we don't know what the next access is" rather than "there is no next
   // access". But it's unclear how to differentiate the two cases...
   void setToExitState(NextAccess *lattice) override {
+    LDBG() << "setToExitState: setting lattice to unknown state";
     propagateIfChanged(lattice, lattice->setKnownToUnknown());
   }
 
@@ -87,16 +90,23 @@ public:
 LogicalResult NextAccessAnalysis::visitOperation(Operation *op,
                                                  const NextAccess &after,
                                                  NextAccess *before) {
+  LDBG() << "visitOperation: "
+         << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  LDBG() << "  after state: " << after;
+  LDBG() << "  before state: " << *before;
+
   auto memory = dyn_cast<MemoryEffectOpInterface>(op);
   // If we can't reason about the memory effects, conservatively assume we can't
   // say anything about the next access.
   if (!memory) {
+    LDBG() << "  No memory effect interface, setting to exit state";
     setToExitState(before);
     return success();
   }
 
   SmallVector<MemoryEffects::EffectInstance> effects;
   memory.getEffects(effects);
+  LDBG() << "  Found " << effects.size() << " memory effects";
 
   // First, check if all underlying values are already known. Otherwise, avoid
   // propagating and stay in the "undefined" state to avoid incorrectly
@@ -110,6 +120,7 @@ LogicalResult NextAccessAnalysis::visitOperation(Operation *op,
     // Effects with unspecified value are treated conservatively and we cannot
     // assume anything about the next access.
     if (!value) {
+      LDBG() << "  Effect has unspecified value, setting to exit state";
       setToExitState(before);
       return success();
     }
@@ -124,38 +135,63 @@ LogicalResult NextAccessAnalysis::visitOperation(Operation *op,
             });
 
     // If the underlying value is not known yet, don't propagate.
-    if (!underlyingValue)
+    if (!underlyingValue) {
+      LDBG() << "  Underlying value not known for " << value
+             << ", skipping propagation";
       return success();
+    }
 
+    LDBG() << "  Found underlying value " << *underlyingValue << " for "
+           << value;
     underlyingValues.push_back(*underlyingValue);
   }
 
   // Update the state if all underlying values are known.
+  LDBG() << "  All underlying values known, updating state";
   ChangeResult result = before->meet(after);
   for (const auto &[effect, value] : llvm::zip(effects, underlyingValues)) {
     // If the underlying value is known to be unknown, set to fixpoint.
     if (!value) {
+      LDBG() << "  Underlying value is unknown, setting to exit state";
       setToExitState(before);
       return success();
     }
 
+    LDBG() << "  Setting next access for value " << value << " to operation "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
     result |= before->set(value, op);
   }
+  LDBG() << "  Final result: "
+         << (result == ChangeResult::Change ? "changed" : "no change");
   propagateIfChanged(before, result);
   return success();
 }
 
 void NextAccessAnalysis::buildOperationEquivalentLatticeAnchor(Operation *op) {
+  LDBG() << "buildOperationEquivalentLatticeAnchor: "
+         << OpWithFlags(op, OpPrintingFlags().skipRegions());
   if (isMemoryEffectFree(op)) {
+    LDBG() << "  Operation is memory effect free, unioning lattice anchors";
     unionLatticeAnchors<NextAccess>(getProgramPointBefore(op),
                                     getProgramPointAfter(op));
+  } else {
+    LDBG() << "  Operation has memory effects, not unioning lattice anchors";
   }
 }
 
 void NextAccessAnalysis::visitCallControlFlowTransfer(
     CallOpInterface call, CallControlFlowAction action, const NextAccess &after,
     NextAccess *before) {
+  LDBG() << "visitCallControlFlowTransfer: "
+         << OpWithFlags(call.getOperation(), OpPrintingFlags().skipRegions());
+  LDBG() << "  action: "
+         << (action == CallControlFlowAction::ExternalCallee ? "ExternalCallee"
+             : action == CallControlFlowAction::EnterCallee  ? "EnterCallee"
+                                                             : "ExitCallee");
+  LDBG() << "  assumeFuncReads: " << assumeFuncReads;
+
   if (action == CallControlFlowAction::ExternalCallee && assumeFuncReads) {
+    LDBG() << "  Handling external callee with assumed function reads";
     SmallVector<Value> underlyingValues;
     underlyingValues.reserve(call->getNumOperands());
     for (Value operand : call.getArgOperands()) {
@@ -165,15 +201,26 @@ void NextAccessAnalysis::visitCallControlFlowTransfer(
                 return getOrCreateFor<UnderlyingValueLattice>(
                     getProgramPointBefore(call.getOperation()), value);
               });
-      if (!underlyingValue)
+      if (!underlyingValue) {
+        LDBG() << "  Underlying value not known for operand " << operand
+               << ", returning";
         return;
+      }
+      LDBG() << "  Found underlying value " << *underlyingValue
+             << " for operand " << operand;
       underlyingValues.push_back(*underlyingValue);
     }
 
+    LDBG() << "  Setting next access for " << underlyingValues.size()
+           << " operands";
     ChangeResult result = before->meet(after);
     for (Value operand : underlyingValues) {
+      LDBG() << "  Setting next access for operand " << operand << " to call "
+             << call;
       result |= before->set(operand, call);
     }
+    LDBG() << "  Call control flow result: "
+           << (result == ChangeResult::Change ? "changed" : "no change");
     return propagateIfChanged(before, result);
   }
   auto testCallAndStore =
@@ -182,8 +229,10 @@ void NextAccessAnalysis::visitCallControlFlowTransfer(
                             testCallAndStore.getStoreBeforeCall()) ||
                            (action == CallControlFlowAction::ExitCallee &&
                             !testCallAndStore.getStoreBeforeCall()))) {
+    LDBG() << "  Handling TestCallAndStoreOp with special logic";
     (void)visitOperation(call, after, before);
   } else {
+    LDBG() << "  Using default call control flow transfer logic";
     AbstractDenseBackwardDataFlowAnalysis::visitCallControlFlowTransfer(
         call, action, after, before);
   }
@@ -192,6 +241,11 @@ void NextAccessAnalysis::visitCallControlFlowTransfer(
 void NextAccessAnalysis::visitRegionBranchControlFlowTransfer(
     RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
     RegionBranchPoint regionTo, const NextAccess &after, NextAccess *before) {
+  LDBG() << "visitRegionBranchControlFlowTransfer: "
+         << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions());
+  LDBG() << "  regionFrom: " << (regionFrom.isParent() ? "parent" : "region");
+  LDBG() << "  regionTo: " << (regionTo.isParent() ? "parent" : "region");
+
   auto testStoreWithARegion =
       dyn_cast<::test::TestStoreWithARegion>(branch.getOperation());
 
@@ -199,9 +253,11 @@ void NextAccessAnalysis::visitRegionBranchControlFlowTransfer(
       ((regionTo.isParent() && !testStoreWithARegion.getStoreBeforeRegion()) ||
        (regionFrom.isParent() &&
         testStoreWithARegion.getStoreBeforeRegion()))) {
+    LDBG() << "  Handling TestStoreWithARegion with special logic";
     (void)visitOperation(branch, static_cast<const NextAccess &>(after),
                          static_cast<NextAccess *>(before));
   } else {
+    LDBG() << "  Using default region branch control flow transfer logic";
     propagateIfChanged(before, before->meet(after));
   }
 }
@@ -278,6 +334,11 @@ struct TestNextAccessPass
 
   void runOnOperation() override {
     Operation *op = getOperation();
+    LDBG() << "runOnOperation: Starting test-next-access pass on "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
+    LDBG() << "  interprocedural: " << interprocedural;
+    LDBG() << "  assumeFuncReads: " << assumeFuncReads;
+
     SymbolTableCollection symbolTable;
 
     auto config = DataFlowConfig().setInterprocedural(interprocedural);
@@ -285,15 +346,20 @@ struct TestNextAccessPass
     loadBaselineAnalyses(solver);
     solver.load<NextAccessAnalysis>(symbolTable, assumeFuncReads);
     solver.load<UnderlyingValueAnalysis>();
+    LDBG() << "  Initializing and running dataflow solver";
     if (failed(solver.initializeAndRun(op))) {
       emitError(op->getLoc(), "dataflow solver failed");
       return signalPassFailure();
     }
+    LDBG() << "  Dataflow solver completed successfully";
+    LDBG() << "  Walking operations to set next access attributes";
     op->walk([&](Operation *op) {
       auto tag = op->getAttrOfType<StringAttr>(kTagAttrName);
       if (!tag)
         return;
 
+      LDBG() << "  Processing tagged operation: "
+             << OpWithFlags(op, OpPrintingFlags().skipRegions());
       const NextAccess *nextAccess =
           solver.lookupState<NextAccess>(solver.getProgramPointAfter(op));
       op->setAttr(kNextAccessAttrName,
diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt
index 3b7bd9b..e31140a 100644
--- a/mlir/test/lib/Dialect/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/CMakeLists.txt
@@ -12,6 +12,7 @@ add_subdirectory(Math)
 add_subdirectory(MemRef)
 add_subdirectory(Shard)
 add_subdirectory(NVGPU)
+add_subdirectory(OpenACC)
 add_subdirectory(SCF)
 add_subdirectory(Shape)
 add_subdirectory(SPIRV)
diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
new file mode 100644
index 0000000..f84055d
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_library(MLIROpenACCTestPasses
+  TestOpenACC.cpp
+  TestPointerLikeTypeInterface.cpp
+  
+  EXCLUDE_FROM_LIBMLIR
+)
+mlir_target_link_libraries(MLIROpenACCTestPasses PUBLIC
+  MLIRIR
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRMemRefDialect
+  MLIROpenACCDialect
+  MLIRPass
+  MLIRSupport
+)
+
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
new file mode 100644
index 0000000..9886240
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
@@ -0,0 +1,23 @@
+//===- TestOpenACC.cpp - OpenACC Test Registration ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains unified registration for all OpenACC test passes.
+//
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace test {
+
+// Forward declarations of individual test pass registration functions
+void registerTestPointerLikeTypeInterfacePass();
+
+// Unified registration function for all OpenACC tests
+void registerTestOpenACC() { registerTestPointerLikeTypeInterfacePass(); }
+
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp b/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
new file mode 100644
index 0000000..85f9283
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
@@ -0,0 +1,305 @@
+//===- TestPointerLikeTypeInterface.cpp - Test PointerLikeType interface -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for testing the OpenACC PointerLikeType
+// interface methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+struct OperationTracker : public OpBuilder::Listener {
+  SmallVector<Operation *> insertedOps;
+
+  void notifyOperationInserted(Operation *op,
+                               OpBuilder::InsertPoint previous) override {
+    insertedOps.push_back(op);
+  }
+};
+
+struct TestPointerLikeTypeInterfacePass
+    : public PassWrapper<TestPointerLikeTypeInterfacePass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPointerLikeTypeInterfacePass)
+
+  TestPointerLikeTypeInterfacePass() = default;
+  TestPointerLikeTypeInterfacePass(const TestPointerLikeTypeInterfacePass &pass)
+      : PassWrapper(pass) {
+    testMode = pass.testMode;
+  }
+
+  Pass::Option<std::string> testMode{
+      *this, "test-mode",
+      llvm::cl::desc("Test mode: walk, alloc, copy, or free"),
+      llvm::cl::init("walk")};
+
+  StringRef getArgument() const override {
+    return "test-acc-pointer-like-interface";
+  }
+
+  StringRef getDescription() const override {
+    return "Test OpenACC PointerLikeType interface methods on any implementing "
+           "type";
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<acc::OpenACCDialect>();
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+  }
+
+private:
+  void walkAndPrint();
+  void testGenAllocate(Operation *op, Value result, PointerLikeType pointerType,
+                       OpBuilder &builder);
+  void testGenFree(Operation *op, Value result, PointerLikeType pointerType,
+                   OpBuilder &builder);
+  void testGenCopy(Operation *srcOp, Operation *destOp, Value srcResult,
+                   Value destResult, PointerLikeType pointerType,
+                   OpBuilder &builder);
+
+  struct PointerCandidate {
+    Operation *op;
+    Value result;
+    PointerLikeType pointerType;
+  };
+};
+
+void TestPointerLikeTypeInterfacePass::runOnOperation() {
+  if (testMode == "walk") {
+    walkAndPrint();
+    return;
+  }
+
+  auto func = getOperation();
+  OpBuilder builder(&getContext());
+
+  if (testMode == "alloc" || testMode == "free") {
+    // Collect all candidates first
+    SmallVector<PointerCandidate> candidates;
+    func.walk([&](Operation *op) {
+      if (op->hasAttr("test.ptr")) {
+        for (auto result : op->getResults()) {
+          if (isa<PointerLikeType>(result.getType())) {
+            candidates.push_back(
+                {op, result, cast<PointerLikeType>(result.getType())});
+            break; // Only take the first PointerLikeType result
+          }
+        }
+      }
+    });
+
+    // Now test all candidates
+    for (const auto &candidate : candidates) {
+      if (testMode == "alloc")
+        testGenAllocate(candidate.op, candidate.result, candidate.pointerType,
+                        builder);
+      else if (testMode == "free")
+        testGenFree(candidate.op, candidate.result, candidate.pointerType,
+                    builder);
+    }
+  } else if (testMode == "copy") {
+    // Collect all source and destination candidates
+    SmallVector<PointerCandidate> sources, destinations;
+
+    func.walk([&](Operation *op) {
+      if (op->hasAttr("test.src_ptr")) {
+        for (auto result : op->getResults()) {
+          if (isa<PointerLikeType>(result.getType())) {
+            sources.push_back(
+                {op, result, cast<PointerLikeType>(result.getType())});
+            break;
+          }
+        }
+      }
+      if (op->hasAttr("test.dest_ptr")) {
+        for (auto result : op->getResults()) {
+          if (isa<PointerLikeType>(result.getType())) {
+            destinations.push_back(
+                {op, result, cast<PointerLikeType>(result.getType())});
+            break;
+          }
+        }
+      }
+    });
+
+    // Try copying from each source to each destination
+    for (const auto &src : sources)
+      for (const auto &dest : destinations)
+        testGenCopy(src.op, dest.op, src.result, dest.result, src.pointerType,
+                    builder);
+  }
+}
+
+void TestPointerLikeTypeInterfacePass::walkAndPrint() {
+  auto func = getOperation();
+
+  func.walk([&](Operation *op) {
+    // Look for operations marked with "test.ptr", "test.src_ptr", or
+    // "test.dest_ptr"
+    if (op->hasAttr("test.ptr") || op->hasAttr("test.src_ptr") ||
+        op->hasAttr("test.dest_ptr")) {
+      llvm::errs() << "Operation: ";
+      op->print(llvm::errs());
+      llvm::errs() << "\n";
+
+      // Check each result to see if it's a PointerLikeType
+      for (auto result : op->getResults()) {
+        if (isa<PointerLikeType>(result.getType())) {
+          llvm::errs() << "  Result " << result.getResultNumber()
+                       << " is PointerLikeType: ";
+          result.getType().print(llvm::errs());
+          llvm::errs() << "\n";
+        } else {
+          llvm::errs() << "  Result " << result.getResultNumber()
+                       << " is NOT PointerLikeType: ";
+          result.getType().print(llvm::errs());
+          llvm::errs() << "\n";
+        }
+      }
+
+      if (op->getNumResults() == 0)
+        llvm::errs() << "  Operation has no results\n";
+
+      llvm::errs() << "\n";
+    }
+  });
+}
+
+void TestPointerLikeTypeInterfacePass::testGenAllocate(
+    Operation *op, Value result, PointerLikeType pointerType,
+    OpBuilder &builder) {
+  Location loc = op->getLoc();
+
+  // Create a new builder with the listener and set insertion point
+  OperationTracker tracker;
+  OpBuilder newBuilder(op->getContext());
+  newBuilder.setListener(&tracker);
+  newBuilder.setInsertionPointAfter(op);
+
+  // Call the genAllocate API
+  Value allocRes = pointerType.genAllocate(newBuilder, loc, "test_alloc",
+                                           result.getType(), result);
+
+  if (allocRes) {
+    llvm::errs() << "Successfully generated alloc for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+
+    // Print all operations that were inserted
+    for (Operation *insertedOp : tracker.insertedOps) {
+      llvm::errs() << "\tGenerated: ";
+      insertedOp->print(llvm::errs());
+      llvm::errs() << "\n";
+    }
+  } else {
+    llvm::errs() << "Failed to generate alloc for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+  }
+}
+
+void TestPointerLikeTypeInterfacePass::testGenFree(Operation *op, Value result,
+                                                   PointerLikeType pointerType,
+                                                   OpBuilder &builder) {
+  Location loc = op->getLoc();
+
+  // Create a new builder with the listener and set insertion point
+  OperationTracker tracker;
+  OpBuilder newBuilder(op->getContext());
+  newBuilder.setListener(&tracker);
+  newBuilder.setInsertionPointAfter(op);
+
+  // Call the genFree API
+  auto typedResult = cast<TypedValue<PointerLikeType>>(result);
+  bool success =
+      pointerType.genFree(newBuilder, loc, typedResult, result.getType());
+
+  if (success) {
+    llvm::errs() << "Successfully generated free for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+
+    // Print all operations that were inserted
+    for (Operation *insertedOp : tracker.insertedOps) {
+      llvm::errs() << "\tGenerated: ";
+      insertedOp->print(llvm::errs());
+      llvm::errs() << "\n";
+    }
+  } else {
+    llvm::errs() << "Failed to generate free for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+  }
+}
+
+void TestPointerLikeTypeInterfacePass::testGenCopy(
+    Operation *srcOp, Operation *destOp, Value srcResult, Value destResult,
+    PointerLikeType pointerType, OpBuilder &builder) {
+  Location loc = destOp->getLoc();
+
+  // Create a new builder with the listener and set insertion point
+  OperationTracker tracker;
+  OpBuilder newBuilder(destOp->getContext());
+  newBuilder.setListener(&tracker);
+  newBuilder.setInsertionPointAfter(destOp);
+
+  // Call the genCopy API with the provided source and destination
+  auto typedSrc = cast<TypedValue<PointerLikeType>>(srcResult);
+  auto typedDest = cast<TypedValue<PointerLikeType>>(destResult);
+  bool success = pointerType.genCopy(newBuilder, loc, typedDest, typedSrc,
+                                     srcResult.getType());
+
+  if (success) {
+    llvm::errs() << "Successfully generated copy from source: ";
+    srcOp->print(llvm::errs());
+    llvm::errs() << " to destination: ";
+    destOp->print(llvm::errs());
+    llvm::errs() << "\n";
+
+    // Print all operations that were inserted
+    for (Operation *insertedOp : tracker.insertedOps) {
+      llvm::errs() << "\tGenerated: ";
+      insertedOp->print(llvm::errs());
+      llvm::errs() << "\n";
+    }
+  } else {
+    llvm::errs() << "Failed to generate copy from source: ";
+    srcOp->print(llvm::errs());
+    llvm::errs() << " to destination: ";
+    destOp->print(llvm::errs());
+    llvm::errs() << "\n";
+  }
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace test {
+void registerTestPointerLikeTypeInterfacePass() {
+  PassRegistration<TestPointerLikeTypeInterfacePass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py
index 5f92f5b..8e6208e 100644
--- a/mlir/test/python/pass_manager.py
+++ b/mlir/test/python/pass_manager.py
@@ -435,3 +435,23 @@ def testPrintIrTree():
 
             print_file_tree(temp_dir)
         log("// Tree printing end")
+
+
+# CHECK-LABEL: TEST: testEnableStatistics
+@run
+def testEnableStatistics():
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() {
+              %0 = arith.constant 10
+              return
+            }
+          }
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        pm.enable_statistics()
+        # CHECK: Pass statistics report
+        pm.run(module)