aboutsummaryrefslogtreecommitdiff
path: root/flang/test/Lower/CUDA/cuda-device-proc.cuf
diff options
context:
space:
mode:
Diffstat (limited to 'flang/test/Lower/CUDA/cuda-device-proc.cuf')
-rw-r--r--flang/test/Lower/CUDA/cuda-device-proc.cuf72
1 files changed, 42 insertions, 30 deletions
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 09b4302..27ef8e0 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -14,15 +14,14 @@ attributes(global) subroutine devsub()
integer :: smalltime
integer(4) :: res, offset
integer(8) :: resl
+ real(2) :: r2a(2)
+ real(2) :: tmp2(2)
integer :: tid
tid = threadIdx%x
call syncthreads()
call syncwarp(1)
- call threadfence()
- call threadfence_block()
- call threadfence_system()
ret = syncthreads_and(1)
res = syncthreads_and(tid > offset)
ret = syncthreads_count(1)
@@ -34,6 +33,7 @@ attributes(global) subroutine devsub()
al = atomicadd(al, 1_8)
af = atomicadd(af, 1.0_4)
ad = atomicadd(ad, 1.0_8)
+ ai = atomicadd(r2a, tmp2)
ai = atomicsub(ai, 1_4)
al = atomicsub(al, 1_8)
@@ -102,32 +102,30 @@ end
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
! CHECK: nvvm.barrier0
-! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
-! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32
+! CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<and> %c1{{.*}} -> i32
! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%[[CONV]])
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<and> %[[CONV]] -> i32
+! CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<popc> %c1{{.*}} -> i32
! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%[[CONV]]) fastmath<contract> : (i32) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<popc> %[[CONV]] -> i32
+! CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<or> %c1{{.*}} -> i32
! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%[[CONV]]) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<or> %[[CONV]] -> i32
! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
@@ -215,10 +213,10 @@ end
! CHECK-LABEL: func.func @_QPhost1()
! CHECK: cuf.kernel
! CHECK: nvvm.barrier0
-! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
-! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
-! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
-! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32
+! CHECK: nvvm.barrier #nvvm.reduction<and> %c1{{.*}} -> i32
+! CHECK: nvvm.barrier #nvvm.reduction<popc> %c1{{.*}} -> i32
+! CHECK: nvvm.barrier #nvvm.reduction<or> %c1{{.*}} -> i32
attributes(device) subroutine testMatch()
integer :: a, ipred, mask, v32
@@ -436,11 +434,11 @@ end subroutine
! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
+! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64
attributes(global) subroutine test_fence()
@@ -479,7 +477,7 @@ end subroutine
! CHECK: %[[DST_7:.*]] = llvm.addrspacecast %[[DST_PTR]] : !llvm.ptr to !llvm.ptr<7>
! CHECK: %[[SRC_PTR:.*]] = fir.convert %[[SRC]] : (!fir.ref<f64>) -> !llvm.ptr
! CHECK: %[[SRC_3:.*]] = llvm.addrspacecast %[[SRC_PTR]] : !llvm.ptr to !llvm.ptr<1>
-! CHECK: nvvm.cp.async.bulk.shared.cluster.global %[[DST_7]], %[[SRC_3]], %[[BARRIER_3]], %[[COUNT_LOAD]] : <7>, <1>
+! CHECK: nvvm.cp.async.bulk.shared.cluster.global %[[DST_7]], %[[SRC_3]], %[[BARRIER_3]], %[[COUNT_LOAD]] : !llvm.ptr<7>, <1>
attributes(global) subroutine test_bulk_s2g(a)
real(8), device :: a(*)
@@ -490,7 +488,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_bulk_s2g
! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(device) subroutine testAtomicCasLoop(aa, n)
@@ -515,7 +513,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
! CHECK: scf.while
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
attributes(global) subroutine test_barrier_try_wait_sleep()
integer :: istat
@@ -526,7 +524,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep()
end subroutine
! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
attributes(global) subroutine test_tma_bulk_load_c4(a, n)
integer(8), shared :: barrier1
@@ -540,6 +538,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xcomplex<f32>> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_c4Etmp"} -> !fir.ref<!fir.array<1024xcomplex<f32>>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -559,6 +558,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xcomplex<f64>> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_c8Etmp"} -> !fir.ref<!fir.array<1024xcomplex<f64>>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -578,6 +578,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xi32> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_i4Etmp"} -> !fir.ref<!fir.array<1024xi32>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -597,6 +598,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xi64> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_i8Etmp"} -> !fir.ref<!fir.array<1024xi64>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -616,6 +618,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xf16> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_r2Etmp"} -> !fir.ref<!fir.array<1024xf16>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -635,6 +638,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xf32> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_r4Etmp"} -> !fir.ref<!fir.array<1024xf32>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -654,6 +658,7 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.shared_memory !fir.array<1024xf64> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_r8Etmp"} -> !fir.ref<!fir.array<1024xf64>>
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -670,8 +675,9 @@ attributes(global) subroutine test_tma_bulk_store_c4(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
+! CHECK: cuf.shared_memory !fir.array<1024xcomplex<f32>> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_c4Etmpa"} -> !fir.ref<!fir.array<1024xcomplex<f32>>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(global) subroutine test_tma_bulk_store_c8(c, n)
@@ -683,8 +689,9 @@ attributes(global) subroutine test_tma_bulk_store_c8(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
+! CHECK: cuf.shared_memory !fir.array<1024xcomplex<f64>> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_c8Etmpa"} -> !fir.ref<!fir.array<1024xcomplex<f64>>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(global) subroutine test_tma_bulk_store_i4(c, n)
@@ -696,8 +703,9 @@ attributes(global) subroutine test_tma_bulk_store_i4(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
+! CHECK: cuf.shared_memory !fir.array<1024xi32> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_i4Etmpa"} -> !fir.ref<!fir.array<1024xi32>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(global) subroutine test_tma_bulk_store_i8(c, n)
@@ -709,8 +717,9 @@ attributes(global) subroutine test_tma_bulk_store_i8(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
+! CHECK: cuf.shared_memory !fir.array<1024xi64> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_i8Etmpa"} -> !fir.ref<!fir.array<1024xi64>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
@@ -723,8 +732,9 @@ attributes(global) subroutine test_tma_bulk_store_r2(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
+! CHECK: cuf.shared_memory !fir.array<1024xf16> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_r2Etmpa"} -> !fir.ref<!fir.array<1024xf16>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(global) subroutine test_tma_bulk_store_r4(c, n)
@@ -736,8 +746,9 @@ attributes(global) subroutine test_tma_bulk_store_r4(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
+! CHECK: cuf.shared_memory !fir.array<1024xf32> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_r4Etmpa"} -> !fir.ref<!fir.array<1024xf32>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(global) subroutine test_tma_bulk_store_r8(c, n)
@@ -749,6 +760,7 @@ attributes(global) subroutine test_tma_bulk_store_r8(c, n)
end subroutine
! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
+! CHECK: cuf.shared_memory !fir.array<1024xf64> align 16 {bindc_name = "tmpa", uniq_name = "_QFtest_tma_bulk_store_r8Etmpa"} -> !fir.ref<!fir.array<1024xf64>>
! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
! CHECK: nvvm.cp.async.bulk.wait_group 0