diff options
Diffstat (limited to 'flang/test/Lower/CUDA/cuda-device-proc.cuf')
| -rw-r--r-- | flang/test/Lower/CUDA/cuda-device-proc.cuf | 262 |
1 files changed, 261 insertions, 1 deletions
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 5c4c3c6..e5d3c43 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -468,7 +468,18 @@ attributes(global) subroutine test_bulk_g2s(a) end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_g2s -! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1> +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %4 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_bulk_g2sEbarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[DST:.*]]:2 = hlfir.declare %16(%17) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_bulk_g2sEtmpa"} : (!fir.ref<!fir.array<1024xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xf64>>, !fir.ref<!fir.array<1024xf64>>) +! CHECK: %[[COUNT:.*]]:2 = hlfir.declare %19 {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_bulk_g2sEtx_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[SRC:.*]] = hlfir.designate %{{.*}} (%{{.*}}) : (!fir.box<!fir.array<?xf64>>, i64) -> !fir.ref<f64> +! CHECK: %[[COUNT_LOAD:.*]] = fir.load %20#0 : !fir.ref<i32> +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: %[[BARRIER_3:.*]] = llvm.addrspacecast %[[BARRIER_PTR]] : !llvm.ptr to !llvm.ptr<3> +! CHECK: %[[DST_PTR:.*]] = fir.convert %[[DST]]#0 : (!fir.ref<!fir.array<1024xf64>>) -> !llvm.ptr +! CHECK: %[[DST_7:.*]] = llvm.addrspacecast %[[DST_PTR]] : !llvm.ptr to !llvm.ptr<7> +! CHECK: %[[SRC_PTR:.*]] = fir.convert %[[SRC]] : (!fir.ref<f64>) -> !llvm.ptr +! CHECK: %[[SRC_3:.*]] = llvm.addrspacecast %[[SRC_PTR]] : !llvm.ptr to !llvm.ptr<1> +! CHECK: nvvm.cp.async.bulk.shared.cluster.global %[[DST_7]], %[[SRC_3]], %[[BARRIER_3]], %[[COUNT_LOAD]] : <7>, <1> attributes(global) subroutine test_bulk_s2g(a) real(8), device :: a(*) @@ -479,6 +490,8 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_s2g ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(device) subroutine testAtomicCasLoop(aa, n) integer :: a @@ -492,3 +505,250 @@ end subroutine ! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32 ! CHECK: %{{.*}} = arith.constant 1 : i32 ! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32 + +attributes(global) subroutine test_barrier_try_wait() + integer :: istat + integer(8), shared :: barrier1 + integer(8) :: token + istat = barrier_try_wait(barrier1, token) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_barrier_try_wait() +! CHECK: scf.while +! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32 + +attributes(global) subroutine test_barrier_try_wait_sleep() + integer :: istat + integer(8), shared :: barrier1 + integer(8) :: token + integer(4) :: sleep_time + istat = barrier_try_wait_sleep(barrier1, token, sleep_time) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() +! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 + +attributes(global) subroutine test_tma_bulk_load_c4(a, n) + integer(8), shared :: barrier1 + integer, value :: n + complex(4), device :: r8(n) + complex(4), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_c8(a, n) + integer(8), shared :: barrier1 + integer, value :: n + complex(8), device :: r8(n) + complex(8), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_i4(a, n) + integer(8), shared :: barrier1 + integer, value :: n + integer(4), device :: r8(n) + integer(4), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_i8(a, n) + integer(8), shared :: barrier1 + integer, value :: n + integer(8), device :: r8(n) + integer(8), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_r2(a, n) + integer(8), shared :: barrier1 + integer, value :: n + real(2), device :: r8(n) + real(2), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_r4(a, n) + integer(8), shared :: barrier1 + integer, value :: n + real(4), device :: r8(n) + real(4), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_r8(a, n) + integer(8), shared :: barrier1 + integer, value :: n + real(8), device :: r8(n) + real(8), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32> +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_store_c4(c, n) + integer, value :: n + complex(4), device :: c(n) + complex(4), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_c8(c, n) + integer, value :: n + complex(8), device :: c(n) + complex(8), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_i4(c, n) + integer, value :: n + integer(4), device :: c(n) + integer(4), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_i8(c, n) + integer, value :: n + integer(8), device :: c(n) + integer(8), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + + +attributes(global) subroutine test_tma_bulk_store_r2(c, n) + integer, value :: n + real(2), device :: c(n) + real(2), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_r4(c, n) + integer, value :: n + real(4), device :: c(n) + real(4), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_r8(c, n) + integer, value :: n + real(8), device :: c(n) + real(8), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 |
