aboutsummaryrefslogtreecommitdiff
path: root/flang/test/Lower
diff options
context:
space:
mode:
Diffstat (limited to 'flang/test/Lower')
-rw-r--r--flang/test/Lower/CUDA/cuda-device-proc.cuf264
-rw-r--r--flang/test/Lower/OpenMP/atomic-read-complex.f9034
-rw-r--r--flang/test/Lower/OpenMP/atomic-write-complex.f9034
-rw-r--r--flang/test/Lower/forall-pointer-assignment.f90 (renamed from flang/test/Lower/forall-polymorphic.f90)87
-rw-r--r--flang/test/Lower/inline_directive.f9061
5 files changed, 477 insertions, 3 deletions
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 5c4c3c6..09b4302 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -431,7 +431,7 @@ end subroutine
! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32
! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
+! CHECK: nvvm.mbarrier.init %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
@@ -468,7 +468,18 @@ attributes(global) subroutine test_bulk_g2s(a)
end subroutine
! CHECK-LABEL: func.func @_QPtest_bulk_g2s
-! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1>
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %4 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_bulk_g2sEbarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[DST:.*]]:2 = hlfir.declare %16(%17) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_bulk_g2sEtmpa"} : (!fir.ref<!fir.array<1024xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xf64>>, !fir.ref<!fir.array<1024xf64>>)
+! CHECK: %[[COUNT:.*]]:2 = hlfir.declare %19 {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_bulk_g2sEtx_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[SRC:.*]] = hlfir.designate %{{.*}} (%{{.*}}) : (!fir.box<!fir.array<?xf64>>, i64) -> !fir.ref<f64>
+! CHECK: %[[COUNT_LOAD:.*]] = fir.load %20#0 : !fir.ref<i32>
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: %[[BARRIER_3:.*]] = llvm.addrspacecast %[[BARRIER_PTR]] : !llvm.ptr to !llvm.ptr<3>
+! CHECK: %[[DST_PTR:.*]] = fir.convert %[[DST]]#0 : (!fir.ref<!fir.array<1024xf64>>) -> !llvm.ptr
+! CHECK: %[[DST_7:.*]] = llvm.addrspacecast %[[DST_PTR]] : !llvm.ptr to !llvm.ptr<7>
+! CHECK: %[[SRC_PTR:.*]] = fir.convert %[[SRC]] : (!fir.ref<f64>) -> !llvm.ptr
+! CHECK: %[[SRC_3:.*]] = llvm.addrspacecast %[[SRC_PTR]] : !llvm.ptr to !llvm.ptr<1>
+! CHECK: nvvm.cp.async.bulk.shared.cluster.global %[[DST_7]], %[[SRC_3]], %[[BARRIER_3]], %[[COUNT_LOAD]] : <7>, <1>
attributes(global) subroutine test_bulk_s2g(a)
real(8), device :: a(*)
@@ -479,6 +490,8 @@ end subroutine
! CHECK-LABEL: func.func @_QPtest_bulk_s2g
! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
attributes(device) subroutine testAtomicCasLoop(aa, n)
integer :: a
@@ -492,3 +505,250 @@ end subroutine
! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32
! CHECK: %{{.*}} = arith.constant 1 : i32
! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32
+
+attributes(global) subroutine test_barrier_try_wait()
+ integer :: istat
+ integer(8), shared :: barrier1
+ integer(8) :: token
+ istat = barrier_try_wait(barrier1, token)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
+! CHECK: scf.while
+! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+
+attributes(global) subroutine test_barrier_try_wait_sleep()
+ integer :: istat
+ integer(8), shared :: barrier1
+ integer(8) :: token
+ integer(4) :: sleep_time
+ istat = barrier_try_wait_sleep(barrier1, token, sleep_time)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
+! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+
+attributes(global) subroutine test_tma_bulk_load_c4(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ complex(4), device :: r8(n)
+ complex(4), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_c8(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ complex(8), device :: r8(n)
+ complex(8), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_i4(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ integer(4), device :: r8(n)
+ integer(4), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_i8(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ integer(8), device :: r8(n)
+ integer(8), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r2(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ real(2), device :: r8(n)
+ real(2), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r4(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ real(4), device :: r8(n)
+ real(4), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r8(a, n)
+ integer(8), shared :: barrier1
+ integer, value :: n
+ real(8), device :: r8(n)
+ real(8), shared :: tmp(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_store_c4(c, n)
+ integer, value :: n
+ complex(4), device :: c(n)
+ complex(4), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_c8(c, n)
+ integer, value :: n
+ complex(8), device :: c(n)
+ complex(8), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_i4(c, n)
+ integer, value :: n
+ integer(4), device :: c(n)
+ integer(4), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_i8(c, n)
+ integer, value :: n
+ integer(8), device :: c(n)
+ integer(8), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+
+attributes(global) subroutine test_tma_bulk_store_r2(c, n)
+ integer, value :: n
+ real(2), device :: c(n)
+ real(2), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_r4(c, n)
+ integer, value :: n
+ real(4), device :: c(n)
+ real(4), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_r8(c, n)
+ integer, value :: n
+ real(8), device :: c(n)
+ real(8), shared :: tmpa(1024)
+ integer(4) :: j, elem_count
+ call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
diff --git a/flang/test/Lower/OpenMP/atomic-read-complex.f90 b/flang/test/Lower/OpenMP/atomic-read-complex.f90
new file mode 100644
index 0000000..2f51f03
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-read-complex.f90
@@ -0,0 +1,34 @@
+! Test lowering of atomic read to LLVM IR for complex types.
+! This is a regression test for issue #165184.
+
+! RUN: %flang_fc1 -emit-llvm -fopenmp -o - %s | FileCheck %s
+
+! Test that atomic read operations with complex types emit the correct
+! size parameter to __atomic_load:
+! - complex(4) (8 bytes total): should call __atomic_load(i64 8, ...)
+! - complex(8) (16 bytes total): should call __atomic_load(i64 16, ...)
+
+program atomic_read_complex
+ implicit none
+
+ ! Test complex(4) - single precision (8 bytes)
+ complex(4) :: c41, c42
+ ! Test complex(8) - double precision (16 bytes)
+ complex(8) :: c81, c82
+
+ c42 = (1.0_4, 1.0_4)
+ c82 = (1.0_8, 1.0_8)
+
+ ! CHECK-LABEL: define {{.*}} @_QQmain
+
+ ! Single precision complex: 8 bytes
+ ! CHECK: call void @__atomic_load(i64 8, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic read
+ c41 = c42
+
+ ! Double precision complex: 16 bytes (this was broken before the fix)
+ ! CHECK: call void @__atomic_load(i64 16, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic read
+ c81 = c82
+
+end program atomic_read_complex
diff --git a/flang/test/Lower/OpenMP/atomic-write-complex.f90 b/flang/test/Lower/OpenMP/atomic-write-complex.f90
new file mode 100644
index 0000000..48cfe26
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-write-complex.f90
@@ -0,0 +1,34 @@
+! Test lowering of atomic write to LLVM IR for complex types.
+! This is a regression test for issue #165184.
+
+! RUN: %flang_fc1 -emit-llvm -fopenmp -o - %s | FileCheck %s
+
+! Test that atomic write operations with complex types emit the correct
+! size parameter to __atomic_store:
+! - complex(4) (8 bytes total): should call __atomic_store(i64 8, ...)
+! - complex(8) (16 bytes total): should call __atomic_store(i64 16, ...)
+
+program atomic_write_complex
+ implicit none
+
+ ! Test complex(4) - single precision (8 bytes)
+ complex(4) :: c41, c42
+ ! Test complex(8) - double precision (16 bytes)
+ complex(8) :: c81, c82
+
+ c42 = (1.0_4, 1.0_4)
+ c82 = (1.0_8, 1.0_8)
+
+ ! CHECK-LABEL: define {{.*}} @_QQmain
+
+ ! Single precision complex: 8 bytes
+ ! CHECK: call void @__atomic_store(i64 8, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic write
+ c41 = c42
+
+ ! Double precision complex: 16 bytes (this was broken before the fix)
+ ! CHECK: call void @__atomic_store(i64 16, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic write
+ c81 = c82
+
+end program atomic_write_complex
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-pointer-assignment.f90
index 2b7a51f..ec142e3 100644
--- a/flang/test/Lower/forall-polymorphic.f90
+++ b/flang/test/Lower/forall-pointer-assignment.f90
@@ -1,6 +1,7 @@
-! Test lower of FORALL polymorphic pointer assignment
+! Test lower of FORALL pointer assignment
! RUN: bbc -emit-fir %s -o - | FileCheck %s
+
!! Test when LHS is polymorphic and RHS is not polymorphic
! CHECK-LABEL: c.func @_QPforallpolymorphic
subroutine forallPolymorphic()
@@ -46,6 +47,7 @@
end subroutine forallPolymorphic
+
!! Test when LHS is not polymorphic but RHS is polymorphic
! CHECK-LABEL: c.func @_QPforallpolymorphic2(
! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) {
@@ -87,3 +89,86 @@
end subroutine forallPolymorphic2
+
+!! Test when LHS is unlimited polymorphic and RHS non-polymorphic intrinsic
+!! type target.
+! CHECK-LABEL: c.func @_QPforallpolymorphic3
+subroutine forallPolymorphic3()
+ TYPE :: DT
+ CLASS(*), POINTER :: Ptr => NULL()
+ END TYPE
+
+ TYPE(DT) :: D1(10)
+ CHARACTER*1, TARGET :: TAR1(10)
+ INTEGER :: I
+
+ FORALL (I=1:10)
+ D1(I)%Ptr => Tar1(I)
+ END FORALL
+
+! CHECK: %[[V_7:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>> {bindc_name = "d1", uniq_name = "_QFforallpolymorphic3Ed1"}
+! CHECK: %[[V_8:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_9:[0-9]+]] = fir.declare %[[V_7]](%[[V_8]]) {uniq_name = "_QFforallpolymorphic3Ed1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>
+! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.char<1>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphic3Etar1"}
+! CHECK: %[[V_17:[0-9]+]] = fir.declare %[[V_16]](%[[V_8]]) typeparams %c1 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphic3Etar1"} : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, index) -> !fir.ref<!fir.array<10x!fir.char<1>>>
+! CHECK: %[[V_24:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg0 = %[[V_24]] to %[[V_25]] step %c1
+! CHECK: {
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: %[[V_27:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64
+! CHECK: %[[V_28:[0-9]+]] = fir.array_coor %[[V_9]](%[[V_8]]) %[[V_27]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>
+! CHECK: %[[V_29:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>
+! CHECK: %[[V_30:[0-9]+]] = fir.coordinate_of %[[V_28]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>) -> !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_31:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64
+! CHECK: %[[V_32:[0-9]+]] = fir.array_coor %[[V_17]](%[[V_8]]) %31 : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.char<1>>
+! CHECK: %[[V_33:[0-9]+]] = fir.embox %[[V_32]] : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.ptr<!fir.char<1>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.rebox %[[V_33]] : (!fir.box<!fir.ptr<!fir.char<1>>>) -> !fir.class<!fir.ptr<none>>
+! CHECK: fir.store %[[V_34]] to %[[V_30]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: }
+
+end subroutine forallPolymorphic3
+
+
+!! Test the LHS of a pointer assignment gets the isPointer flag from the
+!! RHS that is a reference to a function that returns a pointer.
+! CHECK-LABEL: c.func @_QPforallpointerassignment1
+ subroutine forallPointerAssignment1()
+ type base
+ real, pointer :: data => null()
+ end type
+
+ interface
+ pure function makeData (i)
+ real, pointer :: makeData
+ integer*4, intent(in) :: i
+ end function
+ end interface
+
+ type(base) :: co1(10)
+
+ forall (i=1:10)
+ co1(i)%data => makeData (i)
+ end forall
+
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i64
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<f32>> {bindc_name = ".result"}
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: %[[V_27:[0-9]+]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
+! CHECK: %[[V_28:[0-9]+]] = fir.convert %[[V_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+! CHECK: %[[V_29:[0-9]+]] = fir.call @_FortranACreateDescriptorStack(%[[V_28]], %c{{.*}}) : (!fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+! CHECK: fir.do_loop %arg0 = %[[V_25]] to %[[V_26]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: fir.store %[[V_32]] to %[[V_3]] : !fir.ref<i32>
+! CHECK: %[[V_33:[0-9]+]] = fir.call @_QPmakedata(%[[V_3]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.ptr<f32>>
+! CHECK: fir.save_result %[[V_33]] to %[[V_4]] : !fir.box<!fir.ptr<f32>>, !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_35:[0-9]+]] = fir.load %[[V_34]] : !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (!fir.box<!fir.ptr<f32>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAPushDescriptor(%[[V_29]], %[[V_36]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+! CHECK: }
+
+ end subroutine forallPointerAssignment1
diff --git a/flang/test/Lower/inline_directive.f90 b/flang/test/Lower/inline_directive.f90
new file mode 100644
index 0000000..347df85
--- /dev/null
+++ b/flang/test/Lower/inline_directive.f90
@@ -0,0 +1,61 @@
+! RUN: %flang_fc1 -emit-fir -o - %s | FileCheck %s
+
+subroutine test_inline()
+ integer :: x, y
+!CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_inlineEx"}
+!CHECK: %[[VAL_1:.*]] = fir.declare %[[VAL_0]] {uniq_name = "_QFtest_inlineEx"} : (!fir.ref<i32>) -> !fir.ref<i32>
+!CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest_inlineEy"}
+!CHECK: %[[VAL_3:.*]] = fir.declare %[[VAL_2]] {uniq_name = "_QFtest_inlineEy"} : (!fir.ref<i32>) -> !fir.ref<i32>
+
+ !dir$ forceinline
+ y = g(x)
+ !CHECK: %[[VAL_4:.*]] = fir.call @_QFtest_inlinePg(%[[VAL_1]]) fastmath<contract> {inline_attr = #fir.inline_attrs<always_inline>} : (!fir.ref<i32>) -> i32
+ !CHECK: fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<i32>
+
+ !dir$ forceinline
+ call f(x, y)
+ !CHECK: fir.call @_QFtest_inlinePf(%[[VAL_1]], %[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<always_inline>} : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+
+ !dir$ noinline
+ y = g(x) + 7 * (8 + g(y))
+ !CHECK: %[[VAL_8:.*]] = fir.call @_QFtest_inlinePg(%[[VAL_1]]) fastmath<contract> {inline_attr = #fir.inline_attrs<no_inline>} : (!fir.ref<i32>) -> i32
+ !CHECK: %[[VAL_9:.*]] = fir.call @_QFtest_inlinePg(%[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<no_inline>} : (!fir.ref<i32>) -> i32
+ !CHECK: %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[C8:.*]] : i32
+ !CHECK: %[[VAL_11:.*]] = fir.no_reassoc %[[VAL_10]] : i32
+ !CHECK: %[[VAL_12:.*]] = arith.muli %[[VAL_11]], %[[C7:.*]] : i32
+ !CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_8]], %[[VAL_12]] : i32
+ !CHECK: fir.store %[[VAL_13]] to %[[VAL_3]] : !fir.ref<i32>
+
+ !dir$ noinline
+ call f(x, y)
+ !CHECK: fir.call @_QFtest_inlinePf(%[[VAL_1]], %[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<no_inline>} : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+
+ !dir$ inline
+ call f(x, y)
+ !CHECK: fir.call @_QFtest_inlinePf(%[[VAL_1]], %[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<inline_hint>} : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+
+ !dir$ forceinline
+ do i = 1, 100
+ !CHECK: fir.do_loop %[[ARG_0:.*]] = %[[FROM:.*]] to %[[TO:.*]] step %[[C1:.*]] iter_args(%[[ARG_1:.*]] = {{.*}}) -> (i32) {
+ !CHECK: fir.call @_QFtest_inlinePf(%[[VAL_1]], %[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<always_inline>} : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+ call f(x, y)
+ enddo
+
+ !dir$ inline
+ do i = 1, 100
+ !CHECK: fir.do_loop %[[ARG_0:.*]] = %[[FROM:.*]] to %[[TO:.*]] step %[[C1:.*]] iter_args(%[[ARG_1:.*]] = {{.*}}) -> (i32) {
+ !CHECK: fir.call @_QFtest_inlinePf(%[[VAL_1]], %[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<inline_hint>} : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+ call f(x, y)
+ enddo
+!CHECK: return
+ contains
+ subroutine f(x, y)
+ integer, intent(in) :: x
+ integer, intent(out) :: y
+ y = x*2
+ end subroutine f
+ integer function g(x)
+ integer :: x
+ g = x*2
+ end function g
+end subroutine test_inline