/* { dg-do run } */ /* { dg-additional-options "-lm" } */ /* Note: At the time this program was written, Nvptx was not asynchronous enough to trigger the issue (with a 'nowait' added); however, one AMD GPUs, it triggered. */ /* Test whether nowait / dependency is handled correctly. Motivated by OpenMP_VV's 5.1/interop/test_interop_target.c The code actually only creates a streaming object without actually using it, except for dependency tracking. Note that there is a difference between having a steaming (targetsync) object and not (= omp_interop_none); at least if one assumes that omp_interop_none does not include 'targetsync' as (effective) interop type - in that case, 'nowait' has no effect and the 'depend' is active as included task, otherwise the code continues with the depend being active only for the about to be destroyed or used thread. The OpenMP spec states (here 6.0): "If the interop-type set includes 'targetsync', an empty mergeable task is generated. If the 'nowait' clause is not present on the construct then the task is also an included task. If the interop-type set does not include 'targetsync', the 'nowait' clause has no effect. Any depend clauses that are present on the construct apply to the generated task. */ #include void test_async (const int dev) { constexpr int N = 2048; constexpr int ulp = 4; constexpr double M_PI = 2.0 * __builtin_acos (0.0); omp_interop_t obj1, obj2; double A[N] = { }; int B[N] = { }; /* Create interop object. */ #pragma omp interop device(dev) init(targetsync : obj1, obj2) if (dev == omp_initial_device || dev == omp_get_num_devices ()) { if (obj1 != omp_interop_none || obj2 != omp_interop_none) __builtin_abort (); } else { if (obj1 == omp_interop_none || obj2 == omp_interop_none) __builtin_abort (); } /* DOUBLE */ /* Now in the background update it, slowly enough that the code afterwards is reached while still running asynchronously. As OpenMP_VV's Issue #863 shows, the overhead is high enough to fail even when only doing an atomic integer increment. */ #pragma omp target device(dev) map(A) depend(out: A[:N]) nowait for (int i = 0; i < N; i++) #pragma omp atomic update A[i] += __builtin_sin (2*i*M_PI/N); /* DESTROY take care of the dependeny such that ... */ if (obj1 == omp_interop_none) { // Same as below as 'nowait' is ignored. #pragma omp interop destroy(obj1) depend(in: A[:N]) nowait } else { #pragma omp interop destroy(obj1) depend(in: A[:N]) } /* ... this code is only executed once the dependency as been fulfilled. */ /* Check the value - part I: quick, avoid A[0] == sin(0) = 0. */ for (int i = 1; i < N; i++) if (A[i] == 0.0) __builtin_abort (); /* Check the value - part II: throughly */ for (int i = 0; i < N; i++) { double x = A[i]; double y = __builtin_sin (2*i*M_PI/N); if (__builtin_fabs (x - y) > ulp * __builtin_fabs (x+y) * __DBL_EPSILON__) __builtin_abort (); } /* Integer */ #pragma omp target device(dev) map(B) depend(out: B[:N]) nowait for (int i = 0; i < N; i++) #pragma omp atomic update B[i] += 42; /* Same - but using USE. */ if (obj2 == omp_interop_none) { // Same as below as 'nowait' is ignored. #pragma omp interop use(obj2) depend(in: B[:N]) nowait } else { #pragma omp interop use(obj2) depend(in: B[:N]) } for (int i = 0; i < N; i++) if (B[i] != 42) __builtin_abort (); #pragma omp interop destroy(obj2) } int main () { int ndev = omp_get_num_devices (); for (int dev = 0; dev <= ndev; dev++) test_async (dev); test_async (omp_initial_device); return 0; }