1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
/* { dg-do run } */
/* { dg-additional-options "-lm" } */
/* Note: At the time this program was written, Nvptx was not asynchronous
enough to trigger the issue (with a 'nowait' added); however, one
AMD GPUs, it triggered. */
/* Test whether nowait / dependency is handled correctly.
Motivated by OpenMP_VV's 5.1/interop/test_interop_target.c
The code actually only creates a streaming object without actually using it,
except for dependency tracking.
Note that there is a difference between having a steaming (targetsync) object
and not (= omp_interop_none); at least if one assumes that omp_interop_none
does not include 'targetsync' as (effective) interop type - in that case,
'nowait' has no effect and the 'depend' is active as included task, otherwise
the code continues with the depend being active only for the about to be
destroyed or used thread.
The OpenMP spec states (here 6.0):
"If the interop-type set includes 'targetsync', an empty mergeable task is
generated. If the 'nowait' clause is not present on the construct then
the task is also an included task. If the interop-type set does not
include 'targetsync', the 'nowait' clause has no effect. Any depend
clauses that are present on the construct apply to the generated task. */
#include <omp.h>
void
test_async (const int dev)
{
constexpr int N = 2048;
constexpr int ulp = 4;
constexpr double M_PI = 2.0 * __builtin_acos (0.0);
omp_interop_t obj1, obj2;
double A[N] = { };
int B[N] = { };
/* Create interop object. */
#pragma omp interop device(dev) init(targetsync : obj1, obj2)
if (dev == omp_initial_device || dev == omp_get_num_devices ())
{
if (obj1 != omp_interop_none || obj2 != omp_interop_none)
__builtin_abort ();
}
else
{
if (obj1 == omp_interop_none || obj2 == omp_interop_none)
__builtin_abort ();
}
/* DOUBLE */
/* Now in the background update it, slowly enough that the
code afterwards is reached while still running asynchronously.
As OpenMP_VV's Issue #863 shows, the overhead is high enough to
fail even when only doing an atomic integer increment. */
#pragma omp target device(dev) map(A) depend(out: A[:N]) nowait
for (int i = 0; i < N; i++)
#pragma omp atomic update
A[i] += __builtin_sin (2*i*M_PI/N);
/* DESTROY take care of the dependeny such that ... */
if (obj1 == omp_interop_none)
{
// Same as below as 'nowait' is ignored.
#pragma omp interop destroy(obj1) depend(in: A[:N]) nowait
}
else
{
#pragma omp interop destroy(obj1) depend(in: A[:N])
}
/* ... this code is only executed once the dependency as been fulfilled. */
/* Check the value - part I: quick, avoid A[0] == sin(0) = 0. */
for (int i = 1; i < N; i++)
if (A[i] == 0.0)
__builtin_abort ();
/* Check the value - part II: throughly */
for (int i = 0; i < N; i++)
{
double x = A[i];
double y = __builtin_sin (2*i*M_PI/N);
if (__builtin_fabs (x - y) > ulp * __builtin_fabs (x+y) * __DBL_EPSILON__)
__builtin_abort ();
}
/* Integer */
#pragma omp target device(dev) map(B) depend(out: B[:N]) nowait
for (int i = 0; i < N; i++)
#pragma omp atomic update
B[i] += 42;
/* Same - but using USE. */
if (obj2 == omp_interop_none)
{
// Same as below as 'nowait' is ignored.
#pragma omp interop use(obj2) depend(in: B[:N]) nowait
}
else
{
#pragma omp interop use(obj2) depend(in: B[:N])
}
for (int i = 0; i < N; i++)
if (B[i] != 42)
__builtin_abort ();
#pragma omp interop destroy(obj2)
}
int
main ()
{
int ndev = omp_get_num_devices ();
for (int dev = 0; dev <= ndev; dev++)
test_async (dev);
test_async (omp_initial_device);
return 0;
}
|