1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
/* { dg-additional-options "-fopenacc-dim=16:16" } */
#include <openacc.h>
#include <string.h>
#include <stdio.h>
#include <gomp-constants.h>
#pragma acc routine
static int __attribute__ ((noinline)) coord ()
{
int res = 0;
if (acc_on_device (acc_device_not_host))
{
int g, w, v;
g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
res = (1 << 24) | (g << 16) | (w << 8) | v;
}
return res;
}
int check (const int *ary, int size, int gp, int wp, int vp)
{
int exit = 0;
int ix;
int *gangs = (int *)__builtin_alloca (gp * sizeof (int));
int *workers = (int *)__builtin_alloca (wp * sizeof (int));
int *vectors = (int *)__builtin_alloca (vp * sizeof (int));
int offloaded = 0;
memset (gangs, 0, gp * sizeof (int));
memset (workers, 0, wp * sizeof (int));
memset (vectors, 0, vp * sizeof (int));
for (ix = 0; ix < size; ix++)
{
int g = (ary[ix] >> 16) & 0xff;
int w = (ary[ix] >> 8) & 0xff;
int v = (ary[ix] >> 0) & 0xff;
if (g >= gp || w >= wp || v >= vp)
{
printf ("unexpected cpu %#x used\n", ary[ix]);
exit = 1;
}
else
{
vectors[v]++;
workers[w]++;
gangs[g]++;
}
offloaded += ary[ix] >> 24;
}
if (!offloaded)
return 0;
if (offloaded != size)
{
printf ("offloaded %d times, expected %d\n", offloaded, size);
return 1;
}
for (ix = 0; ix < gp; ix++)
if (gangs[ix] != gangs[0])
{
printf ("gang %d not used %d times\n", ix, gangs[0]);
exit = 1;
}
for (ix = 0; ix < wp; ix++)
if (workers[ix] != workers[0])
{
printf ("worker %d not used %d times\n", ix, workers[0]);
exit = 1;
}
for (ix = 0; ix < vp; ix++)
if (vectors[ix] != vectors[0])
{
printf ("vector %d not used %d times\n", ix, vectors[0]);
exit = 1;
}
return exit;
}
#define N (32 *32*32)
int test_1 (int gp, int wp, int vp)
{
int ary[N];
int exit = 0;
#pragma acc parallel copyout (ary)
{
#pragma acc loop gang (static:1)
for (int ix = 0; ix < N; ix++)
ary[ix] = coord ();
}
exit |= check (ary, N, gp, 1, 1);
#pragma acc parallel copyout (ary)
{
#pragma acc loop worker
for (int ix = 0; ix < N; ix++)
ary[ix] = coord ();
}
exit |= check (ary, N, 1, wp, 1);
#pragma acc parallel copyout (ary)
{
#pragma acc loop vector
for (int ix = 0; ix < N; ix++)
ary[ix] = coord ();
}
exit |= check (ary, N, 1, 1, vp);
return exit;
}
int main ()
{
#ifdef ACC_DEVICE_TYPE_radeon
/* AMD GCN uses the autovectorizer for the vector dimension: the use
of a function call in vector-partitioned code in this test is not
currently supported. */
return test_1 (16, 16, 1);
#else
return test_1 (16, 16, 32);
#endif
}
|