1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
#if !defined(__hppa__) || !defined(__hpux__)
#include <complex.h>
#endif
/* Single float has 23 bits of fraction. */
#define FRAC (1.0f / (1 << 20))
typedef float _Complex Type;
int close_enough (Type a, Type b)
{
Type diff = a - b;
float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
float mag2_diff = (__real__(diff) * __real__ (diff)
+ __imag__ (diff) * __imag__ (diff));
return mag2_diff / mag2_a < (FRAC * FRAC);
}
#define N 100
static int __attribute__ ((noinline))
vector (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
#pragma acc parallel vector_length(32) copyin(ary[0:N])
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
static int __attribute__ ((noinline))
worker (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
#pragma acc parallel num_workers(32) copyin(ary[0:N])
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
static int __attribute__ ((noinline))
gang (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
#pragma acc parallel num_gangs (32) copyin(ary[0:N])
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
int main (void)
{
Type ary[N], sum = 0, prod = 1;
for (int ix = 0; ix < N; ix++)
{
float frac = ix * (1.0f / 1024) + 1.0f;
ary[ix] = frac + frac * 2.0j - 1.0j;
sum += ary[ix];
prod *= ary[ix];
}
if (vector (ary, sum, prod))
return 1;
if (worker (ary, sum, prod))
return 1;
if (gang (ary, sum, prod))
return 1;
return 0;
}
|