1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
#include "common.h"
#include <assert.h>
#include <stdlib.h>
#include "util.h"
void thread_entry(int cid, int nc)
{
const int R = 8;
int m, n, p;
if (have_vec) {
m = HCBM;
n = HCBN;
p = HCBK;
} else {
m = CBM;
n = CBN;
p = CBK;
}
t a[m*p];
t b[p*n];
t c[m*n];
for (size_t i = 0; i < m; i++)
for (size_t j = 0; j < p; j++)
a[i*p+j] = i+j;
for (size_t i = 0; i < p; i++)
for (size_t j = 0; j < n; j++)
b[i*n+j] = i-j;
memset(c, 0, m*n*sizeof(c[0]));
size_t instret, cycles;
if (have_vec) {
for (int i = 0; i < R; i++)
{
instret = -rdinstret();
cycles = -rdcycle();
mm_rb_hwacha(m, n, p, a, p, b, n, c, n);
instret += rdinstret();
cycles += rdcycle();
}
} else {
for (int i = 0; i < R; i++)
{
instret = -rdinstret();
cycles = -rdcycle();
mm(m, n, p, a, p, b, n, c, n);
instret += rdinstret();
cycles += rdcycle();
}
}
printf("C%d: reg block %dx%dx%d, cache block %dx%dx%d\n",
cid, RBM, RBN, RBK, CBM, CBN, CBK);
printf("C%d: %d instructions\n", cid, (int)(instret));
printf("C%d: %d cycles\n", cid, (int)(cycles));
printf("C%d: %d flops\n", cid, 2*m*n*p);
printf("C%d: %d Mflops @ 1 GHz\n", cid, 2000*m*n*p/(cycles));
#if 1
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < n; j++)
{
t s = 0;
for (size_t aik = i, bkj = -j; aik < i+p; aik++, bkj++)
s += (t)aik*(t)bkj;
if (fabs(c[i*n+j]-s*R) > 1e-6*s)
{
printf("C%d: c[%lu][%lu] %f != %f\n", cid, i, j, c[i*n+j], s);
exit(1);
}
}
}
#endif
barrier(nc);
exit(0);
}
|