1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
|
#include "stdlib.h"
#include "util.h"
#include "dataset.h"
void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
{
if(coreid > 1) return;
// feel free to make a separate function for MI and MSI versions.
int i, j, k, x;
data_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
data_t temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
if(coreid == 0) {
for(j = 0; j < 32; j++) {
temp0 = C[j*lda];
temp1 = C[1 + j*lda];
temp2 = C[2 + j*lda];
temp3 = C[3 + j*lda];
temp4 = C[4 + j*lda];
temp5 = C[5 + j*lda];
temp6 = C[6 + j*lda];
temp7 = C[7 + j*lda];
temp8 = C[8 + j*lda];
temp9 = C[9 + j*lda];
temp10 = C[10 + j*lda];
temp11 = C[11 + j*lda];
temp12 = C[12 + j*lda];
temp13 = C[13 + j*lda];
temp14 = C[14 + j*lda];
temp15 = C[15 + j*lda];
for(k = 0; k < 32; k++) {
temp0 += A[j*lda + k] * B[k*lda];
temp1 += A[j*lda + k] * B[1 + k*lda];
temp2 += A[j*lda + k] * B[2 + k*lda];
temp3 += A[j*lda + k] * B[3 + k*lda];
temp4 += A[j*lda + k] * B[4 + k*lda];
temp5 += A[j*lda + k] * B[5 + k*lda];
temp6 += A[j*lda + k] * B[6 + k*lda];
temp7 += A[j*lda + k] * B[7 + k*lda];
temp8 += A[j*lda + k] * B[8 + k*lda];
temp9 += A[j*lda + k] * B[9 + k*lda];
temp10 += A[j*lda + k] * B[10 + k*lda];
temp11 += A[j*lda + k] * B[11 + k*lda];
temp12 += A[j*lda + k] * B[12 + k*lda];
temp13 += A[j*lda + k] * B[13 + k*lda];
temp14 += A[j*lda + k] * B[14 + k*lda];
temp15 += A[j*lda + k] * B[15 + k*lda];
}
C[j*lda] = temp0;
C[1 + j*lda] = temp1;
C[2 + j*lda] = temp2;
C[3 + j*lda] = temp3;
C[4 + j*lda] = temp4;
C[5 + j*lda] = temp5;
C[6 + j*lda] = temp6;
C[7 + j*lda] = temp7;
C[8 + j*lda] = temp8;
C[9 + j*lda] = temp9;
C[10 + j*lda] = temp10;
C[11 + j*lda] = temp11;
C[12 + j*lda] = temp12;
C[13 + j*lda] = temp13;
C[14 + j*lda] = temp14;
C[15 + j*lda] = temp15;
}
}
if(coreid == 1 || ncores == 1) {
for(j = 16; j < 32; j++) {
temp0 = C[16 + j*lda];
temp1 = C[17 + j*lda];
temp2 = C[18 + j*lda];
temp3 = C[19 + j*lda];
temp4 = C[20 + j*lda];
temp5 = C[21 + j*lda];
temp6 = C[22 + j*lda];
temp7 = C[23 + j*lda];
temp8 = C[24 + j*lda];
temp9 = C[25 + j*lda];
temp10 = C[26 + j*lda];
temp11 = C[27 + j*lda];
temp12 = C[28 + j*lda];
temp13 = C[29 + j*lda];
temp14 = C[30 + j*lda];
temp15 = C[31 + j*lda];
for(k = 0; k < 32; k++) {
temp0 += A[j*lda + k] * B[16 + k*lda];
temp1 += A[j*lda + k] * B[17 + k*lda];
temp2 += A[j*lda + k] * B[18 + k*lda];
temp3 += A[j*lda + k] * B[19 + k*lda];
temp4 += A[j*lda + k] * B[20 + k*lda];
temp5 += A[j*lda + k] * B[21 + k*lda];
temp6 += A[j*lda + k] * B[22 + k*lda];
temp7 += A[j*lda + k] * B[23 + k*lda];
temp8 += A[j*lda + k] * B[24 + k*lda];
temp9 += A[j*lda + k] * B[25 + k*lda];
temp10 += A[j*lda + k] * B[26 + k*lda];
temp11 += A[j*lda + k] * B[27 + k*lda];
temp12 += A[j*lda + k] * B[28 + k*lda];
temp13 += A[j*lda + k] * B[29 + k*lda];
temp14 += A[j*lda + k] * B[30 + k*lda];
temp15 += A[j*lda + k] * B[31 + k*lda];
}
C[16 + j*lda] = temp0;
C[17 + j*lda] = temp1;
C[18 + j*lda] = temp2;
C[19 + j*lda] = temp3;
C[20 + j*lda] = temp4;
C[21 + j*lda] = temp5;
C[22 + j*lda] = temp6;
C[23 + j*lda] = temp7;
C[24 + j*lda] = temp8;
C[25 + j*lda] = temp9;
C[26 + j*lda] = temp10;
C[27 + j*lda] = temp11;
C[28 + j*lda] = temp12;
C[29 + j*lda] = temp13;
C[30 + j*lda] = temp14;
C[31 + j*lda] = temp15;
}
for(j = 0; j <16; j++) {
temp0 = C[16 + j*lda];
temp1 = C[17 + j*lda];
temp2 = C[18 + j*lda];
temp3 = C[19 + j*lda];
temp4 = C[20 + j*lda];
temp5 = C[21 + j*lda];
temp6 = C[22 + j*lda];
temp7 = C[23 + j*lda];
temp8 = C[24 + j*lda];
temp9 = C[25 + j*lda];
temp10 = C[26 + j*lda];
temp11 = C[27 + j*lda];
temp12 = C[28 + j*lda];
temp13 = C[29 + j*lda];
temp14 = C[30 + j*lda];
temp15 = C[31 + j*lda];
for(k = 0; k < 32; k++) {
temp0 += A[j*lda + k] * B[16 + k*lda];
temp1 += A[j*lda + k] * B[17 + k*lda];
temp2 += A[j*lda + k] * B[18 + k*lda];
temp3 += A[j*lda + k] * B[19 + k*lda];
temp4 += A[j*lda + k] * B[20 + k*lda];
temp5 += A[j*lda + k] * B[21 + k*lda];
temp6 += A[j*lda + k] * B[22 + k*lda];
temp7 += A[j*lda + k] * B[23 + k*lda];
temp8 += A[j*lda + k] * B[24 + k*lda];
temp9 += A[j*lda + k] * B[25 + k*lda];
temp10 += A[j*lda + k] * B[26 + k*lda];
temp11 += A[j*lda + k] * B[27 + k*lda];
temp12 += A[j*lda + k] * B[28 + k*lda];
temp13 += A[j*lda + k] * B[29 + k*lda];
temp14 += A[j*lda + k] * B[30 + k*lda];
temp15 += A[j*lda + k] * B[31 + k*lda];
}
C[16 + j*lda] = temp0;
C[17 + j*lda] = temp1;
C[18 + j*lda] = temp2;
C[19 + j*lda] = temp3;
C[20 + j*lda] = temp4;
C[21 + j*lda] = temp5;
C[22 + j*lda] = temp6;
C[23 + j*lda] = temp7;
C[24 + j*lda] = temp8;
C[25 + j*lda] = temp9;
C[26 + j*lda] = temp10;
C[27 + j*lda] = temp11;
C[28 + j*lda] = temp12;
C[29 + j*lda] = temp13;
C[30 + j*lda] = temp14;
C[31 + j*lda] = temp15;
}
}
}
|