1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
/*
matmul.c : Matrix Multiplication with tiling for openmp4 example
*/
#include <stdlib.h>
#include <math.h>
#define BLOCK_SIZE 16
/*
#define BLOCK_SIZE 32
*/
#define NSECPERSEC 1000000000L
typedef struct {
int width;
int height;
int stride;
int hpad;
float* elements;
} Matrix;
/* Correctly extract the number of nanoseconds from the two time structures */
long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
long int nanosecs;
if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
else nanosecs =
(((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
return nanosecs;
}
void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
int verify(float* v_res, float* v_ref, int len) {
int passed = 1;
int i;
for (i = 0; i < len; ++i) {
if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
__builtin_abort ();
}
}
return passed;
}
int main(int argc, char* argv[]){
Matrix A,B,Bt,C,Cref;
int a1,a2,a3,i,j;
struct timespec start_time1, end_time1;
struct timespec start_time2, end_time2;
long int nanosecs,total_ops;
float gflopsTiled,gflopsCPU;
a1 = 35;
a2 = 28;
a3 = 47;
A.height = a1;
A.width = a2;
A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
B.height = a2;
B.width = a3;
B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
/* Bt is same as B but stored in column-major order */
Bt.height = B.height;
Bt.width = B.width;
Bt.stride = B.stride;
Bt.hpad = B.hpad;
Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
C.height = a1;
C.width = a3;
C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
Cref.height = a1;
Cref.width = a3;
Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
for(i = 0; i < A.hpad ; i++)
for(j = 0; j < A.stride; j++) {
if (( j<A.width ) && (i<A.height)) {
A.elements[i*A.stride + j] = (i % 3);
} else {
A.elements[i*A.stride + j] = 0.0;
}
}
/* Initialize B and Bt */
for(i = 0; i < B.hpad ; i++)
for(j = 0; j < B.stride; j++) {
if (( j<B.width ) && (i<B.height)) {
B.elements[i*B.stride+j] = (j % 2);
Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
} else {
B.elements[i*B.stride+j] = 0.0;
Bt.elements[j*Bt.stride+i] = 0.0;
}
}
/* zero C, and Cref */
for(i = 0; i < C.hpad; i++)
for(j = 0; j < C.stride; j++) {
C.elements[i*C.stride+j] = 0.0;
Cref.elements[i*Cref.stride+j] = 0.0;
}
simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
verify(C.elements, Cref.elements, C.height * C.stride);
return 0;
}
void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) {
/* A,B, and C are in row-major order */
int c_row,c_col,inner;
float sum;
for (c_col = 0 ; c_col<N; c_col++ ) {
for (c_row = 0 ; c_row<M; c_row++ ) {
sum = 0.0 ;
for (inner = 0 ; inner<K; inner++ ) {
sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
}
C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
}
}
}
/***************************
tiled_sgemm_tt: Tiled matrix multiplication:
***************************/
void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
const float*B, const int LDB, const float beta, float*C, const int LDC){
#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
#pragma omp distribute collapse(2)
for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
// We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
// The grid global dimensions are M,N,1
// The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
// -------------------------------------------------------------------
// The rest of this code forms the HSAIL kernel with the
// pairs of "parallel for collapse(2)" loops replaced with a barrier.
// The kernel initializes these values
// C_row_start = get_group_id(0) * BLOCK_SIZE
// C_col_start = get_group_id(1) * BLOCK_SIZE
// row=get_local_id(0)
// col=get_local_id(1)
// -------------------------------------------------------------------
// Each team has a local copy of these mini matrices
float As[BLOCK_SIZE][BLOCK_SIZE];
float Bs[BLOCK_SIZE][BLOCK_SIZE];
float Cs[BLOCK_SIZE][BLOCK_SIZE];
int C_row, C_col;
/* Zero Cs for this BLOCK */
// - - - - - - - - - - - - - - - - - - - -
// REPLACE NEXT THREE LINES WITH A BARRIER
#pragma omp parallel for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++) {
for (int col=0 ; col < BLOCK_SIZE ; col++) {
// END BARRIER
// - - - - - - - - - - - - - - - - - - - -
Cs[row][col] = 0.0;
}
}
// This kblock loop is run on the master thread of each team
for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) {
// Copy global memory values to local memory
// - - - - - - - - - - - - - - - - - - - -
// REPLACE NEXT THREE LINES WITH A BARRIER
#pragma omp parallel for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++) {
for (int col=0 ; col < BLOCK_SIZE ; col++) {
// END BARRIER
// - - - - - - - - - - - - - - - - - - - -
C_row = C_row_start + row;
C_col = C_col_start + col;
if ((C_row < M) && (kblock + col < K))
As[row][col] = A[(C_row*LDA)+ kblock + col];
else
As[row][col] = 0;
if ((kblock + row < K) && C_col < N)
Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
else
Bs[row][col] = 0;
}
}
// Calculate Cs <- Sum(As X Bs) across all kblocks
// - - - - - - - - - - - - - - - - - - - -
// REPLACE NEXT THREE LINES WITH A BARRIER
#pragma omp parallel for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++) {
for (int col=0 ; col < BLOCK_SIZE ; col++) {
// END BARRIER
// - - - - - - - - - - - - - - - - - - - -
for (int e = 0; e < BLOCK_SIZE; ++e)
Cs[row][col] += As[row][e] * Bs[e][col];
}
}
} /* End for kblock .. */
// Scale Update actual C from Cs
// - - - - - - - - - - - - - - - - - - - -
// REPLACE NEXT THREE LINES WITH A BARRIER
#pragma omp parallel for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++) {
for (int col=0 ; col < BLOCK_SIZE ; col++) {
// END BARRIER
// - - - - - - - - - - - - - - - - - - - -
C_row = C_row_start + row;
C_col = C_col_start + col;
if ((C_row < M) && (C_col < N)) {
C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
}
}
}
// -------------------------------------------------------------------
// This is the end of the kernel
}
}
}
|