1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
|
/*
matmul.c : Matrix Multiplication with tiling for openmp4 example
*/
#include <stdlib.h>
#include <math.h>
#define BLOCK_SIZE 16
/*
#define BLOCK_SIZE 32
*/
#define NSECPERSEC 1000000000L
typedef struct {
int width;
int height;
int stride;
int hpad;
float* elements;
} Matrix;
/* Correctly extract the number of nanoseconds from the two time structures */
long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
long int nanosecs;
if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
else nanosecs =
(((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
return nanosecs;
}
void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
int verify(float* v_res, float* v_ref, int len) {
int passed = 1;
int i;
for (i = 0; i < len; ++i) {
if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
__builtin_abort ();
}
}
return passed;
}
int main(int argc, char* argv[]){
Matrix A,B,Bt,C,Cref;
int a1,a2,a3,i,j;
struct timespec start_time1, end_time1;
struct timespec start_time2, end_time2;
long int nanosecs,total_ops;
float gflopsTiled,gflopsCPU;
a1 = 35;
a2 = 28;
a3 = 47;
A.height = a1;
A.width = a2;
A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
B.height = a2;
B.width = a3;
B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
/* Bt is same as B but stored in column-major order */
Bt.height = B.height;
Bt.width = B.width;
Bt.stride = B.stride;
Bt.hpad = B.hpad;
Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
C.height = a1;
C.width = a3;
C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
Cref.height = a1;
Cref.width = a3;
Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
for(i = 0; i < A.hpad ; i++)
for(j = 0; j < A.stride; j++) {
if (( j<A.width ) && (i<A.height)) {
A.elements[i*A.stride + j] = (i % 3);
} else {
A.elements[i*A.stride + j] = 0.0;
}
}
/* Initialize B and Bt */
for(i = 0; i < B.hpad ; i++)
for(j = 0; j < B.stride; j++) {
if (( j<B.width ) && (i<B.height)) {
B.elements[i*B.stride+j] = (j % 2);
Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
} else {
B.elements[i*B.stride+j] = 0.0;
Bt.elements[j*Bt.stride+i] = 0.0;
}
}
/* zero C, and Cref */
for(i = 0; i < C.hpad; i++)
for(j = 0; j < C.stride; j++) {
C.elements[i*C.stride+j] = 0.0;
Cref.elements[i*Cref.stride+j] = 0.0;
}
simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
verify(C.elements, Cref.elements, C.height * C.stride);
return 0;
}
void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
const float* B,const int LDB, const float beta,float* C, const int LDC) {
/* A,B, and C are in row-major order */
int c_row,c_col,inner;
float sum;
for (c_col = 0 ; c_col<N; c_col++ ) {
for (c_row = 0 ; c_row<M; c_row++ ) {
sum = 0.0 ;
for (inner = 0 ; inner<K; inner++ ) {
sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
}
C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
}
}
}
/***************************
tiled_sgemm_tt: Tiled matrix multiplication:
***************************/
void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
const float*B, const int LDB, const float beta, float*C, const int LDC){
#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
#pragma omp distribute collapse(2)
for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
{
// Each team has a local copy of these mini matrices
float As[BLOCK_SIZE][BLOCK_SIZE];
float Bs[BLOCK_SIZE][BLOCK_SIZE];
#pragma omp parallel
{
int C_row, C_col;
float Cval = 0.0;
for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE )
{
#pragma omp for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++)
for (int col=0 ; col < BLOCK_SIZE ; col++)
{
C_row = C_row_start + row;
C_col = C_col_start + col;
if ((C_row < M) && (kblock + col < K))
As[row][col] = A[(C_row*LDA)+ kblock + col];
else
As[row][col] = 0;
if ((kblock + row < K) && C_col < N)
Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
else
Bs[row][col] = 0;
}
#pragma omp for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++)
for (int col=0 ; col < BLOCK_SIZE ; col++)
{
for (int e = 0; e < BLOCK_SIZE; ++e)
Cval += As[row][e] * Bs[e][col];
}
} /* End for kblock .. */
#pragma omp for collapse(2)
for (int row=0 ; row < BLOCK_SIZE ; row++)
for (int col=0 ; col < BLOCK_SIZE ; col++)
{
C_row = C_row_start + row;
C_col = C_col_start + col;
if ((C_row < M) && (C_col < N))
C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
}
} /* end parallel */
} /* end target teams distribute */
}
|