blob: 3b7977d0722955317bc6f2b1f43d2a680def4514 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
#include "stdlib.h"
#include "util.h"
#include "dataset.h"
void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
{
// ***************************** //
// **** ADD YOUR CODE HERE ***** //
// ***************************** //
//
// feel free to make a separate function for MI and MSI versions.
//----------MSI--------------
/*
int i,j,k;
barrier(nc);
for(j = coreid*lda/ncores; j < coreid*lda/ncores + lda/ncores; j++) {
for(i = 0; i < lda; i+=4) {
data_t Cval0 = 0;
data_t Cval1 = 0;
data_t Cval2 = 0;
data_t Cval3 = 0;
for(k = 0; k < lda; k++) {
Cval0 += A[j*lda+k]*B[k*lda+i];
Cval1 += A[j*lda+k]*B[k*lda+i+1];
Cval2 += A[j*lda+k]*B[k*lda+i+2];
Cval3 += A[j*lda+k]*B[k*lda+i+3];
}
C[j*lda+i] = Cval0;
C[j*lda+i+1] = Cval1;
C[j*lda+i+2] = Cval2;
C[j*lda+i+3] = Cval3;
}
}
*/
//------------------MI-------------------
int i,j,k;
barrier(ncores);
for(j = coreid*lda/ncores; j < coreid*lda/ncores + lda/ncores; j++) {
for(i = 0; i < lda; i+=4) {
data_t Cval0 = 0;
data_t Cval1 = 0;
data_t Cval2 = 0;
data_t Cval3 = 0;
if(coreid == 0) {
for(k = 0; k < lda; k++) {
Cval0 += A[j*lda+k]*B[k*lda+i];
Cval1 += A[j*lda+k]*B[k*lda+i+1];
Cval2 += A[j*lda+k]*B[k*lda+i+2];
Cval3 += A[j*lda+k]*B[k*lda+i+3];
}
} else {
for(k = lda-1; k >= 0; k--) {
Cval0 += A[j*lda+k]*B[k*lda+i];
Cval1 += A[j*lda+k]*B[k*lda+i+1];
Cval2 += A[j*lda+k]*B[k*lda+i+2];
Cval3 += A[j*lda+k]*B[k*lda+i+3];
}
}
C[j*lda+i] = Cval0;
C[j*lda+i+1] = Cval1;
C[j*lda+i+2] = Cval2;
C[j*lda+i+3] = Cval3;
}
}
}
|