aboutsummaryrefslogtreecommitdiff
path: root/mt/ak_matmul.c
blob: e4b34e4e34c9d762bbc81b7677799ea5964c9732 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include "stdlib.h"

#include "util.h"

#include "dataset.h"
void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda,  const data_t A[], const data_t B[], data_t C[] )
{
   
   // ***************************** //
   // **** ADD YOUR CODE HERE ***** //
   // ***************************** //
   //
   // feel free to make a separate function for MI and MSI versions.
  int i, j, k, ii, jj, bsize;
  bsize = 16;
  for ( jj = bsize*coreid; jj < lda; jj += bsize*ncores) {
    for ( ii = 0; ii < lda; ii += bsize) {
      for ( j = jj; j < lda && j < jj + bsize; j++) {
	for ( i = ii; i < lda && i < ii + bsize; i += 8) {
	  data_t c1 = C[i + j*lda];
	  data_t c2 = C[i + j*lda + 1];
	  data_t c3 = C[i + j*lda + 2];
	  data_t c4 = C[i + j*lda + 3];
	  data_t c5 = C[i + j*lda + 4];
	  data_t c6 = C[i + j*lda + 5];
	  data_t c7 = C[i + j*lda + 6];
	  data_t c8 = C[i + j*lda + 7];
	  for ( k = 0; k < lda; k+=4 ) {
	    for (int x = 0; x < 4; x++) {
	      data_t a = A[j*lda + k+x];
	      data_t b1 = B[(k+x)*lda + i];
	      data_t b2 = B[(k+x)*lda + i + 1];
	      data_t b3 = B[(k+x)*lda + i + 2];
	      data_t b4 = B[(k+x)*lda + i + 3];
	      data_t b5 = B[(k+x)*lda + i + 4];
	      data_t b6 = B[(k+x)*lda + i + 5];
	      data_t b7 = B[(k+x)*lda + i + 6];
	      data_t b8 = B[(k+x)*lda + i + 7];
	      c1 += a * b1;
	      c2 += a * b2;
	      c3 += a * b3;
	      c4 += a * b4;
	      c5 += a * b5;
	      c6 += a * b6;
	      c7 += a * b7;
	      c8 += a * b8;
	    }
	  }
	  C[i + j*lda] = c1;
	  C[i + j*lda + 1] = c2;
	  C[i + j*lda + 2] = c3;
	  C[i + j*lda + 3] = c4;
	  C[i + j*lda + 4] = c5;
	  C[i + j*lda + 5] = c6;
	  C[i + j*lda + 6] = c7;
	  C[i + j*lda + 7] = c8;
	}
      }
    }
  }
  
}