aboutsummaryrefslogtreecommitdiff
path: root/mt/af_matmul.c
blob: a147b6299b44a2044328ea1589e3298d41d983c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include "stdlib.h"

#include "util.h"

#include "dataset.h"
void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda,  const data_t A[], const data_t B[], data_t C[] )
{
    size_t i, j, k, l;
  int row,row2, column, column2, column3, column4, column5, column6, column7, column8;
  data_t element, element2, element3, element4, element5, element6, element7, element8;
	data_t B1, B2, B3, B4;
  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  data_t temp_mat2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	int local_lda = lda;

  for (l=coreid*local_lda/ncores; l<local_lda*(1+coreid)/ncores; l+=2){
    row=l*32;
    row2=(l+1)*32;
		//element = A[row];
		//element5 = A[row2];
    for (i=0; i<local_lda; i+=4){
      element = A[row+i];
      element2 = A[row+i+1];
      element3 = A[row+i+2];
      element4 = A[row+i+3];

      element5 = A[row2+i];
      element6 = A[row2+i+1];
      element7 = A[row2+i+2];
      element8 = A[row2+i+3];

      column=i*local_lda;
      column2=(i+1)*local_lda;
      column3=(i+2)*local_lda;
      column4=(i+3)*local_lda;

			B1 = B[column];
			B2 = B[column2];
			B3 = B[column3];
			B4 = B[column4];
	
      for (j=0; j<lda; j+=4){		
				temp_mat[j]+=element*B1+element2*B2+element3*B3+element4*B4;
				temp_mat[j+1]+=element*B[column+j+1]+element2*B[column2+j+1]+element3*B[column3+j+1]+element4*B[column4+j+1];
				temp_mat[j+2]+=element*B[column+j+2]+element2*B[column2+j+2]+element3*B[column3+j+2]+element4*B[column4+j+2];
				temp_mat[j+3]+=element*B[column+j+3]+element2*B[column2+j+3]+element3*B[column3+j+3]+element4*B[column4+j+3];

				temp_mat2[j]+=element5*B1+element6*B2+element7*B3+element8*B4;
				temp_mat2[j+1]+=element5*B[column+j+1]+element6*B[column2+j+1]+element7*B[column3+j+1]+element8*B[column4+j+1];
				temp_mat2[j+2]+=element5*B[column+j+2]+element6*B[column2+j+2]+element7*B[column3+j+2]+element8*B[column4+j+2];
				temp_mat2[j+3]+=element5*B[column+j+3]+element6*B[column2+j+3]+element7*B[column3+j+3]+element8*B[column4+j+3];

				B1 = B[column+j+4];
				B2 = B[column2+j+4];
				B3 = B[column3+j+4];
				B4 = B[column4+j+4];
		
				}
			//element = A[row+i+4];
			//element5 = A[row2+i+4];
      }

			for(k=0; k<local_lda; k++){
				C[row+k]=temp_mat[k];
				temp_mat[k]=0;
				C[row2+k]=temp_mat2[k];
				temp_mat2[k]=0;

				}

	
  }
   // ***************************** //
   // **** ADD YOUR CODE HERE ***** //
   // ***************************** //
   //
   // feel free to make a separate function for MI and MSI versions.

}