aboutsummaryrefslogtreecommitdiff
path: root/mt/bk_matmul.c
blob: dae49fc9cc81735da91f2c8d0231bf7d6c66042b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include "stdlib.h"

#include "util.h"

#include "dataset.h"
void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda,  const data_t A[], const data_t B[], data_t C[] )
{
   int i, j, k, ii, jj, kk; 
   if(coreid > 1) return; 
  if (coreid == 0) { 
// for ( ii = 0; ii < 32; ii+=IC )
         for ( kk = 0; kk < 32; kk+=16 ) 
   for ( j = 0; j < 16; j++ )  
// for ( j = 0; j < 16; j++ )  
   {
      for ( i =  0; i < 32; i+=8 )
//    for ( i = ii; i < ii + IC && i < 32; i+=8 )
      {
         data_t temp0 = C[i+j*32];
         data_t temp1 = C[i+j*32+1];
         data_t temp2 = C[i+j*32+2];
         data_t temp3 = C[i+j*32+3];
         data_t temp4 = C[i+j*32+4];
         data_t temp5 = C[i+j*32+5];
         data_t temp6 = C[i+j*32+6];
         data_t temp7 = C[i+j*32+7];
         for ( k = kk; k < kk+16 && k < 32; k++ ) 
//       for ( k = 0; k < 32; k++ ) 
         {
            data_t tempA = A[j*32+k];
            temp0 += tempA * B[k*32 + i];
            temp1 += tempA * B[k*32 + i+1];
            temp2 += tempA * B[k*32 + i+2];
            temp3 += tempA * B[k*32 + i+3];
            temp4 += tempA * B[k*32 + i+4];
            temp5 += tempA * B[k*32 + i+5];
            temp6 += tempA * B[k*32 + i+6];
            temp7 += tempA * B[k*32 + i+7];
         }
         C[i+j*32] = temp0;
         C[i+j*32+1] = temp1;
         C[i+j*32+2] = temp2;
         C[i+j*32+3] = temp3;
         C[i+j*32+4] = temp4;
         C[i+j*32+5] = temp5;
         C[i+j*32+6] = temp6;
         C[i+j*32+7] = temp7;
      }
   } 
  } 
  if(coreid == 1 || ncores == 1) {
// for ( ii = 0; ii < 32; ii+=IC )
         for ( kk = 0; kk < 32; kk+=16 ) 
   for ( j = 16; j < 32; j++ )  
// for ( j = 16; j < 32; j++ )  
   {
      for ( i =   0; i < 32; i+=8 )
//    for ( i = ii; i < ii + IC && i < 32; i+=8 )
      {
         data_t temp0 = C[i+j*32];
         data_t temp1 = C[i+j*32+1];
         data_t temp2 = C[i+j*32+2];
         data_t temp3 = C[i+j*32+3];
         data_t temp4 = C[i+j*32+4];
         data_t temp5 = C[i+j*32+5];
         data_t temp6 = C[i+j*32+6];
         data_t temp7 = C[i+j*32+7];
         for ( k = kk; k < kk+16 && k < 32; k++ ) 
         {
            data_t tempA = A[j*32+k];
            temp0 += tempA * B[k*32 + i];
            temp1 += tempA * B[k*32 + i+1];
            temp2 += tempA * B[k*32 + i+2];
            temp3 += tempA * B[k*32 + i+3];
            temp4 += tempA * B[k*32 + i+4];
            temp5 += tempA * B[k*32 + i+5];
            temp6 += tempA * B[k*32 + i+6];
            temp7 += tempA * B[k*32 + i+7];
         }
         C[i+j*32] = temp0;
         C[i+j*32+1] = temp1;
         C[i+j*32+2] = temp2;
         C[i+j*32+3] = temp3;
         C[i+j*32+4] = temp4;
         C[i+j*32+5] = temp5;
         C[i+j*32+6] = temp6;
         C[i+j*32+7] = temp7;
      }

   }
  } 
}