aboutsummaryrefslogtreecommitdiff
path: root/mt/ad_matmul.c
blob: 60e6e6c73479a48195b88d5b0723b77e9dcd3490 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include "stdlib.h"

#include "util.h"

#include "dataset.h"
void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda,  const data_t A[], const data_t B[], data_t C[] )
{
   int i, k;
   int j = coreid*(lda/ncores);
   int jend = (coreid+1)*(lda/ncores);
   for ( ; j < jend; j++ )
   {
      int j32 = j << 5;
      data_t* Cj32 = C + j32;
      for ( k = 0; k < 32; k+=2 )
      {
	 data_t Aj32k  = A[k + j32];
	 data_t Aj32k2 = A[k + 1 + j32];
	 data_t* Bk32  = B + (k << 5);
	 data_t* Bk322 = Bk32 + 32;
	 for ( i = 0; i < 32; i+=4 )
	 {
            Cj32[i]   += Aj32k  * Bk32   [i];
            Cj32[i]   += Aj32k2 * Bk322  [i];
            Cj32[i+1] += Aj32k  * Bk32 [i+1];
            Cj32[i+1] += Aj32k2 * Bk322[i+1];
            Cj32[i+2] += Aj32k  * Bk32 [i+2];
            Cj32[i+2] += Aj32k2 * Bk322[i+2];
            Cj32[i+3] += Aj32k  * Bk32 [i+3];
            Cj32[i+3] += Aj32k2 * Bk322[i+3];
	 }
         barrier(ncores);
      }
   }
   
 
}