aboutsummaryrefslogtreecommitdiff
path: root/benchmarks/dgemm/dgemm_main.c
blob: 9f28c07e11c4f6569e9129f72a1b93533d2acd38 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//**************************************************************************
// Double-precision general matrix multiplication benchmark
//--------------------------------------------------------------------------

#include "util.h"

//--------------------------------------------------------------------------
// Input/Reference Data

#include "dataset1.h"

//--------------------------------------------------------------------------
// square_dgemm function

void square_dgemm( long n0, const double a0[], const double b0[], double c0[] )
{
  long n = (n0+2)/3*3;
  double a[n*n], b[n*n], c[n*n];

  for (long i = 0; i < n0; i++)
  {
    long j;
    for (j = 0; j < n0; j++)
    {
      a[i*n+j] = a0[i*n0+j];
      b[i*n+j] = b0[j*n0+i];
    }
    for ( ; j < n; j++)
    {
      a[i*n+j] = b[i*n+j] = 0;
    }
  }
  for (long i = n0; i < n; i++)
    for (long j = 0; j < n; j++)
      a[i*n+j] = b[i*n+j] = 0;

  long i, j, k;
  for (i = 0; i < n; i+=3)
  {
    for (j = 0; j < n; j+=3)
    {
      double *a0 = a + (i+0)*n, *b0 = b + (j+0)*n;
      double *a1 = a + (i+1)*n, *b1 = b + (j+1)*n;
      double *a2 = a + (i+2)*n, *b2 = b + (j+2)*n;

      double s00 = 0, s01 = 0, s02 = 0;
      double s10 = 0, s11 = 0, s12 = 0;
      double s20 = 0, s21 = 0, s22 = 0;

      while (a0 < a + (i+1)*n)
      {
        double a00 = a0[0], a01 = a0[1], a02 = a0[2];
        double b00 = b0[0], b01 = b0[1], b02 = b0[2];
        double a10 = a1[0], a11 = a1[1], a12 = a1[2];
        double b10 = b1[0], b11 = b1[1], b12 = b1[2];
        asm ("" ::: "memory");
        double a20 = a2[0], a21 = a2[1], a22 = a2[2];
        double b20 = b2[0], b21 = b2[1], b22 = b2[2];

        s00 = a00*b00 + (a01*b01 + (a02*b02 + s00));
        s01 = a00*b10 + (a01*b11 + (a02*b12 + s01));
        s02 = a00*b20 + (a01*b21 + (a02*b22 + s02));
        s10 = a10*b00 + (a11*b01 + (a12*b02 + s10));
        s11 = a10*b10 + (a11*b11 + (a12*b12 + s11));
        s12 = a10*b20 + (a11*b21 + (a12*b22 + s12));
        s20 = a20*b00 + (a21*b01 + (a22*b02 + s20));
        s21 = a20*b10 + (a21*b11 + (a22*b12 + s21));
        s22 = a20*b20 + (a21*b21 + (a22*b22 + s22));

        a0 += 3; b0 += 3;
        a1 += 3; b1 += 3;
        a2 += 3; b2 += 3;
      }

      c[(i+0)*n+j+0] = s00; c[(i+0)*n+j+1] = s01; c[(i+0)*n+j+2] = s02;
      c[(i+1)*n+j+0] = s10; c[(i+1)*n+j+1] = s11; c[(i+1)*n+j+2] = s12;
      c[(i+2)*n+j+0] = s20; c[(i+2)*n+j+1] = s21; c[(i+2)*n+j+2] = s22;
    }
  }

  for (long i = 0; i < n0; i++)
  {
    long j;
    for (j = 0; j < n0 - 2; j+=3)
    {
      c0[i*n0+j+0] = c[i*n+j+0];
      c0[i*n0+j+1] = c[i*n+j+1];
      c0[i*n0+j+2] = c[i*n+j+2];
    }
    for ( ; j < n0; j++)
      c0[i*n0+j] = c[i*n+j];
  }
}

//--------------------------------------------------------------------------
// Main

int main( int argc, char* argv[] )
{
  double results_data[DATA_SIZE*DATA_SIZE];

  // Output the input array
  printDoubleArray( "input1", DATA_SIZE*DATA_SIZE, input1_data );
  printDoubleArray( "input2", DATA_SIZE*DATA_SIZE, input2_data );
  printDoubleArray( "verify", DATA_SIZE*DATA_SIZE, verify_data );

#if PREALLOCATE
  // If needed we preallocate everything in the caches
  square_dgemm( DATA_SIZE, input1_data, input2_data, results_data );
#endif

  // Do the dgemm
  setStats(1);
  square_dgemm( DATA_SIZE, input1_data, input2_data, results_data );
  setStats(0);

  // Print out the results
  printDoubleArray( "results", DATA_SIZE*DATA_SIZE, results_data );

  // Check the results
  return verifyDouble( DATA_SIZE*DATA_SIZE, results_data, verify_data );
}