aboutsummaryrefslogtreecommitdiff
path: root/benchmarks/mm/mm_main.c
blob: 522768a4fb11276e4145dc1cf1ed58867c816afb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include "common.h"
#include <assert.h>
#include <stdlib.h>
#include "util.h"

void thread_entry(int cid, int nc)
{
  const int R = 8;
  int m, n, p;
  
  if (have_vec) {
    m = HCBM;
    n = HCBN;
    p = HCBK;
  } else {
    m = CBM;
    n = CBN;
    p = CBK;
  }

  t a[m*p];
  t b[p*n];
  t c[m*n];

  for (size_t i = 0; i < m; i++)
    for (size_t j = 0; j < p; j++)
      a[i*p+j] = i+j;
  for (size_t i = 0; i < p; i++)
    for (size_t j = 0; j < n; j++)
      b[i*n+j] = i-j;
  memset(c, 0, m*n*sizeof(c[0]));

  size_t instret, cycles;
  if (have_vec) {
    for (int i = 0; i < R; i++)
    {
      instret = -rdinstret();
      cycles = -rdcycle();
      mm_rb_hwacha(m, n, p, a, p, b, n, c, n);
      instret += rdinstret();
      cycles += rdcycle();
    }
  } else {
    for (int i = 0; i < R; i++)
    {
      instret = -rdinstret();
      cycles = -rdcycle();
      mm(m, n, p, a, p, b, n, c, n);
      instret += rdinstret();
      cycles += rdcycle();
    }
  }

  printf("C%d: reg block %dx%dx%d, cache block %dx%dx%d\n",
         cid, RBM, RBN, RBK, CBM, CBN, CBK);
  printf("C%d: %d instructions\n", cid, (int)(instret));
  printf("C%d: %d cycles\n", cid, (int)(cycles));
  printf("C%d: %d flops\n", cid, 2*m*n*p);
  printf("C%d: %d Mflops @ 1 GHz\n", cid, 2000*m*n*p/(cycles));

#if 1
  for (size_t i = 0; i < m; i++)
  {
    for (size_t j = 0; j < n; j++)
    {
      t s = 0;
      for (size_t aik = i, bkj = -j; aik < i+p; aik++, bkj++)
        s += (t)aik*(t)bkj;
      if (fabs(c[i*n+j]-s*R) > 1e-6*s)
      {
        printf("C%d: c[%lu][%lu] %f != %f\n", cid, i, j, c[i*n+j], s);
        exit(1);
      }
    }
  }
#endif

  barrier(nc);
  exit(0);
}