aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYunsup Lee <yunsup@cs.berkeley.edu>2014-04-14 22:46:46 -0700
committerYunsup Lee <yunsup@cs.berkeley.edu>2014-04-14 22:46:46 -0700
commitf4203e9e56f882af52960d15b98ab881712f776d (patch)
treeff67ecb4da6f1d3decb5faad273aea6bbf6d00ee
parentc31d7c5eb4109fdcce58d27b132e20596ece2d07 (diff)
downloadriscv-tests-eos20-bringup.zip
riscv-tests-eos20-bringup.tar.gz
riscv-tests-eos20-bringup.tar.bz2
add mm benchmark for eos20eos20-bringup
-rw-r--r--benchmarks/Makefile4
-rw-r--r--benchmarks/common/crt-mt.S6
-rw-r--r--benchmarks/common/syscalls.c10
-rw-r--r--benchmarks/mm/bmark.mk32
-rw-r--r--benchmarks/mm/common.h35
-rw-r--r--benchmarks/mm/gen.scala81
-rw-r--r--benchmarks/mm/hwacha.S110
-rw-r--r--benchmarks/mm/hwacha.h137
-rw-r--r--benchmarks/mm/mm.c150
-rw-r--r--benchmarks/mm/mm_main.c82
-rw-r--r--benchmarks/mm/rb.h210
11 files changed, 847 insertions, 10 deletions
diff --git a/benchmarks/Makefile b/benchmarks/Makefile
index cc8e881..6007a45 100644
--- a/benchmarks/Makefile
+++ b/benchmarks/Makefile
@@ -21,7 +21,7 @@ bmarks = \
towers \
vvadd \
multiply \
- dgemm \
+ mm \
dhrystone \
spmv \
vec-vvadd \
@@ -36,7 +36,7 @@ bmarks_host = \
towers \
vvadd \
multiply \
- dgemm \
+ mm \
spmv \
vec-vvadd \
vec-cmplxmult \
diff --git a/benchmarks/common/crt-mt.S b/benchmarks/common/crt-mt.S
index 283b3bf..7f35f9b 100644
--- a/benchmarks/common/crt-mt.S
+++ b/benchmarks/common/crt-mt.S
@@ -96,7 +96,7 @@ _start:
mfpcr a0,cr10
lw a1, 4(zero)
- slli a2, a0, 13
+ slli a2, a0, 16
la sp, stacktop
sub sp, sp, a2
@@ -110,7 +110,7 @@ _start:
.globl tlstop
.align 4
- .skip 32768
+ .skip 131072
stacktop:
- .skip 65536
+ .skip 131072
tlstop:
diff --git a/benchmarks/common/syscalls.c b/benchmarks/common/syscalls.c
index f95dde4..a882462 100644
--- a/benchmarks/common/syscalls.c
+++ b/benchmarks/common/syscalls.c
@@ -6,7 +6,7 @@
void exit(int code)
{
volatile uint64_t magic_mem[8] = {0};
- magic_mem[0] = 1;
+ magic_mem[0] = 93;
magic_mem[1] = code;
__sync_synchronize();
mtpcr(PCR_TOHOST, (long)magic_mem);
@@ -16,7 +16,7 @@ void exit(int code)
void printstr(const char* s)
{
volatile uint64_t magic_mem[8] = {0};
- magic_mem[0] = 4;
+ magic_mem[0] = 64;
magic_mem[1] = 1;
magic_mem[2] = (unsigned long)s;
magic_mem[3] = strlen(s);
@@ -28,8 +28,8 @@ void printstr(const char* s)
int putchar(int ch)
{
#define buffered_putch_bufsize 64
- static char buf[buffered_putch_bufsize];
- static int buflen = 0;
+ static __thread char buf[buffered_putch_bufsize];
+ static __thread int buflen = 0;
if(ch != -1)
buf[buflen++] = ch;
@@ -37,7 +37,7 @@ int putchar(int ch)
if(ch == -1 || buflen == buffered_putch_bufsize)
{
volatile uint64_t magic_mem[8] = {0};
- magic_mem[0] = 4;
+ magic_mem[0] = 64;
magic_mem[1] = 1;
magic_mem[2] = (long)buf;
magic_mem[3] = buflen;
diff --git a/benchmarks/mm/bmark.mk b/benchmarks/mm/bmark.mk
new file mode 100644
index 0000000..a446713
--- /dev/null
+++ b/benchmarks/mm/bmark.mk
@@ -0,0 +1,32 @@
+#=======================================================================
+# UCB CS250 Makefile fragment for benchmarks
+#-----------------------------------------------------------------------
+#
+# Each benchmark directory should have its own fragment which
+# essentially lists what the source files are and how to link them
+# into an riscv and/or host executable. All variables should include
+# the benchmark name as a prefix so that they are unique.
+#
+
+mm_c_src = \
+ mm_main.c \
+ mm.c \
+ syscalls.c \
+
+mm_riscv_src = \
+ hwacha.S \
+ crt-mt.S \
+
+mm_c_objs = $(patsubst %.c, %.o, $(mm_c_src))
+mm_riscv_objs = $(patsubst %.S, %.o, $(mm_riscv_src))
+
+mm_host_bin = mm.host
+$(mm_host_bin) : $(mm_c_src)
+ $(HOST_COMP) $^ -o $(mm_host_bin)
+
+mm_riscv_bin = mm.riscv
+$(mm_riscv_bin) : $(mm_c_objs) $(mm_riscv_objs)
+ $(RISCV_LINK_MT) $(mm_c_objs) $(mm_riscv_objs) -o $(mm_riscv_bin) $(RISCV_LINK_OPTS)
+
+junk += $(mm_c_objs) $(mm_riscv_objs) \
+ $(mm_host_bin) $(mm_riscv_bin)
diff --git a/benchmarks/mm/common.h b/benchmarks/mm/common.h
new file mode 100644
index 0000000..f0e6709
--- /dev/null
+++ b/benchmarks/mm/common.h
@@ -0,0 +1,35 @@
+#ifndef _MM_H
+#define _MM_H
+
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#ifdef SP
+typedef float t;
+#define fma fmaf
+#else
+typedef double t;
+#endif
+
+#define inline inline __attribute__((always_inline))
+
+#define alloca_aligned(s, a) ((void*)(((uintptr_t)alloca((s)+(a)-1)+(a)-1)&~((a)-1)))
+
+#include "rb.h"
+#include "hwacha.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void mm(size_t m, size_t n, size_t p,
+ t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc);
+
+#ifdef __cplusplus
+}
+#endif
+
+//void rb(t* a, t* b, t* c, size_t lda, size_t ldb, size_t ldc);
+
+#endif
diff --git a/benchmarks/mm/gen.scala b/benchmarks/mm/gen.scala
new file mode 100644
index 0000000..2d3dc34
--- /dev/null
+++ b/benchmarks/mm/gen.scala
@@ -0,0 +1,81 @@
+import scala.sys.process._
+object MMGen {
+ implicit def i2s(i: Int) = i.toString
+ def writeFile(name: String, contents: String) = {
+ val f = new java.io.FileWriter(name)
+ f.write(contents)
+ f.close
+ }
+
+ var indent = 0
+ def spacing = " " * indent
+ def assign(lhs: String, rhs: String) =
+ spacing + lhs + " = " + rhs + ";\n"
+ def init(t: String, n: String, v: String) =
+ assign(t+" "+n, v)
+ def open_block(s: String = "") = {
+ val result = (if (s != "") spacing + s else "") + spacing + "{\n"
+ indent = indent + 1
+ result
+ }
+ def close_block = {
+ indent = indent - 1
+ spacing + "}\n"
+ }
+
+ def ar(m: String, i: String) = m+"["+i+"]"
+ def r(a: String, b: String*) = (a :: b.toList).reduceLeft(_+"_"+_)
+
+ def rb(m: Int, n: Int, p: Int) = {
+ var s = open_block("static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)\n")
+
+ for (i <- 0 until m)
+ s += init("t*", r("c", i), "&"+ar("c", "ldc*"+i))
+ for (i <- 0 until m; j <- 0 until n)
+ s += init("t", r("c", i, j), ar(r("c", i), j))
+
+ def doit(m: Int, n: Int, p: Int) = {
+ for (i <- 0 until m)
+ s += init("t*", r("a", i), "&"+ar("a", "lda*"+i))
+ for (k <- 0 until p)
+ s += init("t*", r("b", k), "&"+ar("b", "ldb*"+k))
+ for (i <- 0 until m; j <- 0 until n; k <- 0 until p)
+ s += assign(r("c", i, j), "fma(" + ar(r("a", i), k) + ", " + ar(r("b", k), j) + ", " + r("c", i, j) + ")")
+ }
+
+ s += open_block("for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)\n")
+ doit(m, n, p)
+ s += close_block
+
+ s += open_block("for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)\n")
+ doit(m, n, 1)
+ s += close_block
+
+ for (i <- 0 until m; j <- 0 until n)
+ s += assign(ar(r("c", i), j), r("c", i, j))
+ s += close_block
+
+ s
+ }
+ def gcd(a: Int, b: Int): Int = if (b == 0) a else gcd(b, a%b)
+ def lcm(a: Int, b: Int): Int = a*b/gcd(a, b)
+ def lcm(a: Seq[Int]): Int = {
+ if (a.tail.isEmpty) a.head
+ else lcm(a.head, lcm(a.tail))
+ }
+ def test1(m: Int, n: Int, p: Int, m1: Int, n1: Int, p1: Int) = {
+ val decl = "static const int RBM = "+m+", RBN = "+n+", RBK = "+p+";\n" +
+ "static const int CBM = "+m1+", CBN = "+n1+", CBK = "+p1+";\n"
+ writeFile("rb.h", decl + rb(m, n, p))
+ //"make"!!
+
+ "make run"!
+
+ ("cp a.out " + Seq("b", m, n, p, m1, n1, p1, "run").reduce(_+"."+_))!
+ }
+ def main(args: Array[String]): Unit = {
+ test1(4, 5, 6, 36, 35, 36)
+ //for (i <- 4 to 6; j <- 4 to 6; k <- 4 to 6)
+ // test1(i, j, k, if (i == 5) 35 else 36, if (j == 5) 35 else 36, if (k == 5) 35 else 36)
+ }
+}
diff --git a/benchmarks/mm/hwacha.S b/benchmarks/mm/hwacha.S
new file mode 100644
index 0000000..e99303f
--- /dev/null
+++ b/benchmarks/mm/hwacha.S
@@ -0,0 +1,110 @@
+ .text
+ .align 2
+
+ .globl hwacha_mm_0
+hwacha_mm_0:
+ mxtf.d f19, x1
+ fmadd.d f0,f19,f20,f0
+ stop
+
+ .globl hwacha_mm_1
+hwacha_mm_1:
+ mxtf.d f18, x2
+ fmadd.d f1,f18,f20,f1
+ stop
+
+ .globl hwacha_mm_2
+hwacha_mm_2:
+ mxtf.d f19, x1
+ fmadd.d f2,f19,f20,f2
+ stop
+
+ .globl hwacha_mm_3
+hwacha_mm_3:
+ mxtf.d f18, x2
+ fmadd.d f3,f18,f20,f3
+ stop
+
+ .globl hwacha_mm_4
+hwacha_mm_4:
+ mxtf.d f19, x1
+ fmadd.d f4,f19,f20,f4
+ stop
+
+ .globl hwacha_mm_5
+hwacha_mm_5:
+ mxtf.d f18, x2
+ fmadd.d f5,f18,f20,f5
+ stop
+
+ .globl hwacha_mm_6
+hwacha_mm_6:
+ mxtf.d f19, x1
+ fmadd.d f6,f19,f20,f6
+ stop
+
+ .globl hwacha_mm_7
+hwacha_mm_7:
+ mxtf.d f18, x2
+ fmadd.d f7,f18,f20,f7
+ stop
+
+ .globl hwacha_mm_8
+hwacha_mm_8:
+ mxtf.d f19, x1
+ fmadd.d f8,f19,f20,f8
+ stop
+
+ .globl hwacha_mm_9
+hwacha_mm_9:
+ mxtf.d f18, x2
+ fmadd.d f9,f18,f20,f9
+ stop
+
+ .globl hwacha_mm_10
+hwacha_mm_10:
+ mxtf.d f19, x1
+ fmadd.d f10,f19,f20,f10
+ stop
+
+ .globl hwacha_mm_11
+hwacha_mm_11:
+ mxtf.d f18, x2
+ fmadd.d f11,f18,f20,f11
+ stop
+
+ .globl hwacha_mm_12
+hwacha_mm_12:
+ mxtf.d f19, x1
+ fmadd.d f12,f19,f20,f12
+ stop
+
+ .globl hwacha_mm_13
+hwacha_mm_13:
+ mxtf.d f18, x2
+ fmadd.d f13,f18,f20,f13
+ stop
+
+ .globl hwacha_mm_14
+hwacha_mm_14:
+ mxtf.d f19, x1
+ fmadd.d f14,f19,f20,f14
+ stop
+
+ .globl hwacha_mm_15
+hwacha_mm_15:
+ mxtf.d f18, x2
+ fmadd.d f15,f18,f20,f15
+ stop
+
+ .globl hwacha_mm_16
+hwacha_mm_16:
+ mxtf.d f19, x1
+ fmadd.d f16,f19,f20,f16
+ stop
+
+ .globl hwacha_mm_17
+hwacha_mm_17:
+ mxtf.d f18, x2
+ fmadd.d f17,f18,f20,f17
+ stop
diff --git a/benchmarks/mm/hwacha.h b/benchmarks/mm/hwacha.h
new file mode 100644
index 0000000..b042308
--- /dev/null
+++ b/benchmarks/mm/hwacha.h
@@ -0,0 +1,137 @@
+static const int HCBM = 18;
+static const int HCBN = 80;
+static const int HCBK = 16;
+
+static const int HRBM = 18;
+static const int HRBN = 80;
+static const int HRBK = 1;
+
+extern void hwacha_mm_0();
+extern void hwacha_mm_1();
+extern void hwacha_mm_2();
+extern void hwacha_mm_3();
+extern void hwacha_mm_4();
+extern void hwacha_mm_5();
+extern void hwacha_mm_6();
+extern void hwacha_mm_7();
+extern void hwacha_mm_8();
+extern void hwacha_mm_9();
+extern void hwacha_mm_10();
+extern void hwacha_mm_11();
+extern void hwacha_mm_12();
+extern void hwacha_mm_13();
+extern void hwacha_mm_14();
+extern void hwacha_mm_15();
+extern void hwacha_mm_16();
+extern void hwacha_mm_17();
+
+static inline void nloop(int s, int e, t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
+{
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[0*lda]));
+ asm volatile ("vfld vf20, %[ptr]" : : [ptr]"r"(b) : );
+ if (s) asm volatile ("vfld vf0, %[ptr]" : : [ptr]"r"(&c[ldc*0]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_0) : );
+ if (e) asm volatile ("vfsd vf0, %[ptr]" : : [ptr]"r"(&c[ldc*0]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[1*lda]));
+ if (s) asm volatile ("vfld vf1, %[ptr]" : : [ptr]"r"(&c[ldc*1]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_1) : );
+ if (e) asm volatile ("vfsd vf1, %[ptr]" : : [ptr]"r"(&c[ldc*1]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[2*lda]));
+ if (s) asm volatile ("vfld vf2, %[ptr]" : : [ptr]"r"(&c[ldc*2]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_2) : );
+ if (e) asm volatile ("vfsd vf2, %[ptr]" : : [ptr]"r"(&c[ldc*2]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[3*lda]));
+ if (s) asm volatile ("vfld vf3, %[ptr]" : : [ptr]"r"(&c[ldc*3]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_3) : );
+ if (e) asm volatile ("vfsd vf3, %[ptr]" : : [ptr]"r"(&c[ldc*3]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[4*lda]));
+ if (s) asm volatile ("vfld vf4, %[ptr]" : : [ptr]"r"(&c[ldc*4]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_4) : );
+ if (e) asm volatile ("vfsd vf4, %[ptr]" : : [ptr]"r"(&c[ldc*4]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[5*lda]));
+ if (s) asm volatile ("vfld vf5, %[ptr]" : : [ptr]"r"(&c[ldc*5]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_5) : );
+ if (e) asm volatile ("vfsd vf5, %[ptr]" : : [ptr]"r"(&c[ldc*5]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[6*lda]));
+ if (s) asm volatile ("vfld vf6, %[ptr]" : : [ptr]"r"(&c[ldc*6]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_6) : );
+ if (e) asm volatile ("vfsd vf6, %[ptr]" : : [ptr]"r"(&c[ldc*6]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[7*lda]));
+ if (s) asm volatile ("vfld vf7, %[ptr]" : : [ptr]"r"(&c[ldc*7]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_7) : );
+ if (e) asm volatile ("vfsd vf7, %[ptr]" : : [ptr]"r"(&c[ldc*7]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[8*lda]));
+ if (s) asm volatile ("vfld vf8, %[ptr]" : : [ptr]"r"(&c[ldc*8]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_8) : );
+ if (e) asm volatile ("vfsd vf8, %[ptr]" : : [ptr]"r"(&c[ldc*8]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[9*lda]));
+ if (s) asm volatile ("vfld vf9, %[ptr]" : : [ptr]"r"(&c[ldc*9]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_9) : );
+ if (e) asm volatile ("vfsd vf9, %[ptr]" : : [ptr]"r"(&c[ldc*9]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[10*lda]));
+ if (s) asm volatile ("vfld vf10, %[ptr]" : : [ptr]"r"(&c[ldc*10]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_10) : );
+ if (e) asm volatile ("vfsd vf10, %[ptr]" : : [ptr]"r"(&c[ldc*10]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[11*lda]));
+ if (s) asm volatile ("vfld vf11, %[ptr]" : : [ptr]"r"(&c[ldc*11]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_11) : );
+ if (e) asm volatile ("vfsd vf11, %[ptr]" : : [ptr]"r"(&c[ldc*11]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[12*lda]));
+ if (s) asm volatile ("vfld vf12, %[ptr]" : : [ptr]"r"(&c[ldc*12]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_12) : );
+ if (e) asm volatile ("vfsd vf12, %[ptr]" : : [ptr]"r"(&c[ldc*12]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[13*lda]));
+ if (s) asm volatile ("vfld vf13, %[ptr]" : : [ptr]"r"(&c[ldc*13]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_13) : );
+ if (e) asm volatile ("vfsd vf13, %[ptr]" : : [ptr]"r"(&c[ldc*13]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[14*lda]));
+ if (s) asm volatile ("vfld vf14, %[ptr]" : : [ptr]"r"(&c[ldc*14]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_14) : );
+ if (e) asm volatile ("vfsd vf14, %[ptr]" : : [ptr]"r"(&c[ldc*14]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[15*lda]));
+ if (s) asm volatile ("vfld vf15, %[ptr]" : : [ptr]"r"(&c[ldc*15]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_15) : );
+ if (e) asm volatile ("vfsd vf15, %[ptr]" : : [ptr]"r"(&c[ldc*15]) : );
+
+ asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[16*lda]));
+ if (s) asm volatile ("vfld vf16, %[ptr]" : : [ptr]"r"(&c[ldc*16]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_16) : );
+ if (e) asm volatile ("vfsd vf16, %[ptr]" : : [ptr]"r"(&c[ldc*16]) : );
+
+ asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[17*lda]));
+ if (s) asm volatile ("vfld vf17, %[ptr]" : : [ptr]"r"(&c[ldc*17]) : );
+ asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_17) : );
+ if (e) asm volatile ("vfsd vf17, %[ptr]" : : [ptr]"r"(&c[ldc*17]) : );
+}
+
+static inline void mm_rb_hwacha(size_t m, size_t n, size_t p,
+ t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
+{
+ int vl;
+ asm volatile ("vvcfgivl %[gvl], %[nvl], 4, 21" : [gvl]"=r"(vl) : [nvl]"r"(n) : );
+
+ size_t mb = m/HRBM*HRBM, nk=p/HRBK*HRBK;
+
+ for (size_t i=0; i<mb; i+=HRBM)
+ {
+ for (size_t k=0; k<nk; k+=HRBK)
+ {
+ nloop(k==0, k==(p-1), a+i*lda+k, lda, b+k*ldb, ldb, c+i*ldc, ldc);
+ }
+ }
+}
diff --git a/benchmarks/mm/mm.c b/benchmarks/mm/mm.c
new file mode 100644
index 0000000..8abe8e6
--- /dev/null
+++ b/benchmarks/mm/mm.c
@@ -0,0 +1,150 @@
+#include "common.h"
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <alloca.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+static void mm_naive(size_t m, size_t n, size_t p,
+ t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
+{
+ for (size_t i = 0; i < m; i++)
+ {
+ for (size_t j = 0; j < n; j++)
+ {
+ t s0 = c[i*ldc+j], s1 = 0, s2 = 0, s3 = 0;
+ for (size_t k = 0; k < p/4*4; k+=4)
+ {
+ s0 = fma(a[i*lda+k+0], b[(k+0)*ldb+j], s0);
+ s1 = fma(a[i*lda+k+1], b[(k+1)*ldb+j], s1);
+ s2 = fma(a[i*lda+k+2], b[(k+2)*ldb+j], s2);
+ s3 = fma(a[i*lda+k+3], b[(k+3)*ldb+j], s3);
+ }
+ for (size_t k = p/4*4; k < p; k++)
+ s0 = fma(a[i*lda+k], b[k*ldb+j], s0);
+ c[i*ldc+j] = (s0 + s1) + (s2 + s3);
+ }
+ }
+}
+
+static inline void mm_rb(size_t m, size_t n, size_t p,
+ t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
+{
+ size_t mb = m/RBM*RBM, nb = n/RBN*RBN;
+ for (size_t i = 0; i < mb; i += RBM)
+ {
+ for (size_t j = 0; j < nb; j += RBN)
+ kloop(p, a+i*lda, lda, b+j, ldb, c+i*ldc+j, ldc);
+ mm_naive(RBM, n - nb, p, a+i*lda, lda, b+nb, ldb, c+i*ldc+nb, ldc);
+ }
+ mm_naive(m - mb, n, p, a+mb*lda, lda, b, ldb, c+mb*ldc, ldc);
+}
+
+static inline void repack(t* a, size_t lda, const t* a0, size_t lda0, size_t m, size_t p)
+{
+ for (size_t i = 0; i < m; i++)
+ {
+ for (size_t j = 0; j < p/8*8; j+=8)
+ {
+ t t0 = a0[i*lda0+j+0];
+ t t1 = a0[i*lda0+j+1];
+ t t2 = a0[i*lda0+j+2];
+ t t3 = a0[i*lda0+j+3];
+ t t4 = a0[i*lda0+j+4];
+ t t5 = a0[i*lda0+j+5];
+ t t6 = a0[i*lda0+j+6];
+ t t7 = a0[i*lda0+j+7];
+ a[i*lda+j+0] = t0;
+ a[i*lda+j+1] = t1;
+ a[i*lda+j+2] = t2;
+ a[i*lda+j+3] = t3;
+ a[i*lda+j+4] = t4;
+ a[i*lda+j+5] = t5;
+ a[i*lda+j+6] = t6;
+ a[i*lda+j+7] = t7;
+ }
+ for (size_t j = p/8*8; j < p; j++)
+ a[i*lda+j] = a0[i*lda0+j];
+ }
+}
+
+static void mm_cb(size_t m, size_t n, size_t p,
+ t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
+{
+ size_t nmb = m/CBM, nnb = n/CBN, npb = p/CBK;
+ size_t mb = nmb*CBM, nb = nnb*CBN, pb = npb*CBK;
+ //t a1[mb*pb], b1[pb*nb], c1[mb*nb];
+ t* a1 = (t*)alloca_aligned(sizeof(t)*mb*pb, 8192);
+ t* b1 = (t*)alloca_aligned(sizeof(t)*pb*nb, 8192);
+ t* c1 = (t*)alloca_aligned(sizeof(t)*mb*nb, 8192);
+
+ for (size_t i = 0; i < mb; i += CBM)
+ for (size_t j = 0; j < pb; j += CBK)
+ repack(a1 + (npb*(i/CBM) + j/CBK)*(CBM*CBK), CBK, a + i*lda + j, lda, CBM, CBK);
+
+ for (size_t i = 0; i < pb; i += CBK)
+ for (size_t j = 0; j < nb; j += CBN)
+ repack(b1 + (nnb*(i/CBK) + j/CBN)*(CBK*CBN), CBN, b + i*ldb + j, ldb, CBK, CBN);
+
+ for (size_t i = 0; i < mb; i += CBM)
+ for (size_t j = 0; j < nb; j += CBN)
+ repack(c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, c + i*ldc + j, ldc, CBM, CBN);
+
+ for (size_t i = 0; i < mb; i += CBM)
+ {
+ for (size_t j = 0; j < nb; j += CBN)
+ {
+ for (size_t k = 0; k < pb; k += CBK)
+ {
+ mm_rb(CBM, CBN, CBK,
+ a1 + (npb*(i/CBM) + k/CBK)*(CBM*CBK), CBK,
+ b1 + (nnb*(k/CBK) + j/CBN)*(CBK*CBN), CBN,
+ c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN);
+ }
+ if (pb < p)
+ {
+ mm_rb(CBM, CBN, p - pb,
+ a + i*lda + pb, lda,
+ b + pb*ldb + j, ldb,
+ c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN);
+ }
+ }
+ if (nb < n)
+ {
+ for (size_t k = 0; k < p; k += CBK)
+ {
+ mm_rb(CBM, n - nb, MIN(p - k, CBK),
+ a + i*lda + k, lda,
+ b + k*ldb + nb, ldb,
+ c + i*ldc + nb, ldc);
+ }
+ }
+ }
+ if (mb < m)
+ {
+ for (size_t j = 0; j < n; j += CBN)
+ {
+ for (size_t k = 0; k < p; k += CBK)
+ {
+ mm_rb(m - mb, MIN(n - j, CBN), MIN(p - k, CBK),
+ a + mb*lda + k, lda,
+ b + k*ldb + j, ldb,
+ c + mb*ldc + j, ldc);
+ }
+ }
+ }
+
+ for (size_t i = 0; i < mb; i += CBM)
+ for (size_t j = 0; j < nb; j += CBN)
+ repack(c + i*ldc + j, ldc, c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, CBM, CBN);
+}
+
+void mm(size_t m, size_t n, size_t p,
+ t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
+{
+ if (__builtin_expect(m <= 2*CBM && n <= 2*CBN && p <= 2*CBK, 1))
+ mm_rb(m, n, p, a, lda, b, ldb, c, ldc);
+ else
+ mm_cb(m, n, p, a, lda, b, ldb, c, ldc);
+}
diff --git a/benchmarks/mm/mm_main.c b/benchmarks/mm/mm_main.c
new file mode 100644
index 0000000..a4c04cc
--- /dev/null
+++ b/benchmarks/mm/mm_main.c
@@ -0,0 +1,82 @@
+#define ncores 2
+#include "common.h"
+#include <assert.h>
+#include <stdlib.h>
+#include "util.h"
+
+void thread_entry(int cid, int nc)
+{
+ const int R = 8;
+ int m, n, p;
+ int have_vec = 1;
+
+ if (have_vec) {
+ m = HCBM;
+ n = HCBN;
+ p = HCBK;
+ } else {
+ m = CBM;
+ n = CBN;
+ p = CBK;
+ }
+
+ t a[m*p];
+ t b[p*n];
+ t c[m*n];
+
+ for (size_t i = 0; i < m; i++)
+ for (size_t j = 0; j < p; j++)
+ a[i*p+j] = i+j;
+ for (size_t i = 0; i < p; i++)
+ for (size_t j = 0; j < n; j++)
+ b[i*n+j] = i-j;
+ memset(c, 0, m*n*sizeof(c[0]));
+
+ size_t instret, cycles;
+ if (have_vec) {
+ for (int i = 0; i < R; i++)
+ {
+ instret = -rdinstret();
+ cycles = -rdcycle();
+ mm_rb_hwacha(m, n, p, a, p, b, n, c, n);
+ instret += rdinstret();
+ cycles += rdcycle();
+ }
+ } else {
+ for (int i = 0; i < R; i++)
+ {
+ instret = -rdinstret();
+ cycles = -rdcycle();
+ mm(m, n, p, a, p, b, n, c, n);
+ instret += rdinstret();
+ cycles += rdcycle();
+ }
+ }
+
+ printf("C%d: reg block %dx%dx%d, cache block %dx%dx%d\n",
+ cid, HRBM, HRBN, HRBK, HCBM, HCBN, HCBK);
+ printf("C%d: %d instructions\n", cid, (int)(instret));
+ printf("C%d: %d cycles\n", cid, (int)(cycles));
+ printf("C%d: %d flops\n", cid, 2*m*n*p);
+ printf("C%d: %d Mflops @ 1 GHz\n", cid, 2000*m*n*p/(cycles));
+
+#if 1
+ for (size_t i = 0; i < m; i++)
+ {
+ for (size_t j = 0; j < n; j++)
+ {
+ t s = 0;
+ for (size_t aik = i, bkj = -j; aik < i+p; aik++, bkj++)
+ s += (t)aik*(t)bkj;
+ if (fabs(c[i*n+j]-s*R) > 1e-6*s)
+ {
+ printf("C%d: c[%lu][%lu] %u != %u\n", cid, i, j, (unsigned long)c[i*n+j], (unsigned long)s);
+ exit(1);
+ }
+ }
+ }
+#endif
+
+ //barrier(nc);
+ exit(0);
+}
diff --git a/benchmarks/mm/rb.h b/benchmarks/mm/rb.h
new file mode 100644
index 0000000..c5d5890
--- /dev/null
+++ b/benchmarks/mm/rb.h
@@ -0,0 +1,210 @@
+static const int RBM = 4, RBN = 5, RBK = 6;
+static const int CBM = 36, CBN = 35, CBK = 36;
+static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)
+{
+ t* c_0 = &c[ldc*0];
+ t* c_1 = &c[ldc*1];
+ t* c_2 = &c[ldc*2];
+ t* c_3 = &c[ldc*3];
+ t c_0_0 = c_0[0];
+ t c_0_1 = c_0[1];
+ t c_0_2 = c_0[2];
+ t c_0_3 = c_0[3];
+ t c_0_4 = c_0[4];
+ t c_1_0 = c_1[0];
+ t c_1_1 = c_1[1];
+ t c_1_2 = c_1[2];
+ t c_1_3 = c_1[3];
+ t c_1_4 = c_1[4];
+ t c_2_0 = c_2[0];
+ t c_2_1 = c_2[1];
+ t c_2_2 = c_2[2];
+ t c_2_3 = c_2[3];
+ t c_2_4 = c_2[4];
+ t c_3_0 = c_3[0];
+ t c_3_1 = c_3[1];
+ t c_3_2 = c_3[2];
+ t c_3_3 = c_3[3];
+ t c_3_4 = c_3[4];
+ for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)
+ {
+ t* a_0 = &a[lda*0];
+ t* a_1 = &a[lda*1];
+ t* a_2 = &a[lda*2];
+ t* a_3 = &a[lda*3];
+ t* b_0 = &b[ldb*0];
+ t* b_1 = &b[ldb*1];
+ t* b_2 = &b[ldb*2];
+ t* b_3 = &b[ldb*3];
+ t* b_4 = &b[ldb*4];
+ t* b_5 = &b[ldb*5];
+ c_0_0 = fma(a_0[0], b_0[0], c_0_0);
+ c_0_0 = fma(a_0[1], b_1[0], c_0_0);
+ c_0_0 = fma(a_0[2], b_2[0], c_0_0);
+ c_0_0 = fma(a_0[3], b_3[0], c_0_0);
+ c_0_0 = fma(a_0[4], b_4[0], c_0_0);
+ c_0_0 = fma(a_0[5], b_5[0], c_0_0);
+ c_0_1 = fma(a_0[0], b_0[1], c_0_1);
+ c_0_1 = fma(a_0[1], b_1[1], c_0_1);
+ c_0_1 = fma(a_0[2], b_2[1], c_0_1);
+ c_0_1 = fma(a_0[3], b_3[1], c_0_1);
+ c_0_1 = fma(a_0[4], b_4[1], c_0_1);
+ c_0_1 = fma(a_0[5], b_5[1], c_0_1);
+ c_0_2 = fma(a_0[0], b_0[2], c_0_2);
+ c_0_2 = fma(a_0[1], b_1[2], c_0_2);
+ c_0_2 = fma(a_0[2], b_2[2], c_0_2);
+ c_0_2 = fma(a_0[3], b_3[2], c_0_2);
+ c_0_2 = fma(a_0[4], b_4[2], c_0_2);
+ c_0_2 = fma(a_0[5], b_5[2], c_0_2);
+ c_0_3 = fma(a_0[0], b_0[3], c_0_3);
+ c_0_3 = fma(a_0[1], b_1[3], c_0_3);
+ c_0_3 = fma(a_0[2], b_2[3], c_0_3);
+ c_0_3 = fma(a_0[3], b_3[3], c_0_3);
+ c_0_3 = fma(a_0[4], b_4[3], c_0_3);
+ c_0_3 = fma(a_0[5], b_5[3], c_0_3);
+ c_0_4 = fma(a_0[0], b_0[4], c_0_4);
+ c_0_4 = fma(a_0[1], b_1[4], c_0_4);
+ c_0_4 = fma(a_0[2], b_2[4], c_0_4);
+ c_0_4 = fma(a_0[3], b_3[4], c_0_4);
+ c_0_4 = fma(a_0[4], b_4[4], c_0_4);
+ c_0_4 = fma(a_0[5], b_5[4], c_0_4);
+ c_1_0 = fma(a_1[0], b_0[0], c_1_0);
+ c_1_0 = fma(a_1[1], b_1[0], c_1_0);
+ c_1_0 = fma(a_1[2], b_2[0], c_1_0);
+ c_1_0 = fma(a_1[3], b_3[0], c_1_0);
+ c_1_0 = fma(a_1[4], b_4[0], c_1_0);
+ c_1_0 = fma(a_1[5], b_5[0], c_1_0);
+ c_1_1 = fma(a_1[0], b_0[1], c_1_1);
+ c_1_1 = fma(a_1[1], b_1[1], c_1_1);
+ c_1_1 = fma(a_1[2], b_2[1], c_1_1);
+ c_1_1 = fma(a_1[3], b_3[1], c_1_1);
+ c_1_1 = fma(a_1[4], b_4[1], c_1_1);
+ c_1_1 = fma(a_1[5], b_5[1], c_1_1);
+ c_1_2 = fma(a_1[0], b_0[2], c_1_2);
+ c_1_2 = fma(a_1[1], b_1[2], c_1_2);
+ c_1_2 = fma(a_1[2], b_2[2], c_1_2);
+ c_1_2 = fma(a_1[3], b_3[2], c_1_2);
+ c_1_2 = fma(a_1[4], b_4[2], c_1_2);
+ c_1_2 = fma(a_1[5], b_5[2], c_1_2);
+ c_1_3 = fma(a_1[0], b_0[3], c_1_3);
+ c_1_3 = fma(a_1[1], b_1[3], c_1_3);
+ c_1_3 = fma(a_1[2], b_2[3], c_1_3);
+ c_1_3 = fma(a_1[3], b_3[3], c_1_3);
+ c_1_3 = fma(a_1[4], b_4[3], c_1_3);
+ c_1_3 = fma(a_1[5], b_5[3], c_1_3);
+ c_1_4 = fma(a_1[0], b_0[4], c_1_4);
+ c_1_4 = fma(a_1[1], b_1[4], c_1_4);
+ c_1_4 = fma(a_1[2], b_2[4], c_1_4);
+ c_1_4 = fma(a_1[3], b_3[4], c_1_4);
+ c_1_4 = fma(a_1[4], b_4[4], c_1_4);
+ c_1_4 = fma(a_1[5], b_5[4], c_1_4);
+ c_2_0 = fma(a_2[0], b_0[0], c_2_0);
+ c_2_0 = fma(a_2[1], b_1[0], c_2_0);
+ c_2_0 = fma(a_2[2], b_2[0], c_2_0);
+ c_2_0 = fma(a_2[3], b_3[0], c_2_0);
+ c_2_0 = fma(a_2[4], b_4[0], c_2_0);
+ c_2_0 = fma(a_2[5], b_5[0], c_2_0);
+ c_2_1 = fma(a_2[0], b_0[1], c_2_1);
+ c_2_1 = fma(a_2[1], b_1[1], c_2_1);
+ c_2_1 = fma(a_2[2], b_2[1], c_2_1);
+ c_2_1 = fma(a_2[3], b_3[1], c_2_1);
+ c_2_1 = fma(a_2[4], b_4[1], c_2_1);
+ c_2_1 = fma(a_2[5], b_5[1], c_2_1);
+ c_2_2 = fma(a_2[0], b_0[2], c_2_2);
+ c_2_2 = fma(a_2[1], b_1[2], c_2_2);
+ c_2_2 = fma(a_2[2], b_2[2], c_2_2);
+ c_2_2 = fma(a_2[3], b_3[2], c_2_2);
+ c_2_2 = fma(a_2[4], b_4[2], c_2_2);
+ c_2_2 = fma(a_2[5], b_5[2], c_2_2);
+ c_2_3 = fma(a_2[0], b_0[3], c_2_3);
+ c_2_3 = fma(a_2[1], b_1[3], c_2_3);
+ c_2_3 = fma(a_2[2], b_2[3], c_2_3);
+ c_2_3 = fma(a_2[3], b_3[3], c_2_3);
+ c_2_3 = fma(a_2[4], b_4[3], c_2_3);
+ c_2_3 = fma(a_2[5], b_5[3], c_2_3);
+ c_2_4 = fma(a_2[0], b_0[4], c_2_4);
+ c_2_4 = fma(a_2[1], b_1[4], c_2_4);
+ c_2_4 = fma(a_2[2], b_2[4], c_2_4);
+ c_2_4 = fma(a_2[3], b_3[4], c_2_4);
+ c_2_4 = fma(a_2[4], b_4[4], c_2_4);
+ c_2_4 = fma(a_2[5], b_5[4], c_2_4);
+ c_3_0 = fma(a_3[0], b_0[0], c_3_0);
+ c_3_0 = fma(a_3[1], b_1[0], c_3_0);
+ c_3_0 = fma(a_3[2], b_2[0], c_3_0);
+ c_3_0 = fma(a_3[3], b_3[0], c_3_0);
+ c_3_0 = fma(a_3[4], b_4[0], c_3_0);
+ c_3_0 = fma(a_3[5], b_5[0], c_3_0);
+ c_3_1 = fma(a_3[0], b_0[1], c_3_1);
+ c_3_1 = fma(a_3[1], b_1[1], c_3_1);
+ c_3_1 = fma(a_3[2], b_2[1], c_3_1);
+ c_3_1 = fma(a_3[3], b_3[1], c_3_1);
+ c_3_1 = fma(a_3[4], b_4[1], c_3_1);
+ c_3_1 = fma(a_3[5], b_5[1], c_3_1);
+ c_3_2 = fma(a_3[0], b_0[2], c_3_2);
+ c_3_2 = fma(a_3[1], b_1[2], c_3_2);
+ c_3_2 = fma(a_3[2], b_2[2], c_3_2);
+ c_3_2 = fma(a_3[3], b_3[2], c_3_2);
+ c_3_2 = fma(a_3[4], b_4[2], c_3_2);
+ c_3_2 = fma(a_3[5], b_5[2], c_3_2);
+ c_3_3 = fma(a_3[0], b_0[3], c_3_3);
+ c_3_3 = fma(a_3[1], b_1[3], c_3_3);
+ c_3_3 = fma(a_3[2], b_2[3], c_3_3);
+ c_3_3 = fma(a_3[3], b_3[3], c_3_3);
+ c_3_3 = fma(a_3[4], b_4[3], c_3_3);
+ c_3_3 = fma(a_3[5], b_5[3], c_3_3);
+ c_3_4 = fma(a_3[0], b_0[4], c_3_4);
+ c_3_4 = fma(a_3[1], b_1[4], c_3_4);
+ c_3_4 = fma(a_3[2], b_2[4], c_3_4);
+ c_3_4 = fma(a_3[3], b_3[4], c_3_4);
+ c_3_4 = fma(a_3[4], b_4[4], c_3_4);
+ c_3_4 = fma(a_3[5], b_5[4], c_3_4);
+ }
+ for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)
+ {
+ t* a_0 = &a[lda*0];
+ t* a_1 = &a[lda*1];
+ t* a_2 = &a[lda*2];
+ t* a_3 = &a[lda*3];
+ t* b_0 = &b[ldb*0];
+ c_0_0 = fma(a_0[0], b_0[0], c_0_0);
+ c_0_1 = fma(a_0[0], b_0[1], c_0_1);
+ c_0_2 = fma(a_0[0], b_0[2], c_0_2);
+ c_0_3 = fma(a_0[0], b_0[3], c_0_3);
+ c_0_4 = fma(a_0[0], b_0[4], c_0_4);
+ c_1_0 = fma(a_1[0], b_0[0], c_1_0);
+ c_1_1 = fma(a_1[0], b_0[1], c_1_1);
+ c_1_2 = fma(a_1[0], b_0[2], c_1_2);
+ c_1_3 = fma(a_1[0], b_0[3], c_1_3);
+ c_1_4 = fma(a_1[0], b_0[4], c_1_4);
+ c_2_0 = fma(a_2[0], b_0[0], c_2_0);
+ c_2_1 = fma(a_2[0], b_0[1], c_2_1);
+ c_2_2 = fma(a_2[0], b_0[2], c_2_2);
+ c_2_3 = fma(a_2[0], b_0[3], c_2_3);
+ c_2_4 = fma(a_2[0], b_0[4], c_2_4);
+ c_3_0 = fma(a_3[0], b_0[0], c_3_0);
+ c_3_1 = fma(a_3[0], b_0[1], c_3_1);
+ c_3_2 = fma(a_3[0], b_0[2], c_3_2);
+ c_3_3 = fma(a_3[0], b_0[3], c_3_3);
+ c_3_4 = fma(a_3[0], b_0[4], c_3_4);
+ }
+ c_0[0] = c_0_0;
+ c_0[1] = c_0_1;
+ c_0[2] = c_0_2;
+ c_0[3] = c_0_3;
+ c_0[4] = c_0_4;
+ c_1[0] = c_1_0;
+ c_1[1] = c_1_1;
+ c_1[2] = c_1_2;
+ c_1[3] = c_1_3;
+ c_1[4] = c_1_4;
+ c_2[0] = c_2_0;
+ c_2[1] = c_2_1;
+ c_2[2] = c_2_2;
+ c_2[3] = c_2_3;
+ c_2[4] = c_2_4;
+ c_3[0] = c_3_0;
+ c_3[1] = c_3_1;
+ c_3[2] = c_3_2;
+ c_3[3] = c_3_3;
+ c_3[4] = c_3_4;
+}