From f4203e9e56f882af52960d15b98ab881712f776d Mon Sep 17 00:00:00 2001 From: Yunsup Lee Date: Mon, 14 Apr 2014 22:46:46 -0700 Subject: add mm benchmark for eos20 --- benchmarks/Makefile | 4 +- benchmarks/common/crt-mt.S | 6 +- benchmarks/common/syscalls.c | 10 +-- benchmarks/mm/bmark.mk | 32 +++++++ benchmarks/mm/common.h | 35 ++++++++ benchmarks/mm/gen.scala | 81 +++++++++++++++++ benchmarks/mm/hwacha.S | 110 +++++++++++++++++++++++ benchmarks/mm/hwacha.h | 137 ++++++++++++++++++++++++++++ benchmarks/mm/mm.c | 150 +++++++++++++++++++++++++++++++ benchmarks/mm/mm_main.c | 82 +++++++++++++++++ benchmarks/mm/rb.h | 210 +++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 847 insertions(+), 10 deletions(-) create mode 100644 benchmarks/mm/bmark.mk create mode 100644 benchmarks/mm/common.h create mode 100644 benchmarks/mm/gen.scala create mode 100644 benchmarks/mm/hwacha.S create mode 100644 benchmarks/mm/hwacha.h create mode 100644 benchmarks/mm/mm.c create mode 100644 benchmarks/mm/mm_main.c create mode 100644 benchmarks/mm/rb.h diff --git a/benchmarks/Makefile b/benchmarks/Makefile index cc8e881..6007a45 100644 --- a/benchmarks/Makefile +++ b/benchmarks/Makefile @@ -21,7 +21,7 @@ bmarks = \ towers \ vvadd \ multiply \ - dgemm \ + mm \ dhrystone \ spmv \ vec-vvadd \ @@ -36,7 +36,7 @@ bmarks_host = \ towers \ vvadd \ multiply \ - dgemm \ + mm \ spmv \ vec-vvadd \ vec-cmplxmult \ diff --git a/benchmarks/common/crt-mt.S b/benchmarks/common/crt-mt.S index 283b3bf..7f35f9b 100644 --- a/benchmarks/common/crt-mt.S +++ b/benchmarks/common/crt-mt.S @@ -96,7 +96,7 @@ _start: mfpcr a0,cr10 lw a1, 4(zero) - slli a2, a0, 13 + slli a2, a0, 16 la sp, stacktop sub sp, sp, a2 @@ -110,7 +110,7 @@ _start: .globl tlstop .align 4 - .skip 32768 + .skip 131072 stacktop: - .skip 65536 + .skip 131072 tlstop: diff --git a/benchmarks/common/syscalls.c b/benchmarks/common/syscalls.c index f95dde4..a882462 100644 --- a/benchmarks/common/syscalls.c +++ b/benchmarks/common/syscalls.c @@ -6,7 +6,7 @@ void exit(int code) { volatile uint64_t magic_mem[8] = {0}; - magic_mem[0] = 1; + magic_mem[0] = 93; magic_mem[1] = code; __sync_synchronize(); mtpcr(PCR_TOHOST, (long)magic_mem); @@ -16,7 +16,7 @@ void exit(int code) void printstr(const char* s) { volatile uint64_t magic_mem[8] = {0}; - magic_mem[0] = 4; + magic_mem[0] = 64; magic_mem[1] = 1; magic_mem[2] = (unsigned long)s; magic_mem[3] = strlen(s); @@ -28,8 +28,8 @@ void printstr(const char* s) int putchar(int ch) { #define buffered_putch_bufsize 64 - static char buf[buffered_putch_bufsize]; - static int buflen = 0; + static __thread char buf[buffered_putch_bufsize]; + static __thread int buflen = 0; if(ch != -1) buf[buflen++] = ch; @@ -37,7 +37,7 @@ int putchar(int ch) if(ch == -1 || buflen == buffered_putch_bufsize) { volatile uint64_t magic_mem[8] = {0}; - magic_mem[0] = 4; + magic_mem[0] = 64; magic_mem[1] = 1; magic_mem[2] = (long)buf; magic_mem[3] = buflen; diff --git a/benchmarks/mm/bmark.mk b/benchmarks/mm/bmark.mk new file mode 100644 index 0000000..a446713 --- /dev/null +++ b/benchmarks/mm/bmark.mk @@ -0,0 +1,32 @@ +#======================================================================= +# UCB CS250 Makefile fragment for benchmarks +#----------------------------------------------------------------------- +# +# Each benchmark directory should have its own fragment which +# essentially lists what the source files are and how to link them +# into an riscv and/or host executable. All variables should include +# the benchmark name as a prefix so that they are unique. +# + +mm_c_src = \ + mm_main.c \ + mm.c \ + syscalls.c \ + +mm_riscv_src = \ + hwacha.S \ + crt-mt.S \ + +mm_c_objs = $(patsubst %.c, %.o, $(mm_c_src)) +mm_riscv_objs = $(patsubst %.S, %.o, $(mm_riscv_src)) + +mm_host_bin = mm.host +$(mm_host_bin) : $(mm_c_src) + $(HOST_COMP) $^ -o $(mm_host_bin) + +mm_riscv_bin = mm.riscv +$(mm_riscv_bin) : $(mm_c_objs) $(mm_riscv_objs) + $(RISCV_LINK_MT) $(mm_c_objs) $(mm_riscv_objs) -o $(mm_riscv_bin) $(RISCV_LINK_OPTS) + +junk += $(mm_c_objs) $(mm_riscv_objs) \ + $(mm_host_bin) $(mm_riscv_bin) diff --git a/benchmarks/mm/common.h b/benchmarks/mm/common.h new file mode 100644 index 0000000..f0e6709 --- /dev/null +++ b/benchmarks/mm/common.h @@ -0,0 +1,35 @@ +#ifndef _MM_H +#define _MM_H + +#include +#include +#include + +#ifdef SP +typedef float t; +#define fma fmaf +#else +typedef double t; +#endif + +#define inline inline __attribute__((always_inline)) + +#define alloca_aligned(s, a) ((void*)(((uintptr_t)alloca((s)+(a)-1)+(a)-1)&~((a)-1))) + +#include "rb.h" +#include "hwacha.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void mm(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc); + +#ifdef __cplusplus +} +#endif + +//void rb(t* a, t* b, t* c, size_t lda, size_t ldb, size_t ldc); + +#endif diff --git a/benchmarks/mm/gen.scala b/benchmarks/mm/gen.scala new file mode 100644 index 0000000..2d3dc34 --- /dev/null +++ b/benchmarks/mm/gen.scala @@ -0,0 +1,81 @@ +import scala.sys.process._ +object MMGen { + implicit def i2s(i: Int) = i.toString + def writeFile(name: String, contents: String) = { + val f = new java.io.FileWriter(name) + f.write(contents) + f.close + } + + var indent = 0 + def spacing = " " * indent + def assign(lhs: String, rhs: String) = + spacing + lhs + " = " + rhs + ";\n" + def init(t: String, n: String, v: String) = + assign(t+" "+n, v) + def open_block(s: String = "") = { + val result = (if (s != "") spacing + s else "") + spacing + "{\n" + indent = indent + 1 + result + } + def close_block = { + indent = indent - 1 + spacing + "}\n" + } + + def ar(m: String, i: String) = m+"["+i+"]" + def r(a: String, b: String*) = (a :: b.toList).reduceLeft(_+"_"+_) + + def rb(m: Int, n: Int, p: Int) = { + var s = open_block("static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)\n") + + for (i <- 0 until m) + s += init("t*", r("c", i), "&"+ar("c", "ldc*"+i)) + for (i <- 0 until m; j <- 0 until n) + s += init("t", r("c", i, j), ar(r("c", i), j)) + + def doit(m: Int, n: Int, p: Int) = { + for (i <- 0 until m) + s += init("t*", r("a", i), "&"+ar("a", "lda*"+i)) + for (k <- 0 until p) + s += init("t*", r("b", k), "&"+ar("b", "ldb*"+k)) + for (i <- 0 until m; j <- 0 until n; k <- 0 until p) + s += assign(r("c", i, j), "fma(" + ar(r("a", i), k) + ", " + ar(r("b", k), j) + ", " + r("c", i, j) + ")") + } + + s += open_block("for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)\n") + doit(m, n, p) + s += close_block + + s += open_block("for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)\n") + doit(m, n, 1) + s += close_block + + for (i <- 0 until m; j <- 0 until n) + s += assign(ar(r("c", i), j), r("c", i, j)) + s += close_block + + s + } + def gcd(a: Int, b: Int): Int = if (b == 0) a else gcd(b, a%b) + def lcm(a: Int, b: Int): Int = a*b/gcd(a, b) + def lcm(a: Seq[Int]): Int = { + if (a.tail.isEmpty) a.head + else lcm(a.head, lcm(a.tail)) + } + def test1(m: Int, n: Int, p: Int, m1: Int, n1: Int, p1: Int) = { + val decl = "static const int RBM = "+m+", RBN = "+n+", RBK = "+p+";\n" + + "static const int CBM = "+m1+", CBN = "+n1+", CBK = "+p1+";\n" + writeFile("rb.h", decl + rb(m, n, p)) + //"make"!! + + "make run"! + + ("cp a.out " + Seq("b", m, n, p, m1, n1, p1, "run").reduce(_+"."+_))! + } + def main(args: Array[String]): Unit = { + test1(4, 5, 6, 36, 35, 36) + //for (i <- 4 to 6; j <- 4 to 6; k <- 4 to 6) + // test1(i, j, k, if (i == 5) 35 else 36, if (j == 5) 35 else 36, if (k == 5) 35 else 36) + } +} diff --git a/benchmarks/mm/hwacha.S b/benchmarks/mm/hwacha.S new file mode 100644 index 0000000..e99303f --- /dev/null +++ b/benchmarks/mm/hwacha.S @@ -0,0 +1,110 @@ + .text + .align 2 + + .globl hwacha_mm_0 +hwacha_mm_0: + mxtf.d f19, x1 + fmadd.d f0,f19,f20,f0 + stop + + .globl hwacha_mm_1 +hwacha_mm_1: + mxtf.d f18, x2 + fmadd.d f1,f18,f20,f1 + stop + + .globl hwacha_mm_2 +hwacha_mm_2: + mxtf.d f19, x1 + fmadd.d f2,f19,f20,f2 + stop + + .globl hwacha_mm_3 +hwacha_mm_3: + mxtf.d f18, x2 + fmadd.d f3,f18,f20,f3 + stop + + .globl hwacha_mm_4 +hwacha_mm_4: + mxtf.d f19, x1 + fmadd.d f4,f19,f20,f4 + stop + + .globl hwacha_mm_5 +hwacha_mm_5: + mxtf.d f18, x2 + fmadd.d f5,f18,f20,f5 + stop + + .globl hwacha_mm_6 +hwacha_mm_6: + mxtf.d f19, x1 + fmadd.d f6,f19,f20,f6 + stop + + .globl hwacha_mm_7 +hwacha_mm_7: + mxtf.d f18, x2 + fmadd.d f7,f18,f20,f7 + stop + + .globl hwacha_mm_8 +hwacha_mm_8: + mxtf.d f19, x1 + fmadd.d f8,f19,f20,f8 + stop + + .globl hwacha_mm_9 +hwacha_mm_9: + mxtf.d f18, x2 + fmadd.d f9,f18,f20,f9 + stop + + .globl hwacha_mm_10 +hwacha_mm_10: + mxtf.d f19, x1 + fmadd.d f10,f19,f20,f10 + stop + + .globl hwacha_mm_11 +hwacha_mm_11: + mxtf.d f18, x2 + fmadd.d f11,f18,f20,f11 + stop + + .globl hwacha_mm_12 +hwacha_mm_12: + mxtf.d f19, x1 + fmadd.d f12,f19,f20,f12 + stop + + .globl hwacha_mm_13 +hwacha_mm_13: + mxtf.d f18, x2 + fmadd.d f13,f18,f20,f13 + stop + + .globl hwacha_mm_14 +hwacha_mm_14: + mxtf.d f19, x1 + fmadd.d f14,f19,f20,f14 + stop + + .globl hwacha_mm_15 +hwacha_mm_15: + mxtf.d f18, x2 + fmadd.d f15,f18,f20,f15 + stop + + .globl hwacha_mm_16 +hwacha_mm_16: + mxtf.d f19, x1 + fmadd.d f16,f19,f20,f16 + stop + + .globl hwacha_mm_17 +hwacha_mm_17: + mxtf.d f18, x2 + fmadd.d f17,f18,f20,f17 + stop diff --git a/benchmarks/mm/hwacha.h b/benchmarks/mm/hwacha.h new file mode 100644 index 0000000..b042308 --- /dev/null +++ b/benchmarks/mm/hwacha.h @@ -0,0 +1,137 @@ +static const int HCBM = 18; +static const int HCBN = 80; +static const int HCBK = 16; + +static const int HRBM = 18; +static const int HRBN = 80; +static const int HRBK = 1; + +extern void hwacha_mm_0(); +extern void hwacha_mm_1(); +extern void hwacha_mm_2(); +extern void hwacha_mm_3(); +extern void hwacha_mm_4(); +extern void hwacha_mm_5(); +extern void hwacha_mm_6(); +extern void hwacha_mm_7(); +extern void hwacha_mm_8(); +extern void hwacha_mm_9(); +extern void hwacha_mm_10(); +extern void hwacha_mm_11(); +extern void hwacha_mm_12(); +extern void hwacha_mm_13(); +extern void hwacha_mm_14(); +extern void hwacha_mm_15(); +extern void hwacha_mm_16(); +extern void hwacha_mm_17(); + +static inline void nloop(int s, int e, t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[0*lda])); + asm volatile ("vfld vf20, %[ptr]" : : [ptr]"r"(b) : ); + if (s) asm volatile ("vfld vf0, %[ptr]" : : [ptr]"r"(&c[ldc*0]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_0) : ); + if (e) asm volatile ("vfsd vf0, %[ptr]" : : [ptr]"r"(&c[ldc*0]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[1*lda])); + if (s) asm volatile ("vfld vf1, %[ptr]" : : [ptr]"r"(&c[ldc*1]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_1) : ); + if (e) asm volatile ("vfsd vf1, %[ptr]" : : [ptr]"r"(&c[ldc*1]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[2*lda])); + if (s) asm volatile ("vfld vf2, %[ptr]" : : [ptr]"r"(&c[ldc*2]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_2) : ); + if (e) asm volatile ("vfsd vf2, %[ptr]" : : [ptr]"r"(&c[ldc*2]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[3*lda])); + if (s) asm volatile ("vfld vf3, %[ptr]" : : [ptr]"r"(&c[ldc*3]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_3) : ); + if (e) asm volatile ("vfsd vf3, %[ptr]" : : [ptr]"r"(&c[ldc*3]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[4*lda])); + if (s) asm volatile ("vfld vf4, %[ptr]" : : [ptr]"r"(&c[ldc*4]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_4) : ); + if (e) asm volatile ("vfsd vf4, %[ptr]" : : [ptr]"r"(&c[ldc*4]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[5*lda])); + if (s) asm volatile ("vfld vf5, %[ptr]" : : [ptr]"r"(&c[ldc*5]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_5) : ); + if (e) asm volatile ("vfsd vf5, %[ptr]" : : [ptr]"r"(&c[ldc*5]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[6*lda])); + if (s) asm volatile ("vfld vf6, %[ptr]" : : [ptr]"r"(&c[ldc*6]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_6) : ); + if (e) asm volatile ("vfsd vf6, %[ptr]" : : [ptr]"r"(&c[ldc*6]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[7*lda])); + if (s) asm volatile ("vfld vf7, %[ptr]" : : [ptr]"r"(&c[ldc*7]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_7) : ); + if (e) asm volatile ("vfsd vf7, %[ptr]" : : [ptr]"r"(&c[ldc*7]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[8*lda])); + if (s) asm volatile ("vfld vf8, %[ptr]" : : [ptr]"r"(&c[ldc*8]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_8) : ); + if (e) asm volatile ("vfsd vf8, %[ptr]" : : [ptr]"r"(&c[ldc*8]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[9*lda])); + if (s) asm volatile ("vfld vf9, %[ptr]" : : [ptr]"r"(&c[ldc*9]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_9) : ); + if (e) asm volatile ("vfsd vf9, %[ptr]" : : [ptr]"r"(&c[ldc*9]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[10*lda])); + if (s) asm volatile ("vfld vf10, %[ptr]" : : [ptr]"r"(&c[ldc*10]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_10) : ); + if (e) asm volatile ("vfsd vf10, %[ptr]" : : [ptr]"r"(&c[ldc*10]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[11*lda])); + if (s) asm volatile ("vfld vf11, %[ptr]" : : [ptr]"r"(&c[ldc*11]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_11) : ); + if (e) asm volatile ("vfsd vf11, %[ptr]" : : [ptr]"r"(&c[ldc*11]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[12*lda])); + if (s) asm volatile ("vfld vf12, %[ptr]" : : [ptr]"r"(&c[ldc*12]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_12) : ); + if (e) asm volatile ("vfsd vf12, %[ptr]" : : [ptr]"r"(&c[ldc*12]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[13*lda])); + if (s) asm volatile ("vfld vf13, %[ptr]" : : [ptr]"r"(&c[ldc*13]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_13) : ); + if (e) asm volatile ("vfsd vf13, %[ptr]" : : [ptr]"r"(&c[ldc*13]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[14*lda])); + if (s) asm volatile ("vfld vf14, %[ptr]" : : [ptr]"r"(&c[ldc*14]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_14) : ); + if (e) asm volatile ("vfsd vf14, %[ptr]" : : [ptr]"r"(&c[ldc*14]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[15*lda])); + if (s) asm volatile ("vfld vf15, %[ptr]" : : [ptr]"r"(&c[ldc*15]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_15) : ); + if (e) asm volatile ("vfsd vf15, %[ptr]" : : [ptr]"r"(&c[ldc*15]) : ); + + asm volatile ("vmsv vx1, %[ptr]" : : [ptr]"r"(a[16*lda])); + if (s) asm volatile ("vfld vf16, %[ptr]" : : [ptr]"r"(&c[ldc*16]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_16) : ); + if (e) asm volatile ("vfsd vf16, %[ptr]" : : [ptr]"r"(&c[ldc*16]) : ); + + asm volatile ("vmsv vx2, %[ptr]" : : [ptr]"r"(a[17*lda])); + if (s) asm volatile ("vfld vf17, %[ptr]" : : [ptr]"r"(&c[ldc*17]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_17) : ); + if (e) asm volatile ("vfsd vf17, %[ptr]" : : [ptr]"r"(&c[ldc*17]) : ); +} + +static inline void mm_rb_hwacha(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + int vl; + asm volatile ("vvcfgivl %[gvl], %[nvl], 4, 21" : [gvl]"=r"(vl) : [nvl]"r"(n) : ); + + size_t mb = m/HRBM*HRBM, nk=p/HRBK*HRBK; + + for (size_t i=0; i +#include +#include +#include + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static void mm_naive(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + t s0 = c[i*ldc+j], s1 = 0, s2 = 0, s3 = 0; + for (size_t k = 0; k < p/4*4; k+=4) + { + s0 = fma(a[i*lda+k+0], b[(k+0)*ldb+j], s0); + s1 = fma(a[i*lda+k+1], b[(k+1)*ldb+j], s1); + s2 = fma(a[i*lda+k+2], b[(k+2)*ldb+j], s2); + s3 = fma(a[i*lda+k+3], b[(k+3)*ldb+j], s3); + } + for (size_t k = p/4*4; k < p; k++) + s0 = fma(a[i*lda+k], b[k*ldb+j], s0); + c[i*ldc+j] = (s0 + s1) + (s2 + s3); + } + } +} + +static inline void mm_rb(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + size_t mb = m/RBM*RBM, nb = n/RBN*RBN; + for (size_t i = 0; i < mb; i += RBM) + { + for (size_t j = 0; j < nb; j += RBN) + kloop(p, a+i*lda, lda, b+j, ldb, c+i*ldc+j, ldc); + mm_naive(RBM, n - nb, p, a+i*lda, lda, b+nb, ldb, c+i*ldc+nb, ldc); + } + mm_naive(m - mb, n, p, a+mb*lda, lda, b, ldb, c+mb*ldc, ldc); +} + +static inline void repack(t* a, size_t lda, const t* a0, size_t lda0, size_t m, size_t p) +{ + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < p/8*8; j+=8) + { + t t0 = a0[i*lda0+j+0]; + t t1 = a0[i*lda0+j+1]; + t t2 = a0[i*lda0+j+2]; + t t3 = a0[i*lda0+j+3]; + t t4 = a0[i*lda0+j+4]; + t t5 = a0[i*lda0+j+5]; + t t6 = a0[i*lda0+j+6]; + t t7 = a0[i*lda0+j+7]; + a[i*lda+j+0] = t0; + a[i*lda+j+1] = t1; + a[i*lda+j+2] = t2; + a[i*lda+j+3] = t3; + a[i*lda+j+4] = t4; + a[i*lda+j+5] = t5; + a[i*lda+j+6] = t6; + a[i*lda+j+7] = t7; + } + for (size_t j = p/8*8; j < p; j++) + a[i*lda+j] = a0[i*lda0+j]; + } +} + +static void mm_cb(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + size_t nmb = m/CBM, nnb = n/CBN, npb = p/CBK; + size_t mb = nmb*CBM, nb = nnb*CBN, pb = npb*CBK; + //t a1[mb*pb], b1[pb*nb], c1[mb*nb]; + t* a1 = (t*)alloca_aligned(sizeof(t)*mb*pb, 8192); + t* b1 = (t*)alloca_aligned(sizeof(t)*pb*nb, 8192); + t* c1 = (t*)alloca_aligned(sizeof(t)*mb*nb, 8192); + + for (size_t i = 0; i < mb; i += CBM) + for (size_t j = 0; j < pb; j += CBK) + repack(a1 + (npb*(i/CBM) + j/CBK)*(CBM*CBK), CBK, a + i*lda + j, lda, CBM, CBK); + + for (size_t i = 0; i < pb; i += CBK) + for (size_t j = 0; j < nb; j += CBN) + repack(b1 + (nnb*(i/CBK) + j/CBN)*(CBK*CBN), CBN, b + i*ldb + j, ldb, CBK, CBN); + + for (size_t i = 0; i < mb; i += CBM) + for (size_t j = 0; j < nb; j += CBN) + repack(c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, c + i*ldc + j, ldc, CBM, CBN); + + for (size_t i = 0; i < mb; i += CBM) + { + for (size_t j = 0; j < nb; j += CBN) + { + for (size_t k = 0; k < pb; k += CBK) + { + mm_rb(CBM, CBN, CBK, + a1 + (npb*(i/CBM) + k/CBK)*(CBM*CBK), CBK, + b1 + (nnb*(k/CBK) + j/CBN)*(CBK*CBN), CBN, + c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN); + } + if (pb < p) + { + mm_rb(CBM, CBN, p - pb, + a + i*lda + pb, lda, + b + pb*ldb + j, ldb, + c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN); + } + } + if (nb < n) + { + for (size_t k = 0; k < p; k += CBK) + { + mm_rb(CBM, n - nb, MIN(p - k, CBK), + a + i*lda + k, lda, + b + k*ldb + nb, ldb, + c + i*ldc + nb, ldc); + } + } + } + if (mb < m) + { + for (size_t j = 0; j < n; j += CBN) + { + for (size_t k = 0; k < p; k += CBK) + { + mm_rb(m - mb, MIN(n - j, CBN), MIN(p - k, CBK), + a + mb*lda + k, lda, + b + k*ldb + j, ldb, + c + mb*ldc + j, ldc); + } + } + } + + for (size_t i = 0; i < mb; i += CBM) + for (size_t j = 0; j < nb; j += CBN) + repack(c + i*ldc + j, ldc, c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, CBM, CBN); +} + +void mm(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + if (__builtin_expect(m <= 2*CBM && n <= 2*CBN && p <= 2*CBK, 1)) + mm_rb(m, n, p, a, lda, b, ldb, c, ldc); + else + mm_cb(m, n, p, a, lda, b, ldb, c, ldc); +} diff --git a/benchmarks/mm/mm_main.c b/benchmarks/mm/mm_main.c new file mode 100644 index 0000000..a4c04cc --- /dev/null +++ b/benchmarks/mm/mm_main.c @@ -0,0 +1,82 @@ +#define ncores 2 +#include "common.h" +#include +#include +#include "util.h" + +void thread_entry(int cid, int nc) +{ + const int R = 8; + int m, n, p; + int have_vec = 1; + + if (have_vec) { + m = HCBM; + n = HCBN; + p = HCBK; + } else { + m = CBM; + n = CBN; + p = CBK; + } + + t a[m*p]; + t b[p*n]; + t c[m*n]; + + for (size_t i = 0; i < m; i++) + for (size_t j = 0; j < p; j++) + a[i*p+j] = i+j; + for (size_t i = 0; i < p; i++) + for (size_t j = 0; j < n; j++) + b[i*n+j] = i-j; + memset(c, 0, m*n*sizeof(c[0])); + + size_t instret, cycles; + if (have_vec) { + for (int i = 0; i < R; i++) + { + instret = -rdinstret(); + cycles = -rdcycle(); + mm_rb_hwacha(m, n, p, a, p, b, n, c, n); + instret += rdinstret(); + cycles += rdcycle(); + } + } else { + for (int i = 0; i < R; i++) + { + instret = -rdinstret(); + cycles = -rdcycle(); + mm(m, n, p, a, p, b, n, c, n); + instret += rdinstret(); + cycles += rdcycle(); + } + } + + printf("C%d: reg block %dx%dx%d, cache block %dx%dx%d\n", + cid, HRBM, HRBN, HRBK, HCBM, HCBN, HCBK); + printf("C%d: %d instructions\n", cid, (int)(instret)); + printf("C%d: %d cycles\n", cid, (int)(cycles)); + printf("C%d: %d flops\n", cid, 2*m*n*p); + printf("C%d: %d Mflops @ 1 GHz\n", cid, 2000*m*n*p/(cycles)); + +#if 1 + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + t s = 0; + for (size_t aik = i, bkj = -j; aik < i+p; aik++, bkj++) + s += (t)aik*(t)bkj; + if (fabs(c[i*n+j]-s*R) > 1e-6*s) + { + printf("C%d: c[%lu][%lu] %u != %u\n", cid, i, j, (unsigned long)c[i*n+j], (unsigned long)s); + exit(1); + } + } + } +#endif + + //barrier(nc); + exit(0); +} diff --git a/benchmarks/mm/rb.h b/benchmarks/mm/rb.h new file mode 100644 index 0000000..c5d5890 --- /dev/null +++ b/benchmarks/mm/rb.h @@ -0,0 +1,210 @@ +static const int RBM = 4, RBN = 5, RBK = 6; +static const int CBM = 36, CBN = 35, CBK = 36; +static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc) +{ + t* c_0 = &c[ldc*0]; + t* c_1 = &c[ldc*1]; + t* c_2 = &c[ldc*2]; + t* c_3 = &c[ldc*3]; + t c_0_0 = c_0[0]; + t c_0_1 = c_0[1]; + t c_0_2 = c_0[2]; + t c_0_3 = c_0[3]; + t c_0_4 = c_0[4]; + t c_1_0 = c_1[0]; + t c_1_1 = c_1[1]; + t c_1_2 = c_1[2]; + t c_1_3 = c_1[3]; + t c_1_4 = c_1[4]; + t c_2_0 = c_2[0]; + t c_2_1 = c_2[1]; + t c_2_2 = c_2[2]; + t c_2_3 = c_2[3]; + t c_2_4 = c_2[4]; + t c_3_0 = c_3[0]; + t c_3_1 = c_3[1]; + t c_3_2 = c_3[2]; + t c_3_3 = c_3[3]; + t c_3_4 = c_3[4]; + for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb) + { + t* a_0 = &a[lda*0]; + t* a_1 = &a[lda*1]; + t* a_2 = &a[lda*2]; + t* a_3 = &a[lda*3]; + t* b_0 = &b[ldb*0]; + t* b_1 = &b[ldb*1]; + t* b_2 = &b[ldb*2]; + t* b_3 = &b[ldb*3]; + t* b_4 = &b[ldb*4]; + t* b_5 = &b[ldb*5]; + c_0_0 = fma(a_0[0], b_0[0], c_0_0); + c_0_0 = fma(a_0[1], b_1[0], c_0_0); + c_0_0 = fma(a_0[2], b_2[0], c_0_0); + c_0_0 = fma(a_0[3], b_3[0], c_0_0); + c_0_0 = fma(a_0[4], b_4[0], c_0_0); + c_0_0 = fma(a_0[5], b_5[0], c_0_0); + c_0_1 = fma(a_0[0], b_0[1], c_0_1); + c_0_1 = fma(a_0[1], b_1[1], c_0_1); + c_0_1 = fma(a_0[2], b_2[1], c_0_1); + c_0_1 = fma(a_0[3], b_3[1], c_0_1); + c_0_1 = fma(a_0[4], b_4[1], c_0_1); + c_0_1 = fma(a_0[5], b_5[1], c_0_1); + c_0_2 = fma(a_0[0], b_0[2], c_0_2); + c_0_2 = fma(a_0[1], b_1[2], c_0_2); + c_0_2 = fma(a_0[2], b_2[2], c_0_2); + c_0_2 = fma(a_0[3], b_3[2], c_0_2); + c_0_2 = fma(a_0[4], b_4[2], c_0_2); + c_0_2 = fma(a_0[5], b_5[2], c_0_2); + c_0_3 = fma(a_0[0], b_0[3], c_0_3); + c_0_3 = fma(a_0[1], b_1[3], c_0_3); + c_0_3 = fma(a_0[2], b_2[3], c_0_3); + c_0_3 = fma(a_0[3], b_3[3], c_0_3); + c_0_3 = fma(a_0[4], b_4[3], c_0_3); + c_0_3 = fma(a_0[5], b_5[3], c_0_3); + c_0_4 = fma(a_0[0], b_0[4], c_0_4); + c_0_4 = fma(a_0[1], b_1[4], c_0_4); + c_0_4 = fma(a_0[2], b_2[4], c_0_4); + c_0_4 = fma(a_0[3], b_3[4], c_0_4); + c_0_4 = fma(a_0[4], b_4[4], c_0_4); + c_0_4 = fma(a_0[5], b_5[4], c_0_4); + c_1_0 = fma(a_1[0], b_0[0], c_1_0); + c_1_0 = fma(a_1[1], b_1[0], c_1_0); + c_1_0 = fma(a_1[2], b_2[0], c_1_0); + c_1_0 = fma(a_1[3], b_3[0], c_1_0); + c_1_0 = fma(a_1[4], b_4[0], c_1_0); + c_1_0 = fma(a_1[5], b_5[0], c_1_0); + c_1_1 = fma(a_1[0], b_0[1], c_1_1); + c_1_1 = fma(a_1[1], b_1[1], c_1_1); + c_1_1 = fma(a_1[2], b_2[1], c_1_1); + c_1_1 = fma(a_1[3], b_3[1], c_1_1); + c_1_1 = fma(a_1[4], b_4[1], c_1_1); + c_1_1 = fma(a_1[5], b_5[1], c_1_1); + c_1_2 = fma(a_1[0], b_0[2], c_1_2); + c_1_2 = fma(a_1[1], b_1[2], c_1_2); + c_1_2 = fma(a_1[2], b_2[2], c_1_2); + c_1_2 = fma(a_1[3], b_3[2], c_1_2); + c_1_2 = fma(a_1[4], b_4[2], c_1_2); + c_1_2 = fma(a_1[5], b_5[2], c_1_2); + c_1_3 = fma(a_1[0], b_0[3], c_1_3); + c_1_3 = fma(a_1[1], b_1[3], c_1_3); + c_1_3 = fma(a_1[2], b_2[3], c_1_3); + c_1_3 = fma(a_1[3], b_3[3], c_1_3); + c_1_3 = fma(a_1[4], b_4[3], c_1_3); + c_1_3 = fma(a_1[5], b_5[3], c_1_3); + c_1_4 = fma(a_1[0], b_0[4], c_1_4); + c_1_4 = fma(a_1[1], b_1[4], c_1_4); + c_1_4 = fma(a_1[2], b_2[4], c_1_4); + c_1_4 = fma(a_1[3], b_3[4], c_1_4); + c_1_4 = fma(a_1[4], b_4[4], c_1_4); + c_1_4 = fma(a_1[5], b_5[4], c_1_4); + c_2_0 = fma(a_2[0], b_0[0], c_2_0); + c_2_0 = fma(a_2[1], b_1[0], c_2_0); + c_2_0 = fma(a_2[2], b_2[0], c_2_0); + c_2_0 = fma(a_2[3], b_3[0], c_2_0); + c_2_0 = fma(a_2[4], b_4[0], c_2_0); + c_2_0 = fma(a_2[5], b_5[0], c_2_0); + c_2_1 = fma(a_2[0], b_0[1], c_2_1); + c_2_1 = fma(a_2[1], b_1[1], c_2_1); + c_2_1 = fma(a_2[2], b_2[1], c_2_1); + c_2_1 = fma(a_2[3], b_3[1], c_2_1); + c_2_1 = fma(a_2[4], b_4[1], c_2_1); + c_2_1 = fma(a_2[5], b_5[1], c_2_1); + c_2_2 = fma(a_2[0], b_0[2], c_2_2); + c_2_2 = fma(a_2[1], b_1[2], c_2_2); + c_2_2 = fma(a_2[2], b_2[2], c_2_2); + c_2_2 = fma(a_2[3], b_3[2], c_2_2); + c_2_2 = fma(a_2[4], b_4[2], c_2_2); + c_2_2 = fma(a_2[5], b_5[2], c_2_2); + c_2_3 = fma(a_2[0], b_0[3], c_2_3); + c_2_3 = fma(a_2[1], b_1[3], c_2_3); + c_2_3 = fma(a_2[2], b_2[3], c_2_3); + c_2_3 = fma(a_2[3], b_3[3], c_2_3); + c_2_3 = fma(a_2[4], b_4[3], c_2_3); + c_2_3 = fma(a_2[5], b_5[3], c_2_3); + c_2_4 = fma(a_2[0], b_0[4], c_2_4); + c_2_4 = fma(a_2[1], b_1[4], c_2_4); + c_2_4 = fma(a_2[2], b_2[4], c_2_4); + c_2_4 = fma(a_2[3], b_3[4], c_2_4); + c_2_4 = fma(a_2[4], b_4[4], c_2_4); + c_2_4 = fma(a_2[5], b_5[4], c_2_4); + c_3_0 = fma(a_3[0], b_0[0], c_3_0); + c_3_0 = fma(a_3[1], b_1[0], c_3_0); + c_3_0 = fma(a_3[2], b_2[0], c_3_0); + c_3_0 = fma(a_3[3], b_3[0], c_3_0); + c_3_0 = fma(a_3[4], b_4[0], c_3_0); + c_3_0 = fma(a_3[5], b_5[0], c_3_0); + c_3_1 = fma(a_3[0], b_0[1], c_3_1); + c_3_1 = fma(a_3[1], b_1[1], c_3_1); + c_3_1 = fma(a_3[2], b_2[1], c_3_1); + c_3_1 = fma(a_3[3], b_3[1], c_3_1); + c_3_1 = fma(a_3[4], b_4[1], c_3_1); + c_3_1 = fma(a_3[5], b_5[1], c_3_1); + c_3_2 = fma(a_3[0], b_0[2], c_3_2); + c_3_2 = fma(a_3[1], b_1[2], c_3_2); + c_3_2 = fma(a_3[2], b_2[2], c_3_2); + c_3_2 = fma(a_3[3], b_3[2], c_3_2); + c_3_2 = fma(a_3[4], b_4[2], c_3_2); + c_3_2 = fma(a_3[5], b_5[2], c_3_2); + c_3_3 = fma(a_3[0], b_0[3], c_3_3); + c_3_3 = fma(a_3[1], b_1[3], c_3_3); + c_3_3 = fma(a_3[2], b_2[3], c_3_3); + c_3_3 = fma(a_3[3], b_3[3], c_3_3); + c_3_3 = fma(a_3[4], b_4[3], c_3_3); + c_3_3 = fma(a_3[5], b_5[3], c_3_3); + c_3_4 = fma(a_3[0], b_0[4], c_3_4); + c_3_4 = fma(a_3[1], b_1[4], c_3_4); + c_3_4 = fma(a_3[2], b_2[4], c_3_4); + c_3_4 = fma(a_3[3], b_3[4], c_3_4); + c_3_4 = fma(a_3[4], b_4[4], c_3_4); + c_3_4 = fma(a_3[5], b_5[4], c_3_4); + } + for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb) + { + t* a_0 = &a[lda*0]; + t* a_1 = &a[lda*1]; + t* a_2 = &a[lda*2]; + t* a_3 = &a[lda*3]; + t* b_0 = &b[ldb*0]; + c_0_0 = fma(a_0[0], b_0[0], c_0_0); + c_0_1 = fma(a_0[0], b_0[1], c_0_1); + c_0_2 = fma(a_0[0], b_0[2], c_0_2); + c_0_3 = fma(a_0[0], b_0[3], c_0_3); + c_0_4 = fma(a_0[0], b_0[4], c_0_4); + c_1_0 = fma(a_1[0], b_0[0], c_1_0); + c_1_1 = fma(a_1[0], b_0[1], c_1_1); + c_1_2 = fma(a_1[0], b_0[2], c_1_2); + c_1_3 = fma(a_1[0], b_0[3], c_1_3); + c_1_4 = fma(a_1[0], b_0[4], c_1_4); + c_2_0 = fma(a_2[0], b_0[0], c_2_0); + c_2_1 = fma(a_2[0], b_0[1], c_2_1); + c_2_2 = fma(a_2[0], b_0[2], c_2_2); + c_2_3 = fma(a_2[0], b_0[3], c_2_3); + c_2_4 = fma(a_2[0], b_0[4], c_2_4); + c_3_0 = fma(a_3[0], b_0[0], c_3_0); + c_3_1 = fma(a_3[0], b_0[1], c_3_1); + c_3_2 = fma(a_3[0], b_0[2], c_3_2); + c_3_3 = fma(a_3[0], b_0[3], c_3_3); + c_3_4 = fma(a_3[0], b_0[4], c_3_4); + } + c_0[0] = c_0_0; + c_0[1] = c_0_1; + c_0[2] = c_0_2; + c_0[3] = c_0_3; + c_0[4] = c_0_4; + c_1[0] = c_1_0; + c_1[1] = c_1_1; + c_1[2] = c_1_2; + c_1[3] = c_1_3; + c_1[4] = c_1_4; + c_2[0] = c_2_0; + c_2[1] = c_2_1; + c_2[2] = c_2_2; + c_2[3] = c_2_3; + c_2[4] = c_2_4; + c_3[0] = c_3_0; + c_3[1] = c_3_1; + c_3[2] = c_3_2; + c_3[3] = c_3_3; + c_3[4] = c_3_4; +} -- cgit v1.1