diff options
author | Henry Cook <hcook@eecs.berkeley.edu> | 2014-11-06 17:24:39 -0800 |
---|---|---|
committer | Henry Cook <hcook@eecs.berkeley.edu> | 2014-11-07 16:52:51 -0800 |
commit | d537de7deffa6036dab573ff174b7f8c8e470437 (patch) | |
tree | ddc921eb337cda4889570f0251bdba85059a2531 /mt/ce_matmul.c | |
parent | 5afc6b9bc2e3685220cffb3da66ad9f5f1f7b14f (diff) | |
download | riscv-tests-d537de7deffa6036dab573ff174b7f8c8e470437.zip riscv-tests-d537de7deffa6036dab573ff174b7f8c8e470437.tar.gz riscv-tests-d537de7deffa6036dab573ff174b7f8c8e470437.tar.bz2 |
Clean up canonical mt benchmarks and reorganize extra versions in /mt. All versions support support at least 1/2/4 threads.
Diffstat (limited to 'mt/ce_matmul.c')
-rwxr-xr-x | mt/ce_matmul.c | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/mt/ce_matmul.c b/mt/ce_matmul.c new file mode 100755 index 0000000..49ad295 --- /dev/null +++ b/mt/ce_matmul.c @@ -0,0 +1,157 @@ +#include "stdlib.h" + +#include "util.h" + +#include "dataset.h" +void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] ) +{ + + data_t a1; + data_t a2; + data_t a3; + data_t a4; + data_t a5; + data_t a6; + data_t a7; + data_t a8; + data_t *b1; + data_t *b2; + data_t *b3; + data_t *b4; + data_t *b5; + data_t *b6; + data_t *b7; + data_t *b8; + data_t c1; + data_t c2; + data_t c3; + data_t c4; + data_t c5; + data_t c6; + data_t c7; + data_t c8; + int i, j, k; + int start, end; + static data_t BB[1024]; + + + //transpose B + for ( k = 0; k < lda; k++) { + for ( i = coreid*(lda/ncores); i < (coreid+1)*(lda/ncores); i++ ) { + BB[i*lda + k] = B[k*lda + i]; + } + barrier(ncores); + } + + for ( int x = 0; x < ncores; x++) { + //split the i values into two chunks so the threads don't interfere on the B loads + //this could be generalized if needed, but I won't bother since it would be tricky + //and we already know the size and numthreads + start = x * (32 / ncores); + end = (x+1) * (32 / ncores); + for ( i = start; i < end; i+=8 ) { + for ( j = coreid*(lda/ncores); j < (coreid+1)*(lda/ncores); j++ ) { + c1=0;c2=0;c3=0;c4=0;c5=0;c6=0;c7=0;c8=0; + b1 = &BB[(i+0)*lda]; + b2 = &BB[(i+1)*lda]; + b3 = &BB[(i+2)*lda]; + b4 = &BB[(i+3)*lda]; + b5 = &BB[(i+4)*lda]; + b6 = &BB[(i+5)*lda]; + b7 = &BB[(i+6)*lda]; + b8 = &BB[(i+7)*lda]; + + for ( k = 0; k < lda; k+=8 ) { + a1 = A[j*lda + k+0]; + a2 = A[j*lda + k+1]; + a3 = A[j*lda + k+2]; + a4 = A[j*lda + k+3]; + a5 = A[j*lda + k+4]; + a6 = A[j*lda + k+5]; + a7 = A[j*lda + k+6]; + a8 = A[j*lda + k+7]; + + c1 += a1 * b1[k+0]; + c1 += a2 * b1[k+1]; + c1 += a3 * b1[k+2]; + c1 += a4 * b1[k+3]; + c1 += a5 * b1[k+4]; + c1 += a6 * b1[k+5]; + c1 += a7 * b1[k+6]; + c1 += a8 * b1[k+7]; + + c2 += a1 * b2[k+0]; + c2 += a2 * b2[k+1]; + c2 += a3 * b2[k+2]; + c2 += a4 * b2[k+3]; + c2 += a5 * b2[k+4]; + c2 += a6 * b2[k+5]; + c2 += a7 * b2[k+6]; + c2 += a8 * b2[k+7]; + + c3 += a1 * b3[k+0]; + c3 += a2 * b3[k+1]; + c3 += a3 * b3[k+2]; + c3 += a4 * b3[k+3]; + c3 += a5 * b3[k+4]; + c3 += a6 * b3[k+5]; + c3 += a7 * b3[k+6]; + c3 += a8 * b3[k+7]; + + c4 += a1 * b4[k+0]; + c4 += a2 * b4[k+1]; + c4 += a3 * b4[k+2]; + c4 += a4 * b4[k+3]; + c4 += a5 * b4[k+4]; + c4 += a6 * b4[k+5]; + c4 += a7 * b4[k+6]; + c4 += a8 * b4[k+7]; + + c5 += a1 * b5[k+0]; + c5 += a2 * b5[k+1]; + c5 += a3 * b5[k+2]; + c5 += a4 * b5[k+3]; + c5 += a5 * b5[k+4]; + c5 += a6 * b5[k+5]; + c5 += a7 * b5[k+6]; + c5 += a8 * b5[k+7]; + + c6 += a1 * b6[k+0]; + c6 += a2 * b6[k+1]; + c6 += a3 * b6[k+2]; + c6 += a4 * b6[k+3]; + c6 += a5 * b6[k+4]; + c6 += a6 * b6[k+5]; + c6 += a7 * b6[k+6]; + c6 += a8 * b6[k+7]; + + c7 += a1 * b7[k+0]; + c7 += a2 * b7[k+1]; + c7 += a3 * b7[k+2]; + c7 += a4 * b7[k+3]; + c7 += a5 * b7[k+4]; + c7 += a6 * b7[k+5]; + c7 += a7 * b7[k+6]; + c7 += a8 * b7[k+7]; + + c8 += a1 * b8[k+0]; + c8 += a2 * b8[k+1]; + c8 += a3 * b8[k+2]; + c8 += a4 * b8[k+3]; + c8 += a5 * b8[k+4]; + c8 += a6 * b8[k+5]; + c8 += a7 * b8[k+6]; + c8 += a8 * b8[k+7]; + } + C[i+0 + j*lda] += c1; + C[i+1 + j*lda] += c2; + C[i+2 + j*lda] += c3; + C[i+3 + j*lda] += c4; + C[i+4 + j*lda] += c5; + C[i+5 + j*lda] += c6; + C[i+6 + j*lda] += c7; + C[i+7 + j*lda] += c8; + } + } + } +} |