aboutsummaryrefslogtreecommitdiff
path: root/mt/dv_matmul.c
diff options
context:
space:
mode:
authorHenry Cook <hcook@eecs.berkeley.edu>2014-11-06 17:24:39 -0800
committerHenry Cook <hcook@eecs.berkeley.edu>2014-11-07 16:52:51 -0800
commitd537de7deffa6036dab573ff174b7f8c8e470437 (patch)
treeddc921eb337cda4889570f0251bdba85059a2531 /mt/dv_matmul.c
parent5afc6b9bc2e3685220cffb3da66ad9f5f1f7b14f (diff)
downloadriscv-tests-d537de7deffa6036dab573ff174b7f8c8e470437.zip
riscv-tests-d537de7deffa6036dab573ff174b7f8c8e470437.tar.gz
riscv-tests-d537de7deffa6036dab573ff174b7f8c8e470437.tar.bz2
Clean up canonical mt benchmarks and reorganize extra versions in /mt. All versions support support at least 1/2/4 threads.
Diffstat (limited to 'mt/dv_matmul.c')
-rwxr-xr-xmt/dv_matmul.c98
1 files changed, 98 insertions, 0 deletions
diff --git a/mt/dv_matmul.c b/mt/dv_matmul.c
new file mode 100755
index 0000000..f76386d
--- /dev/null
+++ b/mt/dv_matmul.c
@@ -0,0 +1,98 @@
+#include "stdlib.h"
+
+#include "util.h"
+
+#include "dataset.h"
+void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
+{
+
+ // ***************************** //
+ // **** ADD YOUR CODE HERE ***** //
+ // ***************************** //
+ //
+ // feel free to make a separate function for MI and MSI versions.
+ int i, j, k, ii, jj, kk;
+ if(coreid > 1) return;
+ if (coreid || ncores == 1) {
+// for ( ii = 0; ii < 32; ii+=IC )
+ for ( kk = 0; kk < 32; kk+=16 )
+ for ( j = 0; j < 16; j++ )
+// for ( j = 0; j < 16; j++ )
+ {
+ for ( i = 0; i < 32; i+=8 )
+// for ( i = ii; i < ii + IC && i < 32; i+=8 )
+ {
+ data_t temp0 = C[i+j*32];
+ data_t temp1 = C[i+j*32+1];
+ data_t temp2 = C[i+j*32+2];
+ data_t temp3 = C[i+j*32+3];
+ data_t temp4 = C[i+j*32+4];
+ data_t temp5 = C[i+j*32+5];
+ data_t temp6 = C[i+j*32+6];
+ data_t temp7 = C[i+j*32+7];
+ for ( k = kk; k < kk+16 && k < 32; k++ )
+// for ( k = 0; k < 32; k++ )
+ {
+ data_t tempA = A[j*32+k];
+ temp0 += tempA * B[k*32 + i];
+ temp1 += tempA * B[k*32 + i+1];
+ temp2 += tempA * B[k*32 + i+2];
+ temp3 += tempA * B[k*32 + i+3];
+ temp4 += tempA * B[k*32 + i+4];
+ temp5 += tempA * B[k*32 + i+5];
+ temp6 += tempA * B[k*32 + i+6];
+ temp7 += tempA * B[k*32 + i+7];
+ }
+ C[i+j*32] = temp0;
+ C[i+j*32+1] = temp1;
+ C[i+j*32+2] = temp2;
+ C[i+j*32+3] = temp3;
+ C[i+j*32+4] = temp4;
+ C[i+j*32+5] = temp5;
+ C[i+j*32+6] = temp6;
+ C[i+j*32+7] = temp7;
+ }
+ }
+ }
+ if(coreid == 0){
+// for ( ii = 0; ii < 32; ii+=IC )
+ for ( kk = 0; kk < 32; kk+=16 )
+ for ( j = 16; j < 32; j++ )
+// for ( j = 16; j < 32; j++ )
+ {
+ for ( i = 0; i < 32; i+=8 )
+// for ( i = ii; i < ii + IC && i < 32; i+=8 )
+ {
+ data_t temp0 = C[i+j*32];
+ data_t temp1 = C[i+j*32+1];
+ data_t temp2 = C[i+j*32+2];
+ data_t temp3 = C[i+j*32+3];
+ data_t temp4 = C[i+j*32+4];
+ data_t temp5 = C[i+j*32+5];
+ data_t temp6 = C[i+j*32+6];
+ data_t temp7 = C[i+j*32+7];
+ for ( k = kk; k < kk+16 && k < 32; k++ )
+ {
+ data_t tempA = A[j*32+k];
+ temp0 += tempA * B[k*32 + i];
+ temp1 += tempA * B[k*32 + i+1];
+ temp2 += tempA * B[k*32 + i+2];
+ temp3 += tempA * B[k*32 + i+3];
+ temp4 += tempA * B[k*32 + i+4];
+ temp5 += tempA * B[k*32 + i+5];
+ temp6 += tempA * B[k*32 + i+6];
+ temp7 += tempA * B[k*32 + i+7];
+ }
+ C[i+j*32] = temp0;
+ C[i+j*32+1] = temp1;
+ C[i+j*32+2] = temp2;
+ C[i+j*32+3] = temp3;
+ C[i+j*32+4] = temp4;
+ C[i+j*32+5] = temp5;
+ C[i+j*32+6] = temp6;
+ C[i+j*32+7] = temp7;
+ }
+
+ }
+ }
+}