diff options
Diffstat (limited to 'mt/bf_matmul.c')
-rw-r--r-- | mt/bf_matmul.c | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/mt/bf_matmul.c b/mt/bf_matmul.c new file mode 100644 index 0000000..04904b9 --- /dev/null +++ b/mt/bf_matmul.c @@ -0,0 +1,127 @@ +#include "stdlib.h" + +#include "util.h" + +#include "dataset.h" +void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] ) +{ + + // ***************************** // + // **** ADD YOUR CODE HERE ***** // + // ***************************** // + // + // feel free to make a separate function for MI and MSI versions. + int j, k, i; + data_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + data_t temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15; + if(coreid == 0) { + for(j = 0; j < 32; j++) { + temp0 = 0; //C[j*lda]; + temp1 = 0; //C[1 + j*lda]; + temp2 = 0; //C[2 + j*lda]; + temp3 = 0; //C[3 + j*lda]; + temp4 = 0; //C[4 + j*lda]; + temp5 = 0; //C[5 + j*lda]; + temp6 = 0; //C[6 + j*lda]; + temp7 = 0; //C[7 + j*lda]; + temp8 = 0; //C[8 + j*lda]; + temp9 = 0; //C[9 + j*lda]; + temp10 = 0; //C[10 + j*lda]; + temp11 = 0; //C[11 + j*lda]; + temp12 = 0; //C[12 + j*lda]; + temp13 = 0; //C[13 + j*lda]; + temp14 = 0; //C[14 + j*lda]; + temp15 = 0; //C[15 + j*lda]; + for(k = 0; k < 32; k++) { + temp0 += A[j*lda + k] * B[k*lda]; + temp1 += A[j*lda + k] * B[1+k*lda]; + temp2 += A[j*lda + k] * B[2+k*lda]; + temp3 += A[j*lda + k] * B[3+k*lda]; + temp4 += A[j*lda + k] * B[4+k*lda]; + temp5 += A[j*lda + k] * B[5+k*lda]; + temp6 += A[j*lda + k] * B[6+k*lda]; + temp7 += A[j*lda + k] * B[7+k*lda]; + temp8 += A[j*lda + k] * B[8+k*lda]; + temp9 += A[j*lda + k] * B[9+k*lda]; + temp10 += A[j*lda + k] * B[10+k*lda]; + temp11 += A[j*lda + k] * B[11+k*lda]; + temp12 += A[j*lda + k] * B[12+k*lda]; + temp13 += A[j*lda + k] * B[13+k*lda]; + temp14 += A[j*lda + k] * B[14+k*lda]; + temp15 += A[j*lda + k] * B[15+k*lda]; + } + C[j*lda] = temp0; + C[1 + j*lda] = temp1; + C[2 + j*lda] = temp2; + C[3 + j*lda] = temp3; + C[4 + j*lda] = temp4; + C[5 + j*lda] = temp5; + C[6 + j*lda] = temp6; + C[7 + j*lda] = temp7; + C[8 + j*lda] = temp8; + C[9 + j*lda] = temp9; + C[10 + j*lda] = temp10; + C[11 + j*lda] = temp11; + C[12 + j*lda] = temp12; + C[13 + j*lda] = temp13; + C[14 + j*lda] = temp14; + C[15 + j*lda] = temp15; + } + } + + if(coreid == 1 || ncores == 1) { + for(j = 0; j < 32; j++) { + temp0 = 0; //C[16+j*lda]; + temp1 = 0; //C[17+j*lda]; + temp2 = 0; //C[18+j*lda]; + temp3 = 0; //C[19+j*lda]; + temp4 = 0; //C[20+j*lda]; + temp5 = 0; //C[21+j*lda]; + temp6 = 0; //C[22+j*lda]; + temp7 = 0; //C[23+j*lda]; + temp8 = 0; //C[24+j*lda]; + temp9 = 0; //C[25+j*lda]; + temp10 = 0; //C[26+j*lda]; + temp11 = 0; //C[27+j*lda]; + temp12 = 0; //C[28+j*lda]; + temp13 = 0; //C[29+j*lda]; + temp14 = 0; //C[30+j*lda]; + temp15 = 0; //C[31+j*lda]; + for(k = 0; k < 32; k++) { + temp0 += A[j*lda + k] * B[16+k*lda]; + temp1 += A[j*lda + k] * B[17+k*lda]; + temp2 += A[j*lda + k] * B[18+k*lda]; + temp3 += A[j*lda + k] * B[19+k*lda]; + temp4 += A[j*lda + k] * B[20+k*lda]; + temp5 += A[j*lda + k] * B[21+k*lda]; + temp6 += A[j*lda + k] * B[22+k*lda]; + temp7 += A[j*lda + k] * B[23+k*lda]; + temp8 += A[j*lda + k] * B[24+k*lda]; + temp9 += A[j*lda + k] * B[25+k*lda]; + temp10 += A[j*lda + k] * B[26+k*lda]; + temp11 += A[j*lda + k] * B[27+k*lda]; + temp12 += A[j*lda + k] * B[28+k*lda]; + temp13 += A[j*lda + k] * B[29+k*lda]; + temp14 += A[j*lda + k] * B[30+k*lda]; + temp15 += A[j*lda + k] * B[31+k*lda]; + } + C[16 + j*lda] = temp0; + C[17 + j*lda] = temp1; + C[18 + j*lda] = temp2; + C[19 + j*lda] = temp3; + C[20 + j*lda] = temp4; + C[21 + j*lda] = temp5; + C[22 + j*lda] = temp6; + C[23 + j*lda] = temp7; + C[24 + j*lda] = temp8; + C[25 + j*lda] = temp9; + C[26 + j*lda] = temp10; + C[27 + j*lda] = temp11; + C[28 + j*lda] = temp12; + C[29 + j*lda] = temp13; + C[30 + j*lda] = temp14; + C[31 + j*lda] = temp15; + } + } + +} |