diff options
Diffstat (limited to 'mt/dv_matmul.c')
-rwxr-xr-x | mt/dv_matmul.c | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/mt/dv_matmul.c b/mt/dv_matmul.c new file mode 100755 index 0000000..f76386d --- /dev/null +++ b/mt/dv_matmul.c @@ -0,0 +1,98 @@ +#include "stdlib.h" + +#include "util.h" + +#include "dataset.h" +void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] ) +{ + + // ***************************** // + // **** ADD YOUR CODE HERE ***** // + // ***************************** // + // + // feel free to make a separate function for MI and MSI versions. + int i, j, k, ii, jj, kk; + if(coreid > 1) return; + if (coreid || ncores == 1) { +// for ( ii = 0; ii < 32; ii+=IC ) + for ( kk = 0; kk < 32; kk+=16 ) + for ( j = 0; j < 16; j++ ) +// for ( j = 0; j < 16; j++ ) + { + for ( i = 0; i < 32; i+=8 ) +// for ( i = ii; i < ii + IC && i < 32; i+=8 ) + { + data_t temp0 = C[i+j*32]; + data_t temp1 = C[i+j*32+1]; + data_t temp2 = C[i+j*32+2]; + data_t temp3 = C[i+j*32+3]; + data_t temp4 = C[i+j*32+4]; + data_t temp5 = C[i+j*32+5]; + data_t temp6 = C[i+j*32+6]; + data_t temp7 = C[i+j*32+7]; + for ( k = kk; k < kk+16 && k < 32; k++ ) +// for ( k = 0; k < 32; k++ ) + { + data_t tempA = A[j*32+k]; + temp0 += tempA * B[k*32 + i]; + temp1 += tempA * B[k*32 + i+1]; + temp2 += tempA * B[k*32 + i+2]; + temp3 += tempA * B[k*32 + i+3]; + temp4 += tempA * B[k*32 + i+4]; + temp5 += tempA * B[k*32 + i+5]; + temp6 += tempA * B[k*32 + i+6]; + temp7 += tempA * B[k*32 + i+7]; + } + C[i+j*32] = temp0; + C[i+j*32+1] = temp1; + C[i+j*32+2] = temp2; + C[i+j*32+3] = temp3; + C[i+j*32+4] = temp4; + C[i+j*32+5] = temp5; + C[i+j*32+6] = temp6; + C[i+j*32+7] = temp7; + } + } + } + if(coreid == 0){ +// for ( ii = 0; ii < 32; ii+=IC ) + for ( kk = 0; kk < 32; kk+=16 ) + for ( j = 16; j < 32; j++ ) +// for ( j = 16; j < 32; j++ ) + { + for ( i = 0; i < 32; i+=8 ) +// for ( i = ii; i < ii + IC && i < 32; i+=8 ) + { + data_t temp0 = C[i+j*32]; + data_t temp1 = C[i+j*32+1]; + data_t temp2 = C[i+j*32+2]; + data_t temp3 = C[i+j*32+3]; + data_t temp4 = C[i+j*32+4]; + data_t temp5 = C[i+j*32+5]; + data_t temp6 = C[i+j*32+6]; + data_t temp7 = C[i+j*32+7]; + for ( k = kk; k < kk+16 && k < 32; k++ ) + { + data_t tempA = A[j*32+k]; + temp0 += tempA * B[k*32 + i]; + temp1 += tempA * B[k*32 + i+1]; + temp2 += tempA * B[k*32 + i+2]; + temp3 += tempA * B[k*32 + i+3]; + temp4 += tempA * B[k*32 + i+4]; + temp5 += tempA * B[k*32 + i+5]; + temp6 += tempA * B[k*32 + i+6]; + temp7 += tempA * B[k*32 + i+7]; + } + C[i+j*32] = temp0; + C[i+j*32+1] = temp1; + C[i+j*32+2] = temp2; + C[i+j*32+3] = temp3; + C[i+j*32+4] = temp4; + C[i+j*32+5] = temp5; + C[i+j*32+6] = temp6; + C[i+j*32+7] = temp7; + } + + } + } +} |