//************************************************************************** // Vector-vector add benchmark //-------------------------------------------------------------------------- // Author : Andrew Waterman // TA : Christopher Celio // Student : // // This benchmark adds two vectors and writes the results to a // third vector. The input data (and reference data) should be // generated using the vvadd_gendata.pl perl script and dumped // to a file named dataset.h // to print out arrays, etc. //#define DEBUG //-------------------------------------------------------------------------- // Includes #include #include #include //-------------------------------------------------------------------------- // Input/Reference Data typedef float data_t; #include "dataset.h" //-------------------------------------------------------------------------- // Basic Utilities and Multi-thread Support __thread unsigned long coreid; unsigned long ncores; #include "util.h" #define stringify_1(s) #s #define stringify(s) stringify_1(s) #define stats(code) do { \ unsigned long _c = -rdcycle(), _i = -rdinstret(); \ code; \ _c += rdcycle(), _i += rdinstret(); \ if (coreid == 0) \ printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \ stringify(code), _c, _c/DATA_SIZE, 10*_c/DATA_SIZE%10, _c/_i, 10*_c/_i%10); \ } while(0) //-------------------------------------------------------------------------- // Helper functions void printArray( char name[], int n, data_t arr[] ) { int i; if (coreid != 0) return; printf( " %10s :", name ); for ( i = 0; i < n; i++ ) printf( " %4ld ", (long) arr[i] ); printf( "\n" ); } void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct) { if (coreid != 0) return; size_t i; for (i = 0; i < n; i++) { if (test[i] != correct[i]) { printf("FAILED test[%d]= %4ld, correct[%d]= %4ld\n", i, (long) test[i], i, (long)correct[i]); exit(-1); } } return; } //-------------------------------------------------------------------------- // vvadd function //perform in-place vvadd void __attribute__((noinline)) vvadd(size_t n, data_t* __restrict__ x, const data_t* __restrict__ y) { size_t i; // interleave accesses for (i = coreid; i < n; i+=ncores) { x[i] = x[i] + y[i]; } } void __attribute__((noinline)) vvadd_opt(size_t n, data_t* __restrict__ x, const data_t* __restrict__ y) { size_t i; size_t start = n * coreid / ncores; size_t end = (coreid == ncores - 1) ? n : n * (coreid+1)/ ncores; for (i = start; i < end; i++) { x[i] = x[i] + y[i]; } } //-------------------------------------------------------------------------- // Main // // all threads start executing thread_entry(). Use their "coreid" to // differentiate between threads (each thread is running on a separate core). void thread_entry(int cid, int nc) { coreid = cid; ncores = nc; // static allocates data in the binary, which is visible to both threads static data_t results_data[DATA_SIZE]; // because we're going to perform an in-place vvadd (and we're going to run // it a couple of times) let's copy the input data to a temporary results // array size_t i; if (coreid == 0) { for (i = 0; i < DATA_SIZE; i++) results_data[i] = input1_data[i]; } // Execute the provided, terrible vvadd barrier(); stats(vvadd(DATA_SIZE, results_data, input2_data); barrier()); // verify verify(DATA_SIZE, results_data, verify_data); // reset results from the first trial if (coreid == 0) { for (i=0; i < DATA_SIZE; i++) results_data[i] = input1_data[i]; } barrier(); // Execute your faster vvadd barrier(); stats(vvadd_opt(DATA_SIZE, results_data, input2_data); barrier()); #ifdef DEBUG printArray("results: ", DATA_SIZE, results_data); printArray("verify : ", DATA_SIZE, verify_data); #endif // verify verify(DATA_SIZE, results_data, verify_data); barrier(); exit(0); }