multithreading tests from 152 lab 5

author: Henry Cook <hcook@eecs.berkeley.edu> 2013-06-13 15:30:16 -0700
committer: Henry Cook <hcook@eecs.berkeley.edu> 2013-06-13 15:30:16 -0700
commit: 60f056880ec6929c5f23af4d66aea0f0cb7b0245 (patch)
tree: a2f4cbc9902df362534ede13d65883ee47fba2d8 /mt/am_matmul
parent: 4412b96c81ca09dcce6305579dd86d4bf3b808da (diff)
download: riscv-tests-60f056880ec6929c5f23af4d66aea0f0cb7b0245.zip
riscv-tests-60f056880ec6929c5f23af4d66aea0f0cb7b0245.tar.gz
riscv-tests-60f056880ec6929c5f23af4d66aea0f0cb7b0245.tar.bz2
11 files changed, 2131 insertions, 0 deletions
diff --git a/mt/am_matmul/am_matmul.c b/mt/am_matmul/am_matmul.c
new file mode 100755
index 0000000..7fe737b
--- /dev/null
+++ b/mt/am_matmul/am_matmul.c
@@ -0,0 +1,216 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+  size_t i, j, k, l;
+  int row,row2, column, column2, column3, column4, column5, column6, column7, column8;
+  size_t max_dim = 32*32;
+  data_t element, element2, element3, element4, element5, element6, element7, element8;
+  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  data_t temp_mat2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  //for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores); i+=8){
+  for (l=coreid*32/ncores; l<32*(1+coreid)/ncores; l+=2){
+    row=l*32;
+    row2=(l+1)*32;
+    for (i=0; i<lda; i+=4){
+      element = A[row+i];
+      element2 = A[row+i+1];
+      element3 = A[row+i+2];
+      element4 = A[row+i+3];
+      element5 = A[row2+i];
+      element6 = A[row2+i+1];
+      element7 = A[row2+i+2];
+      element8 = A[row2+i+3];
+      column=i*32;
+      column2=(i+1)*32;
+      column3=(i+2)*32;
+      column4=(i+3)*32;
+      for (j=0; j<32; j+=4){
+	temp_mat[j]+=element*B[column+j]+element2*B[column2+j]+element3*B[column3+j]+element4*B[column4+j];
+	temp_mat[j+1]+=element*B[column+j+1]+element2*B[column2+j+1]+element3*B[column3+j+1]+element4*B[column4+j+1];
+	temp_mat[j+2]+=element*B[column+j+2]+element2*B[column2+j+2]+element3*B[column3+j+2]+element4*B[column4+j+2];
+	temp_mat[j+3]+=element*B[column+j+3]+element2*B[column2+j+3]+element3*B[column3+j+3]+element4*B[column4+j+3];
+	temp_mat2[j]+=element5*B[column+j]+element6*B[column2+j]+element7*B[column3+j]+element8*B[column4+j];
+	temp_mat2[j+1]+=element5*B[column+j+1]+element6*B[column2+j+1]+element7*B[column3+j+1]+element8*B[column4+j+1];
+	temp_mat2[j+2]+=element5*B[column+j+2]+element6*B[column2+j+2]+element7*B[column3+j+2]+element8*B[column4+j+2];
+	temp_mat2[j+3]+=element5*B[column+j+3]+element6*B[column2+j+3]+element7*B[column3+j+3]+element8*B[column4+j+3];
+      }
+      /*if (i==28){
+	for(k=0; k<32; k++){
+	  C[row+k]=temp_mat[k];
+	  C[row2+k]=temp_mat2[k];
+	  temp_mat[k]=0;
+	  temp_mat2[k]=0;
+	}
+	}*/
+    }
+    for(k=0; k<32; k++){
+	  C[row+k]=temp_mat[k];
+	  C[row2+k]=temp_mat2[k];
+	  temp_mat[k]=0;
+	  temp_mat2[k]=0;
+    }
+  }
+  
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+//   // Execute the provided, naive matmul
+//   barrier();
+//   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+// 
+//   
+//   // verify
+//   verify(ARRAY_SIZE, results_data, verify_data);
+//   
+//   // clear results from the first trial
+//   size_t i;
+//   if (coreid == 0) 
+//      for (i=0; i < ARRAY_SIZE; i++)
+//         results_data[i] = 0;
+//   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
diff --git a/mt/am_matmul/dataset.h b/mt/am_matmul/dataset.h
new file mode 100755
index 0000000..dde3ee4
--- /dev/null
+++ b/mt/am_matmul/dataset.h
@@ -0,0 +1,174 @@
+
+#define ARRAY_SIZE 1024 
+
+
+#define DIM_SIZE 32 
+
+static data_t input1_data[ARRAY_SIZE] = 
+{
+    0,   3,   2,   0,   3,   1,   0,   3,   2,   3,   2,   0,   3,   3,   1,   2,   3,   0,   0,   1, 
+    1,   1,   2,   3,   1,   2,   3,   1,   1,   3,   2,   2,   0,   1,   3,   2,   2,   2,   0,   0, 
+    1,   0,   1,   3,   3,   0,   3,   3,   3,   3,   0,   3,   2,   1,   2,   2,   0,   0,   3,   0, 
+    1,   1,   0,   3,   3,   1,   2,   3,   3,   0,   1,   2,   1,   0,   1,   2,   2,   1,   0,   3, 
+    1,   0,   2,   2,   1,   1,   1,   1,   1,   1,   2,   0,   3,   1,   1,   2,   2,   3,   3,   1, 
+    3,   2,   0,   0,   0,   3,   3,   3,   2,   1,   2,   3,   1,   0,   0,   0,   0,   1,   2,   2, 
+    1,   1,   3,   3,   3,   1,   1,   2,   3,   1,   3,   3,   2,   3,   2,   1,   2,   3,   0,   2, 
+    2,   1,   1,   0,   0,   0,   0,   0,   1,   3,   3,   1,   1,   1,   2,   2,   3,   2,   1,   1, 
+    1,   1,   3,   0,   2,   2,   1,   3,   2,   1,   2,   2,   1,   3,   1,   3,   1,   3,   2,   3, 
+    1,   2,   1,   3,   2,   2,   0,   1,   0,   0,   1,   2,   3,   3,   1,   0,   0,   0,   3,   1, 
+    2,   3,   2,   3,   2,   0,   0,   0,   0,   0,   3,   1,   3,   0,   0,   0,   3,   1,   1,   1, 
+    1,   2,   1,   2,   3,   2,   0,   0,   2,   2,   3,   0,   3,   0,   0,   3,   0,   3,   1,   3, 
+    3,   1,   1,   1,   2,   2,   1,   3,   0,   3,   3,   1,   0,   0,   3,   2,   1,   3,   3,   3, 
+    1,   0,   1,   1,   2,   1,   0,   1,   1,   2,   2,   3,   1,   2,   2,   2,   0,   1,   3,   3, 
+    3,   2,   2,   1,   0,   1,   2,   0,   1,   1,   1,   1,   2,   3,   2,   2,   3,   3,   0,   0, 
+    2,   0,   0,   0,   3,   0,   1,   0,   3,   0,   0,   0,   3,   0,   0,   2,   0,   2,   0,   0, 
+    2,   3,   2,   0,   0,   3,   3,   2,   1,   1,   0,   2,   0,   0,   3,   3,   2,   3,   3,   0, 
+    1,   0,   2,   2,   0,   3,   3,   1,   1,   0,   2,   3,   2,   1,   1,   0,   1,   2,   1,   2, 
+    2,   0,   0,   1,   0,   1,   1,   0,   1,   0,   2,   3,   3,   2,   0,   0,   1,   3,   0,   3, 
+    3,   0,   0,   0,   0,   3,   3,   1,   0,   0,   3,   3,   2,   1,   2,   1,   3,   3,   0,   1, 
+    3,   0,   2,   3,   1,   3,   3,   3,   3,   3,   0,   1,   1,   3,   0,   2,   2,   3,   1,   2, 
+    2,   2,   1,   3,   3,   0,   3,   0,   0,   2,   0,   2,   3,   0,   1,   3,   2,   2,   0,   0, 
+    2,   3,   0,   2,   2,   2,   3,   1,   0,   3,   3,   3,   3,   1,   0,   3,   3,   2,   0,   3, 
+    2,   0,   3,   0,   2,   0,   0,   2,   2,   1,   0,   2,   3,   1,   1,   1,   1,   2,   3,   3, 
+    3,   0,   0,   3,   3,   3,   2,   3,   3,   1,   2,   2,   3,   1,   2,   1,   1,   3,   0,   1, 
+    2,   0,   2,   0,   0,   1,   3,   2,   0,   1,   3,   2,   3,   3,   0,   0,   0,   1,   0,   3, 
+    3,   2,   2,   2,   1,   1,   2,   2,   1,   3,   2,   0,   1,   3,   2,   0,   2,   1,   3,   0, 
+    0,   0,   1,   3,   3,   2,   2,   2,   3,   1,   0,   0,   1,   1,   2,   1,   3,   1,   1,   2, 
+    2,   3,   2,   3,   0,   2,   3,   3,   0,   3,   0,   0,   1,   0,   0,   0,   1,   3,   1,   1, 
+    2,   3,   2,   1,   1,   2,   2,   2,   3,   0,   1,   1,   2,   1,   2,   0,   2,   3,   1,   3, 
+    0,   1,   1,   3,   0,   2,   3,   0,   1,   2,   3,   2,   0,   0,   3,   3,   2,   1,   1,   2, 
+    3,   0,   1,   1,   1,   1,   2,   0,   1,   2,   0,   1,   1,   1,   0,   1,   3,   2,   3,   1, 
+    0,   2,   1,   2,   1,   3,   3,   1,   0,   2,   2,   3,   1,   3,   1,   3,   0,   1,   0,   3, 
+    0,   3,   2,   0,   3,   3,   3,   0,   3,   2,   2,   2,   1,   3,   0,   0,   1,   1,   3,   0, 
+    1,   2,   1,   0,   0,   0,   3,   2,   2,   0,   0,   2,   1,   3,   0,   0,   3,   0,   0,   2, 
+    1,   1,   2,   2,   1,   3,   2,   2,   1,   1,   2,   1,   3,   2,   1,   1,   3,   0,   1,   3, 
+    3,   2,   2,   1,   0,   3,   2,   2,   2,   3,   0,   1,   3,   3,   2,   3,   0,   3,   2,   3, 
+    1,   1,   0,   0,   0,   2,   3,   0,   3,   0,   1,   1,   3,   1,   3,   2,   1,   1,   2,   1, 
+    3,   2,   0,   2,   1,   0,   2,   3,   2,   3,   2,   1,   2,   3,   0,   0,   1,   1,   0,   0, 
+    2,   1,   0,   1,   2,   2,   2,   2,   0,   3,   3,   1,   0,   0,   0,   0,   3,   1,   1,   0, 
+    0,   0,   0,   1,   2,   2,   1,   3,   0,   2,   3,   2,   3,   2,   2,   1,   2,   2,   3,   3, 
+    1,   3,   0,   2,   2,   3,   3,   1,   2,   2,   2,   3,   1,   1,   1,   0,   0,   0,   3,   0, 
+    1,   0,   3,   1,   1,   3,   0,   1,   2,   2,   0,   0,   3,   3,   3,   3,   2,   1,   0,   0, 
+    1,   0,   2,   0,   1,   1,   0,   0,   3,   3,   2,   1,   1,   1,   0,   1,   1,   2,   2,   1, 
+    1,   2,   0,   3,   1,   3,   1,   0,   3,   0,   3,   1,   1,   1,   0,   2,   0,   3,   1,   0, 
+    1,   0,   2,   0,   2,   3,   3,   3,   1,   2,   3,   2,   2,   0,   1,   1,   0,   3,   3,   1, 
+    3,   3,   2,   0,   2,   0,   2,   2,   3,   3,   3,   0,   2,   3,   3,   1,   3,   2,   2,   2, 
+    0,   2,   3,   0,   2,   0,   3,   2,   2,   1,   1,   0,   2,   2,   2,   0,   2,   2,   0,   1, 
+    3,   2,   1,   3,   2,   2,   0,   3,   3,   1,   2,   2,   0,   0,   3,   2,   1,   2,   2,   1, 
+    3,   1,   2,   0,   0,   1,   1,   2,   1,   3,   2,   2,   3,   0,   2,   1,   3,   2,   1,   3, 
+    2,   3,   3,   1,   2,   1,   2,   2,   0,   0,   0,   3,   0,   2,   3,   1,   0,   0,   2,   3, 
+    3,   2,   2,   1
+};
+
+static data_t input2_data[ARRAY_SIZE] = 
+{
+    1,   1,   0,   3,   1,   2,   0,   0,   0,   0,   0,   2,   1,   2,   3,   0,   0,   3,   3,   2, 
+    2,   1,   2,   3,   3,   0,   2,   2,   1,   1,   2,   2,   0,   2,   2,   1,   2,   3,   2,   2, 
+    3,   3,   2,   2,   1,   1,   1,   1,   2,   1,   2,   2,   3,   3,   3,   0,   0,   3,   2,   3, 
+    2,   3,   1,   2,   1,   1,   2,   2,   0,   1,   0,   3,   2,   1,   1,   1,   2,   0,   1,   2, 
+    2,   0,   2,   1,   3,   3,   2,   3,   2,   0,   3,   1,   3,   3,   2,   0,   1,   0,   1,   1, 
+    2,   2,   1,   1,   2,   2,   1,   2,   3,   3,   1,   3,   2,   2,   2,   3,   3,   1,   0,   2, 
+    1,   0,   0,   0,   1,   1,   2,   0,   3,   2,   3,   3,   0,   2,   3,   1,   0,   0,   2,   1, 
+    2,   0,   2,   1,   1,   2,   3,   1,   3,   2,   1,   0,   0,   0,   0,   0,   2,   2,   0,   2, 
+    1,   2,   0,   3,   2,   2,   0,   0,   3,   2,   1,   1,   3,   0,   2,   0,   0,   1,   0,   2, 
+    3,   3,   1,   3,   3,   0,   0,   2,   2,   0,   0,   0,   1,   0,   0,   1,   3,   0,   2,   1, 
+    3,   2,   2,   1,   3,   2,   0,   1,   2,   2,   3,   2,   1,   1,   1,   1,   3,   0,   1,   3, 
+    2,   2,   3,   1,   1,   2,   0,   2,   1,   1,   2,   3,   1,   0,   1,   0,   1,   1,   0,   0, 
+    2,   0,   3,   0,   3,   0,   3,   2,   2,   3,   3,   2,   1,   0,   2,   2,   1,   1,   0,   3, 
+    3,   2,   2,   0,   0,   3,   0,   1,   0,   0,   1,   2,   0,   1,   3,   0,   1,   2,   2,   0, 
+    0,   3,   0,   3,   0,   1,   1,   2,   0,   0,   0,   3,   0,   0,   2,   1,   1,   1,   0,   2, 
+    1,   3,   1,   2,   0,   3,   0,   3,   1,   3,   0,   0,   2,   2,   2,   2,   3,   3,   2,   1, 
+    2,   2,   1,   1,   2,   2,   2,   2,   0,   3,   0,   0,   2,   0,   1,   2,   0,   3,   2,   3, 
+    2,   0,   2,   1,   2,   1,   0,   2,   1,   1,   3,   2,   2,   3,   1,   0,   3,   3,   1,   0, 
+    3,   2,   2,   0,   0,   3,   0,   0,   2,   0,   3,   2,   3,   1,   1,   0,   0,   2,   3,   0, 
+    0,   1,   1,   1,   2,   1,   3,   2,   1,   3,   0,   1,   3,   3,   1,   1,   1,   1,   1,   1, 
+    0,   0,   2,   3,   2,   2,   2,   3,   2,   3,   1,   2,   3,   2,   2,   2,   0,   1,   3,   0, 
+    1,   1,   0,   1,   0,   1,   1,   3,   3,   1,   2,   2,   3,   2,   0,   2,   2,   0,   1,   3, 
+    0,   1,   3,   2,   1,   3,   3,   2,   0,   1,   3,   2,   0,   2,   1,   1,   0,   3,   0,   1, 
+    1,   1,   1,   1,   3,   0,   0,   1,   0,   2,   3,   1,   3,   0,   2,   1,   3,   0,   3,   0, 
+    3,   2,   2,   0,   0,   2,   1,   3,   3,   2,   3,   2,   2,   1,   2,   2,   3,   0,   3,   2, 
+    2,   0,   3,   2,   3,   2,   0,   0,   1,   2,   0,   0,   2,   0,   0,   3,   3,   2,   0,   0, 
+    3,   3,   0,   2,   3,   1,   0,   1,   0,   2,   1,   0,   2,   1,   0,   1,   0,   3,   0,   2, 
+    2,   3,   0,   0,   2,   1,   0,   1,   0,   0,   0,   2,   2,   3,   2,   0,   3,   3,   2,   1, 
+    0,   0,   3,   1,   2,   3,   3,   1,   0,   3,   1,   1,   0,   3,   3,   3,   2,   2,   2,   0, 
+    1,   2,   0,   3,   0,   1,   0,   1,   1,   0,   1,   2,   0,   3,   2,   0,   1,   2,   2,   0, 
+    2,   0,   0,   1,   0,   3,   0,   3,   2,   1,   1,   1,   1,   3,   2,   1,   1,   1,   1,   0, 
+    2,   1,   1,   3,   2,   0,   2,   1,   1,   0,   2,   2,   1,   3,   0,   2,   1,   0,   1,   2, 
+    0,   1,   3,   2,   3,   2,   1,   0,   2,   0,   2,   2,   3,   1,   1,   3,   2,   3,   2,   2, 
+    0,   2,   0,   0,   0,   3,   2,   0,   2,   2,   3,   3,   3,   2,   1,   2,   0,   0,   3,   0, 
+    2,   0,   3,   2,   2,   3,   0,   3,   2,   1,   2,   2,   1,   2,   0,   0,   3,   1,   2,   0, 
+    2,   3,   2,   2,   1,   1,   1,   3,   3,   3,   3,   3,   1,   3,   0,   1,   3,   2,   2,   1, 
+    0,   1,   1,   2,   1,   2,   3,   1,   2,   2,   1,   2,   1,   1,   0,   3,   3,   1,   1,   3, 
+    2,   0,   0,   1,   2,   0,   1,   3,   1,   0,   0,   2,   2,   3,   3,   0,   2,   3,   2,   1, 
+    1,   3,   0,   2,   2,   3,   3,   1,   2,   3,   3,   3,   1,   3,   0,   3,   1,   1,   2,   2, 
+    2,   1,   0,   3,   2,   3,   0,   2,   3,   2,   3,   1,   2,   3,   3,   1,   2,   1,   0,   0, 
+    0,   3,   3,   3,   3,   0,   3,   3,   3,   3,   2,   1,   0,   3,   0,   3,   2,   3,   1,   0, 
+    0,   1,   3,   1,   0,   2,   2,   3,   1,   0,   2,   1,   1,   3,   1,   1,   3,   1,   2,   1, 
+    0,   0,   3,   2,   1,   1,   1,   1,   3,   2,   1,   3,   3,   1,   0,   3,   1,   1,   2,   0, 
+    0,   0,   2,   3,   3,   2,   2,   3,   0,   2,   3,   1,   3,   3,   0,   2,   1,   2,   2,   2, 
+    1,   0,   1,   3,   2,   3,   1,   1,   2,   1,   1,   0,   0,   2,   3,   2,   1,   0,   3,   1, 
+    3,   0,   1,   1,   2,   2,   1,   3,   3,   1,   1,   0,   0,   3,   3,   0,   0,   0,   0,   0, 
+    3,   1,   3,   0,   0,   0,   3,   3,   2,   1,   3,   0,   1,   3,   1,   1,   1,   0,   1,   0, 
+    1,   2,   2,   2,   3,   3,   0,   2,   3,   2,   1,   3,   3,   1,   1,   3,   0,   3,   3,   2, 
+    1,   1,   2,   0,   3,   0,   1,   2,   1,   1,   0,   0,   1,   2,   2,   0,   3,   1,   1,   1, 
+    3,   3,   3,   1,   0,   3,   3,   2,   2,   2,   1,   2,   0,   1,   1,   3,   0,   3,   1,   0, 
+    2,   2,   0,   1,   2,   3,   2,   1,   2,   0,   3,   2,   1,   3,   0,   1,   2,   0,   3,   0, 
+    1,   1,   2,   1
+};
+
+static data_t verify_data[ARRAY_SIZE] = 
+{
+   72,  75,  88, 101,  80,  88,  73,  75,  80,  81,  58,  75,  86,  65,  60,  80,  84,  83,  87,  83, 
+  108,  93,  85,  76,  72,  98,  79,  86,  80,  96,  91,  85,  72,  64,  70,  83,  68,  92,  51,  54, 
+   85,  85,  60,  58,  90,  64,  55,  69,  72,  48,  94,  77,  91,  83,  70,  69,  67,  77,  59,  50, 
+   67,  74,  77,  67,  67,  62,  72,  71,  68,  79,  54,  61,  67,  61,  55,  62,  78,  60,  53,  64, 
+   67,  69,  99,  68,  88,  60,  66,  63,  70,  62,  65,  50,  53,  66,  70,  72,  75,  78,  85,  95, 
+   71,  89,  70,  68,  86,  88,  58,  77,  84,  70,  65,  68,  73,  75,  91,  96, 105,  92,  76,  68, 
+   86,  69,  80,  59,  73,  83,  88,  75,  64,  63,  71,  99,  77,  77,  69,  55,  80,  73,  54,  73, 
+   87,  78,  60,  69,  65,  78,  86,  89,  95,  92,  63,  69,  89,  61,  80,  65,  70,  77,  89,  77, 
+   79,  79,  73,  92,  64,  81,  60,  78,  81,  80,  61,  63,  89,  65,  56,  83,  77,  65, 102,  70, 
+   98,  86,  96,  68,  72,  89,  73,  73,  70,  89,  84,  76,  48,  61,  63,  70,  70,  79,  50,  53, 
+   64,  63,  43,  51,  59,  62,  43,  63,  55,  77,  79,  74,  75,  74,  64,  44,  65,  69,  72,  66, 
+   54,  71,  74,  72,  69,  76,  68,  89,  94,  75,  65,  53,  85,  79,  65,  74,  82,  73,  58,  70, 
+   84,  77,  99,  72,  92,  84,  78,  62,  59,  83,  71,  74,  63,  85,  80,  78,  71,  72,  79,  83, 
+   73,  82,  60,  85,  76,  82,  60,  70,  82,  68,  54,  85,  84,  70,  86,  74, 100,  88,  98,  68, 
+   67,  87,  69,  73,  68,  88,  76,  71,  47,  43,  47,  80,  54,  65,  40,  37,  59,  53,  33,  48, 
+   62,  40,  36,  55,  36,  62,  53,  57,  70,  69,  45,  43,  53,  61,  42,  57,  56,  63,  51,  47, 
+   59,  75,  64,  89,  83,  75,  59,  75,  91,  92,  58,  64,  83,  74,  58,  60,  76,  66,  97,  69, 
+   90,  95,  92,  64,  78,  75,  77,  73,  65,  78,  82,  75,  47,  54,  59,  71,  59,  56,  53,  42, 
+   60,  55,  40,  51,  60,  46,  36,  59,  46,  57,  67,  43,  51,  53,  53,  38,  54,  56,  55,  48, 
+   41,  46,  63,  63,  80,  77,  89, 102,  89,  98,  74,  86,  98,  93,  63,  76,  98,  77,  48, 101, 
+   86,  88, 100,  82, 102,  90,  95,  75,  86, 103,  83,  98,  80, 104,  98,  86,  71,  74,  80,  90, 
+   86,  87,  73,  70,  81,  83,  55,  66,  90,  66,  58,  84,  77,  84,  93,  72,  99,  75,  85,  65, 
+   70,  89,  71,  82,  64,  79,  82,  80,  67,  73,  86, 101,  78,  97,  66,  64,  84,  80,  55,  64, 
+   79,  73,  51,  79,  89,  68,  94,  77, 109, 102,  82,  61,  66,  93,  88,  70,  82,  82,  85,  69, 
+   69,  72,  66,  97,  85,  90,  70,  59,  76,  89,  53,  56,  90,  79,  71,  64,  70,  67, 100,  92, 
+  106,  89,  83,  78,  73,  80,  70,  72,  65,  70,  92,  88,  57,  76,  55,  85,  66,  80,  61,  63, 
+   63,  78,  54,  58,  71,  73,  54,  63,  63,  62,  89,  76,  86,  81,  83,  54,  70,  81,  78,  64, 
+   56,  72,  74,  81,  75,  63,  68,  89,  65,  77,  58,  68,  75,  83,  52,  62,  82,  63,  55,  75, 
+   51,  70,  95,  66,  83,  77,  86,  61,  64,  77,  48,  70,  66,  82,  72,  75,  79,  71,  72,  89, 
+   78,  78,  66,  59,  91,  80,  55,  64,  79,  68,  54,  71,  67,  75,  87,  84, 100, 101,  76,  58, 
+   74,  82,  61,  74,  75,  97,  85,  79,  61,  55,  69,  68,  72,  65,  52,  64,  80,  73,  48,  54, 
+   71,  66,  42,  61,  66,  63,  92,  64,  85,  77,  73,  54,  74,  73,  76,  66,  62,  79,  85,  70, 
+   71,  84,  87,  81,  88,  86,  77,  77,  93,  88,  78,  71, 101,  89,  58,  84,  95,  81,  89,  97, 
+  104,  79,  83,  76,  90,  81,  91,  74,  70,  76,  91,  80,  51,  48,  56,  69,  47,  63,  54,  42, 
+   63,  63,  42,  52,  66,  56,  39,  59,  61,  52,  59,  63,  62,  68,  57,  35,  67,  58,  56,  52, 
+   61,  63,  60,  47,  85,  75,  89, 106,  88,  95,  74,  82, 107, 107,  64,  78,  98,  90,  62,  91, 
+   79,  87, 111,  84, 104, 106,  96,  68,  94,  99,  81,  89,  79, 105,  95,  86,  65,  63,  77,  89, 
+   66,  88,  56,  73,  82,  92,  41,  62,  85,  66,  50,  81,  57,  71,  77,  78,  86,  89,  77,  53, 
+   67,  78,  61,  63,  72,  82,  69,  66,  59,  46,  55,  70,  56,  64,  45,  50,  65,  64,  42,  56, 
+   78,  49,  51,  52,  38,  56,  72,  55,  73,  72,  61,  50,  63,  60,  47,  57,  55,  73,  53,  68, 
+   85,  88,  91,  96,  82,  89,  73,  76,  87,  86,  67,  69,  96,  84,  57,  89,  87,  89,  99,  88, 
+  104,  90,  85,  75,  88,  92,  85,  75,  74,  87, 103,  94,  55,  48,  56,  65,  72,  50,  45,  51, 
+   63,  62,  47,  57,  79,  53,  36,  63,  54,  68,  71,  59,  63,  61,  63,  41,  50,  73,  57,  59, 
+   56,  76,  73,  65,  61,  64,  61,  79,  53,  73,  57,  44,  61,  59,  59,  56,  81,  59,  49,  62, 
+   65,  55,  69,  72,  79,  70,  58,  57,  68,  61,  62,  50,  57,  60,  66,  66,  63,  77,  81,  89, 
+   85,  81,  76,  73,  78,  95,  59,  70,  81,  77,  46,  79,  78,  79,  83,  81,  84,  82,  85,  48, 
+   74,  85,  85,  74,  74,  80,  80,  74,  60,  76,  80,  97,  88,  93,  66,  66,  73,  84,  56,  70, 
+   90,  63,  58,  78,  73,  93,  90,  78,  94,  88,  82,  67,  85,  70,  81,  86,  74,  82,  88,  82, 
+   68,  73,  75,  91,  78,  97,  71,  66,  74,  85,  50,  59,  86,  77,  70,  74,  75,  74,  99,  82, 
+   99,  91,  86,  65,  80,  77,  72,  69,  60,  78,  90,  87,  79,  69,  74,  98,  70,  86,  81,  67, 
+   69,  78,  48,  65,  88,  70,  70,  70,  69,  72,  96,  90,  99,  82,  81,  76,  98,  73,  74,  71, 
+   69,  73,  94,  89
+};
+
diff --git a/mt/am_matmul/matmul2.c b/mt/am_matmul/matmul2.c
new file mode 100644
index 0000000..30c705d
--- /dev/null
+++ b/mt/am_matmul/matmul2.c
@@ -0,0 +1,73 @@
+/*size_t i;
+  size_t j;
+  size_t max_dim = lda*lda;
+  if (coreid==0){
+    for (i=0; i<max_dim/(ncores*2); i+=8){
+      data_t elementA1 = A[i];
+      data_t elementA12 = A[i+1];
+      data_t elementA13 = A[i+2];
+      data_t elementA14 = A[i+3];
+      data_t elementA15 = A[i+4];
+      data_t elementA16 = A[i+5];
+      data_t elementA17 = A[i+6];
+      data_t elementA18 = A[i+7];
+      data_t elementA2 = A[i+32*8];
+      data_t elementA21 = A[i+32*8+1];
+      data_t elementA22 = A[i+32*8+2];
+      data_t elementA23 = A[i+32*8+3];
+      data_t elementA24 = A[i+32*8+4];
+      data_t elementA25 = A[i+32*8+5];
+      data_t elementA26 = A[i+32*8+6];
+      data_t elementA27 = A[i+32*8+7];
+      int row= (int)(i/32)*32;
+      int row2 = row+8*32;
+      int column1 = i%32*32;
+      int column12 = (i+1)%32*32;
+      int column13 = (i+2)%32*32;
+      int column14 = (i+3)%32*32;
+      int column15 = (i+4)%32*32;
+      int column16 = (i+5)%32*32;
+      int column17 = (i+6)%32*32;
+      int column18 = (i+7)%32*32;
+    
+      for (j=0; j<lda; j++){
+	C[row+j]+=elementA1*B[column1+j]+elementA12*B[column12+j]+elementA13*B[column13+j]+elementA14*B[column14+j]+elementA15*B[column15+j]+elementA16*B[column16+j]+elementA17*B[column17+j]+elementA18*B[column18+j]
+ 
+	C[row2+j]+=elementA2*B[column1+j]+elementA21*B[column12+j]+elementA22*B[column13+j]+elementA23*B[column14+j]+elementA24*B[column15+j]+elementA25*B[column16+j]+elementA26*B[column17+j]+elementA27*B[column18+j];
+      }
+  }}else{
+    for (i=max_dim/2; i<(max_dim/(ncores*2)+max_dim/2); i+=8){
+      data_t elementA1 = A[i];
+      data_t elementA12 = A[i+1];
+      data_t elementA13 = A[i+2];
+      data_t elementA14 = A[i+3];
+      data_t elementA15 = A[i+4];
+      data_t elementA16 = A[i+5];
+      data_t elementA17 = A[i+6];
+      data_t elementA18 = A[i+7];
+      data_t elementA2 = A[i+32*8];
+      data_t elementA21 = A[i+32*8+1];
+      data_t elementA22 = A[i+32*8+2];
+      data_t elementA23 = A[i+32*8+3];
+      data_t elementA24 = A[i+32*8+4];
+      data_t elementA25 = A[i+32*8+5];
+      data_t elementA26 = A[i+32*8+6];
+      data_t elementA27 = A[i+32*8+7];
+      int row= (int)(i/32)*32;
+      int row2 = row+8*32;
+      int column1 = i%32*32;
+      int column12 = (i+1)%32*32;
+      int column13 = (i+2)%32*32;
+      int column14 = (i+3)%32*32;
+      int column15 = (i+4)%32*32;
+      int column16 = (i+5)%32*32;
+      int column17 = (i+6)%32*32;
+      int column18 = (i+7)%32*32;
+    
+      for (j=0; j<lda; j++){
+	C[row+j]+=elementA1*B[column1+j]+elementA12*B[column12+j]+elementA13*B[column13+j]+elementA14*B[column14+j]+elementA15*B[column15+j]+elementA16*B[column16+j]+elementA17*B[column17+j]+elementA18*B[column18+j];
+	C[row2+j]+=elementA2*B[column1+j]+elementA21*B[column12+j]+elementA22*B[column13+j]+elementA23*B[column14+j]+elementA24*B[column15+j]+elementA25*B[column16+j]+elementA26*B[column17+j]+elementA27*B[column18+j];
+
+      }
+    }
+    }*/
diff --git a/mt/am_matmul/matmul2.c~ b/mt/am_matmul/matmul2.c~
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/mt/am_matmul/matmul2.c~
diff --git a/mt/am_matmul/matmul3.c b/mt/am_matmul/matmul3.c
new file mode 100755
index 0000000..9a79baa
--- /dev/null
+++ b/mt/am_matmul/matmul3.c
@@ -0,0 +1,221 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+  size_t i;
+  size_t j;
+  size_t k;
+  size_t max_dim = 32*32;
+  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  data_t temp_mat2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  //for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores); i+=8){
+   for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores)/2; i+=8){
+    data_t element=A[i];
+    data_t element2 = A[i+1];
+    data_t element3 = A[i+2];
+    data_t element4 = A[i+3];
+    data_t element5 = A[i+4];
+    data_t element6 = A[i+5];
+    data_t element7 = A[i+6];
+    data_t element8 = A[i+7];
+    data_t elementA2 = A[i+32*8];
+      data_t elementA21 = A[i+32*8+1];
+      data_t elementA22 = A[i+32*8+2];
+      data_t elementA23 = A[i+32*8+3];
+      data_t elementA24 = A[i+32*8+4];
+      data_t elementA25 = A[i+32*8+5];
+      data_t elementA26 = A[i+32*8+6];
+      data_t elementA27 = A[i+32*8+7];
+    int row= (int)(i/32)*32;
+    int row2 = row+8*32;
+    int column = i%32*32;
+    int column2 = (i+1)%32*32;
+    int column3 = (i+2)%32*32;
+    int column4 = (i+3)%32*32;
+    int column5 = (i+4)%32*32;
+    int column6 = (i+5)%32*32;
+    int column7 = (i+6)%32*32;
+    int column8 = (i+7)%32*32;
+    
+    for (j=0; j<32; j++){
+      temp_mat[j]+=element*B[column+j]+element2*B[column2+j]+element3*B[column3+j]+element4*B[column4+j]+element5*B[column5+j]+element6*B[column6+j]+element7*B[column7+j]+element8*B[column8+j];
+      
+      temp_mat2[j]+=elementA2*B[column+j]+elementA21*B[column2+j]+elementA22*B[column3+j]+elementA23*B[column4+j]+elementA24*B[column5+j]+elementA25*B[column6+j]+elementA26*B[column7+j]+elementA27*B[column8+j];
+    }
+    if (i%32==24){
+      for(k=0; k<32; k++){
+	C[row+k]=temp_mat[k];
+	C[row2+k]=temp_mat2[k];
+	temp_mat[k]=0;
+	temp_mat2[k]=0;
+	
+      }
+    }
+    }
+
+
+
+      
+   
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+   // Execute the provided, naive matmul
+   barrier();
+   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   
+   // clear results from the first trial
+   size_t i;
+   if (coreid == 0) 
+      for (i=0; i < ARRAY_SIZE; i++)
+         results_data[i] = 0;
+   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
diff --git a/mt/am_matmul/matmul4.c b/mt/am_matmul/matmul4.c
new file mode 100755
index 0000000..05a1aa4
--- /dev/null
+++ b/mt/am_matmul/matmul4.c
@@ -0,0 +1,282 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+  /*size_t i;
+  size_t j;
+  size_t k;
+  size_t max_dim = 32*32;
+  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores); i+=8){
+    data_t element=A[i];
+    data_t element2 = A[i+1];
+    data_t element3 = A[i+2];
+    data_t element4 = A[i+3];
+    data_t element5 = A[i+4];
+    data_t element6 = A[i+5];
+    data_t element7 = A[i+6];
+    data_t element8 = A[i+7];
+    int row= (int)(i/32)*32;
+    int column = i%32*32;
+    int column2 = (i+1)%32*32;
+    int column3 = (i+2)%32*32;
+    int column4 = (i+3)%32*32;
+    int column5 = (i+4)%32*32;
+    int column6 = (i+5)%32*32;
+    int column7 = (i+6)%32*32;
+    int column8 = (i+7)%32*32;
+    
+    for (j=0; j<32; j++){
+      temp_mat[j]+=element*B[column+j]+element2*B[column2+j]+element3*B[column3+j]+element4*B[column4+j]+element5*B[column5+j]+element6*B[column6+j]+element7*B[column7+j]+element8*B[column8+j];
+    }
+    if (i%32==24){
+      for(k=0; k<32; k++){
+	C[row+k]=temp_mat[k];
+	temp_mat[k]=0;
+      }
+    }
+    }*/
+  int i,j,k,l;
+  //data_t element11, element12, element13, element14, element21, element22, element23, element24;
+  data_t element1, element2, element3, element4, element5, element6, element7, element8;
+  int row, row2;
+  //int column11, column12, column13, column14, column21, column22, column23, column24;
+  int column1, column2, column3, column4, column5, column6, column7, column8;
+  data_t temp[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  //data_t temp2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  if (coreid == 0){
+    for (i=0; i<32; i++){
+      if (i==15){
+	for (j=0; j<32; j+=4){
+	  row=15*32;
+	  element1 = A[row+j];
+	  element2 = A[row+j+1];
+	  element3 = A[row+j+2];
+	  element4 = A[row+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  for (k=0;k<32; k++){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	  }
+	  if (j==28){
+	    for (l=0; l<32; l++){
+	      C[row+l]=temp[l];
+	      temp[l]=0;
+	    }
+	  }
+	}
+      }
+      else{
+	row = i*32;
+	for (j=0; j<16; j+=4){
+	  element1 = A[i*32+j];
+	  element2 = A[i*32+j+1];
+	  element3 = A[i*32+j+2];
+	  element4 = A[i*32+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  for (k=0; k<32; k++){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	  }
+	  if (j==12){
+	    for (l=0; l<32; l++){
+	      C[row+l]+=temp[l];
+	      temp[l]=0;
+	    }
+	  }
+	}
+      }
+    }
+  }
+  else if (coreid==1){
+    for (i=0; i<32; i++){
+      row = (31-i)*32;
+      if (row/32 != 15){
+	for (j=16; j<32; j+=4){
+	  element1 = A[(31-i)*32+j];
+	  element2 = A[(31-i)*32+j+1];
+	  element3 = A[(31-i)*32+j+2];
+	  element4 = A[(31-i)*32+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  for (k=0; k<32; k++){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	  }
+	  if (j==28){
+	    for (l=0; l<32; l++){
+	      C[row+l]+=temp[l];
+	      temp[l]=0;
+	    }
+	  }
+	}
+      }
+    }
+  }  
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+   // Execute the provided, naive matmul
+   barrier();
+   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   
+   // clear results from the first trial
+   size_t i;
+   if (coreid == 0) 
+      for (i=0; i < ARRAY_SIZE; i++)
+         results_data[i] = 0;
+   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
diff --git a/mt/am_matmul/matmul_gendata.pl b/mt/am_matmul/matmul_gendata.pl
new file mode 100755
index 0000000..f21bb46
--- /dev/null
+++ b/mt/am_matmul/matmul_gendata.pl
@@ -0,0 +1,200 @@
+#!/usr/bin/perl -w
+#==========================================================================
+# matmul_gendata.pl
+#
+# Author : Christopher Batten (cbatten@mit.edu)
+# Date   : April 29, 2005
+#
+(our $usageMsg = <<'ENDMSG') =~ s/^\#//gm;
+#
+# Simple script which creates an input data set and the reference data
+# for the matmul benchmark.
+#
+ENDMSG
+
+use strict "vars";
+use warnings;
+no  warnings("once");
+use Getopt::Long;
+
+#--------------------------------------------------------------------------
+# Command line processing
+#--------------------------------------------------------------------------
+
+our %opts;
+
+sub usage()
+{
+
+  print "\n";
+  print " Usage: matmul_gendata.pl [options] \n";
+  print "\n";
+  print " Options:\n";
+  print "  --help  print this message\n";
+  print "  --size  size of input data [1000]\n";
+  print "  --seed  random seed [1]\n";
+  print "$usageMsg";
+
+  exit();
+}
+
+sub processCommandLine()
+{
+
+  $opts{"help"} = 0;
+  $opts{"size"} = 1000;
+  $opts{"seed"} = 1;
+  Getopt::Long::GetOptions( \%opts, 'help|?', 'size:i', 'seed:i' ) or usage();
+  $opts{"help"} and usage();
+
+}
+
+#--------------------------------------------------------------------------
+# Helper Functions
+#--------------------------------------------------------------------------
+
+sub printArray
+{
+  my $arrayName = $_[0];
+  my $arrayRef  = $_[1];
+
+  my $numCols = 20;
+  my $arrayLen = scalar(@{$arrayRef});
+
+  print "static data_t ".$arrayName."[ARRAY_SIZE] = \n";
+  print "{\n";
+
+  if ( $arrayLen <= $numCols ) {
+    print "  ";
+    for ( my $i = 0; $i < $arrayLen; $i++ ) {
+      print sprintf("%3d",$arrayRef->[$i]);
+      if ( $i != $arrayLen-1 ) {
+        print ", ";
+      }
+    }
+    print "\n";
+  }
+
+  else {
+    my $numRows = int($arrayLen/$numCols);
+    for ( my $j = 0; $j < $numRows; $j++ ) {
+      print "  ";
+      for ( my $i = 0; $i < $numCols; $i++ ) {
+        my $index = $j*$numCols + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+    if ( $arrayLen > ($numRows*$numCols) ) {
+      print "  ";
+      for ( my $i = 0; $i < ($arrayLen-($numRows*$numCols)); $i++ ) {
+        my $index = $numCols*$numRows + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+  }
+
+  print  "};\n\n";
+}
+
+
+
+#--------------------------------------------------------------------------
+# Matmul
+#--------------------------------------------------------------------------
+
+# http://answers.oreilly.com/topic/418-how-to-multiply-matrices-in-perl/
+
+sub mmult {
+    my ($m1,$m2) = @_;
+    my ($m1rows,$m1cols) = matdim($m1);
+    my ($m2rows,$m2cols) = matdim($m2);
+
+    my $result = [  ];
+    my ($i, $j, $k);
+
+    for $i (range($m1rows)) {
+        for $j (range($m2cols)) {
+            for $k (range($m1cols)) {
+                $result->[$i][$j] += $m1->[$i][$k] * $m2->[$k][$j];
+            }
+        }
+    }
+    return $result;
+}
+
+sub range { 0 .. ($_[0] - 1) }
+
+
+sub veclen {
+    my $ary_ref = $_[0];
+    my $type = ref $ary_ref;
+    if ($type ne "ARRAY") { die "$type is bad array ref for $ary_ref" }
+    return scalar(@$ary_ref);
+}
+
+sub matdim {
+    my $matrix = $_[0];
+    my $rows = veclen($matrix);
+    my $cols = veclen($matrix->[0]);
+    return ($rows, $cols);
+}
+
+#--------------------------------------------------------------------------
+# Main
+#--------------------------------------------------------------------------
+
+sub main()
+{
+
+  processCommandLine();
+  srand($opts{"seed"});
+
+  # create random input arrays
+  my $mat_values1;
+  my $mat_values2;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    for ( my $j = 0; $j < $opts{"size"}; $j++ ) {
+      $mat_values1->[$i][$j] = int(rand(4));
+      $mat_values2->[$i][$j] = int(rand(4));
+    }
+  }
+
+  # perform matmul
+  my $mat_results = mmult( $mat_values1, $mat_values2 );
+  
+  # translate 2d arrays to 1d-somethings (I don't know how to code in perl - Chris)
+  my @values1;
+  my @values2;
+  my @results;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    for ( my $j = 0; $j < $opts{"size"}; $j++ ) {
+    my $value1 = $mat_values1->[$i][$j];
+    my $value2 = $mat_values2->[$i][$j];
+    my $result = $mat_results->[$i][$j];
+    push( @values1, $value1 );
+    push( @values2, $value2 );
+    push( @results, $result );
+    }
+  }
+
+  print "\n\#define ARRAY_SIZE ".($opts{"size"}*$opts{"size"})." \n\n";
+  print "\n\#define DIM_SIZE ".$opts{"size"}." \n\n";
+   
+  printArray( "input1_data", \@values1 );
+  printArray( "input2_data", \@values2 );
+  printArray( "verify_data", \@results);
+ 
+}
+
+main();
+
diff --git a/mt/am_matmul/matmul_mi.c b/mt/am_matmul/matmul_mi.c
new file mode 100755
index 0000000..841a4b5
--- /dev/null
+++ b/mt/am_matmul/matmul_mi.c
@@ -0,0 +1,249 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+    int i,j,k,l;
+    data_t element1, element2, element3, element4, element5, element6, element7, element8;
+    int row, row2;
+    int column1, column2, column3, column4, column5, column6, column7, column8;
+    data_t temp[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    data_t temp2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    if (coreid == 0){
+      for (i=0; i<lda; i+=2){
+	row = i*lda;
+	row2 = (i+1)*lda;
+	for (j=0; j<16; j+=4){
+	  element1 = A[row+j];
+	  element2 = A[row+j+1];
+	  element3 = A[row+j+2];
+	  element4 = A[row+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  element5 = A[row2+j];
+	  element6 = A[row2+j+1];
+	  element7 = A[row2+j+2];
+	  element8 = A[row2+j+3];
+
+	  for (k=0; k<32; k+=4){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	    temp[k+1]+=element1*B[column1+k+1]+element2*B[column2+k+1]+element3*B[column3+k+1]+element4*B[column4+k+1];
+	    temp[k+2]+=element1*B[column1+k+2]+element2*B[column2+k+2]+element3*B[column3+k+2]+element4*B[column4+k+2];
+	    temp[k+3]+=element1*B[column1+k+3]+element2*B[column2+k+3]+element3*B[column3+k+3]+element4*B[column4+k+3];
+	    temp2[k]+=element5*B[column1+k]+element6*B[column2+k]+element7*B[column3+k]+element8*B[column4+k];
+	    temp2[k+1]+=element5*B[column1+k+1]+element6*B[column2+k+1]+element7*B[column3+k+1]+element8*B[column4+k+1];
+	    temp2[k+2]+=element5*B[column1+k+2]+element6*B[column2+k+2]+element7*B[column3+k+2]+element8*B[column4+k+2];
+	    temp2[k+3]+=element5*B[column1+k+3]+element6*B[column2+k+3]+element7*B[column3+k+3]+element8*B[column4+k+3];
+	  }
+	  if (j==12){
+	    for (l=0; l<32; l++){
+	      C[row+l]+=temp[l];
+	      C[row2+l]+=temp2[l];
+	      temp[l]=0;
+	      temp2[l]=0;
+	    }
+	  }
+	}
+      }
+    }
+    else if (coreid==1){
+      for (i=0; i<32; i+=2){
+	row = (31-i)*lda;
+	row2 = (31-i-1)*lda;
+	for (j=16; j<32; j+=4){
+	  element1 = A[row+j];
+	  element2 = A[row+j+1];
+	  element3 = A[row+j+2];
+	  element4 = A[row+j+3];
+	  element5 = A[row2+j];
+	  element6 = A[row2+j+1];
+	  element7 = A[row2+j+2];
+	  element8 = A[row2+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  for (k=0; k<32; k+=4){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	    temp[k+1]+=element1*B[column1+k+1]+element2*B[column2+k+1]+element3*B[column3+k+1]+element4*B[column4+k+1];
+	    temp[k+2]+=element1*B[column1+k+2]+element2*B[column2+k+2]+element3*B[column3+k+2]+element4*B[column4+k+2];
+	    temp[k+3]+=element1*B[column1+k+3]+element2*B[column2+k+3]+element3*B[column3+k+3]+element4*B[column4+k+3];
+	    temp2[k]+=element5*B[column1+k]+element6*B[column2+k]+element7*B[column3+k]+element8*B[column4+k];
+	    temp2[k+1]+=element5*B[column1+k+1]+element6*B[column2+k+1]+element7*B[column3+k+1]+element8*B[column4+k+1];
+	    temp2[k+2]+=element5*B[column1+k+2]+element6*B[column2+k+2]+element7*B[column3+k+2]+element8*B[column4+k+2];
+	    temp2[k+3]+=element5*B[column1+k+3]+element6*B[column2+k+3]+element7*B[column3+k+3]+element8*B[column4+k+3];
+	  }
+	  if (j==28){
+	    for (l=0; l<32; l++){
+	      C[row+l]+=temp[l];
+	      C[row2+l]+=temp2[l];
+	      temp[l]=0;
+	      temp2[l]=0;
+	    }
+	  }
+	}
+      }
+      }  
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+//   // Execute the provided, naive matmul
+//   barrier();
+//   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+// 
+//   
+//   // verify
+//   verify(ARRAY_SIZE, results_data, verify_data);
+//   
+//   // clear results from the first trial
+//   size_t i;
+//   if (coreid == 0) 
+//      for (i=0; i < ARRAY_SIZE; i++)
+//         results_data[i] = 0;
+//   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
diff --git a/mt/am_matmul/matmul_mi.c~ b/mt/am_matmul/matmul_mi.c~
new file mode 100755
index 0000000..858f363
--- /dev/null
+++ b/mt/am_matmul/matmul_mi.c~
@@ -0,0 +1,290 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+  /*size_t i, j, k;
+  int row, column, column2, column3, column4, column5, column6, column7, column8;
+  size_t max_dim = 32*32;
+  data_t element, element2, element3, element4, element5, element6, element7, element8;
+  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores); i+=8){
+    element=A[i];
+    element2 = A[i+1];
+    element3 = A[i+2];
+    element4 = A[i+3];
+    element5 = A[i+4];
+    element6 = A[i+5];
+    element7 = A[i+6];
+    element8 = A[i+7];
+    row= (int)(i/32)*32;
+    column = i%32*32;
+    column2 = (i+1)%32*32;
+    column3 = (i+2)%32*32;
+    column4 = (i+3)%32*32;
+    column5 = (i+4)%32*32;
+    column6 = (i+5)%32*32;
+    column7 = (i+6)%32*32;
+    column8 = (i+7)%32*32;
+    
+    for (j=0; j<32; j+=8){
+      temp_mat[j]+=element*B[column+j]+element2*B[column2+j]+element3*B[column3+j]+element4*B[column4+j]+element5*B[column5+j]+element6*B[column6+j]+element7*B[column7+j]+element8*B[column8+j];
+      temp_mat[j+1]+=element*B[column+j+1]+element2*B[column2+j+1]+element3*B[column3+j+1]+element4*B[column4+j+1]+element5*B[column5+j+1]+element6*B[column6+j+1]+element7*B[column7+j+1]+element8*B[column8+j+1];
+      temp_mat[j+2]+=element*B[column+j+2]+element2*B[column2+j+2]+element3*B[column3+j+2]+element4*B[column4+j+2]+element5*B[column5+j+2]+element6*B[column6+j+2]+element7*B[column7+j+2]+element8*B[column8+j+2];
+      temp_mat[j+3]+=element*B[column+j+3]+element2*B[column2+j+3]+element3*B[column3+j+3]+element4*B[column4+j+3]+element5*B[column5+j+3]+element6*B[column6+j+3]+element7*B[column7+j+3]+element8*B[column8+j+3];
+      temp_mat[j+4]+=element*B[column+j+4]+element2*B[column2+j+4]+element3*B[column3+j+4]+element4*B[column4+j+4]+element5*B[column5+j+4]+element6*B[column6+j+4]+element7*B[column7+j+4]+element8*B[column8+j+4];
+      temp_mat[j+5]+=element*B[column+j+5]+element2*B[column2+j+5]+element3*B[column3+j+5]+element4*B[column4+j+5]+element5*B[column5+j+5]+element6*B[column6+j+5]+element7*B[column7+j+5]+element8*B[column8+j+5];
+      temp_mat[j+6]+=element*B[column+j+6]+element2*B[column2+j+6]+element3*B[column3+j+6]+element4*B[column4+j+6]+element5*B[column5+j+6]+element6*B[column6+j+6]+element7*B[column7+j+6]+element8*B[column8+j+6];
+      temp_mat[j+7]+=element*B[column+j+7]+element2*B[column2+j+7]+element3*B[column3+j+7]+element4*B[column4+j+7]+element5*B[column5+j+7]+element6*B[column6+j+7]+element7*B[column7+j+7]+element8*B[column8+j+7];
+    }
+    if (i%32==24){
+      for(k=0; k<32; k++){
+	C[row+k]=temp_mat[k];
+	temp_mat[k]=0;
+      }
+    }
+    }*/
+    int i,j,k,l;
+    data_t element1, element2, element3, element4, element5, element6, element7, element8;
+    int row, row2;
+    int column1, column2, column3, column4, column5, column6, column7, column8;
+    data_t temp[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    data_t temp2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    if (coreid == 0){
+      for (i=0; i<32; i+=2){
+	row = i*32;
+	row2 = (i+1)*32;
+	for (j=0; j<16; j+=4){
+	  element1 = A[row+j];
+	  element2 = A[row+j+1];
+	  element3 = A[row+j+2];
+	  element4 = A[row+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  element5 = A[row2+j];
+	  element6 = A[row2+j+1];
+	  element7 = A[row2+j+2];
+	  element8 = A[row2+j+3];
+
+	  for (k=0; k<32; k+=4){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	    temp[k+1]+=element1*B[column1+k+1]+element2*B[column2+k+1]+element3*B[column3+k+1]+element4*B[column4+k+1];
+	    temp[k+2]+=element1*B[column1+k+2]+element2*B[column2+k+2]+element3*B[column3+k+2]+element4*B[column4+k+2];
+	    temp[k+3]+=element1*B[column1+k+3]+element2*B[column2+k+3]+element3*B[column3+k+3]+element4*B[column4+k+3];
+	    temp2[k]+=element5*B[column1+k]+element6*B[column2+k]+element7*B[column3+k]+element8*B[column4+k];
+	    temp2[k+1]+=element5*B[column1+k+1]+element6*B[column2+k+1]+element7*B[column3+k+1]+element8*B[column4+k+1];
+	    temp2[k+2]+=element5*B[column1+k+2]+element6*B[column2+k+2]+element7*B[column3+k+2]+element8*B[column4+k+2];
+	    temp2[k+3]+=element5*B[column1+k+3]+element6*B[column2+k+3]+element7*B[column3+k+3]+element8*B[column4+k+3];
+	  }
+	  if (j==12){
+	    for (l=0; l<32; l++){
+	      C[row+l]+=temp[l];
+	      C[row2+l]+=temp2[l];
+	      temp[l]=0;
+	      temp2[l]=0;
+	    }
+	  }
+	}
+      }
+    }
+    else if (coreid==1){
+      for (i=0; i<32; i+=2){
+	row = (31-i)*32;
+	row2 = (31-i-1)*32;
+	for (j=16; j<32; j+=4){
+	  element1 = A[row+j];
+	  element2 = A[row+j+1];
+	  element3 = A[row+j+2];
+	  element4 = A[row+j+3];
+	  element5 = A[row2+j];
+	  element6 = A[row2+j+1];
+	  element7 = A[row2+j+2];
+	  element8 = A[row2+j+3];
+	  column1 = j*32;
+	  column2 = (j+1)*32;
+	  column3 = (j+2)*32;
+	  column4 = (j+3)*32;
+	  for (k=0; k<32; k+=4){
+	    temp[k]+=element1*B[column1+k]+element2*B[column2+k]+element3*B[column3+k]+element4*B[column4+k];
+	    temp[k+1]+=element1*B[column1+k+1]+element2*B[column2+k+1]+element3*B[column3+k+1]+element4*B[column4+k+1];
+	    temp[k+2]+=element1*B[column1+k+2]+element2*B[column2+k+2]+element3*B[column3+k+2]+element4*B[column4+k+2];
+	    temp[k+3]+=element1*B[column1+k+3]+element2*B[column2+k+3]+element3*B[column3+k+3]+element4*B[column4+k+3];
+	    temp2[k]+=element5*B[column1+k]+element6*B[column2+k]+element7*B[column3+k]+element8*B[column4+k];
+	    temp2[k+1]+=element5*B[column1+k+1]+element6*B[column2+k+1]+element7*B[column3+k+1]+element8*B[column4+k+1];
+	    temp2[k+2]+=element5*B[column1+k+2]+element6*B[column2+k+2]+element7*B[column3+k+2]+element8*B[column4+k+2];
+	    temp2[k+3]+=element5*B[column1+k+3]+element6*B[column2+k+3]+element7*B[column3+k+3]+element8*B[column4+k+3];
+	  }
+	  if (j==28){
+	    for (l=0; l<32; l++){
+	      C[row+l]+=temp[l];
+	      C[row2+l]+=temp2[l];
+	      temp[l]=0;
+	      temp2[l]=0;
+	    }
+	  }
+	}
+      }
+      }  
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+   // Execute the provided, naive matmul
+   barrier();
+   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   
+   // clear results from the first trial
+   size_t i;
+   if (coreid == 0) 
+      for (i=0; i < ARRAY_SIZE; i++)
+         results_data[i] = 0;
+   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
diff --git a/mt/am_matmul/matmul_msi.c b/mt/am_matmul/matmul_msi.c
new file mode 100755
index 0000000..0b59f8c
--- /dev/null
+++ b/mt/am_matmul/matmul_msi.c
@@ -0,0 +1,216 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+  size_t i, j, k, l;
+  int row,row2, column, column2, column3, column4, column5, column6, column7, column8;
+  size_t max_dim = 32*32;
+  data_t element, element2, element3, element4, element5, element6, element7, element8;
+  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  data_t temp_mat2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  //for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores); i+=8){
+  for (l=coreid*32/ncores; l<32*(1+coreid)/ncores; l+=2){
+    row=l*32;
+    row2=(l+1)*32;
+    for (i=0; i<lda; i+=4){
+      element = A[row+i];
+      element2 = A[row+i+1];
+      element3 = A[row+i+2];
+      element4 = A[row+i+3];
+      element5 = A[row2+i];
+      element6 = A[row2+i+1];
+      element7 = A[row2+i+2];
+      element8 = A[row2+i+3];
+      column=i*32;
+      column2=(i+1)*32;
+      column3=(i+2)*32;
+      column4=(i+3)*32;
+      for (j=0; j<32; j+=4){
+	temp_mat[j]+=element*B[column+j]+element2*B[column2+j]+element3*B[column3+j]+element4*B[column4+j];
+	temp_mat[j+1]+=element*B[column+j+1]+element2*B[column2+j+1]+element3*B[column3+j+1]+element4*B[column4+j+1];
+	temp_mat[j+2]+=element*B[column+j+2]+element2*B[column2+j+2]+element3*B[column3+j+2]+element4*B[column4+j+2];
+	temp_mat[j+3]+=element*B[column+j+3]+element2*B[column2+j+3]+element3*B[column3+j+3]+element4*B[column4+j+3];
+	temp_mat2[j]+=element5*B[column+j]+element6*B[column2+j]+element7*B[column3+j]+element8*B[column4+j];
+	temp_mat2[j+1]+=element5*B[column+j+1]+element6*B[column2+j+1]+element7*B[column3+j+1]+element8*B[column4+j+1];
+	temp_mat2[j+2]+=element5*B[column+j+2]+element6*B[column2+j+2]+element7*B[column3+j+2]+element8*B[column4+j+2];
+	temp_mat2[j+3]+=element5*B[column+j+3]+element6*B[column2+j+3]+element7*B[column3+j+3]+element8*B[column4+j+3];
+      }
+      /*if (i==28){
+	for(k=0; k<32; k++){
+	  C[row+k]=temp_mat[k];
+	  C[row2+k]=temp_mat2[k];
+	  temp_mat[k]=0;
+	  temp_mat2[k]=0;
+	}
+	}*/
+    }
+    for(k=0; k<32; k++){
+	  C[row+k]=temp_mat[k];
+	  C[row2+k]=temp_mat2[k];
+	  temp_mat[k]=0;
+	  temp_mat2[k]=0;
+    }
+  }
+  
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+   // Execute the provided, naive matmul
+   barrier();
+   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   
+   // clear results from the first trial
+   size_t i;
+   if (coreid == 0) 
+      for (i=0; i < ARRAY_SIZE; i++)
+         results_data[i] = 0;
+   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
diff --git a/mt/am_matmul/matmul_msi.c~ b/mt/am_matmul/matmul_msi.c~
new file mode 100755
index 0000000..61016a7
--- /dev/null
+++ b/mt/am_matmul/matmul_msi.c~
@@ -0,0 +1,210 @@
+//**************************************************************************
+// Multi-threaded Matrix Multiply benchmark
+//--------------------------------------------------------------------------
+// TA     : Christopher Celio
+// Student: 
+//
+//
+// This benchmark multiplies two 2-D arrays together and writes the results to
+// a third vector. The input data (and reference data) should be generated
+// using the matmul_gendata.pl perl script and dumped to a file named
+// dataset.h. 
+
+
+// print out arrays, etc.
+//#define DEBUG
+
+//--------------------------------------------------------------------------
+// Includes 
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+typedef float data_t;
+#include "dataset.h"
+ 
+  
+//--------------------------------------------------------------------------
+// Basic Utilities and Multi-thread Support
+
+__thread unsigned long coreid;
+unsigned long ncores;
+
+#include "util.h"
+   
+#define stringify_1(s) #s
+#define stringify(s) stringify_1(s)
+#define stats(code) do { \
+    unsigned long _c = -rdcycle(), _i = -rdinstret(); \
+    code; \
+    _c += rdcycle(), _i += rdinstret(); \
+    if (coreid == 0) \
+      printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
+             stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
+  } while(0)
+ 
+
+//--------------------------------------------------------------------------
+// Helper functions
+    
+void printArray( char name[], int n, data_t arr[] )
+{
+   int i;
+   if (coreid != 0)
+      return;
+  
+   printf( " %10s :", name );
+   for ( i = 0; i < n; i++ )
+      printf( " %3ld ", (long) arr[i] );
+   printf( "\n" );
+}
+      
+void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
+{
+   if (coreid != 0)
+      return;
+
+   size_t i;
+   for (i = 0; i < n; i++)
+   {
+      if (test[i] != correct[i])
+      {
+         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n", 
+            i, (long)test[i], i, (long)correct[i]);
+         exit(-1);
+      }
+   }
+   
+   return;
+}
+ 
+//--------------------------------------------------------------------------
+// matmul function
+ 
+// single-thread, naive version
+void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   int i, j, k;
+
+   if (coreid > 0)
+      return;
+  
+   for ( i = 0; i < lda; i++ )
+      for ( j = 0; j < lda; j++ )  
+      {
+         for ( k = 0; k < lda; k++ ) 
+         {
+            C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
+         }
+      }
+
+}
+ 
+
+
+void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+  size_t i, j, k, l;
+  int row,row2, column, column2, column3, column4, column5, column6, column7, column8;
+  size_t max_dim = 32*32;
+  data_t element, element2, element3, element4, element5, element6, element7, element8;
+  data_t temp_mat[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  data_t temp_mat2[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  //for (i=coreid*max_dim/ncores; i<(max_dim/ncores+coreid*max_dim/ncores); i+=8){
+  for (l=coreid*32/ncores; l<32*(1+coreid)/ncores; l+=2){
+    row=l*32;
+    row2=(l+1)*32;
+    for (i=0; i<lda; i+=4){
+      element = A[row+i];
+      element2 = A[row+i+1];
+      element3 = A[row+i+2];
+      element4 = A[row+i+3];
+      element5 = A[row2+i];
+      element6 = A[row2+i+1];
+      element7 = A[row2+i+2];
+      element8 = A[row2+i+3];
+      column=i*32;
+      column2=(i+1)*32;
+      column3=(i+2)*32;
+      column4=(i+3)*32;
+      for (j=0; j<32; j+=4){
+	temp_mat[j]+=element*B[column+j]+element2*B[column2+j]+element3*B[column3+j]+element4*B[column4+j];
+	temp_mat[j+1]+=element*B[column+j+1]+element2*B[column2+j+1]+element3*B[column3+j+1]+element4*B[column4+j+1];
+	temp_mat[j+2]+=element*B[column+j+2]+element2*B[column2+j+2]+element3*B[column3+j+2]+element4*B[column4+j+2];
+	temp_mat[j+3]+=element*B[column+j+3]+element2*B[column2+j+3]+element3*B[column3+j+3]+element4*B[column4+j+3];
+	temp_mat2[j]+=element5*B[column+j]+element6*B[column2+j]+element7*B[column3+j]+element8*B[column4+j];
+	temp_mat2[j+1]+=element5*B[column+j+1]+element6*B[column2+j+1]+element7*B[column3+j+1]+element8*B[column4+j+1];
+	temp_mat2[j+2]+=element5*B[column+j+2]+element6*B[column2+j+2]+element7*B[column3+j+2]+element8*B[column4+j+2];
+	temp_mat2[j+3]+=element5*B[column+j+3]+element6*B[column2+j+3]+element7*B[column3+j+3]+element8*B[column4+j+3];
+      }
+      if (i==28){
+	for(k=0; k<32; k++){
+	  C[row+k]=temp_mat[k];
+	  C[row2+k]=temp_mat2[k];
+	  temp_mat[k]=0;
+	  temp_mat2[k]=0;
+	}
+      }
+    }
+  }
+  
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+
+}
+
+//--------------------------------------------------------------------------
+// Main
+//
+// all threads start executing thread_entry(). Use their "coreid" to
+// differentiate between threads (each thread is running on a separate core).
+  
+void thread_entry(int cid, int nc)
+{
+   coreid = cid;
+   ncores = nc;
+
+   // static allocates data in the binary, which is visible to both threads
+   static data_t results_data[ARRAY_SIZE];
+
+
+   // Execute the provided, naive matmul
+   barrier();
+   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   
+   // clear results from the first trial
+   size_t i;
+   if (coreid == 0) 
+      for (i=0; i < ARRAY_SIZE; i++)
+         results_data[i] = 0;
+   barrier();
+
+   
+   // Execute your faster matmul
+   barrier();
+   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
+ 
+#ifdef DEBUG
+   printArray("results:", ARRAY_SIZE, results_data);
+   printArray("verify :", ARRAY_SIZE, verify_data);
+#endif
+   
+   // verify
+   verify(ARRAY_SIZE, results_data, verify_data);
+   barrier();
+
+   exit(0);
+}
+
author	Henry Cook <hcook@eecs.berkeley.edu>	2013-06-13 15:30:16 -0700
committer	Henry Cook <hcook@eecs.berkeley.edu>	2013-06-13 15:30:16 -0700
commit	60f056880ec6929c5f23af4d66aea0f0cb7b0245 (patch)
tree	a2f4cbc9902df362534ede13d65883ee47fba2d8 /mt/am_matmul
parent	4412b96c81ca09dcce6305579dd86d4bf3b808da (diff)
download	riscv-tests-60f056880ec6929c5f23af4d66aea0f0cb7b0245.zip riscv-tests-60f056880ec6929c5f23af4d66aea0f0cb7b0245.tar.gz riscv-tests-60f056880ec6929c5f23af4d66aea0f0cb7b0245.tar.bz2