rvv: import parallel vf(w)redsum hardware impl.

The number of vector FP ALUs and implementations of vf(w)redsum could be passed as options by the following example: "--varch=vlen:512,elen:32,slen:512,nalu:4,fredsum-impl:parallel" By default, 4 of vector FP ALUs and ordered vector FP reduction sum implementations are assumed.
author: Zhen Wei <zhen.wei@sifive.com> 2020-02-21 15:06:04 +0800
committer: Chih-Min Chao <48193236+chihminchao@users.noreply.github.com> 2020-03-05 17:16:19 +0800
commit: 621340acc2bbeacfdb9863781ffa0f06b1338344 (patch)
tree: 3787481592e123a64cef19364500cddb491ecaad /riscv
parent: e799cf99af80ebe6849bfda0d0af27906f21f3ce (diff)
download: spike-621340acc2bbeacfdb9863781ffa0f06b1338344.zip
spike-621340acc2bbeacfdb9863781ffa0f06b1338344.tar.gz
spike-621340acc2bbeacfdb9863781ffa0f06b1338344.tar.bz2
7 files changed, 118 insertions, 16 deletions
diff --git a/riscv/decode.h b/riscv/decode.h
index d4ad98a..f0c1568 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -1738,6 +1738,59 @@ for (reg_t i = 0; i < vlmax && P.VU.vl != 0; ++i) { \
       break; \
   }; \
 
+#define VI_VFP_LOOP_REDUCTIONSUM_WIDEN_INIT /* Dedicated to f32 -> f64 */ \
+  float64_t vd_0 = f64(p->VU.elt<float64_t>(rs1_num, 0).v); \
+  reg_t temp_len = 1, temp_vl = (vl > 0) ? (vl - 1) : 0; \
+  uint64_t valu = p->VU.get_valu(); \
+  float64_t* temp_arr = NULL; \
+  for (; temp_vl > 0; temp_len <<= 1) temp_vl >>= 1; /* Calculate the bit length of vl */ \
+  temp_arr = new float64_t[temp_len](); \
+  for (reg_t i=p->VU.vstart; i<vl; ++i) { /* Allocate a temporary array for computing */  \
+    VI_LOOP_ELEMENT_SKIP(); \
+    temp_arr[i] = f32_to_f64(p->VU.elt<float32_t>(rs2_num, i)); \
+  }
+
+#define VI_VFP_LOOP_REDUCTIONSUM_INIT(width) \
+  float##width##_t vd_0 = p->VU.elt<float##width##_t>(rd_num, 0); \
+  float##width##_t vs1_0 = p->VU.elt<float##width##_t>(rs1_num, 0); \
+  vd_0 = vs1_0; \
+  reg_t temp_len = 1, temp_vl = (vl > 0) ? (vl - 1) : 0; \
+  uint64_t valu = p->VU.get_valu(); \
+  float##width##_t* temp_arr = NULL; \
+  for (; temp_vl > 0; temp_len <<= 1) temp_vl >>= 1; /* Calculate the bit length of vl */ \
+  temp_arr = new float##width##_t[temp_len](); \
+  for (reg_t i=p->VU.vstart; i<vl; ++i) { /* Allocate a temporary array for computing */  \
+    VI_LOOP_ELEMENT_SKIP(); \
+    temp_arr[i] = p->VU.elt<float##width##_t>(rs2_num, i); \
+  }
+
+#define VI_VFP_LOOP_REDUCTIONSUM_MERGE(width) \
+  while (valu > 0) { \
+    while (valu < temp_len) { \
+      uint64_t write_pos = 0; \
+      uint64_t read_pos = 0; \
+      while (read_pos < temp_len) { \
+        temp_arr[write_pos] = f##width##_add(temp_arr[read_pos], temp_arr[read_pos + valu]); \
+        set_fp_exceptions; \
+        ++write_pos; \
+        ++read_pos; \
+        if (read_pos % valu == 0) \
+          read_pos += valu; \
+      } \
+      temp_len = write_pos; \
+    } \
+    valu >>= 1; \
+  } \
+  vd_0 = f##width##_add(vd_0, temp_arr[0]); \
+  set_fp_exceptions; \
+  delete [] temp_arr;
+
+#define VI_VFP_LOOP_REDUCTIONSUM_CLOSE(x) \
+  P.VU.vstart = 0; \
+  if (vl > 0) { \
+    P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true) = vd_0.v; \
+  }
+
 #define VI_VFP_VV_LOOP_WIDE_REDUCTION(BODY) \
   VI_VFP_LOOP_WIDE_REDUCTION_BASE \
   float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
diff --git a/riscv/insns/vfredsum_vs.h b/riscv/insns/vfredsum_vs.h
index 7b5cccc..ca8e91b 100644
--- a/riscv/insns/vfredsum_vs.h
+++ b/riscv/insns/vfredsum_vs.h
@@ -1,8 +1,8 @@
 // vfredsum: vd[0] =  sum( vs2[*] , vs1[0] )
-VI_VFP_VV_LOOP_REDUCTION
-({
-  vd_0 = f32_add(vd_0, vs2);
-},
-{
-  vd_0 = f64_add(vd_0, vs2);
-})
+
+if(p->VU.FREDSUM_IMPL == "ordered") {
+  #include "vfredosum_vs.h"
+} else if (p->VU.FREDSUM_IMPL == "parallel") {
+  #include "vfredsum_vs_parallel.h"
+} else
+  require(0);
diff --git a/riscv/insns/vfredsum_vs_parallel.h b/riscv/insns/vfredsum_vs_parallel.h
new file mode 100644
index 0000000..d611c60
--- /dev/null
+++ b/riscv/insns/vfredsum_vs_parallel.h
@@ -0,0 +1,24 @@
+// Parallel version of vfredsum
+// vfredsum: vd[0] =  sum( vs2[*] , vs1[0] )
+
+VI_CHECK_REDUCTION(false)
+VI_VFP_COMMON
+
+switch(p->VU.vsew) {
+  case e32: {
+    VI_VFP_LOOP_REDUCTIONSUM_INIT(32)
+    VI_VFP_LOOP_REDUCTIONSUM_MERGE(32)
+    VI_VFP_LOOP_REDUCTIONSUM_CLOSE(e32)
+    break;
+  }
+  case e64: {
+    VI_VFP_LOOP_REDUCTIONSUM_INIT(64)
+    VI_VFP_LOOP_REDUCTIONSUM_MERGE(64)
+    VI_VFP_LOOP_REDUCTIONSUM_CLOSE(e64)
+    break;
+  }
+  case e16:
+  default:
+    require(0);
+    break;
+};
diff --git a/riscv/insns/vfwredsum_vs.h b/riscv/insns/vfwredsum_vs.h
index 3426ef8..bed3c64 100644
--- a/riscv/insns/vfwredsum_vs.h
+++ b/riscv/insns/vfwredsum_vs.h
@@ -1,8 +1,7 @@
 // vfwredsum.vs vd, vs2, vs1
-require_vector;
-require(P.VU.vsew * 2 <= P.VU.ELEN);
-require((insn.rs2() & (P.VU.vlmul - 1)) == 0);
-VI_VFP_VV_LOOP_WIDE_REDUCTION
-({
-  vd_0 = f64_add(vd_0, vs2);
-})
+if(p->VU.FREDSUM_IMPL == "ordered") {
+  #include "vfwredosum_vs.h"
+} else if (p->VU.FREDSUM_IMPL == "parallel") {
+  #include "vfwredsum_vs_parallel.h"
+} else
+  require(0);
diff --git a/riscv/insns/vfwredsum_vs_parallel.h b/riscv/insns/vfwredsum_vs_parallel.h
new file mode 100644
index 0000000..236d4d0
--- /dev/null
+++ b/riscv/insns/vfwredsum_vs_parallel.h
@@ -0,0 +1,10 @@
+// Parallel version of vfwredsum
+require_vector;
+require(P.VU.vsew * 2 <= P.VU.ELEN);
+require((insn.rs2() & (P.VU.vlmul - 1)) == 0);
+
+VI_VFP_COMMON
+
+VI_VFP_LOOP_REDUCTIONSUM_WIDEN_INIT
+VI_VFP_LOOP_REDUCTIONSUM_MERGE(64)
+VI_VFP_LOOP_REDUCTIONSUM_CLOSE(e64)
diff --git a/riscv/processor.cc b/riscv/processor.cc
index 94c8c76..ee9fbaa 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -108,6 +108,8 @@ void processor_t::parse_varch_string(const char* s)
   int vlen = 0;
   int elen = 0;
   int slen = 0;
+  int valu = 4; // default
+  std::string fredsum_impl("ordered"); // default
 
   while (pos < len) {
     std::string attr = get_string_token(str, ':', pos);
@@ -120,16 +122,22 @@ void processor_t::parse_varch_string(const char* s)
       slen = get_int_token(str, ',', pos);
     else if (attr == "elen")
       elen = get_int_token(str, ',', pos);
+    else if (attr == "nalu")
+      valu = get_int_token(str, ',', pos);
+    else if (attr == "fredsum-impl")
+      fredsum_impl = get_string_token(str, ',', pos);
     else
       bad_varch_string(s, "Unsupported token");
 
     ++pos;
   }
 
-  // The integer should be the power of 2
-  if (!check_pow2(vlen) || !check_pow2(elen) || !check_pow2(slen)){
+  if (!check_pow2(vlen) || !check_pow2(elen) || !check_pow2(slen) || !check_pow2(valu)){
     bad_varch_string(s, "The integer value should be the power of 2");
   }
+  if (fredsum_impl != "ordered" && fredsum_impl != "parallel"){
+    bad_varch_string(s, "fredsum-impl now only supported ordered/parallel");
+  }
 
   /* Vector spec requirements. */
   if (vlen < elen)
@@ -144,10 +152,15 @@ void processor_t::parse_varch_string(const char* s)
   /* spike requirements. */
   if (vlen > 4096)
     bad_varch_string(s, "vlen must be <= 4096");
+  if (valu == 0){
+    bad_varch_string(s, "nalu (Number of vector ALUs) must be > 0");
+  }
 
   VU.VLEN = vlen;
   VU.ELEN = elen;
   VU.SLEN = slen;
+  VU.VALU = valu;
+  VU.FREDSUM_IMPL = fredsum_impl;
   VU.vlenb = vlen / 8;
 }
 
diff --git a/riscv/processor.h b/riscv/processor.h
index cf2fe33..36306e7 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -426,6 +426,8 @@ public:
       reg_t vstart, vxrm, vxsat, vl, vtype, vlenb;
       reg_t vediv, vsew, vlmul;
       reg_t ELEN, VLEN, SLEN;
+      reg_t VALU;
+      std::string FREDSUM_IMPL;
       bool vill;
 
       // vector element for varies SEW
@@ -469,6 +471,7 @@ public:
       reg_t get_vlen() { return VLEN; }
       reg_t get_elen() { return ELEN; }
       reg_t get_slen() { return SLEN; }
+      reg_t get_valu() { return VALU; }
 
       VRM get_vround_mode() {
         return (VRM)vxrm;
author	Zhen Wei <zhen.wei@sifive.com>	2020-02-21 15:06:04 +0800
committer	Chih-Min Chao <48193236+chihminchao@users.noreply.github.com>	2020-03-05 17:16:19 +0800
commit	621340acc2bbeacfdb9863781ffa0f06b1338344 (patch)
tree	3787481592e123a64cef19364500cddb491ecaad /riscv
parent	e799cf99af80ebe6849bfda0d0af27906f21f3ce (diff)
download	spike-621340acc2bbeacfdb9863781ffa0f06b1338344.zip spike-621340acc2bbeacfdb9863781ffa0f06b1338344.tar.gz spike-621340acc2bbeacfdb9863781ffa0f06b1338344.tar.bz2