diff options
author | Zhen Wei <zhen.wei@sifive.com> | 2020-02-21 15:06:04 +0800 |
---|---|---|
committer | Chih-Min Chao <48193236+chihminchao@users.noreply.github.com> | 2020-03-05 17:16:19 +0800 |
commit | 621340acc2bbeacfdb9863781ffa0f06b1338344 (patch) | |
tree | 3787481592e123a64cef19364500cddb491ecaad /riscv | |
parent | e799cf99af80ebe6849bfda0d0af27906f21f3ce (diff) | |
download | spike-621340acc2bbeacfdb9863781ffa0f06b1338344.zip spike-621340acc2bbeacfdb9863781ffa0f06b1338344.tar.gz spike-621340acc2bbeacfdb9863781ffa0f06b1338344.tar.bz2 |
rvv: import parallel vf(w)redsum hardware impl.
The number of vector FP ALUs and implementations of vf(w)redsum could be
passed as options by the following example:
"--varch=vlen:512,elen:32,slen:512,nalu:4,fredsum-impl:parallel"
By default, 4 of vector FP ALUs and ordered vector FP reduction sum
implementations are assumed.
Diffstat (limited to 'riscv')
-rw-r--r-- | riscv/decode.h | 53 | ||||
-rw-r--r-- | riscv/insns/vfredsum_vs.h | 14 | ||||
-rw-r--r-- | riscv/insns/vfredsum_vs_parallel.h | 24 | ||||
-rw-r--r-- | riscv/insns/vfwredsum_vs.h | 13 | ||||
-rw-r--r-- | riscv/insns/vfwredsum_vs_parallel.h | 10 | ||||
-rw-r--r-- | riscv/processor.cc | 17 | ||||
-rw-r--r-- | riscv/processor.h | 3 |
7 files changed, 118 insertions, 16 deletions
diff --git a/riscv/decode.h b/riscv/decode.h index d4ad98a..f0c1568 100644 --- a/riscv/decode.h +++ b/riscv/decode.h @@ -1738,6 +1738,59 @@ for (reg_t i = 0; i < vlmax && P.VU.vl != 0; ++i) { \ break; \ }; \ +#define VI_VFP_LOOP_REDUCTIONSUM_WIDEN_INIT /* Dedicated to f32 -> f64 */ \ + float64_t vd_0 = f64(p->VU.elt<float64_t>(rs1_num, 0).v); \ + reg_t temp_len = 1, temp_vl = (vl > 0) ? (vl - 1) : 0; \ + uint64_t valu = p->VU.get_valu(); \ + float64_t* temp_arr = NULL; \ + for (; temp_vl > 0; temp_len <<= 1) temp_vl >>= 1; /* Calculate the bit length of vl */ \ + temp_arr = new float64_t[temp_len](); \ + for (reg_t i=p->VU.vstart; i<vl; ++i) { /* Allocate a temporary array for computing */ \ + VI_LOOP_ELEMENT_SKIP(); \ + temp_arr[i] = f32_to_f64(p->VU.elt<float32_t>(rs2_num, i)); \ + } + +#define VI_VFP_LOOP_REDUCTIONSUM_INIT(width) \ + float##width##_t vd_0 = p->VU.elt<float##width##_t>(rd_num, 0); \ + float##width##_t vs1_0 = p->VU.elt<float##width##_t>(rs1_num, 0); \ + vd_0 = vs1_0; \ + reg_t temp_len = 1, temp_vl = (vl > 0) ? (vl - 1) : 0; \ + uint64_t valu = p->VU.get_valu(); \ + float##width##_t* temp_arr = NULL; \ + for (; temp_vl > 0; temp_len <<= 1) temp_vl >>= 1; /* Calculate the bit length of vl */ \ + temp_arr = new float##width##_t[temp_len](); \ + for (reg_t i=p->VU.vstart; i<vl; ++i) { /* Allocate a temporary array for computing */ \ + VI_LOOP_ELEMENT_SKIP(); \ + temp_arr[i] = p->VU.elt<float##width##_t>(rs2_num, i); \ + } + +#define VI_VFP_LOOP_REDUCTIONSUM_MERGE(width) \ + while (valu > 0) { \ + while (valu < temp_len) { \ + uint64_t write_pos = 0; \ + uint64_t read_pos = 0; \ + while (read_pos < temp_len) { \ + temp_arr[write_pos] = f##width##_add(temp_arr[read_pos], temp_arr[read_pos + valu]); \ + set_fp_exceptions; \ + ++write_pos; \ + ++read_pos; \ + if (read_pos % valu == 0) \ + read_pos += valu; \ + } \ + temp_len = write_pos; \ + } \ + valu >>= 1; \ + } \ + vd_0 = f##width##_add(vd_0, temp_arr[0]); \ + set_fp_exceptions; \ + delete [] temp_arr; + +#define VI_VFP_LOOP_REDUCTIONSUM_CLOSE(x) \ + P.VU.vstart = 0; \ + if (vl > 0) { \ + P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true) = vd_0.v; \ + } + #define VI_VFP_VV_LOOP_WIDE_REDUCTION(BODY) \ VI_VFP_LOOP_WIDE_REDUCTION_BASE \ float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \ diff --git a/riscv/insns/vfredsum_vs.h b/riscv/insns/vfredsum_vs.h index 7b5cccc..ca8e91b 100644 --- a/riscv/insns/vfredsum_vs.h +++ b/riscv/insns/vfredsum_vs.h @@ -1,8 +1,8 @@ // vfredsum: vd[0] = sum( vs2[*] , vs1[0] ) -VI_VFP_VV_LOOP_REDUCTION -({ - vd_0 = f32_add(vd_0, vs2); -}, -{ - vd_0 = f64_add(vd_0, vs2); -}) + +if(p->VU.FREDSUM_IMPL == "ordered") { + #include "vfredosum_vs.h" +} else if (p->VU.FREDSUM_IMPL == "parallel") { + #include "vfredsum_vs_parallel.h" +} else + require(0); diff --git a/riscv/insns/vfredsum_vs_parallel.h b/riscv/insns/vfredsum_vs_parallel.h new file mode 100644 index 0000000..d611c60 --- /dev/null +++ b/riscv/insns/vfredsum_vs_parallel.h @@ -0,0 +1,24 @@ +// Parallel version of vfredsum +// vfredsum: vd[0] = sum( vs2[*] , vs1[0] ) + +VI_CHECK_REDUCTION(false) +VI_VFP_COMMON + +switch(p->VU.vsew) { + case e32: { + VI_VFP_LOOP_REDUCTIONSUM_INIT(32) + VI_VFP_LOOP_REDUCTIONSUM_MERGE(32) + VI_VFP_LOOP_REDUCTIONSUM_CLOSE(e32) + break; + } + case e64: { + VI_VFP_LOOP_REDUCTIONSUM_INIT(64) + VI_VFP_LOOP_REDUCTIONSUM_MERGE(64) + VI_VFP_LOOP_REDUCTIONSUM_CLOSE(e64) + break; + } + case e16: + default: + require(0); + break; +}; diff --git a/riscv/insns/vfwredsum_vs.h b/riscv/insns/vfwredsum_vs.h index 3426ef8..bed3c64 100644 --- a/riscv/insns/vfwredsum_vs.h +++ b/riscv/insns/vfwredsum_vs.h @@ -1,8 +1,7 @@ // vfwredsum.vs vd, vs2, vs1 -require_vector; -require(P.VU.vsew * 2 <= P.VU.ELEN); -require((insn.rs2() & (P.VU.vlmul - 1)) == 0); -VI_VFP_VV_LOOP_WIDE_REDUCTION -({ - vd_0 = f64_add(vd_0, vs2); -}) +if(p->VU.FREDSUM_IMPL == "ordered") { + #include "vfwredosum_vs.h" +} else if (p->VU.FREDSUM_IMPL == "parallel") { + #include "vfwredsum_vs_parallel.h" +} else + require(0); diff --git a/riscv/insns/vfwredsum_vs_parallel.h b/riscv/insns/vfwredsum_vs_parallel.h new file mode 100644 index 0000000..236d4d0 --- /dev/null +++ b/riscv/insns/vfwredsum_vs_parallel.h @@ -0,0 +1,10 @@ +// Parallel version of vfwredsum +require_vector; +require(P.VU.vsew * 2 <= P.VU.ELEN); +require((insn.rs2() & (P.VU.vlmul - 1)) == 0); + +VI_VFP_COMMON + +VI_VFP_LOOP_REDUCTIONSUM_WIDEN_INIT +VI_VFP_LOOP_REDUCTIONSUM_MERGE(64) +VI_VFP_LOOP_REDUCTIONSUM_CLOSE(e64) diff --git a/riscv/processor.cc b/riscv/processor.cc index 94c8c76..ee9fbaa 100644 --- a/riscv/processor.cc +++ b/riscv/processor.cc @@ -108,6 +108,8 @@ void processor_t::parse_varch_string(const char* s) int vlen = 0; int elen = 0; int slen = 0; + int valu = 4; // default + std::string fredsum_impl("ordered"); // default while (pos < len) { std::string attr = get_string_token(str, ':', pos); @@ -120,16 +122,22 @@ void processor_t::parse_varch_string(const char* s) slen = get_int_token(str, ',', pos); else if (attr == "elen") elen = get_int_token(str, ',', pos); + else if (attr == "nalu") + valu = get_int_token(str, ',', pos); + else if (attr == "fredsum-impl") + fredsum_impl = get_string_token(str, ',', pos); else bad_varch_string(s, "Unsupported token"); ++pos; } - // The integer should be the power of 2 - if (!check_pow2(vlen) || !check_pow2(elen) || !check_pow2(slen)){ + if (!check_pow2(vlen) || !check_pow2(elen) || !check_pow2(slen) || !check_pow2(valu)){ bad_varch_string(s, "The integer value should be the power of 2"); } + if (fredsum_impl != "ordered" && fredsum_impl != "parallel"){ + bad_varch_string(s, "fredsum-impl now only supported ordered/parallel"); + } /* Vector spec requirements. */ if (vlen < elen) @@ -144,10 +152,15 @@ void processor_t::parse_varch_string(const char* s) /* spike requirements. */ if (vlen > 4096) bad_varch_string(s, "vlen must be <= 4096"); + if (valu == 0){ + bad_varch_string(s, "nalu (Number of vector ALUs) must be > 0"); + } VU.VLEN = vlen; VU.ELEN = elen; VU.SLEN = slen; + VU.VALU = valu; + VU.FREDSUM_IMPL = fredsum_impl; VU.vlenb = vlen / 8; } diff --git a/riscv/processor.h b/riscv/processor.h index cf2fe33..36306e7 100644 --- a/riscv/processor.h +++ b/riscv/processor.h @@ -426,6 +426,8 @@ public: reg_t vstart, vxrm, vxsat, vl, vtype, vlenb; reg_t vediv, vsew, vlmul; reg_t ELEN, VLEN, SLEN; + reg_t VALU; + std::string FREDSUM_IMPL; bool vill; // vector element for varies SEW @@ -469,6 +471,7 @@ public: reg_t get_vlen() { return VLEN; } reg_t get_elen() { return ELEN; } reg_t get_slen() { return SLEN; } + reg_t get_valu() { return VALU; } VRM get_vround_mode() { return (VRM)vxrm; |