diff options
| author | Andrew Waterman <andrew@sifive.com> | 2026-04-29 10:58:27 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-04-29 10:58:27 -0700 |
| commit | b21cccdc5f4680d9c13a6bd7d9d00b75aea3cbb5 (patch) | |
| tree | 87c74cdee88663a7d7775799fc0006875a651b4b /riscv/p_ext_macros.h | |
| parent | 632777d37139298f0af1ee8d2a001f3ab0bde98c (diff) | |
| parent | f2aa295a31f6d0de376e807b2dfab5a62418c8dc (diff) | |
| download | riscv-isa-sim-master.tar.gz riscv-isa-sim-master.tar.bz2 riscv-isa-sim-master.zip | |
rvp for rv32/rv64
Diffstat (limited to 'riscv/p_ext_macros.h')
| -rw-r--r-- | riscv/p_ext_macros.h | 804 |
1 files changed, 804 insertions, 0 deletions
diff --git a/riscv/p_ext_macros.h b/riscv/p_ext_macros.h new file mode 100644 index 00000000..2501300f --- /dev/null +++ b/riscv/p_ext_macros.h @@ -0,0 +1,804 @@ +#ifndef _RISCV_P_EXT_MACROS_H_ +#define _RISCV_P_EXT_MACROS_H_ + +// rd temp +#define WRITE_P_RD() \ + rd_tmp = set_field(rd_tmp, make_mask64((i * sizeof(p_rd) * 8), sizeof(p_rd) * 8), p_rd); + +// Field +#define P_FIELD(R, INDEX, SIZE) \ + (type_sew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE)) + +#define P_UFIELD(R, INDEX, SIZE) \ + (type_usew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE)) + +// Params +#define P_RD_PARAMS(BIT) \ + auto p_rd = P_FIELD(rd_tmp, i, BIT); + +#define P_RD_UPARAMS(BIT) \ + auto p_rd = P_UFIELD(rd_tmp, i, BIT); + +#define P_RS1_PARAMS(BIT) \ + auto p_rs1 = P_FIELD(rs1, i, BIT); + +#define P_RS1_UPARAMS(BIT) \ + auto p_rs1 = P_UFIELD(rs1, i, BIT); + +#define P_RS1_INNER_PARAMS(BIT_INNER) \ + auto p_rs1 = P_FIELD(rs1, j, BIT_INNER); + +#define P_RS1_INNER_UPARAMS(BIT_INNER) \ + auto p_rs1 = P_UFIELD(rs1, j, BIT_INNER); + +#define P_RS1_EVEN_PARAMS(BIT) \ + auto p_rs1 = P_FIELD(rs1, i * 2, BIT); + +#define P_RS1_ODD_PARAMS(BIT) \ + auto p_rs1 = P_FIELD(rs1, i * 2 + 1, BIT); + +#define P_RS1_EVEN_UPARAMS(BIT) \ + auto p_rs1 = P_UFIELD(rs1, i * 2, BIT); + +#define P_RS1_ODD_UPARAMS(BIT) \ + auto p_rs1 = P_UFIELD(rs1, i * 2 + 1, BIT); + +#define P_RS1_ZIP_PARAMS(BIT) \ + auto p_rs1 = P_UFIELD(rs1, i / 2 + pos, BIT); + +#define P_RS2_PARAMS(BIT) \ + auto p_rs2 = P_FIELD(rs2, i, BIT); + +#define P_RS2_UPARAMS(BIT) \ + auto p_rs2 = P_UFIELD(rs2, i, BIT); + +#define P_RS2_CROSS_PARAMS(BIT) \ + auto p_rs2 = P_FIELD(rs2, (i ^ 1), BIT); + +#define P_RS2_CROSS_UPARAMS(BIT) \ + auto p_rs2 = P_UFIELD(rs2, (i ^ 1), BIT); + +#define P_RS2_INNER_PARAMS(BIT_INNER) \ + auto p_rs2 = P_FIELD(rs2, j, BIT_INNER); + +#define P_RS2_INNER_UPARAMS(BIT_INNER) \ + auto p_rs2 = P_UFIELD(rs2, j, BIT_INNER); + +#define P_RS2_INNER_CROSS_PARAMS(BIT_INNER) \ + auto p_rs2 = P_FIELD(rs2, (j ^ 1), BIT_INNER); + +#define P_RS2_EVEN_PARAMS(BIT) \ + auto p_rs2 = P_FIELD(rs2, i * 2, BIT); + +#define P_RS2_ODD_PARAMS(BIT) \ + auto p_rs2 = P_FIELD(rs2, i * 2 + 1, BIT); + +#define P_RS2_EVEN_UPARAMS(BIT) \ + auto p_rs2 = P_UFIELD(rs2, i * 2, BIT); + +#define P_RS2_ODD_UPARAMS(BIT) \ + auto p_rs2 = P_UFIELD(rs2, i * 2 + 1, BIT); + +#define P_RS2_ZIP_PARAMS(BIT) \ + auto p_rs2 = P_UFIELD(rs2, i / 2 + pos, BIT); + +// Loop base +#define P_RD_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = RD; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RD_RS1_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = RD; \ + reg_t rs1 = RS1; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RD_RS1_RS2_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = RD; \ + reg_t rs1 = RS1; \ + reg_t rs2 = RS2; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RS1_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rs1 = RS1; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + require_extension('P'); \ + require(BIT == e16 || BIT == e32 || BIT == e64); \ + reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \ + reg_t rs1 = zext_xlen(RS1); \ + reg_t rs2 = zext_xlen(RS2); \ + sreg_t len = 64 / BIT; \ + sreg_t len_inner = BIT / BIT_INNER; \ + for (sreg_t i = len - 1; i >= 0; --i) { \ + sreg_t p_res = P_FIELD(rd_tmp, i, BIT); \ + for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) { + +#define P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \ + require_extension('P'); \ + require(BIT == e16 || BIT == e32 || BIT == e64); \ + reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \ + reg_t rs1 = zext_xlen(RS1); \ + reg_t rs2 = zext_xlen(RS2); \ + sreg_t len = 64 / BIT; \ + sreg_t len_inner = BIT / BIT_INNER; \ + for (sreg_t i = len - 1; i >= 0; --i) { \ + sreg_t p_res = P_UFIELD(rd_tmp, i, BIT); \ + for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) { + +#define P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + require_extension('P'); \ + require(BIT == e16 || BIT == e32 || BIT == e64); \ + reg_t rd_tmp = USE_RD ? zext_xlen_pair(P_RD_PAIR) : 0; \ + reg_t rs1 = zext_xlen(RS1); \ + reg_t rs2 = zext_xlen(RS2); \ + sreg_t len_inner = BIT / BIT_INNER; \ + sreg_t p_res = P_FIELD(rd_tmp, 0, BIT * 2); \ + for (sreg_t j = len_inner - 1; j >= 0 ; --j) { + +#define P_WIDEN_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \ + require_extension('P'); \ + require(BIT == e16 || BIT == e32 || BIT == e64); \ + reg_t rd_tmp = USE_RD ? zext_xlen_pair(P_RD_PAIR) : 0; \ + reg_t rs1 = zext_xlen(RS1); \ + reg_t rs2 = zext_xlen(RS2); \ + sreg_t len_inner = BIT / BIT_INNER; \ + sreg_t p_res = P_UFIELD(rd_tmp, 0, BIT * 2); \ + for (sreg_t j = 0; j < len_inner; ++j) { + +#define P_RD_DW_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = P_RD_PAIR; \ + sreg_t len = xlen / (BIT) * 2; \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RD_RS1_DW_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = P_RD_PAIR; \ + reg_t rs1 = P_RS1_PAIR; \ + sreg_t len = xlen / (BIT) * 2; \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RS1_DW_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rs1 = P_RS1_PAIR; \ + sreg_t len = xlen / (BIT) * 2; \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_WIDEN_RD_RS1_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = P_RD_PAIR; \ + reg_t rs1 = RS1; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = P_RD_PAIR; \ + reg_t rs1 = RS1; \ + reg_t rs2 = RS2; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = P_RD_PAIR; \ + reg_t rs1 = RS1; \ + reg_t rs2 = RS2; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RD_RS1_RS2_DW_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = P_RD_PAIR; \ + reg_t rs1 = P_RS1_PAIR; \ + reg_t rs2 = P_RS2_PAIR; \ + sreg_t len = xlen / (BIT) * 2; \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_NARROW_RD_RS1_LOOP_BASE(BIT) \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = RD; \ + reg_t rs1 = P_RS1_PAIR; \ + sreg_t len = xlen / (BIT); \ + for (sreg_t i = len - 1; i >= 0; --i) { + +#define P_RD_RS1_RS2_ZIP_LOOP_BASE(BIT, POS) \ + require_rv64; \ + require_extension('P'); \ + require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \ + reg_t rd_tmp = RD; \ + reg_t rs1 = RS1; \ + reg_t rs2 = RS2; \ + sreg_t len = xlen / (BIT); \ + sreg_t pos = POS * len / 2; \ + for (sreg_t i = len - 1; i >= 0; --i) { + +// Loop body +#define P_RD_LOOP_BODY(BIT, BODY) { \ + P_RD_PARAMS(BIT) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RS1_LOOP_BODY(BIT_RS1, BODY) { \ + P_RS1_PARAMS(BIT_RS1) \ + BODY \ +} + +#define P_RS1_ULOOP_BODY(BIT_RS1, BODY) { \ + P_RS1_UPARAMS(BIT_RS1) \ + BODY \ +} + +#define P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) { \ + P_RD_UPARAMS(BIT_RD) \ + P_RS1_UPARAMS(BIT_RS1) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + P_RS2_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_UPARAMS(BIT_RD) \ + P_RS1_UPARAMS(BIT_RS1) \ + P_RS2_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + P_RS2_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_CROSS_LOOP_BODY(BIT, BODY) { \ + P_RD_PARAMS(BIT) \ + P_RS1_PARAMS(BIT) \ + P_RS2_CROSS_PARAMS(BIT) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_CROSS_ULOOP_BODY(BIT, BODY) { \ + P_RD_UPARAMS(BIT) \ + P_RS1_UPARAMS(BIT) \ + P_RS2_CROSS_UPARAMS(BIT) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_EE_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_EVEN_PARAMS(BIT_RS1) \ + P_RS2_EVEN_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_EO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_EVEN_PARAMS(BIT_RS1) \ + P_RS2_ODD_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_OO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_ODD_PARAMS(BIT_RS1) \ + P_RS2_ODD_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_EE_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_UPARAMS(BIT_RD) \ + P_RS1_EVEN_UPARAMS(BIT_RS1) \ + P_RS2_EVEN_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_EO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_UPARAMS(BIT_RD) \ + P_RS1_EVEN_UPARAMS(BIT_RS1) \ + P_RS2_ODD_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_OO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_UPARAMS(BIT_RD) \ + P_RS1_ODD_UPARAMS(BIT_RS1) \ + P_RS2_ODD_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_EE_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_EVEN_PARAMS(BIT_RS1) \ + P_RS2_EVEN_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_OO_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_ODD_PARAMS(BIT_RS1) \ + P_RS2_ODD_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_E_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + P_RS2_EVEN_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_O_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + P_RS2_ODD_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_E_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + P_RS2_EVEN_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_O_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_PARAMS(BIT_RS1) \ + P_RS2_ODD_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_UPARAMS(BIT_RS1) \ + P_RS2_UPARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +#define P_RD_RS1_RS2_ZIP_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \ + P_RD_PARAMS(BIT_RD) \ + P_RS1_ZIP_PARAMS(BIT_RS1) \ + P_RS2_ZIP_PARAMS(BIT_RS2) \ + BODY \ + WRITE_P_RD(); \ +} + +// Loop end +#define P_RD_LOOP_END() \ + } \ + WRITE_RD(sext_xlen(rd_tmp)); + +#define P_REDUCTION_LOOP_END(BIT, IS_SAT) \ + } \ + if (IS_SAT) { \ + p_res = P_SAT(BIT, p_res); \ + } \ + type_usew_t<BIT>::type p_rd = p_res; \ + WRITE_P_RD(); \ + } \ + WRITE_RD(sext_xlen(rd_tmp)); + +#define P_REDUCTION_ULOOP_END(BIT, IS_SAT) \ + } \ + type_usew_t<BIT>::type p_rd = p_res; \ + WRITE_P_RD(); \ + } \ + WRITE_RD(sext_xlen(rd_tmp)); + +#define P_REDUCTION_DW_LOOP_END(BIT, IS_SAT) \ + } \ + if (IS_SAT) { \ + p_res = P_SAT(BIT * 2, p_res); \ + } \ + WRITE_P_RD_PAIR(p_res); + +#define P_RD_DW_LOOP_END() \ + } \ + WRITE_P_RD_PAIR(rd_tmp); + +// Loop +#define P_RD_LOOP(BIT_RD, BODY) \ + P_RD_LOOP_BASE(BIT_RD) \ + P_RD_LOOP_BODY(BIT_RD, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_LOOP(BIT_RD, BIT_RS1, BODY) \ + P_RD_RS1_LOOP_BASE(BIT_RD) \ + P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_ULOOP(BIT_RD, BIT_RS1, BODY) \ + P_RD_RS1_LOOP_BASE(BIT_RD) \ + P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_CROSS_LOOP(BIT, BODY1, BODY2) \ + P_RD_RS1_RS2_LOOP_BASE(BIT) \ + P_CROSS_LOOP_BODY(BIT, BODY1) \ + --i; \ + if (sizeof(#BODY2) == 1) { \ + P_CROSS_LOOP_BODY(BIT, BODY1) \ + } \ + else { \ + P_CROSS_LOOP_BODY(BIT, BODY2) \ + } \ + P_RD_LOOP_END() + +#define P_CROSS_ULOOP(BIT, BODY1, BODY2) \ + P_RD_RS1_RS2_LOOP_BASE(BIT) \ + P_CROSS_ULOOP_BODY(BIT, BODY1) \ + --i; \ + if (sizeof(#BODY2) == 1) { \ + P_CROSS_ULOOP_BODY(BIT, BODY1) \ + } \ + else { \ + P_CROSS_ULOOP_BODY(BIT, BODY2) \ + } \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_ZIP_LOOP(BIT_RD, BIT_RS1, BIT_RS2, POS, BODY) \ + P_RD_RS1_RS2_ZIP_LOOP_BASE(BIT_RD, POS) \ + P_RD_RS1_RS2_ZIP_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_UNZIP(BIT, HIGH) \ + require_rv64; \ + require_extension('P'); \ + require(BIT == e8 || BIT == e16); \ + reg_t rd_tmp = 0; \ + for (sreg_t i = 0; i < xlen / BIT / 2; i++) { \ + rd_tmp = set_field(rd_tmp, make_mask64(i * BIT, BIT), \ + P_UFIELD(RS1, i * 2 + HIGH, BIT)); \ + rd_tmp = set_field(rd_tmp, make_mask64(i * BIT + xlen / 2, BIT), \ + P_UFIELD(RS2, i * 2 + HIGH, BIT)); \ + } \ + WRITE_RD(sext_xlen(rd_tmp)); + +#define P_RD_RS1_RS2_EE_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_EE_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_EO_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_EO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_OO_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_OO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_EE_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_EE_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_EO_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_EO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_OO_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_OO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_EE_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_EE_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_OO_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_OO_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_E_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_E_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_O_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_O_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_E_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_E_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_RD_RS1_RS2_O_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_O_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_LOOP_END() + +#define P_REDUCTION_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_PARAMS(BIT_INNER) \ + P_RS2_INNER_PARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_LOOP_END(BIT, IS_SAT) + +#define P_REDUCTION_SULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_PARAMS(BIT_INNER) \ + P_RS2_INNER_UPARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_LOOP_END(BIT, IS_SAT) + +#define P_REDUCTION_ULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_UPARAMS(BIT_INNER) \ + P_RS2_INNER_UPARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_ULOOP_END(BIT, IS_SAT) + +#define P_REDUCTION_CROSS_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_PARAMS(BIT_INNER) \ + P_RS2_INNER_CROSS_PARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_LOOP_END(BIT, IS_SAT) + +#define P_RD_DW_LOOP(BIT_RD, BODY) \ + P_RD_DW_LOOP_BASE(BIT_RD) \ + P_RD_LOOP_BODY(BIT_RD, BODY) \ + P_RD_DW_LOOP_END() + +#define P_RD_RS1_DW_LOOP(BIT_RD, BIT_RS1, BODY) \ + P_RD_RS1_DW_LOOP_BASE(BIT_RD) \ + P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_DW_LOOP_END() + +#define P_RS1_DW_LOOP(BIT_RS1, BODY) \ + P_RS1_DW_LOOP_BASE(BIT_RS1) \ + P_RS1_LOOP_BODY(BIT_RS1, BODY) \ + P_RD_LOOP_END() + +#define P_RS1_DW_ULOOP(BIT_RS1, BODY) \ + P_RS1_DW_LOOP_BASE(BIT_RS1) \ + P_RS1_ULOOP_BODY(BIT_RS1, BODY) \ + P_RD_LOOP_END() + +#define P_WIDEN_RD_RS1_LOOP(BIT_RS1, BODY) \ + P_WIDEN_RD_RS1_LOOP_BASE(BIT_RS1) \ + P_RD_RS1_LOOP_BODY((BIT_RS1) * 2, BIT_RS1, BODY) \ + P_RD_DW_LOOP_END() + +#define P_WIDEN_RD_RS1_ULOOP(BIT_RS1, BODY) \ + P_WIDEN_RD_RS1_LOOP_BASE(BIT_RS1) \ + P_RD_RS1_ULOOP_BODY((BIT_RS1) * 2, BIT_RS1, BODY) \ + P_RD_DW_LOOP_END() + +#define P_WIDEN_RD_RS1_RS2_ZIP_LOOP(BIT_RS1, BIT_RS2, BODY) \ + P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BASE(BIT_RS1) \ + P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BODY((BIT_RS1 * 2), BIT_RS1, BIT_RS2, BODY) \ + P_RD_DW_LOOP_END() + +#define P_WIDEN_RD_RS1_RS2_LOOP(BIT_RS1, BIT_RS2, BODY) \ + P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT_RS1) \ + P_RD_RS1_RS2_LOOP_BODY((BIT_RS1) * 2, BIT_RS1, BIT_RS2, BODY) \ + P_RD_DW_LOOP_END() + +#define P_WIDEN_RD_RS1_RS2_ULOOP(BIT_RS1, BIT_RS2, BODY) \ + P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT_RS1) \ + P_RD_RS1_RS2_ULOOP_BODY((BIT_RS1) * 2, BIT_RS1, BIT_RS2, BODY) \ + P_RD_DW_LOOP_END() + +#define P_WIDEN_RD_RS1_RS2_SULOOP(BIT_RS1, BIT_RS2, BODY) \ + P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT_RS1) \ + P_RD_RS1_RS2_SULOOP_BODY((BIT_RS1) * 2, BIT_RS1, BIT_RS2, BODY) \ + P_RD_DW_LOOP_END() + +#define P_WIDEN_REDUCTION_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_PARAMS(BIT_INNER) \ + P_RS2_INNER_PARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_DW_LOOP_END(BIT, IS_SAT) + +#define P_WIDEN_REDUCTION_ULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_WIDEN_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_UPARAMS(BIT_INNER) \ + P_RS2_INNER_UPARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_DW_LOOP_END(BIT, IS_SAT) + +#define P_WIDEN_REDUCTION_SULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_PARAMS(BIT_INNER) \ + P_RS2_INNER_UPARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_DW_LOOP_END(BIT, IS_SAT) + +#define P_WIDEN_REDUCTION_CROSS_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \ + P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \ + P_RS1_INNER_PARAMS(BIT_INNER) \ + P_RS2_INNER_CROSS_PARAMS(BIT_INNER) \ + BODY \ + P_REDUCTION_DW_LOOP_END(BIT, IS_SAT) + +#define P_RD_RS1_DW_LOOP(BIT_RD, BIT_RS1, BODY) \ + P_RD_RS1_DW_LOOP_BASE(BIT_RD) \ + P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_DW_LOOP_END() + +#define P_RD_RS1_DW_ULOOP(BIT_RD, BIT_RS1, BODY) \ + P_RD_RS1_DW_LOOP_BASE(BIT_RD) \ + P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_DW_LOOP_END() + +#define P_RD_RS1_RS2_DW_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_DW_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_DW_LOOP_END() + +#define P_RD_RS1_RS2_DW_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_RS1_RS2_DW_LOOP_BASE(BIT_RD) \ + P_RD_RS1_RS2_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \ + P_RD_DW_LOOP_END() + +#define P_NARROW_RD_RS1_LOOP(BIT_RD, BIT_RS1, BODY) \ + P_NARROW_RD_RS1_LOOP_BASE(BIT_RD) \ + P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_LOOP_END() + +#define P_NARROW_RD_RS1_ULOOP(BIT_RD, BIT_RS1, BODY) \ + P_NARROW_RD_RS1_LOOP_BASE(BIT_RD) \ + P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) \ + P_RD_LOOP_END() + +#define P_CROSS_DW_LOOP(BIT, BODY1, BODY2) \ + P_RD_RS1_RS2_DW_LOOP_BASE(BIT) \ + P_CROSS_LOOP_BODY(BIT, BODY1) \ + --i; \ + if (sizeof(#BODY2) == 1) { \ + P_CROSS_LOOP_BODY(BIT, BODY1) \ + } \ + else { \ + P_CROSS_LOOP_BODY(BIT, BODY2) \ + } \ + P_RD_DW_LOOP_END() + +#define P_CROSS_DW_ULOOP(BIT, BODY1, BODY2) \ + P_RD_RS1_RS2_DW_LOOP_BASE(BIT) \ + P_CROSS_ULOOP_BODY(BIT, BODY1) \ + --i; \ + if (sizeof(#BODY2) == 1) { \ + P_CROSS_ULOOP_BODY(BIT, BODY1) \ + } \ + else { \ + P_CROSS_ULOOP_BODY(BIT, BODY2) \ + } \ + P_RD_DW_LOOP_END() + +// Misc +#define P_SAT(BIT, R) ({ \ + sreg_t _psat_in = (R); \ + sreg_t _psat_out; \ + if ((BIT) == 64) _psat_out = _psat_in; \ + else if (_psat_in > (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1)) _psat_out = (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1); \ + else if (_psat_in < (sreg_t)(reg_t(-1) << ((BIT) - 1))) _psat_out = (sreg_t)(reg_t(-1) << ((BIT) - 1)); \ + else _psat_out = _psat_in; \ + if (_psat_out != _psat_in) P.VU.vxsat->write(1); \ + _psat_out; \ +}) + +#define P_USAT(BIT, R) ({ \ + sreg_t _pusat_in = (R); \ + sreg_t _pusat_out; \ + if (_pusat_in < 0) _pusat_out = 0; \ + else if ((BIT) == 64) _pusat_out = _pusat_in; \ + else if (_pusat_in > (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1)) _pusat_out = (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1); \ + else _pusat_out = _pusat_in; \ + if (_pusat_out != _pusat_in) P.VU.vxsat->write(1); \ + _pusat_out; \ +}) + +#define P_USAT_FULL(BIT, R) ({ \ + sreg_t _pusatf_in = (R); \ + sreg_t _pusatf_out; \ + if (_pusatf_in < 0) _pusatf_out = 0; \ + else if ((BIT) >= 64) _pusatf_out = _pusatf_in; \ + else if (_pusatf_in > (sreg_t)((reg_t(1) << (BIT)) - 1)) _pusatf_out = (sreg_t)((reg_t(1) << (BIT)) - 1); \ + else _pusatf_out = _pusatf_in; \ + if (_pusatf_out != _pusatf_in) P.VU.vxsat->write(1); \ + _pusatf_out; \ +}) + +#define P_PACK(BIT, X, Y) \ + require_extension('P'); \ + require(BIT == e8 || BIT == e16 || BIT == e32); \ + reg_t rd_tmp = 0; \ + for (sreg_t i = 0; i < xlen / BIT / 2; i++) { \ + rd_tmp = set_field(rd_tmp, make_mask64((i * 2 + 1) * BIT, BIT), \ + P_UFIELD(RS2, i * 2 + Y, BIT)); \ + rd_tmp = set_field(rd_tmp, make_mask64(i * 2 * BIT, BIT), \ + P_UFIELD(RS1, i * 2 + X, BIT)); \ + } \ + WRITE_RD(sext_xlen(rd_tmp)); + +#define P_PACK_DW(BIT, X, Y) \ + require_extension('P'); \ + require(BIT == e8 || BIT == e16); \ + reg_t rd_tmp = 0, rs1 = P_RS1_PAIR, rs2 = P_RS2_PAIR; \ + for (sreg_t i = 0; i < 64 / BIT / 2; i++) { \ + rd_tmp = set_field(rd_tmp, make_mask64((i * 2 + 1) * BIT, BIT), \ + P_UFIELD(rs2, i * 2 + Y, BIT)); \ + rd_tmp = set_field(rd_tmp, make_mask64(i * 2 * BIT, BIT), \ + P_UFIELD(rs1, i * 2 + X, BIT)); \ + } \ + WRITE_P_RD_PAIR(rd_tmp); + +#endif
\ No newline at end of file |
