aboutsummaryrefslogtreecommitdiff
path: root/riscv/p_ext_macros.h
diff options
context:
space:
mode:
authorAndrew Waterman <andrew@sifive.com>2026-04-29 10:58:27 -0700
committerGitHub <noreply@github.com>2026-04-29 10:58:27 -0700
commitb21cccdc5f4680d9c13a6bd7d9d00b75aea3cbb5 (patch)
tree87c74cdee88663a7d7775799fc0006875a651b4b /riscv/p_ext_macros.h
parent632777d37139298f0af1ee8d2a001f3ab0bde98c (diff)
parentf2aa295a31f6d0de376e807b2dfab5a62418c8dc (diff)
downloadriscv-isa-sim-master.tar.gz
riscv-isa-sim-master.tar.bz2
riscv-isa-sim-master.zip
Merge pull request #2246 from chihminchao/rvp-rv32-rv64HEADmaster
rvp for rv32/rv64
Diffstat (limited to 'riscv/p_ext_macros.h')
-rw-r--r--riscv/p_ext_macros.h804
1 files changed, 804 insertions, 0 deletions
diff --git a/riscv/p_ext_macros.h b/riscv/p_ext_macros.h
new file mode 100644
index 00000000..2501300f
--- /dev/null
+++ b/riscv/p_ext_macros.h
@@ -0,0 +1,804 @@
+#ifndef _RISCV_P_EXT_MACROS_H_
+#define _RISCV_P_EXT_MACROS_H_
+
+// rd temp
+#define WRITE_P_RD() \
+ rd_tmp = set_field(rd_tmp, make_mask64((i * sizeof(p_rd) * 8), sizeof(p_rd) * 8), p_rd);
+
+// Field
+#define P_FIELD(R, INDEX, SIZE) \
+ (type_sew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE))
+
+#define P_UFIELD(R, INDEX, SIZE) \
+ (type_usew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE))
+
+// Params
+#define P_RD_PARAMS(BIT) \
+ auto p_rd = P_FIELD(rd_tmp, i, BIT);
+
+#define P_RD_UPARAMS(BIT) \
+ auto p_rd = P_UFIELD(rd_tmp, i, BIT);
+
+#define P_RS1_PARAMS(BIT) \
+ auto p_rs1 = P_FIELD(rs1, i, BIT);
+
+#define P_RS1_UPARAMS(BIT) \
+ auto p_rs1 = P_UFIELD(rs1, i, BIT);
+
+#define P_RS1_INNER_PARAMS(BIT_INNER) \
+ auto p_rs1 = P_FIELD(rs1, j, BIT_INNER);
+
+#define P_RS1_INNER_UPARAMS(BIT_INNER) \
+ auto p_rs1 = P_UFIELD(rs1, j, BIT_INNER);
+
+#define P_RS1_EVEN_PARAMS(BIT) \
+ auto p_rs1 = P_FIELD(rs1, i * 2, BIT);
+
+#define P_RS1_ODD_PARAMS(BIT) \
+ auto p_rs1 = P_FIELD(rs1, i * 2 + 1, BIT);
+
+#define P_RS1_EVEN_UPARAMS(BIT) \
+ auto p_rs1 = P_UFIELD(rs1, i * 2, BIT);
+
+#define P_RS1_ODD_UPARAMS(BIT) \
+ auto p_rs1 = P_UFIELD(rs1, i * 2 + 1, BIT);
+
+#define P_RS1_ZIP_PARAMS(BIT) \
+ auto p_rs1 = P_UFIELD(rs1, i / 2 + pos, BIT);
+
+#define P_RS2_PARAMS(BIT) \
+ auto p_rs2 = P_FIELD(rs2, i, BIT);
+
+#define P_RS2_UPARAMS(BIT) \
+ auto p_rs2 = P_UFIELD(rs2, i, BIT);
+
+#define P_RS2_CROSS_PARAMS(BIT) \
+ auto p_rs2 = P_FIELD(rs2, (i ^ 1), BIT);
+
+#define P_RS2_CROSS_UPARAMS(BIT) \
+ auto p_rs2 = P_UFIELD(rs2, (i ^ 1), BIT);
+
+#define P_RS2_INNER_PARAMS(BIT_INNER) \
+ auto p_rs2 = P_FIELD(rs2, j, BIT_INNER);
+
+#define P_RS2_INNER_UPARAMS(BIT_INNER) \
+ auto p_rs2 = P_UFIELD(rs2, j, BIT_INNER);
+
+#define P_RS2_INNER_CROSS_PARAMS(BIT_INNER) \
+ auto p_rs2 = P_FIELD(rs2, (j ^ 1), BIT_INNER);
+
+#define P_RS2_EVEN_PARAMS(BIT) \
+ auto p_rs2 = P_FIELD(rs2, i * 2, BIT);
+
+#define P_RS2_ODD_PARAMS(BIT) \
+ auto p_rs2 = P_FIELD(rs2, i * 2 + 1, BIT);
+
+#define P_RS2_EVEN_UPARAMS(BIT) \
+ auto p_rs2 = P_UFIELD(rs2, i * 2, BIT);
+
+#define P_RS2_ODD_UPARAMS(BIT) \
+ auto p_rs2 = P_UFIELD(rs2, i * 2 + 1, BIT);
+
+#define P_RS2_ZIP_PARAMS(BIT) \
+ auto p_rs2 = P_UFIELD(rs2, i / 2 + pos, BIT);
+
+// Loop base
+#define P_RD_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = RD; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RD_RS1_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = RD; \
+ reg_t rs1 = RS1; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RD_RS1_RS2_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = RD; \
+ reg_t rs1 = RS1; \
+ reg_t rs2 = RS2; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RS1_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rs1 = RS1; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ require_extension('P'); \
+ require(BIT == e16 || BIT == e32 || BIT == e64); \
+ reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \
+ reg_t rs1 = zext_xlen(RS1); \
+ reg_t rs2 = zext_xlen(RS2); \
+ sreg_t len = 64 / BIT; \
+ sreg_t len_inner = BIT / BIT_INNER; \
+ for (sreg_t i = len - 1; i >= 0; --i) { \
+ sreg_t p_res = P_FIELD(rd_tmp, i, BIT); \
+ for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) {
+
+#define P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ require_extension('P'); \
+ require(BIT == e16 || BIT == e32 || BIT == e64); \
+ reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \
+ reg_t rs1 = zext_xlen(RS1); \
+ reg_t rs2 = zext_xlen(RS2); \
+ sreg_t len = 64 / BIT; \
+ sreg_t len_inner = BIT / BIT_INNER; \
+ for (sreg_t i = len - 1; i >= 0; --i) { \
+ sreg_t p_res = P_UFIELD(rd_tmp, i, BIT); \
+ for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) {
+
+#define P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ require_extension('P'); \
+ require(BIT == e16 || BIT == e32 || BIT == e64); \
+ reg_t rd_tmp = USE_RD ? zext_xlen_pair(P_RD_PAIR) : 0; \
+ reg_t rs1 = zext_xlen(RS1); \
+ reg_t rs2 = zext_xlen(RS2); \
+ sreg_t len_inner = BIT / BIT_INNER; \
+ sreg_t p_res = P_FIELD(rd_tmp, 0, BIT * 2); \
+ for (sreg_t j = len_inner - 1; j >= 0 ; --j) {
+
+#define P_WIDEN_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ require_extension('P'); \
+ require(BIT == e16 || BIT == e32 || BIT == e64); \
+ reg_t rd_tmp = USE_RD ? zext_xlen_pair(P_RD_PAIR) : 0; \
+ reg_t rs1 = zext_xlen(RS1); \
+ reg_t rs2 = zext_xlen(RS2); \
+ sreg_t len_inner = BIT / BIT_INNER; \
+ sreg_t p_res = P_UFIELD(rd_tmp, 0, BIT * 2); \
+ for (sreg_t j = 0; j < len_inner; ++j) {
+
+#define P_RD_DW_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = P_RD_PAIR; \
+ sreg_t len = xlen / (BIT) * 2; \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RD_RS1_DW_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = P_RD_PAIR; \
+ reg_t rs1 = P_RS1_PAIR; \
+ sreg_t len = xlen / (BIT) * 2; \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RS1_DW_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rs1 = P_RS1_PAIR; \
+ sreg_t len = xlen / (BIT) * 2; \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_WIDEN_RD_RS1_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = P_RD_PAIR; \
+ reg_t rs1 = RS1; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = P_RD_PAIR; \
+ reg_t rs1 = RS1; \
+ reg_t rs2 = RS2; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = P_RD_PAIR; \
+ reg_t rs1 = RS1; \
+ reg_t rs2 = RS2; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RD_RS1_RS2_DW_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = P_RD_PAIR; \
+ reg_t rs1 = P_RS1_PAIR; \
+ reg_t rs2 = P_RS2_PAIR; \
+ sreg_t len = xlen / (BIT) * 2; \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_NARROW_RD_RS1_LOOP_BASE(BIT) \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = RD; \
+ reg_t rs1 = P_RS1_PAIR; \
+ sreg_t len = xlen / (BIT); \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+#define P_RD_RS1_RS2_ZIP_LOOP_BASE(BIT, POS) \
+ require_rv64; \
+ require_extension('P'); \
+ require((BIT) == e8 || (BIT) == e16 || (BIT) == e32); \
+ reg_t rd_tmp = RD; \
+ reg_t rs1 = RS1; \
+ reg_t rs2 = RS2; \
+ sreg_t len = xlen / (BIT); \
+ sreg_t pos = POS * len / 2; \
+ for (sreg_t i = len - 1; i >= 0; --i) {
+
+// Loop body
+#define P_RD_LOOP_BODY(BIT, BODY) { \
+ P_RD_PARAMS(BIT) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RS1_LOOP_BODY(BIT_RS1, BODY) { \
+ P_RS1_PARAMS(BIT_RS1) \
+ BODY \
+}
+
+#define P_RS1_ULOOP_BODY(BIT_RS1, BODY) { \
+ P_RS1_UPARAMS(BIT_RS1) \
+ BODY \
+}
+
+#define P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) { \
+ P_RD_UPARAMS(BIT_RD) \
+ P_RS1_UPARAMS(BIT_RS1) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ P_RS2_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_UPARAMS(BIT_RD) \
+ P_RS1_UPARAMS(BIT_RS1) \
+ P_RS2_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ P_RS2_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_CROSS_LOOP_BODY(BIT, BODY) { \
+ P_RD_PARAMS(BIT) \
+ P_RS1_PARAMS(BIT) \
+ P_RS2_CROSS_PARAMS(BIT) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_CROSS_ULOOP_BODY(BIT, BODY) { \
+ P_RD_UPARAMS(BIT) \
+ P_RS1_UPARAMS(BIT) \
+ P_RS2_CROSS_UPARAMS(BIT) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_EE_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_EVEN_PARAMS(BIT_RS1) \
+ P_RS2_EVEN_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_EO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_EVEN_PARAMS(BIT_RS1) \
+ P_RS2_ODD_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_OO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_ODD_PARAMS(BIT_RS1) \
+ P_RS2_ODD_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_EE_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_UPARAMS(BIT_RD) \
+ P_RS1_EVEN_UPARAMS(BIT_RS1) \
+ P_RS2_EVEN_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_EO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_UPARAMS(BIT_RD) \
+ P_RS1_EVEN_UPARAMS(BIT_RS1) \
+ P_RS2_ODD_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_OO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_UPARAMS(BIT_RD) \
+ P_RS1_ODD_UPARAMS(BIT_RS1) \
+ P_RS2_ODD_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_EE_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_EVEN_PARAMS(BIT_RS1) \
+ P_RS2_EVEN_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_OO_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_ODD_PARAMS(BIT_RS1) \
+ P_RS2_ODD_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_E_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ P_RS2_EVEN_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_O_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ P_RS2_ODD_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_E_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ P_RS2_EVEN_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_O_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_PARAMS(BIT_RS1) \
+ P_RS2_ODD_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_UPARAMS(BIT_RS1) \
+ P_RS2_UPARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+#define P_RD_RS1_RS2_ZIP_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) { \
+ P_RD_PARAMS(BIT_RD) \
+ P_RS1_ZIP_PARAMS(BIT_RS1) \
+ P_RS2_ZIP_PARAMS(BIT_RS2) \
+ BODY \
+ WRITE_P_RD(); \
+}
+
+// Loop end
+#define P_RD_LOOP_END() \
+ } \
+ WRITE_RD(sext_xlen(rd_tmp));
+
+#define P_REDUCTION_LOOP_END(BIT, IS_SAT) \
+ } \
+ if (IS_SAT) { \
+ p_res = P_SAT(BIT, p_res); \
+ } \
+ type_usew_t<BIT>::type p_rd = p_res; \
+ WRITE_P_RD(); \
+ } \
+ WRITE_RD(sext_xlen(rd_tmp));
+
+#define P_REDUCTION_ULOOP_END(BIT, IS_SAT) \
+ } \
+ type_usew_t<BIT>::type p_rd = p_res; \
+ WRITE_P_RD(); \
+ } \
+ WRITE_RD(sext_xlen(rd_tmp));
+
+#define P_REDUCTION_DW_LOOP_END(BIT, IS_SAT) \
+ } \
+ if (IS_SAT) { \
+ p_res = P_SAT(BIT * 2, p_res); \
+ } \
+ WRITE_P_RD_PAIR(p_res);
+
+#define P_RD_DW_LOOP_END() \
+ } \
+ WRITE_P_RD_PAIR(rd_tmp);
+
+// Loop
+#define P_RD_LOOP(BIT_RD, BODY) \
+ P_RD_LOOP_BASE(BIT_RD) \
+ P_RD_LOOP_BODY(BIT_RD, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_LOOP(BIT_RD, BIT_RS1, BODY) \
+ P_RD_RS1_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_ULOOP(BIT_RD, BIT_RS1, BODY) \
+ P_RD_RS1_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_CROSS_LOOP(BIT, BODY1, BODY2) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT) \
+ P_CROSS_LOOP_BODY(BIT, BODY1) \
+ --i; \
+ if (sizeof(#BODY2) == 1) { \
+ P_CROSS_LOOP_BODY(BIT, BODY1) \
+ } \
+ else { \
+ P_CROSS_LOOP_BODY(BIT, BODY2) \
+ } \
+ P_RD_LOOP_END()
+
+#define P_CROSS_ULOOP(BIT, BODY1, BODY2) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT) \
+ P_CROSS_ULOOP_BODY(BIT, BODY1) \
+ --i; \
+ if (sizeof(#BODY2) == 1) { \
+ P_CROSS_ULOOP_BODY(BIT, BODY1) \
+ } \
+ else { \
+ P_CROSS_ULOOP_BODY(BIT, BODY2) \
+ } \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_ZIP_LOOP(BIT_RD, BIT_RS1, BIT_RS2, POS, BODY) \
+ P_RD_RS1_RS2_ZIP_LOOP_BASE(BIT_RD, POS) \
+ P_RD_RS1_RS2_ZIP_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_UNZIP(BIT, HIGH) \
+ require_rv64; \
+ require_extension('P'); \
+ require(BIT == e8 || BIT == e16); \
+ reg_t rd_tmp = 0; \
+ for (sreg_t i = 0; i < xlen / BIT / 2; i++) { \
+ rd_tmp = set_field(rd_tmp, make_mask64(i * BIT, BIT), \
+ P_UFIELD(RS1, i * 2 + HIGH, BIT)); \
+ rd_tmp = set_field(rd_tmp, make_mask64(i * BIT + xlen / 2, BIT), \
+ P_UFIELD(RS2, i * 2 + HIGH, BIT)); \
+ } \
+ WRITE_RD(sext_xlen(rd_tmp));
+
+#define P_RD_RS1_RS2_EE_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_EE_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_EO_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_EO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_OO_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_OO_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_EE_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_EE_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_EO_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_EO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_OO_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_OO_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_EE_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_EE_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_OO_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_OO_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_E_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_E_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_O_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_O_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_E_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_E_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RD_RS1_RS2_O_SULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_O_SULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_LOOP_END()
+
+#define P_REDUCTION_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_PARAMS(BIT_INNER) \
+ P_RS2_INNER_PARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_LOOP_END(BIT, IS_SAT)
+
+#define P_REDUCTION_SULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_PARAMS(BIT_INNER) \
+ P_RS2_INNER_UPARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_LOOP_END(BIT, IS_SAT)
+
+#define P_REDUCTION_ULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_UPARAMS(BIT_INNER) \
+ P_RS2_INNER_UPARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_ULOOP_END(BIT, IS_SAT)
+
+#define P_REDUCTION_CROSS_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_PARAMS(BIT_INNER) \
+ P_RS2_INNER_CROSS_PARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_LOOP_END(BIT, IS_SAT)
+
+#define P_RD_DW_LOOP(BIT_RD, BODY) \
+ P_RD_DW_LOOP_BASE(BIT_RD) \
+ P_RD_LOOP_BODY(BIT_RD, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_RD_RS1_DW_LOOP(BIT_RD, BIT_RS1, BODY) \
+ P_RD_RS1_DW_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_RS1_DW_LOOP(BIT_RS1, BODY) \
+ P_RS1_DW_LOOP_BASE(BIT_RS1) \
+ P_RS1_LOOP_BODY(BIT_RS1, BODY) \
+ P_RD_LOOP_END()
+
+#define P_RS1_DW_ULOOP(BIT_RS1, BODY) \
+ P_RS1_DW_LOOP_BASE(BIT_RS1) \
+ P_RS1_ULOOP_BODY(BIT_RS1, BODY) \
+ P_RD_LOOP_END()
+
+#define P_WIDEN_RD_RS1_LOOP(BIT_RS1, BODY) \
+ P_WIDEN_RD_RS1_LOOP_BASE(BIT_RS1) \
+ P_RD_RS1_LOOP_BODY((BIT_RS1) * 2, BIT_RS1, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_WIDEN_RD_RS1_ULOOP(BIT_RS1, BODY) \
+ P_WIDEN_RD_RS1_LOOP_BASE(BIT_RS1) \
+ P_RD_RS1_ULOOP_BODY((BIT_RS1) * 2, BIT_RS1, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_WIDEN_RD_RS1_RS2_ZIP_LOOP(BIT_RS1, BIT_RS2, BODY) \
+ P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BASE(BIT_RS1) \
+ P_WIDEN_RD_RS1_RS2_ZIP_LOOP_BODY((BIT_RS1 * 2), BIT_RS1, BIT_RS2, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_WIDEN_RD_RS1_RS2_LOOP(BIT_RS1, BIT_RS2, BODY) \
+ P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT_RS1) \
+ P_RD_RS1_RS2_LOOP_BODY((BIT_RS1) * 2, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_WIDEN_RD_RS1_RS2_ULOOP(BIT_RS1, BIT_RS2, BODY) \
+ P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT_RS1) \
+ P_RD_RS1_RS2_ULOOP_BODY((BIT_RS1) * 2, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_WIDEN_RD_RS1_RS2_SULOOP(BIT_RS1, BIT_RS2, BODY) \
+ P_WIDEN_RD_RS1_RS2_LOOP_BASE(BIT_RS1) \
+ P_RD_RS1_RS2_SULOOP_BODY((BIT_RS1) * 2, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_WIDEN_REDUCTION_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_PARAMS(BIT_INNER) \
+ P_RS2_INNER_PARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_DW_LOOP_END(BIT, IS_SAT)
+
+#define P_WIDEN_REDUCTION_ULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_WIDEN_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_UPARAMS(BIT_INNER) \
+ P_RS2_INNER_UPARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_DW_LOOP_END(BIT, IS_SAT)
+
+#define P_WIDEN_REDUCTION_SULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_PARAMS(BIT_INNER) \
+ P_RS2_INNER_UPARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_DW_LOOP_END(BIT, IS_SAT)
+
+#define P_WIDEN_REDUCTION_CROSS_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
+ P_WIDEN_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
+ P_RS1_INNER_PARAMS(BIT_INNER) \
+ P_RS2_INNER_CROSS_PARAMS(BIT_INNER) \
+ BODY \
+ P_REDUCTION_DW_LOOP_END(BIT, IS_SAT)
+
+#define P_RD_RS1_DW_LOOP(BIT_RD, BIT_RS1, BODY) \
+ P_RD_RS1_DW_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_RD_RS1_DW_ULOOP(BIT_RD, BIT_RS1, BODY) \
+ P_RD_RS1_DW_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_RD_RS1_RS2_DW_LOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_DW_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_LOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_RD_RS1_RS2_DW_ULOOP(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_RS1_RS2_DW_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_RS2_ULOOP_BODY(BIT_RD, BIT_RS1, BIT_RS2, BODY) \
+ P_RD_DW_LOOP_END()
+
+#define P_NARROW_RD_RS1_LOOP(BIT_RD, BIT_RS1, BODY) \
+ P_NARROW_RD_RS1_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_LOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_LOOP_END()
+
+#define P_NARROW_RD_RS1_ULOOP(BIT_RD, BIT_RS1, BODY) \
+ P_NARROW_RD_RS1_LOOP_BASE(BIT_RD) \
+ P_RD_RS1_ULOOP_BODY(BIT_RD, BIT_RS1, BODY) \
+ P_RD_LOOP_END()
+
+#define P_CROSS_DW_LOOP(BIT, BODY1, BODY2) \
+ P_RD_RS1_RS2_DW_LOOP_BASE(BIT) \
+ P_CROSS_LOOP_BODY(BIT, BODY1) \
+ --i; \
+ if (sizeof(#BODY2) == 1) { \
+ P_CROSS_LOOP_BODY(BIT, BODY1) \
+ } \
+ else { \
+ P_CROSS_LOOP_BODY(BIT, BODY2) \
+ } \
+ P_RD_DW_LOOP_END()
+
+#define P_CROSS_DW_ULOOP(BIT, BODY1, BODY2) \
+ P_RD_RS1_RS2_DW_LOOP_BASE(BIT) \
+ P_CROSS_ULOOP_BODY(BIT, BODY1) \
+ --i; \
+ if (sizeof(#BODY2) == 1) { \
+ P_CROSS_ULOOP_BODY(BIT, BODY1) \
+ } \
+ else { \
+ P_CROSS_ULOOP_BODY(BIT, BODY2) \
+ } \
+ P_RD_DW_LOOP_END()
+
+// Misc
+#define P_SAT(BIT, R) ({ \
+ sreg_t _psat_in = (R); \
+ sreg_t _psat_out; \
+ if ((BIT) == 64) _psat_out = _psat_in; \
+ else if (_psat_in > (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1)) _psat_out = (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1); \
+ else if (_psat_in < (sreg_t)(reg_t(-1) << ((BIT) - 1))) _psat_out = (sreg_t)(reg_t(-1) << ((BIT) - 1)); \
+ else _psat_out = _psat_in; \
+ if (_psat_out != _psat_in) P.VU.vxsat->write(1); \
+ _psat_out; \
+})
+
+#define P_USAT(BIT, R) ({ \
+ sreg_t _pusat_in = (R); \
+ sreg_t _pusat_out; \
+ if (_pusat_in < 0) _pusat_out = 0; \
+ else if ((BIT) == 64) _pusat_out = _pusat_in; \
+ else if (_pusat_in > (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1)) _pusat_out = (sreg_t)((reg_t(1) << ((BIT) - 1)) - 1); \
+ else _pusat_out = _pusat_in; \
+ if (_pusat_out != _pusat_in) P.VU.vxsat->write(1); \
+ _pusat_out; \
+})
+
+#define P_USAT_FULL(BIT, R) ({ \
+ sreg_t _pusatf_in = (R); \
+ sreg_t _pusatf_out; \
+ if (_pusatf_in < 0) _pusatf_out = 0; \
+ else if ((BIT) >= 64) _pusatf_out = _pusatf_in; \
+ else if (_pusatf_in > (sreg_t)((reg_t(1) << (BIT)) - 1)) _pusatf_out = (sreg_t)((reg_t(1) << (BIT)) - 1); \
+ else _pusatf_out = _pusatf_in; \
+ if (_pusatf_out != _pusatf_in) P.VU.vxsat->write(1); \
+ _pusatf_out; \
+})
+
+#define P_PACK(BIT, X, Y) \
+ require_extension('P'); \
+ require(BIT == e8 || BIT == e16 || BIT == e32); \
+ reg_t rd_tmp = 0; \
+ for (sreg_t i = 0; i < xlen / BIT / 2; i++) { \
+ rd_tmp = set_field(rd_tmp, make_mask64((i * 2 + 1) * BIT, BIT), \
+ P_UFIELD(RS2, i * 2 + Y, BIT)); \
+ rd_tmp = set_field(rd_tmp, make_mask64(i * 2 * BIT, BIT), \
+ P_UFIELD(RS1, i * 2 + X, BIT)); \
+ } \
+ WRITE_RD(sext_xlen(rd_tmp));
+
+#define P_PACK_DW(BIT, X, Y) \
+ require_extension('P'); \
+ require(BIT == e8 || BIT == e16); \
+ reg_t rd_tmp = 0, rs1 = P_RS1_PAIR, rs2 = P_RS2_PAIR; \
+ for (sreg_t i = 0; i < 64 / BIT / 2; i++) { \
+ rd_tmp = set_field(rd_tmp, make_mask64((i * 2 + 1) * BIT, BIT), \
+ P_UFIELD(rs2, i * 2 + Y, BIT)); \
+ rd_tmp = set_field(rd_tmp, make_mask64(i * 2 * BIT, BIT), \
+ P_UFIELD(rs1, i * 2 + X, BIT)); \
+ } \
+ WRITE_P_RD_PAIR(rd_tmp);
+
+#endif \ No newline at end of file