From d5c0339484323b5a9498576d70ec90eab2e13438 Mon Sep 17 00:00:00 2001 From: Eric Gouriou Date: Sun, 18 Jun 2023 17:10:53 -0700 Subject: Zvk: Infrastructure for Zvk extensions, element group handling Introduce types and macros useful across multiple Zvk sub-extensions, including Zvbb and Zvbc. Those will be used by upcoming per-sub-extension commits. In particular we introduce "Element Group" types and loop macros handling those element groups. The concept of element group is described in . Note that the element group access method is not implemented for WORDS_BIGENDIAN setup. As such, isa_parser.cc is modified to emit an error when WORDS_BIGENDIAN is defined and extensions using element groups are enabled. Signed-off-by: Eric Gouriou --- riscv/arith.h | 21 + riscv/isa_parser.cc | 10 +- riscv/v_ext_macros.h | 22 ++ riscv/vector_unit.cc | 55 +++ riscv/vector_unit.h | 19 +- riscv/zvk_ext_macros.h | 1023 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1148 insertions(+), 2 deletions(-) create mode 100644 riscv/zvk_ext_macros.h diff --git a/riscv/arith.h b/riscv/arith.h index 3b807e9..20b1504 100644 --- a/riscv/arith.h +++ b/riscv/arith.h @@ -7,6 +7,7 @@ #include #include #include +#include inline uint64_t mulhu(uint64_t a, uint64_t b) { @@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t return r; } +// Rotates right an unsigned integer by the given number of bits. +template +static inline T rotate_right(T x, std::size_t shiftamt) { + static_assert(std::is_unsigned::value); + static constexpr T mask = (8 * sizeof(T)) - 1; + const std::size_t rshift = shiftamt & mask; + const std::size_t lshift = (-rshift) & mask; + return (x << lshift) | (x >> rshift); +} + +// Rotates right an unsigned integer by the given number of bits. +template +static inline T rotate_left(T x, std::size_t shiftamt) { + static_assert(std::is_unsigned::value); + static constexpr T mask = (8 * sizeof(T)) - 1; + const std::size_t lshift = shiftamt & mask; + const std::size_t rshift = (-lshift) & mask; + return (x << lshift) | (x >> rshift); +} + #endif diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc index 6fb29ae..59472a4 100644 --- a/riscv/isa_parser.cc +++ b/riscv/isa_parser.cc @@ -361,7 +361,15 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv) (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) { bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64"); } - +#ifdef WORDS_BIGENDIAN + // Access to the vector registers as element groups is unimplemented on big-endian setups. + if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] || + extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) { + bad_isa_string(str, + "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' " + "extensions are incompatible with WORDS_BIGENDIAN setups."); + } +#endif std::string lowercase = strtolower(priv); bool user = false, supervisor = false; diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h index 41256c7..908ff16 100644 --- a/riscv/v_ext_macros.h +++ b/riscv/v_ext_macros.h @@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize, type_usew_t::type vs1 = P.VU.elt::type>(rs1_num, i); \ type_usew_t::type vs2 = P.VU.elt::type>(rs2_num, i); +#define V_U_PARAMS(x) \ + type_usew_t::type &vd = P.VU.elt::type>(rd_num, i, true); \ + type_usew_t::type vs2 = P.VU.elt::type>(rs2_num, i); + #define VX_U_PARAMS(x) \ type_usew_t::type &vd = P.VU.elt::type>(rd_num, i, true); \ type_usew_t::type rs1 = (type_usew_t::type)RS1; \ @@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize, } \ VI_LOOP_END +#define VI_V_ULOOP(BODY) \ + VI_CHECK_SSS(false) \ + VI_LOOP_BASE \ + if (sew == e8) { \ + V_U_PARAMS(e8); \ + BODY; \ + } else if (sew == e16) { \ + V_U_PARAMS(e16); \ + BODY; \ + } else if (sew == e32) { \ + V_U_PARAMS(e32); \ + BODY; \ + } else if (sew == e64) { \ + V_U_PARAMS(e64); \ + BODY; \ + } \ + VI_LOOP_END + #define VI_VX_ULOOP(BODY) \ VI_CHECK_SSS(false) \ VI_LOOP_BASE \ diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc index 9128df6..08adc61 100644 --- a/riscv/vector_unit.cc +++ b/riscv/vector_unit.cc @@ -86,6 +86,56 @@ template T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write return regStart[n]; } +// The logic differences between 'elt()' and 'elt_group()' come from +// the fact that, while 'elt()' requires that the element is fully +// contained in a single vector register, the element group may span +// multiple registers in a single register group (LMUL>1). +// +// Notes: +// - We do NOT check that a single element - i.e., the T in the element +// group type std::array - fits within a single register, or that +// T is smaller or equal to VSEW. Implementations of the instructions +// sometimes use a different T than what the specification suggests. +// Instructon implementations should 'require()' what the specification +// dictates. +// - We do NOT check that 'vReg' is a valid register group, or that +// 'n+1' element groups fit in the register group 'vReg'. It is +// the responsibility of the caller to validate those preconditions. +template EG& +vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) { +#ifdef WORDS_BIGENDIAN + fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n", + stderr); + abort(); +#endif + using T = typename EG::value_type; + constexpr std::size_t N = std::tuple_size::value; + assert(N > 0); + + assert(vsew != 0); + constexpr reg_t elt_group_size = N * sizeof(T); + const reg_t reg_group_size = (VLEN >> 3) * vflmul; + assert(((n + 1) * elt_group_size) <= reg_group_size); + + const reg_t start_byte = n * elt_group_size; + const reg_t bytes_per_reg = VLEN >> 3; + + // Inclusive first/last register indices. + const reg_t reg_first = vReg + start_byte / bytes_per_reg; + const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg; + + // Element groups per register groups + for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) { + reg_referenced[vidx] = 1; + + if (unlikely(p->get_log_commits_enabled() && is_write)) { + p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0}; + } + } + + return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte); +} + template signed char& vectorUnit_t::elt(reg_t, reg_t, bool); template short& vectorUnit_t::elt(reg_t, reg_t, bool); template int& vectorUnit_t::elt(reg_t, reg_t, bool); @@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt(reg_t, reg_t, bool); template float16_t& vectorUnit_t::elt(reg_t, reg_t, bool); template float32_t& vectorUnit_t::elt(reg_t, reg_t, bool); template float64_t& vectorUnit_t::elt(reg_t, reg_t, bool); + +template EGU32x4_t& vectorUnit_t::elt_group(reg_t, reg_t, bool); +template EGU32x8_t& vectorUnit_t::elt_group(reg_t, reg_t, bool); +template EGU64x4_t& vectorUnit_t::elt_group(reg_t, reg_t, bool); +template EGU8x16_t& vectorUnit_t::elt_group(reg_t, reg_t, bool); diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h index b9f706c..a057c62 100644 --- a/riscv/vector_unit.h +++ b/riscv/vector_unit.h @@ -2,6 +2,9 @@ #ifndef _RISCV_VECTOR_UNIT_H #define _RISCV_VECTOR_UNIT_H +#include +#include + #include "decode.h" #include "csrs.h" @@ -69,6 +72,17 @@ struct type_sew_t<64> using type=int64_t; }; +// Element Group of 4 32 bits elements (128b total). +using EGU32x4_t = std::array; + +// Element Group of 8 32 bits elements (256b total). +using EGU32x8_t = std::array; + +// Element Group of 4 64 bits elements (256b total). +using EGU64x4_t = std::array; + +// Element Group of 16 8 bits elements (128b total). +using EGU8x16_t = std::array; class vectorUnit_t { @@ -88,8 +102,11 @@ public: bool vill; bool vstart_alu; - // vector element for varies SEW + // vector element for various SEW template T& elt(reg_t vReg, reg_t n, bool is_write = false); + // vector element group access, where EG is a std::array. + template EG& + elt_group(reg_t vReg, reg_t n, bool is_write = false); public: diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h new file mode 100644 index 0000000..7efbac8 --- /dev/null +++ b/riscv/zvk_ext_macros.h @@ -0,0 +1,1023 @@ +// Helper macros to help implement instructions defined as part of +// the RISC-V Zvk extension (vector cryptography). + +// Note that a good deal of code here would be cleaner/simpler +// if exposed as C++ functions (including templated ones), however +// this is not possible in the contexts where those headers are +// included. + +#ifndef RISCV_ZVK_EXT_MACROS_H_ +#define RISCV_ZVK_EXT_MACROS_H_ + +// +// Predicate Macros +// + +// Ensures that the ZVBB extension (vector crypto bitmanip) is present, +// and the vector unit is enabled and in a valid state. +#define require_zvbb \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVBB); \ + } while (0) + +// Ensures that the ZVBC extension (vector carryless multiplication) +// is present, and the vector unit is enabled and in a valid state. +#define require_zvbc \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVBC); \ + } while (0) + +// Ensures that the ZVKG extension (vector Gallois Field Multiplication) +// is present, and the vector unit is enabled and in a valid state. +#define require_zvkg \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKG); \ + } while (0) + +// Ensures that a ZVK extension supporting SHA-256 is present. +// For SHA-256, this support is present in either Zvknha or Zvknhb. +// Also ensures that the vector unit is enabled and in a valid state. +#define require_zvknh_256 \ + do { \ + require_vector(true); \ + require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \ + } while (0) + +// Ensures that the ZVKNED extension (vector AES single round) is present, +// and the vector unit is enabled and in a valid state. +#define require_zvkned \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKNED); \ + } while (0) + +// Ensures that a ZVK extension supporting SHA-512 is present. +// For SHA-512, this support is only present in Zvknhb. +// Also ensures that the vector unit is enabled and in a valid state. +#define require_zvknh_512 \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKNHB); \ + } while (0) + +// Ensures that the ZVKSED extension (vector SM4 block cipher) +// is present, and the vector unit is enabled and in a valid state. +#define require_zvksed \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKSED); \ + } while (0) + +// Ensures that the ZVKSH extension (vector SM3 hash) is present, +// and the vector unit is enabled and in a valid state. +#define require_zvksh \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKSH); \ + } while (0) + +// Ensures that the vector instruction is not using a mask. +#define require_no_vmask require(insn.v_vm() == 1) + +// Ensures that an element group can fit in a register group. That is, +// (LMUL * VLEN) <= EGW +#define require_egw_fits(EGW) require((EGW) <= (P.VU.VLEN * P.VU.vflmul)) + +// Checks that the vector unit state (vtype and vl) can be interpreted +// as element groups with EEW=32, EGS=4 (four 32-bits elements per group), +// for an effective element group width of EGW=128 bits. +// +// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart' +// are interpreted as a number of EEW-wide elements. They must both +// be multiples of EGS (potentially 0). +#define require_element_groups_32x4 \ + do { \ + /* 'vstart' must be a multiple of EGS */ \ + const reg_t vstart = P.VU.vstart->read(); \ + require(vstart % 4 == 0); \ + /* 'vl' must be a multiple of EGS */ \ + const reg_t vl = P.VU.vl->read(); \ + require(vl % 4 == 0); \ + } while (0) + +// Checks that the vector unit state (vtype and vl) can be interpreted +// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group), +// for an effective element group width of EGW=256 bits. +// +// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart' +// are interpreted as a number of EEW-wide elements. They must both +// be multiples of EGS (potentially 0). +#define require_element_groups_32x8 \ + do { \ + /* 'vstart' must be a multiple of EGS */ \ + const reg_t vstart = P.VU.vstart->read(); \ + require(vstart % 8 == 0); \ + /* 'vl' must be a multiple of EGS */ \ + const reg_t vl = P.VU.vl->read(); \ + require(vl % 8 == 0); \ + } while (0) + +// Checks that the vector unit state (vtype and vl) can be interpreted +// as element groups with EEW=64, EGS=4 (four 64-bits elements per group), +// for an effective element group width of EGW=128 bits. +// +// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart' +// are interpreted as a number of EEW-wide elements. They must both +// be multiples of EGS (potentially 0). +#define require_element_groups_64x4 \ + do { \ + /* 'vstart' must be a multiple of EGS */ \ + const reg_t vstart = P.VU.vstart->read(); \ + require(vstart % 4 == 0); \ + /* 'vl' must be a multiple of EGS */ \ + const reg_t vl = P.VU.vl->read(); \ + require(vl % 4 == 0); \ + } while (0) + +// +// Loop Parameters Macros +// + +// Extracts a 32b*4 element group as a EGU32x4_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// 'vs1' and 'vs2' (constant, by value). +#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \ + EGU32x4_t &vd = P.VU.elt_group((VD_NUM), (EG_IDX), true); \ + const EGU32x4_t vs1 = P.VU.elt_group((VS1_NUM), (EG_IDX)); \ + const EGU32x4_t vs2 = P.VU.elt_group((VS2_NUM), (EG_IDX)) + +// Extracts a 32b*8 element group as a EGU32x8_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// 'vs1' and 'vs2' (constant, by value). +#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \ + EGU32x8_t &vd = P.VU.elt_group((VD_NUM), (EG_IDX), true); \ + const EGU32x8_t vs1 = P.VU.elt_group((VS1_NUM), (EG_IDX)); \ + const EGU32x8_t vs2 = P.VU.elt_group((VS2_NUM), (EG_IDX)) + +// Extracts a 32b*4 element group as a EGU32x4_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// and 'vs2' (constant, by value). +#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \ + EGU32x4_t &vd = P.VU.elt_group((VD_NUM), (EG_IDX), true); \ + const EGU32x4_t vs2 = P.VU.elt_group((VS2_NUM), (EG_IDX)) + +// Extracts a 32b*8 element group as a EGU32x8_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// and 'vs2' (constant, by value). +#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \ + EGU32x8_t &vd = P.VU.elt_group((VD_NUM), (EG_IDX), true); \ + const EGU32x8_t vs2 = P.VU.elt_group((VS2_NUM), (EG_IDX)) + +// Extracts a 64b*4 element group as a EGU64x4_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// 'vs1' and 'vs2' (constant, by value). +#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \ + EGU64x4_t &vd = P.VU.elt_group((VD_NUM), (EG_IDX), true); \ + const EGU64x4_t vs1 = P.VU.elt_group((VS1_NUM), (EG_IDX)); \ + const EGU64x4_t vs2 = P.VU.elt_group((VS2_NUM), (EG_IDX)) + +// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1', +// as part of a widening operation where 'vd' has EEW = 2 * SEW. +// Defines +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'vs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \ + auto &vd_w = P.VU.elt::type>(rd_num, i, true); \ + const auto vs2 = P.VU.elt::type>(rs2_num, i); \ + const type_usew_t<2 * SEW>::type vs2_w = vs2; \ + const auto vs1 = P.VU.elt::type>(rs1_num, i); \ + +// Extracts elements from the vector register groups 'vd', 'vs2', +// and the scalar register 'rs1', as part of a widening operation where +// 'vd' has EEW = 2 * SEW. +// Defines +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'rs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \ + auto &vd_w = P.VU.elt::type>(rd_num, i, true); \ + const auto vs2 = P.VU.elt::type>(rs2_num, i); \ + const type_usew_t<2 * SEW>::type vs2_w = vs2; \ + const auto rs1 = (type_usew_t::type)RS1; \ + +// Extracts elements from the vector register groups 'vd', 'vs2', +// and the 5-bit immediate field 'zimm5', as part of a widening operation +// where 'vd' has EEW = 2 * SEW. +// Defines +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'zimm5', unsigned, SEW width, by value, constant. +#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \ + auto &vd_w = P.VU.elt::type>(rd_num, i, true); \ + const auto vs2 = P.VU.elt::type>(rs2_num, i); \ + const type_usew_t<2 * SEW>::type vs2_w = vs2; \ + const auto zimm5 = (type_usew_t::type)insn.v_zimm5(); \ + +// +// Loop Macros +// + +// NOTES: +// - Each of the element-group loop macros DO contain an invocation +// of the corresponding 'require_element_groups_x<#elements>;', +// because the macro correctness requires proper VL/VSTART values. +// - Each of the loop macros named "_NOVM_" DO contain an invocation +// of the 'require_no_vmask>;' macro. Those macros (all of them +// at this time) do not support masking (i.e., no skipping +// of elements/element groups is performed). + +// Processes all 32b*4 element groups available in the vector register +// operands vd, vs1, and vs2. This interprets the vectors as containing +// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while +// *ignoring* the current SEW setting of the vector unit. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x4_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs1': EGU32x4_t, content of the current element group +// in the 'vs1' vector register / register group. +// 'vs2': EGU32x4_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*8 element groups available in the vector register +// operands vd, vs1, and vs2. This interprets the vectors as containing +// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while +// *ignoring* the current SEW setting of the vector unit. +// +// IMPORTANT +// - This macro contains an invocation of the macro 'require_element_groups_32x8;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x8_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs1': EGU32x8_t, content of the current element group +// in the 'vs1' vector register / register group. +// 'vs2': EGU32x8_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_32x8;; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 8; \ + const reg_t vl_eg = P.VU.vl->read() / 8; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector register +// operands vd, vs1, and vs2. This interprets the vectors as containing +// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while +// *ignoring* the current SEW setting of the vector unit. +// +// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP: +// - this macro does NOT extract the element groups into EGU32x4_t +// variables. It is intended for uses where there is a more natural +// type to use (e.g., EGU8x16_t). The type should still be a 128 bits +// wide type if extracted via 'P.VU.elt_group(...)'. +// - this macro offers the additional PRELOOP code block argument, +// that is executed once if the loop is going to be entered. +// This is intended for use with "vector scalar" instructions where +// we extract the first element group from one of the operands and +// use it for all loop iterations. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// +#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \ + PRELOOP, \ + EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector register +// operands vd and vs2. This interprets the vectors as containing +// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while +// *ignoring* the current SEW setting of the vector unit. +// +// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP: +// - this macro is meant to be used for "op vd, vs2" instructions, +// whether vd is output only, or input and output. +// - this macro does NOT extract the element groups into EGU32x4_t +// variables. It is intended for uses where there is a more natural +// type to use (e.g., EGU8x16_t). The type should still be a 128 bits +// wide type if extracted via 'P.VU.elt_group(...)'. +// - this macro offers the additional PRELOOP code block argument, +// that is executed once if the loop is going to be entered. +// This is intended for use with "vector scalar" instructions where +// we extract the first element group from one of the operands and +// use it for all loop iterations. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes three statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// +#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \ + PRELOOP, \ + EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector registers +// vd, vs2. This interprets the vectors as containing element groups +// of 4 uint32_t values (EGW=128, EEW=32, EGS=4), +// *ignoring* the current SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x4_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs2': EGU32x4_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector registers +// vd, vs2, given the 'zimm5' immediate. This interprets the vectors as +// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), +// *ignoring* the current SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// +// Invokes three statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'zimm5': 5 bits unsigned immediate +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x4_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs2': EGU32x4_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t zimm5 = insn.v_zimm5(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*8 element groups available in the vector registers +// vd, vs2, given the 'zimm5' immediate. This interprets the vectors as +// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), +// *ignoring* the current SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x8;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// +// Invokes three statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'zimm5': unsigned 5 bits immediate +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x8_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs2': EGU32x8_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \ + do { \ + require_element_groups_32x8; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t zimm5 = insn.v_zimm5(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 8; \ + const reg_t vl_eg = P.VU.vl->read() / 8; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 64b*4 element groups available in the vector registers +// vd, vs1, and vs2. This interprets the vectors as containing element groups +// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current +// SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_64x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU64x4_t reference, content of the current element group +// in the 'vd' vector register / vector register group. +// 'vs1': EGU64x4_t, content of the current element group +// in the 'vs1' vector register / vector register group. +// 'vs2': EGU64x4_t, content of the current element group +// in the 'vs2' vector register / vector register group. +#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_64x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + + +// Loop macro for widening instructions taking parameters 'vd, vs2, v1', +// with logic processing elements one-at-a-time in those register groups +// and treating the elements as unsigned integers. +// +// Invokes the BODY statement block once per element. +// As a widening instruction, it is defined for SEW in {8, 16, 32}. +// A separate copy of BODY is instantiated for each SEW value. +// +// Declares the following variables available for use in BODY: +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'vs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \ + do { \ + VI_CHECK_DSS(true); \ + VI_LOOP_BASE \ + switch (sew) { \ + case e8: { \ + VI_ZVK_VV_WIDENING_U_PARAMS(e8); \ + BODY \ + break; \ + } \ + case e16: { \ + VI_ZVK_VV_WIDENING_U_PARAMS(e16); \ + BODY \ + break; \ + } \ + case e32: { \ + VI_ZVK_VV_WIDENING_U_PARAMS(e32); \ + BODY \ + break; \ + } \ + } \ + VI_LOOP_END \ + } while (0) + +// Loop macro for widening instructions taking parameters 'vd, vs2, rs1', +// with logic processing elements one-at-a-time in those register groups +// and treating the elements as unsigned integers. +// +// Invokes the BODY statement block once per element. +// As a widening instruction, it is defined for SEW in {8, 16, 32}. +// A separate copy of BODY is instantiated for each SEW value. +// +// Declares the following variables available for use in BODY: +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'rs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \ + do { \ + VI_CHECK_DSS(true); \ + VI_LOOP_BASE \ + switch (sew) { \ + case e8: { \ + VI_ZVK_VX_WIDENING_U_PARAMS(e8); \ + BODY \ + break; \ + } \ + case e16: { \ + VI_ZVK_VX_WIDENING_U_PARAMS(e16); \ + BODY \ + break; \ + } \ + case e32: { \ + VI_ZVK_VX_WIDENING_U_PARAMS(e32); \ + BODY \ + break; \ + } \ + } \ + VI_LOOP_END \ + } while (0) + +// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5', +// with logic processing elements one-at-a-time in those register groups +// and treating the elements as unsigned integers. +// +// Invokes the BODY statement block once per element. +// As a widening instruction, it is defined for SEW in {8, 16, 32}. +// A separate copy of BODY is instantiated for each SEW value. +// +// Declares the following variables available for use in BODY: +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'zimm5', unsigned, SEW width, by value, constant. +#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \ + do { \ + VI_CHECK_DSS(true); \ + VI_LOOP_BASE \ + switch (sew) { \ + case e8: { \ + VI_ZVK_VI_WIDENING_U_PARAMS(e8); \ + BODY \ + break; \ + } \ + case e16: { \ + VI_ZVK_VI_WIDENING_U_PARAMS(e16); \ + BODY \ + break; \ + } \ + case e32: { \ + VI_ZVK_VI_WIDENING_U_PARAMS(e32); \ + BODY \ + break; \ + } \ + } \ + VI_LOOP_END \ + } while (0) + +// +// Element Group Manipulation Macros +// + +// Extracts 4 uint32_t words from the input EGU32x4_t value +// into the (mutable) variables named by the W arguments, provided in +// "Little Endian" (LE) order, i.e., from the least significant (W0) +// to the most significant (W3). +#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \ + uint32_t W0 = (X)[0]; \ + uint32_t W1 = (X)[1]; \ + uint32_t W2 = (X)[2]; \ + uint32_t W3 = (X)[3]; \ + (void)(0) + +// Sets the elements words of given EGU32x4_t variable 'X' to +// the given 4 uint32_t values privided in "Little Endian" (LE) +// order, i.e., from the least significant (W0) to the most +// significant (W3). +#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \ + do { \ + (X)[0] = (W0); \ + (X)[1] = (W1); \ + (X)[2] = (W2); \ + (X)[3] = (W3); \ + } while (0) + +// Extracts 4 uint32_t words from the input EGU32x4_t value +// into the (mutable) variables named by the W arguments, provided in +// "Big Endian" (BE) order, i.e., from the most significant (W3) +// to the least significant (W0). +#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \ + uint32_t W0 = (X)[0]; \ + uint32_t W1 = (X)[1]; \ + uint32_t W2 = (X)[2]; \ + uint32_t W3 = (X)[3]; \ + (void)(0) + +// Sets the elements words of given EGU32x4_t variable 'X' to +// the given 4 uint32_t values privided in "Big Endian" (BE) +// order, i.e., from the most significant (W3) to the least +// significant (W0). +#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \ + do { \ + (X)[0] = (W0); \ + (X)[1] = (W1); \ + (X)[2] = (W2); \ + (X)[3] = (W3); \ + } while (0) + +// Byte-swap the bytes of a uin32_t such that the order of bytes +// is reversed. +#define ZVK_BSWAP32(x) \ + ((((uint32_t)((x) >> 24)) & 0xFF) << 0 | \ + (((uint32_t)((x) >> 16)) & 0xFF) << 8 | \ + (((uint32_t)((x) >> 8)) & 0xFF) << 16 | \ + (((uint32_t)((x) >> 0)) & 0xFF) << 24) + +// Extracts 8 uint32_t words from the input EGU32x8_t value +// into the (mutable) variables named by the W arguments, provided in +// "Big Endian" (BE) order, i.e., from the most significant (W7) +// to the least significant (W0). Each of the words is byte-swapped, +// from a big-endian representation in the EGU32x8_t to a native/little-endian +// ordering in the variables. +#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \ + uint32_t W0 = ZVK_BSWAP32((X)[0]); \ + uint32_t W1 = ZVK_BSWAP32((X)[1]); \ + uint32_t W2 = ZVK_BSWAP32((X)[2]); \ + uint32_t W3 = ZVK_BSWAP32((X)[3]); \ + uint32_t W4 = ZVK_BSWAP32((X)[4]); \ + uint32_t W5 = ZVK_BSWAP32((X)[5]); \ + uint32_t W6 = ZVK_BSWAP32((X)[6]); \ + uint32_t W7 = ZVK_BSWAP32((X)[7]); \ + (void)(0) + +// Sets the elements words of given EGU32x8_t variable 'X' to +// the given 8 uint32_t values privided in "Big Endian" (BE) +// order, i.e., from the most significant (W7) to the least +// significant (W0). Each of the words is byte-swapped, +// from a native/little-endian ordering in the variables to +// a big-endian representation in the EGU32x8_t. +#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \ + do { \ + (X)[0] = ZVK_BSWAP32(W0); \ + (X)[1] = ZVK_BSWAP32(W1); \ + (X)[2] = ZVK_BSWAP32(W2); \ + (X)[3] = ZVK_BSWAP32(W3); \ + (X)[4] = ZVK_BSWAP32(W4); \ + (X)[5] = ZVK_BSWAP32(W5); \ + (X)[6] = ZVK_BSWAP32(W6); \ + (X)[7] = ZVK_BSWAP32(W7); \ + } while (0) + +// Extracts 4 uint64_t words from the input EGU64x4_t value +// into the (mutable) variables named by the W arguments, provided in +// "Big Endian" (BE) order, i.e., from the most significant (W3) +// to the least significant (W0). +#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \ + uint64_t W0 = (X)[0]; \ + uint64_t W1 = (X)[1]; \ + uint64_t W2 = (X)[2]; \ + uint64_t W3 = (X)[3]; \ + (void)(0) + +// Sets the elements words of given EGU64x4_t variable 'X' to +// the given 4 uint64_t values privided in "Big Endian" (BE) +// order, i.e., from the most significant (W3) to the least +// significant (W0). +#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \ + do { \ + (X)[0] = (W0); \ + (X)[1] = (W1); \ + (X)[2] = (W2); \ + (X)[3] = (W3); \ + } while (0) + +// Copies a EGU8x16_t value from 'SRC' into 'DST'. +#define EGU8x16_COPY(DST, SRC) \ + for (std::size_t bidx = 0; bidx < 16; ++bidx) { \ + (DST)[bidx] = (SRC)[bidx]; \ + } + +// Performs "MUT_A ^= CONST_B;", i.e., xor of the bytes +// in A (mutated) with the bytes in B (unchanged). +#define EGU8x16_XOREQ(MUT_A, CONST_B) \ + for (std::size_t bidx = 0; bidx < 16; ++bidx) { \ + (MUT_A)[bidx] ^= (CONST_B)[bidx]; \ + } + +// Performs "MUT_A ^= CONST_B;", i.e., xor of the bytes +// in A (mutated) with the bytes in B (unchanged). +#define EGU32x4_XOREQ(MUT_A, CONST_B) \ + for (std::size_t bidx = 0; bidx < 4; ++bidx) { \ + (MUT_A)[bidx] ^= (CONST_B)[bidx]; \ + } + +// Performs "DST = A ^ B;", i.e., DST (overwritten) receives +// the xor of the bytes in A and B (both unchanged). +#define EGU8x16_XOR(DST, A, B) \ + for (std::size_t bidx = 0; bidx < 16; ++bidx) { \ + (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \ + } + +// +// Common bit manipulations logic. +// + +// Form a 64 bit integer with bit X set +#define ZVK_BIT(X) (1ULL << (X)) + +// Reverse the order of bits within bytes of a word. +// This is used to match the data interpretation in NIST SP 800-38D +// a.k.a the GCM specification. +#define ZVK_BREV8_32(X) \ + do { \ + (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \ + (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \ + (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \ + } while (0) + +// Rotates right a uint32_t value by N bits. +// uint32_t ROR32(uint32_t X, std::size_t N); +#define ZVK_ROR32(X, N) rotate_right((X), (N)) + +// Rotates right a uint64_t value by N bits. +// uint64_t ROR64(uint64_t X, std::size_t N); +#define ZVK_ROR64(X, N) rotate_right((X), (N)) + +// Rotates left a uint32_t value by N bits. +// uint32_t ROL32(uint32_t X, std::size_t N); +#define ZVK_ROL32(X, N) rotate_left((X), (N)) + +// +// Element Group Bit Manipulation Macros +// + +// Performs bit reversal in a EGU32x4_t group. +#define EGU32x4_BREV8(X) \ + for (std::size_t bidx = 0; bidx < 4; ++bidx) { \ + ZVK_BREV8_32((X)[bidx]); \ + } + +// Checks if a given bit is set within a EGU32x4_t group. +// Assumes LE ordering. +#define EGU32x4_ISSET(X, BIDX) \ + (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0) + +// Shfts a EGU32x4_t group left by one bit. +// +// Since the entire 128 bit value is shifted we need to handle carry bits. +// In order to limit the amount of carry check logic the elements are copied to +// a 64 bit temporary variable. +#define EGU32x4_LSHIFT(X) \ + do { \ + uint64_t dword; \ + dword = ((uint64_t)(X)[3]) << 32; \ + dword |= X[2]; \ + dword <<= 1; \ + if (X[1] & ZVK_BIT(31)) { \ + dword |= ZVK_BIT(0); \ + } \ + X[2] = dword & UINT32_MAX; \ + X[3] = dword >> 32; \ + dword = ((uint64_t)(X)[1]) << 32; \ + dword |= X[0]; \ + dword <<= 1; \ + X[0] = dword & UINT32_MAX; \ + X[1] = dword >> 32; \ + } while (0) + +#endif // RISCV_ZVK_EXT_MACROS_H_ -- cgit v1.1