diff options
author | Andrew Waterman <andrew@sifive.com> | 2023-06-19 20:18:09 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-19 20:18:09 -0700 |
commit | 5731a478ea2b7cf639a383498eb114a9dc7d64df (patch) | |
tree | 43f28cdc046246deb9275b71ecaaacdce45ecaa9 | |
parent | 8b10de64dd2048e813438dbb5e4ed24d09feb8eb (diff) | |
parent | a55f96ae9380d5cc9bef05e8b9e82e54d5d6ec35 (diff) | |
download | riscv-isa-sim-5731a478ea2b7cf639a383498eb114a9dc7d64df.zip riscv-isa-sim-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.gz riscv-isa-sim-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.bz2 |
Merge pull request #1303 from rivosinc/zvk-vector-crypto
Zvk vector crypto support (v5)
56 files changed, 3171 insertions, 19 deletions
diff --git a/riscv/arith.h b/riscv/arith.h index 3b807e9..20b1504 100644 --- a/riscv/arith.h +++ b/riscv/arith.h @@ -7,6 +7,7 @@ #include <cstdint> #include <climits> #include <cstddef> +#include <type_traits> inline uint64_t mulhu(uint64_t a, uint64_t b) { @@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t return r; } +// Rotates right an unsigned integer by the given number of bits. +template <typename T> +static inline T rotate_right(T x, std::size_t shiftamt) { + static_assert(std::is_unsigned<T>::value); + static constexpr T mask = (8 * sizeof(T)) - 1; + const std::size_t rshift = shiftamt & mask; + const std::size_t lshift = (-rshift) & mask; + return (x << lshift) | (x >> rshift); +} + +// Rotates right an unsigned integer by the given number of bits. +template <typename T> +static inline T rotate_left(T x, std::size_t shiftamt) { + static_assert(std::is_unsigned<T>::value); + static constexpr T mask = (8 * sizeof(T)) - 1; + const std::size_t lshift = shiftamt & mask; + const std::size_t rshift = (-lshift) & mask; + return (x << lshift) | (x >> rshift); +} + #endif diff --git a/riscv/decode.h b/riscv/decode.h index dad32a1..cd1c0a1 100644 --- a/riscv/decode.h +++ b/riscv/decode.h @@ -140,6 +140,7 @@ public: uint64_t v_vta() { return x(26, 1); } uint64_t v_vma() { return x(27, 1); } uint64_t v_mew() { return x(28, 1); } + uint64_t v_zimm6() { return x(15, 5) + (x(26, 1) << 5); } uint64_t p_imm2() { return x(20, 2); } uint64_t p_imm3() { return x(20, 3); } diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h index 17f129f..24d6ce1 100644 --- a/riscv/insns/sm4_common.h +++ b/riscv/insns/sm4_common.h @@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = { 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 }; - diff --git a/riscv/insns/vaesdf_vs.h b/riscv/insns/vaesdf_vs.h new file mode 100644 index 0000000..a124278 --- /dev/null +++ b/riscv/insns/vaesdf_vs.h @@ -0,0 +1,43 @@ +// vaesdf.vs vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vs_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);, + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd contains the input state, + // - vs2 contains the round key, + // - vd does receive the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + + // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_INV_SHIFT_ROWS(aes_state); + // InvSubBytes - Apply S-box to every byte in the state + VAES_INV_SUB_BYTES(aes_state); + // AddRoundKey (which is also InvAddRoundKey as it's xor) + EGU8x16_XOREQ(aes_state, scalar_key); + // InvMixColumns is not performed in the final round. + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesdf_vv.h b/riscv/insns/vaesdf_vv.h new file mode 100644 index 0000000..9fca572 --- /dev/null +++ b/riscv/insns/vaesdf_vv.h @@ -0,0 +1,37 @@ +// vaesdf.vv vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vv_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + {}, // No PRELOOP. + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd in contains the input state, + // - vs2 contains the input round key, + // - vd out receives the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg); + + // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_INV_SHIFT_ROWS(aes_state); + // InvSubBytes - Apply S-box to every byte in the state + VAES_INV_SUB_BYTES(aes_state); + // AddRoundKey (which is also InvAddRoundKey as it's xor) + EGU8x16_XOREQ(aes_state, round_key); + // InvMixColumns is not performed in the final round. + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesdm_vs.h b/riscv/insns/vaesdm_vs.h new file mode 100644 index 0000000..3c23e69 --- /dev/null +++ b/riscv/insns/vaesdm_vs.h @@ -0,0 +1,44 @@ +// vaesdm.vs vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vs_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);, + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd in contains the input state, + // - vs2 contains the input round key, + // - vd out receives the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + + // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_INV_SHIFT_ROWS(aes_state); + // InvSubBytes - Apply S-box to every byte in the state + VAES_INV_SUB_BYTES(aes_state); + // AddRoundKey (which is also InvAddRoundKey as it's xor) + EGU8x16_XOREQ(aes_state, scalar_key); + // InvMixColumns + VAES_INV_MIX_COLUMNS(aes_state); + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesdm_vv.h b/riscv/insns/vaesdm_vv.h new file mode 100644 index 0000000..9c29cd9 --- /dev/null +++ b/riscv/insns/vaesdm_vv.h @@ -0,0 +1,38 @@ +// vaesdm.vv vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vv_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + {}, // No PRELOOP. + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd contains the input state, + // - vs2 contains the round key, + // - vd does receive the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg); + + // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_INV_SHIFT_ROWS(aes_state); + // InvSubBytes - Apply S-box to every byte in the state + VAES_INV_SUB_BYTES(aes_state); + // AddRoundKey (which is also InvAddRoundKey as it's xor) + EGU8x16_XOREQ(aes_state, round_key); + // InvMixColumns + VAES_INV_MIX_COLUMNS(aes_state); + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesef_vs.h b/riscv/insns/vaesef_vs.h new file mode 100644 index 0000000..2d32653 --- /dev/null +++ b/riscv/insns/vaesef_vs.h @@ -0,0 +1,43 @@ +// vaesef.vs vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vs_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);, + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd contains the input state, + // - vs2 contains the round key, + // - vd receives the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + + // SubBytes - Apply S-box to every byte in the state + VAES_SUB_BYTES(aes_state); + // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_SHIFT_ROWS(aes_state); + // MixColumns is not performed for the final round. + // AddRoundKey + EGU8x16_XOREQ(aes_state, scalar_key); + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesef_vv.h b/riscv/insns/vaesef_vv.h new file mode 100644 index 0000000..9b43a6d --- /dev/null +++ b/riscv/insns/vaesef_vv.h @@ -0,0 +1,37 @@ +// vaesef.vv vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vv_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + {}, // No PRELOOP. + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd contains the input state, + // - vs2 contains the round key, + // - vd receives the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg); + + // SubBytes - Apply S-box to every byte in the state + VAES_SUB_BYTES(aes_state); + // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_SHIFT_ROWS(aes_state); + // MixColumns is not performed for the final round. + // AddRoundKey + EGU8x16_XOREQ(aes_state, round_key); + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesem_vs.h b/riscv/insns/vaesem_vs.h new file mode 100644 index 0000000..348cd9f --- /dev/null +++ b/riscv/insns/vaesem_vs.h @@ -0,0 +1,44 @@ +// vaesem.vs vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vs_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);, + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd contains the input state, + // - vs2 contains the round key, + // - vd receives the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + + // SubBytes - Apply S-box to every byte in the state + VAES_SUB_BYTES(aes_state); + // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_SHIFT_ROWS(aes_state); + // MixColumns + VAES_MIX_COLUMNS(aes_state); + // AddRoundKey + EGU8x16_XOREQ(aes_state, scalar_key); + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaesem_vv.h b/riscv/insns/vaesem_vv.h new file mode 100644 index 0000000..34f0056 --- /dev/null +++ b/riscv/insns/vaesem_vv.h @@ -0,0 +1,38 @@ +// vaesem.vv vd, vs2 + +#include "zvkned_ext_macros.h" +#include "zvk_ext_macros.h" + +require_vaes_vv_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + {}, // No PRELOOP. + { + // For AES128, AES192, or AES256, state and key are 128b/16B values: + // - vd contains the input state, + // - vs2 contains the round key, + // - vd receives the output state. + // + // While the spec calls for handling the vector as made of EGU32x4 + // element groups (i.e., 4 uint32_t), it is convenient to treat + // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why + // we extract the operands here instead of using the existing LOOP + // macro that defines/extracts the operand variables as EGU32x4. + EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg); + const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg); + + // SubBytes - Apply S-box to every byte in the state + VAES_SUB_BYTES(aes_state); + // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions. + VAES_SHIFT_ROWS(aes_state); + // MixColumns + VAES_MIX_COLUMNS(aes_state); + // AddRoundKey + EGU8x16_XOREQ(aes_state, round_key); + + // Update the destination register. + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + EGU8x16_COPY(vd, aes_state); + } +); diff --git a/riscv/insns/vaeskf1_vi.h b/riscv/insns/vaeskf1_vi.h new file mode 100644 index 0000000..28d03d0 --- /dev/null +++ b/riscv/insns/vaeskf1_vi.h @@ -0,0 +1,65 @@ +// vaeskf1.vi vd, vs2, rnd + +#include "zvk_ext_macros.h" +#include "zvkned_ext_macros.h" + +require_vaeskf_vi_constraints; + +// There is one round constant for each round number +// between 1 and 10. We index using 'round# -1'. +static constexpr uint8_t kRoundConstants[10] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 +}; + +// For AES128, AES192, or AES256, keys (and state) are handled as +// 128b/16B values. +// +// The Zvkned spec calls for handling the vector as made of EGU32x4 +// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification +// describes the key expansion in terms of manipulations of 32 bit +// words, so using the EGU32x4 is natural. +// +VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP( + {}, + // The following statements will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the 'round' variable + // declared and defined here here to be visible in the loop block. + // Only consider the bottom 4 bits of the immediate. + const reg_t zimm4 = zimm5 & 0xF; + // Normalize the round value to be in [2, 14] by toggling bit 3 + // if outside the range (i.e., +8 or -8). + const reg_t round = ((1 <= zimm4) && (zimm4 <= 10)) ? zimm4 : (zimm4 ^ 0x8); + const uint32_t rcon = kRoundConstants[round - 1];, + // Per Element Group body. + { + // vaeskf1_vi produces key[i+1] in vd, it receives key[i] in vs2, + // i.e., 4x32b values (4 words). + // + // The logic is fairly similar between vaeskf1/vaeskf2, with the following + // differences: + // - in AES-128 (vaeskf1), we get both the 'temp' word and + // the "previous words" w0..w3 from key[i]/vs2. + // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and + // the "previous words" w0..w3 from key[i-1]/vd. + + // 'temp' is extracted from the last (most significant) word of key[i]. + uint32_t temp = vs2[3]; + temp = (temp >> 8) | (temp << 24); // Rotate right by 8 + temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) | + ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) | + ((uint32_t)AES_ENC_SBOX[(temp >> 8) & 0xFF] << 8) | + ((uint32_t)AES_ENC_SBOX[(temp >> 0) & 0xFF] << 0)); + temp = temp ^ rcon; + + // "old" words are the w[i-Nk] of FIPS-197. They are extracted + // from vs2, which contains key[i] in AES-128 where Nk=4. + const uint32_t w0 = vs2[0] ^ temp; + const uint32_t w1 = vs2[1] ^ w0; + const uint32_t w2 = vs2[2] ^ w1; + const uint32_t w3 = vs2[3] ^ w2; + + // Overwrite vd with k[i+1] from the new words. + SET_EGU32x4_LE(vd, w0, w1, w2, w3); + } +); diff --git a/riscv/insns/vaeskf2_vi.h b/riscv/insns/vaeskf2_vi.h new file mode 100644 index 0000000..49c2a2d --- /dev/null +++ b/riscv/insns/vaeskf2_vi.h @@ -0,0 +1,89 @@ +// vaeskf2.vi vd, vs2, rnd + +#include "zvk_ext_macros.h" +#include "zvkned_ext_macros.h" + +require_vaeskf_vi_constraints; + +// Round Constants +// +// Only the odd rounds need to be encoded, the even ones can use 0 +// or skip the rcon handling. We can use '(round# / 2) - 1' +// (or "(round# >> 1) - 1") to index into the array. +// +// Round# Constant +// [ 2] -> kRoundConstants[0] +// [ 3] -> 0 / Nothing +// [ 4] -> kRoundConstants[1] +// [ 5] -> 0 / Nothing +// [ 6] -> kRoundConstants[2] +// [ 7] -> 0 / Nothing +// ... +// [13] -> 0 / Nothing +// [14] -> kRoundConstants[6] +static constexpr uint8_t kRoundConstants[7] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, +}; + +// For AES128, AES192, or AES256, keys (and state) are handled as +// 128b/16B values. +// +// The Zvkned spec calls for handling the vector as made of EGU32x4 +// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification +// describes the key expansion in terms of manipulations of 32 bit +// words, so using the EGU32x4 is natural. +// +VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP( + {}, + // The following statements will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the 'round' variable + // declared and defined here here to be visible in the loop block. + // Only consider the bottom 4 bits of the immediate. + const reg_t zimm4 = zimm5 & 0xF; + // Normalize the round value to be in [2, 14] by toggling bit 3 + // if outside the range (i.e., +8 or -8). + const reg_t round = ((2 <= zimm4) && (zimm4 <= 14)) ? zimm4 : (zimm4 ^ 0x8);, + // Per Element Group body. + { + // vaeskf2_vi produces key[i+1] in vd, it receives key[i] in vs2, + // i.e., 4x32b values (4 words). + // + // The logic is fairly similar between vaeskf2/vaeskf2, with the following + // differences: + // - in AES-128 (vaeskf1), we get both the 'temp' word and + // the "previous words" w0..w3 from key[i]/vs2. + // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and + // the "previous words" w0..w3 from key[i-1]/vd. + + // 'temp' is extracted from the last (most significant) word of key[i]. + uint32_t temp = vs2[3]; + // With AES-256, when we have an even round number, we hit the + // Nk > 6 and i mod Nk = 4 + // condition in the FIPS-197 key expansion pseudo-code (Figure 11). + // In those cases we skip RotWord and the round constant is 0. + const bool is_even_round = (round & 0x1) == 0; + if (is_even_round) { + temp = (temp >> 8) | (temp << 24); // Rotate right by 8 + } + temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) | + ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) | + ((uint32_t)AES_ENC_SBOX[(temp >> 8) & 0xFF] << 8) | + ((uint32_t)AES_ENC_SBOX[(temp >> 0) & 0xFF] << 0)); + + if (is_even_round) { + const uint32_t rcon = kRoundConstants[(round >> 1) - 1]; + temp = temp ^ rcon; + } + + // "old" words are the w[i-Nk] of FIPS-197. For AES-256, where Nk=8, + // they are extracted from vd which contains key[i-1]. + const uint32_t w0 = vd[0] ^ temp; + const uint32_t w1 = vd[1] ^ w0; + const uint32_t w2 = vd[2] ^ w1; + const uint32_t w3 = vd[3] ^ w2; + + // Overwrite vd with k[i+1] from the new words. + SET_EGU32x4_LE(vd, w0, w1, w2, w3); + } +); diff --git a/riscv/insns/vaesz_vs.h b/riscv/insns/vaesz_vs.h new file mode 100644 index 0000000..c3dc931 --- /dev/null +++ b/riscv/insns/vaesz_vs.h @@ -0,0 +1,24 @@ +// vaesz.vs vd, vs2 + +#include "zvk_ext_macros.h" +#include "zvkned_ext_macros.h" + +require_vaes_vs_constraints; + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);, + // Per Element Group body. + { + EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true); + // Produce vd = vd ^ "common key from vs2". + EGU8x16_XOR(vd, vd, scalar_key); + } +); diff --git a/riscv/insns/vandn_vv.h b/riscv/insns/vandn_vv.h new file mode 100644 index 0000000..d85e47d --- /dev/null +++ b/riscv/insns/vandn_vv.h @@ -0,0 +1,10 @@ +// vandn.vv vd, vs2, vs1, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_VV_LOOP +({ + vd = vs2 & (~vs1); +}) diff --git a/riscv/insns/vandn_vx.h b/riscv/insns/vandn_vx.h new file mode 100644 index 0000000..1c66a40 --- /dev/null +++ b/riscv/insns/vandn_vx.h @@ -0,0 +1,10 @@ +// vandn.vx vd, vs2, rs1, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_VX_LOOP +({ + vd = vs2 & (~rs1); +}) diff --git a/riscv/insns/vbrev8_v.h b/riscv/insns/vbrev8_v.h new file mode 100644 index 0000000..a6d3cda --- /dev/null +++ b/riscv/insns/vbrev8_v.h @@ -0,0 +1,13 @@ +// vbrev8.v vd, vs2, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_V_ULOOP +({ + vd = vs2; + vd = ((vd & 0x5555555555555555llu) << 1) | ((vd & 0xAAAAAAAAAAAAAAAAllu) >> 1); + vd = ((vd & 0x3333333333333333llu) << 2) | ((vd & 0xCCCCCCCCCCCCCCCCllu) >> 2); + vd = ((vd & 0x0F0F0F0F0F0F0F0Fllu) << 4) | ((vd & 0xF0F0F0F0F0F0F0F0llu) >> 4); +}) diff --git a/riscv/insns/vbrev_v.h b/riscv/insns/vbrev_v.h new file mode 100644 index 0000000..7f784c2 --- /dev/null +++ b/riscv/insns/vbrev_v.h @@ -0,0 +1,24 @@ +// vbrev.v vd, vs2 + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_V_ULOOP +({ + reg_t x = vs2; + + // Reverse bits in bytes (vbrev8) + x = ((x & 0x5555555555555555llu) << 1) | ((x & 0xAAAAAAAAAAAAAAAAllu) >> 1); + x = ((x & 0x3333333333333333llu) << 2) | ((x & 0xCCCCCCCCCCCCCCCCllu) >> 2); + x = ((x & 0x0F0F0F0F0F0F0F0Fllu) << 4) | ((x & 0xF0F0F0F0F0F0F0F0llu) >> 4); + // Re-order bytes (vrev8) + if (P.VU.vsew > 8) + x = ((x & 0x00FF00FF00FF00FFllu) << 8) | ((x & 0xFF00FF00FF00FF00llu) >> 8); + if (P.VU.vsew > 16) + x = ((x & 0x0000FFFF0000FFFFllu) << 16) | ((x & 0xFFFF0000FFFF0000llu) >> 16); + if (P.VU.vsew > 32) + x = ((x & 0x00000000FFFFFFFFllu) << 32) | ((x & 0xFFFFFFFF00000000llu) >> 32); + + vd = x; +}) diff --git a/riscv/insns/vclmul_vv.h b/riscv/insns/vclmul_vv.h new file mode 100644 index 0000000..8957738 --- /dev/null +++ b/riscv/insns/vclmul_vv.h @@ -0,0 +1,20 @@ +// vclmul.vv vd, vs2, vs1, vm + +#include "zvk_ext_macros.h" + +require_zvbc; +require(P.VU.vsew == 64); + +VI_VV_ULOOP +({ + // Perform a carryless multiplication 64bx64b on each 64b element, + // return the low 64b of the 128b product. + // <https://en.wikipedia.org/wiki/Carry-less_product> + vd = 0; + for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) { + const reg_t mask = ((reg_t) 1) << bit_idx; + if ((vs1 & mask) != 0) { + vd ^= vs2 << bit_idx; + } + } +}) diff --git a/riscv/insns/vclmul_vx.h b/riscv/insns/vclmul_vx.h new file mode 100644 index 0000000..1df7a3a --- /dev/null +++ b/riscv/insns/vclmul_vx.h @@ -0,0 +1,20 @@ +// vclmul.vx vd, vs2, rs1, vm + +#include "zvk_ext_macros.h" + +require_zvbc; +require(P.VU.vsew == 64); + +VI_VX_ULOOP +({ + // Perform a carryless multiplication 64bx64b on each 64b element, + // return the low 64b of the 128b product. + // <https://en.wikipedia.org/wiki/Carry-less_product> + vd = 0; + for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) { + const reg_t mask = ((reg_t) 1) << bit_idx; + if ((rs1 & mask) != 0) { + vd ^= vs2 << bit_idx; + } + } +}) diff --git a/riscv/insns/vclmulh_vv.h b/riscv/insns/vclmulh_vv.h new file mode 100644 index 0000000..6a54bcf --- /dev/null +++ b/riscv/insns/vclmulh_vv.h @@ -0,0 +1,20 @@ +// vclmulh.vv vd, vs2, vs1, vm + +#include "zvk_ext_macros.h" + +require_zvbc; +require(P.VU.vsew == 64); + +VI_VV_ULOOP +({ + // Perform a carryless multiplication 64bx64b on each 64b element, + // return the high 64b of the 128b product. + // <https://en.wikipedia.org/wiki/Carry-less_product> + vd = 0; + for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) { + const reg_t mask = ((reg_t) 1) << bit_idx; + if ((vs1 & mask) != 0) { + vd ^= ((reg_t)vs2) >> (sew - bit_idx); + } + } +}) diff --git a/riscv/insns/vclmulh_vx.h b/riscv/insns/vclmulh_vx.h new file mode 100644 index 0000000..e874d1d --- /dev/null +++ b/riscv/insns/vclmulh_vx.h @@ -0,0 +1,20 @@ +// vclmulh.vx vd, vs2, rs1, vm + +#include "zvk_ext_macros.h" + +require_zvbc; +require(P.VU.vsew == 64); + +VI_VX_ULOOP +({ + // Perform a carryless multiplication 64bx64b on each 64b element, + // return the high 64b of the 128b product. + // <https://en.wikipedia.org/wiki/Carry-less_product> + vd = 0; + for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) { + const reg_t mask = ((reg_t) 1) << bit_idx; + if ((rs1 & mask) != 0) { + vd ^= ((reg_t)vs2) >> (sew - bit_idx); + } + } +}) diff --git a/riscv/insns/vclz_v.h b/riscv/insns/vclz_v.h new file mode 100644 index 0000000..5f7f03c --- /dev/null +++ b/riscv/insns/vclz_v.h @@ -0,0 +1,16 @@ +// vclz.v vd, vs2 + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_V_ULOOP +({ + unsigned int i = 0; + for (; i < P.VU.vsew; ++i) { + if (1 & (vs2 >> (P.VU.vsew - 1 - i))) { + break; + } + } + vd = i; +}) diff --git a/riscv/insns/vcpop_v.h b/riscv/insns/vcpop_v.h new file mode 100644 index 0000000..52b29c6 --- /dev/null +++ b/riscv/insns/vcpop_v.h @@ -0,0 +1,16 @@ +// vpopc.v vd, vs2 + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_V_ULOOP +({ + reg_t count = 0; + for (std::size_t i = 0; i < P.VU.vsew; ++i) { + if (1 & (vs2 >> i)) { + count++; + } + } + vd = count; +}) diff --git a/riscv/insns/vctz_v.h b/riscv/insns/vctz_v.h new file mode 100644 index 0000000..b63dd01 --- /dev/null +++ b/riscv/insns/vctz_v.h @@ -0,0 +1,16 @@ +// vctz.v vd, vs2 + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_V_ULOOP +({ + unsigned int i = 0; + for (; i < P.VU.vsew; ++i) { + if (1 & (vs2 >> i)) { + break; + } + } + vd = i; +}) diff --git a/riscv/insns/vghsh_vv.h b/riscv/insns/vghsh_vv.h new file mode 100644 index 0000000..bcbfe74 --- /dev/null +++ b/riscv/insns/vghsh_vv.h @@ -0,0 +1,38 @@ +// vghsh.vv vd, vs2, vs1 + +#include "zvk_ext_macros.h" + +require_zvkg; +require(P.VU.vsew == 32); +require_egw_fits(128); + +VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP( + {}, + { + EGU32x4_t Y = vd; // Current partial hash + EGU32x4_t X = vs1; // Block cipher output + EGU32x4_t H = vs2; // Hash subkey + + EGU32x4_BREV8(H); + EGU32x4_t Z = {}; + + // S = brev8(Y ^ X) + EGU32x4_t S; + EGU32x4_XOR(S, Y, X); + EGU32x4_BREV8(S); + + for (int bit = 0; bit < 128; bit++) { + if (EGU32x4_ISSET(S, bit)) { + EGU32x4_XOREQ(Z, H); + } + + const bool reduce = EGU32x4_ISSET(H, 127); + EGU32x4_LSHIFT(H); // Left shift by 1. + if (reduce) { + H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial + } + } + EGU32x4_BREV8(Z); + vd = Z; + } +); diff --git a/riscv/insns/vgmul_vv.h b/riscv/insns/vgmul_vv.h new file mode 100644 index 0000000..820b396 --- /dev/null +++ b/riscv/insns/vgmul_vv.h @@ -0,0 +1,32 @@ +// vgmul.vv vd, vs2 + +#include "zvk_ext_macros.h" + +require_zvkg; +require(P.VU.vsew == 32); +require_egw_fits(128); + +VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP( + {}, + { + EGU32x4_t Y = vd; // Multiplier + EGU32x4_BREV8(Y); + EGU32x4_t H = vs2; // Multiplicand + EGU32x4_BREV8(H); + EGU32x4_t Z = {}; + + for (int bit = 0; bit < 128; bit++) { + if (EGU32x4_ISSET(Y, bit)) { + EGU32x4_XOREQ(Z, H); + } + + bool reduce = EGU32x4_ISSET(H, 127); + EGU32x4_LSHIFT(H); // Lef shift by 1 + if (reduce) { + H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial + } + } + EGU32x4_BREV8(Z); + vd = Z; + } +); diff --git a/riscv/insns/vrev8_v.h b/riscv/insns/vrev8_v.h new file mode 100644 index 0000000..f26c5a0 --- /dev/null +++ b/riscv/insns/vrev8_v.h @@ -0,0 +1,16 @@ +// vrev8.v vd, vs2, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_V_ULOOP +({ + vd = vs2; + if (P.VU.vsew > 8) + vd = ((vd & 0x00FF00FF00FF00FFllu) << 8) | ((vd & 0xFF00FF00FF00FF00llu) >> 8); + if (P.VU.vsew > 16) + vd = ((vd & 0x0000FFFF0000FFFFllu) << 16) | ((vd & 0xFFFF0000FFFF0000llu) >> 16); + if (P.VU.vsew > 32) + vd = ((vd & 0x00000000FFFFFFFFllu) << 32) | ((vd & 0xFFFFFFFF00000000llu) >> 32); +}) diff --git a/riscv/insns/vrol_vv.h b/riscv/insns/vrol_vv.h new file mode 100644 index 0000000..fb2e483 --- /dev/null +++ b/riscv/insns/vrol_vv.h @@ -0,0 +1,17 @@ +// vrol.vv vd, vs2, vs1, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +// 'mask' selects the low log2(vsew) bits of the shift amount, +// to limit the maximum shift to "vsew - 1" bits. +const reg_t mask = P.VU.vsew - 1; + +VI_VV_ULOOP +({ + // For .vv, the shift amount comes from the vs1 element. + const reg_t lshift = vs1 & mask; + const reg_t rshift = (-lshift) & mask; + vd = (vs2 << lshift) | (vs2 >> rshift); +}) diff --git a/riscv/insns/vrol_vx.h b/riscv/insns/vrol_vx.h new file mode 100644 index 0000000..b0c89a2 --- /dev/null +++ b/riscv/insns/vrol_vx.h @@ -0,0 +1,18 @@ +// vrol.vx vd, vs2, rs1, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +// 'mask' selects the low log2(vsew) bits of the shift amount, +// to limit the maximum shift to "vsew - 1" bits. +const reg_t mask = P.VU.vsew - 1; + +// For .vx, the shift amount comes from rs1. +const reg_t lshift = ((reg_t)RS1) & mask; +const reg_t rshift = (-lshift) & mask; + +VI_V_ULOOP +({ + vd = (vs2 << lshift) | (vs2 >> rshift); +}) diff --git a/riscv/insns/vror_vi.h b/riscv/insns/vror_vi.h new file mode 100644 index 0000000..1269c3d --- /dev/null +++ b/riscv/insns/vror_vi.h @@ -0,0 +1,18 @@ +// vror.vi vd, vs2, zimm6, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +// 'mask' selects the low log2(vsew) bits of the shift amount, +// to limit the maximum shift to "vsew - 1" bits. +const reg_t mask = P.VU.vsew - 1; + +// For .vi, the shift amount comes from bits [26,19-15]. +const reg_t rshift = insn.v_zimm6() & mask; +const reg_t lshift = (-rshift) & mask; + +VI_V_ULOOP +({ + vd = (vs2 << lshift) | (vs2 >> rshift); +}) diff --git a/riscv/insns/vror_vv.h b/riscv/insns/vror_vv.h new file mode 100644 index 0000000..c649c6d --- /dev/null +++ b/riscv/insns/vror_vv.h @@ -0,0 +1,17 @@ +// vror.vv vd, vs2, vs1, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +// 'mask' selects the low log2(vsew) bits of the shift amount, +// to limit the maximum shift to "vsew - 1" bits. +const reg_t mask = P.VU.vsew - 1; + +VI_VV_ULOOP +({ + // For .vv, the shift amount comes from the vs1 element. + const reg_t rshift = vs1 & mask; + const reg_t lshift = (-rshift) & mask; + vd = (vs2 << lshift) | (vs2 >> rshift); +}) diff --git a/riscv/insns/vror_vx.h b/riscv/insns/vror_vx.h new file mode 100644 index 0000000..50c8e5c --- /dev/null +++ b/riscv/insns/vror_vx.h @@ -0,0 +1,18 @@ +// vror.vx vd, vs2, rs1, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +// 'mask' selects the low log2(vsew) bits of the shift amount, +// to limit the maximum shift to "vsew - 1" bits. +const reg_t mask = P.VU.vsew - 1; + +// For .vx, the shift amount comes from rs1. +const reg_t rshift = ((reg_t)RS1) & mask; +const reg_t lshift = (-rshift) & mask; + +VI_V_ULOOP +({ + vd = (vs2 << lshift) | (vs2 >> rshift); +}) diff --git a/riscv/insns/vsha2ch_vv.h b/riscv/insns/vsha2ch_vv.h new file mode 100644 index 0000000..34c6e05 --- /dev/null +++ b/riscv/insns/vsha2ch_vv.h @@ -0,0 +1,61 @@ +// vsha2ch.vv vd, vs2, vs1 + +#include "zvknh_ext_macros.h" + +// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2. +require_vsha2_common_constraints; + +switch (P.VU.vsew) { + case e32: { + require_vsha2_vsew32_constraints; + + VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP( + {}, + { + // {c, d, g, h} <- vd + EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h); + // {a, b, e, f} <- vs2 + EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f); + // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W + EXTRACT_EGU32x4_WORDS_BE(vs1, kw3, kw2, + UNUSED _unused_kw1, UNUSED _unused_kw0); + + ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw2); + ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw3); + + // Update the destination register, vd <- {a, b, e, f}. + SET_EGU32x4_BE(vd, a, b, e, f); + } + ); + break; + } + + case e64: { + require_vsha2_vsew64_constraints; + + VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP( + {}, + { + // {c, d, g, h} <- vd + EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h); + // {a, b, e, f} <- vs2 + EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f); + // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W + EXTRACT_EGU64x4_WORDS_BE(vs1, kw3, kw2, + UNUSED _unused_kw1, UNUSED _unused_kw0); + + ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw2); + ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw3); + + // Update the destination register, vd <- {a, b, e, f}. + SET_EGU64x4_BE(vd, a, b, e, f); + } + ); + break; + } + + // 'require_vsha2_common_constraints' ensures that + // VSEW is either 32 or 64. + default: + require(false); +} diff --git a/riscv/insns/vsha2cl_vv.h b/riscv/insns/vsha2cl_vv.h new file mode 100644 index 0000000..4a1df09 --- /dev/null +++ b/riscv/insns/vsha2cl_vv.h @@ -0,0 +1,62 @@ +// vsha2cl.vv vd, vs2, vs1 + +#include "zvknh_ext_macros.h" + +// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2. +require_vsha2_common_constraints; + +switch (P.VU.vsew) { + case e32: { + require_vsha2_vsew32_constraints; + + VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP( + {}, + { + // {c, d, g, h} <- vd + EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h); + // {a, b, e, f} <- vs2 + EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f); + // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W + EXTRACT_EGU32x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2, + kw1, kw0); + + ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw0); + ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw1); + + // Update the destination register, vd <- {a, b, e, f}. + SET_EGU32x4_BE(vd, a, b, e, f); + } + ); + break; + } + + case e64: { + require_vsha2_vsew64_constraints; + + VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP( + {}, + { + // {c, d, g, h} <- vd + EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h); + // {a, b, e, f} <- vs2 + EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f); + // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W + EXTRACT_EGU64x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2, + kw1, kw0); + + ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw0); + ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw1); + + // Update the destination register, vd <- {a, b, e, f}. + SET_EGU64x4_BE(vd, a, b, e, f); + } + ); + break; + } + + // 'require_vsha2_common_constraints' ensures that + // VSEW is either 32 or 64. + default: + require(false); +} + diff --git a/riscv/insns/vsha2ms_vv.h b/riscv/insns/vsha2ms_vv.h new file mode 100644 index 0000000..8f1ca08 --- /dev/null +++ b/riscv/insns/vsha2ms_vv.h @@ -0,0 +1,63 @@ +// vshams.vv vd, vs2, vs1 + +#include "zvknh_ext_macros.h" + +// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2. +require_vsha2_common_constraints; + +switch (P.VU.vsew) { + case e32: { + require_vsha2_vsew32_constraints; + + VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP( + {}, + { + // {w3, w2, w1, w0} <- vd + EXTRACT_EGU32x4_WORDS_BE(vd, w3, w2, w1, w0); + // {w11, w10, w9, w4} <- vs2 + EXTRACT_EGU32x4_WORDS_BE(vs2, w11, w10, w9, w4); + // {w15, w14, w13, w12} <- vs1 + EXTRACT_EGU32x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12); + + const uint32_t w16 = ZVK_SHA256_SCHEDULE(w14, w9, w1, w0); + const uint32_t w17 = ZVK_SHA256_SCHEDULE(w15, w10, w2, w1); + const uint32_t w18 = ZVK_SHA256_SCHEDULE(w16, w11, w3, w2); + const uint32_t w19 = ZVK_SHA256_SCHEDULE(w17, w12, w4, w3); + + // Update the destination register. + SET_EGU32x4_BE(vd, w19, w18, w17, w16);; + } + ); + break; + } + + case e64: { + require_vsha2_vsew64_constraints; + + VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP( + {}, + { + // {w3, w2, w1, w0} <- vd + EXTRACT_EGU64x4_WORDS_BE(vd, w3, w2, w1, w0); + // {w11, w10, w9, w4} <- vs2 + EXTRACT_EGU64x4_WORDS_BE(vs2, w11, w10, w9, w4); + // {w15, w14, w13, w12} <- vs1 + EXTRACT_EGU64x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12); + + const uint64_t w16 = ZVK_SHA512_SCHEDULE(w14, w9, w1, w0); + const uint64_t w17 = ZVK_SHA512_SCHEDULE(w15, w10, w2, w1); + const uint64_t w18 = ZVK_SHA512_SCHEDULE(w16, w11, w3, w2); + const uint64_t w19 = ZVK_SHA512_SCHEDULE(w17, w12, w4, w3); + + // Update the destination register. + SET_EGU64x4_BE(vd, w19, w18, w17, w16);; + } + ); + break; + } + + // 'require_vsha2_common_constraints' ensures that + // VSEW is either 32 or 64. + default: + require(false); +} diff --git a/riscv/insns/vsm3c_vi.h b/riscv/insns/vsm3c_vi.h new file mode 100644 index 0000000..b3e8121 --- /dev/null +++ b/riscv/insns/vsm3c_vi.h @@ -0,0 +1,60 @@ +// vsm3c.vi vd, vs2, rnd + +#include "zvksh_ext_macros.h" + +require_vsm3_constraints; + +VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP( + {}, + // No need to validate or normalize 'zimm5' here as this is a 5 bits value + // and all values in 0-31 are valid. + const reg_t round = zimm5;, + { + // {H, G, F, E, D, C, B, A} <- vd + EXTRACT_EGU32x8_WORDS_BE_BSWAP(vd, H, G, F, E, D, C, B, A); + // {_, _, w5, w4, _, _, w1, w0} <- vs2 + EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2, + UNUSED _unused_w7, UNUSED _unused_w6, w5, w4, + UNUSED _unused_w3, UNUSED _unused_w2, w1, w0); + const uint32_t x0 = w0 ^ w4; // W'[0] in spec documentation. + const uint32_t x1 = w1 ^ w5; // W'[1] + + // Two rounds of compression. + uint32_t ss1; + uint32_t ss2; + uint32_t tt1; + uint32_t tt2; + uint32_t j; + + j = 2 * round; + ss1 = ZVK_ROL32(ZVK_ROL32(A, 12) + E + ZVK_ROL32(ZVKSH_T(j), j % 32), 7); + ss2 = ss1 ^ ZVK_ROL32(A, 12); + tt1 = ZVKSH_FF(A, B, C, j) + D + ss2 + x0; + tt2 = ZVKSH_GG(E, F, G, j) + H + ss1 + w0; + D = C; + const uint32_t C1 = ZVK_ROL32(B, 9); + B = A; + const uint32_t A1 = tt1; + H = G; + const uint32_t G1 = ZVK_ROL32(F, 19); + F = E; + const uint32_t E1 = ZVKSH_P0(tt2); + + j = 2 * round + 1; + ss1 = ZVK_ROL32(ZVK_ROL32(A1, 12) + E1 + ZVK_ROL32(ZVKSH_T(j), j % 32), 7); + ss2 = ss1 ^ ZVK_ROL32(A1, 12); + tt1 = ZVKSH_FF(A1, B, C1, j) + D + ss2 + x1; + tt2 = ZVKSH_GG(E1, F, G1, j) + H + ss1 + w1; + D = C1; + const uint32_t C2 = ZVK_ROL32(B, 9); + B = A1; + const uint32_t A2 = tt1; + H = G1; + const uint32_t G2 = ZVK_ROL32(F, 19); + F = E1; + const uint32_t E2 = ZVKSH_P0(tt2); + + // Update the destination register. + SET_EGU32x8_WORDS_BE_BSWAP(vd, G1, G2, E1, E2, C1, C2, A1, A2); + } +); diff --git a/riscv/insns/vsm3me_vv.h b/riscv/insns/vsm3me_vv.h new file mode 100644 index 0000000..dd6cb52 --- /dev/null +++ b/riscv/insns/vsm3me_vv.h @@ -0,0 +1,39 @@ +// vsm3me.vv vd, vs2, vs1 + +#include "zvk_ext_macros.h" +#include "zvksh_ext_macros.h" + +// Per the SM3 spec, the message expansion computes new words Wi as: +// W[i] = ( P_1( W[i-16] xor W[i-9] xor ( W[i-3] <<< 15 ) ) +// xor ( W[i-13] <<< 7 ) +// xor W[i-6])) +// Using arguments M16 = W[i-16], M9 = W[i-9], etc., +// where Mk stands for "W[i Minus k]", we define the "W function": +#define ZVKSH_W(M16, M9, M3, M13, M6) \ + (ZVKSH_P1((M16) ^ (M9) ^ ZVK_ROL32((M3), 15)) ^ ZVK_ROL32((M13), 7) ^ (M6)) + +require_vsm3_constraints; + +VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP( + {}, + { + // {w7, w6, w5, w4, w3, w2, w1, w0} <- vs1 + EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs1, w7, w6, w5, w4, w3, w2, w1, w0); + // {w15, w14, w13, w12, w11, w10, w9, w8} <- vs2 + EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2, w15, w14, w13, w12, w11, w10, w9, w8); + + // Arguments are W[i-16], W[i-9], W[i-13], W[i-6]. + // Note that some of the newly computed words are used in later invocations. + const uint32_t w16 = ZVKSH_W(w0, w7, w13, w3, w10); + const uint32_t w17 = ZVKSH_W(w1, w8, w14, w4, w11); + const uint32_t w18 = ZVKSH_W(w2, w9, w15, w5, w12); + const uint32_t w19 = ZVKSH_W(w3, w10, w16, w6, w13); + const uint32_t w20 = ZVKSH_W(w4, w11, w17, w7, w14); + const uint32_t w21 = ZVKSH_W(w5, w12, w18, w8, w15); + const uint32_t w22 = ZVKSH_W(w6, w13, w19, w9, w16); + const uint32_t w23 = ZVKSH_W(w7, w14, w20, w10, w17); + + // Update the destination register. + SET_EGU32x8_WORDS_BE_BSWAP(vd, w23, w22, w21, w20, w19, w18, w17, w16); + } +); diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h new file mode 100644 index 0000000..8f52e68 --- /dev/null +++ b/riscv/insns/vsm4k_vi.h @@ -0,0 +1,52 @@ +// vsm4k.vi vd, vs2, round# + +#include "zvksed_ext_macros.h" + +// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft. +static constexpr uint32_t zvksed_ck[32] = { + 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269, + 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9, + 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249, + 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9, + 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229, + 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299, + 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209, + 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +}; + +require_vsm4_constraints; + +VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP( + {}, + // The following statements will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the 'round' variable + // declared and defined here here to be visible in the loop block. + // Only consider the bottom 3 bits of the immediate, ensuring that + // 'round' is in the valid range [0, 7]. + const reg_t round = zimm5 & 0x7;, + // Per Element Group body. + { + // {rk0, rk1, rk2, rk3} <- vs2 + EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3); + + uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round]; + uint32_t S = ZVKSED_SUB_BYTES(B); + uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S); + + B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1]; + S = ZVKSED_SUB_BYTES(B); + uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S); + + B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2]; + S = ZVKSED_SUB_BYTES(B); + uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S); + + B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3]; + S = ZVKSED_SUB_BYTES(B); + uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S); + + // Update the destination register. + SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7); + } +); diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h new file mode 100644 index 0000000..44011eb --- /dev/null +++ b/riscv/insns/vsm4r_vs.h @@ -0,0 +1,51 @@ +// vsm4r.vs vd, vs2 + +#include "zvksed_ext_macros.h" + +require_vsm4_constraints; +// No overlap of vd and vs2. +require(insn.rd() != insn.rs2()); + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0); + const uint32_t rk0 = scalar_key[0]; + const uint32_t rk1 = scalar_key[1]; + const uint32_t rk2 = scalar_key[2]; + const uint32_t rk3 = scalar_key[3];, + { + EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true); + + // {x0, x1,x2, x3} <- vd + EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3); + + uint32_t B; + uint32_t S; + + B = x1 ^ x2 ^ x3 ^ rk0; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x4 = ZVKSED_ROUND(x0, S); + + B = x2 ^ x3 ^ x4 ^ rk1; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x5 = ZVKSED_ROUND(x1, S); + + B = x3 ^ x4 ^ x5 ^ rk2; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x6 = ZVKSED_ROUND(x2, S); + + B = x4 ^ x5 ^ x6 ^ rk3; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x7 = ZVKSED_ROUND(x3, S); + + // Update the destination register. + SET_EGU32x4_LE(state, x4, x5, x6, x7); + } +); diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h new file mode 100644 index 0000000..9a18cec --- /dev/null +++ b/riscv/insns/vsm4r_vv.h @@ -0,0 +1,37 @@ +// vsm4r.vv vd, vs2 + +#include "zvksed_ext_macros.h" + +require_vsm4_constraints; + +VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP( + {}, + { + // vd = {x0, x1,x2, x3} <- vd + EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3); + // {rk0, rk1, rk2, rk3} <- vs2 + EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3); + + uint32_t B; + uint32_t S; + + B = x1 ^ x2 ^ x3 ^ rk0; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x4 = ZVKSED_ROUND(x0, S); + + B = x2 ^ x3 ^ x4 ^ rk1; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x5 = ZVKSED_ROUND(x1, S); + + B = x3 ^ x4 ^ x5 ^ rk2; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x6 = ZVKSED_ROUND(x2, S); + + B = x4 ^ x5 ^ x6 ^ rk3; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x7 = ZVKSED_ROUND(x3, S); + + // Update the destination register. + SET_EGU32x4_LE(vd, x4, x5, x6, x7); + } +); diff --git a/riscv/insns/vwsll_vi.h b/riscv/insns/vwsll_vi.h new file mode 100644 index 0000000..13b5eb4 --- /dev/null +++ b/riscv/insns/vwsll_vi.h @@ -0,0 +1,10 @@ +// vwsll.vi vd, vs2, zimm5, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_ZVK_VI_WIDENING_ULOOP({ + const reg_t shift = zimm5 & ((2 * sew) - 1); + vd_w = vs2_w << shift; +}); diff --git a/riscv/insns/vwsll_vv.h b/riscv/insns/vwsll_vv.h new file mode 100644 index 0000000..5a64c6c --- /dev/null +++ b/riscv/insns/vwsll_vv.h @@ -0,0 +1,10 @@ +// vwsll.vv vd, vs2, zimm5, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_ZVK_VV_WIDENING_ULOOP({ + const reg_t shift = (vs1 & ((2 * sew) - 1)); + vd_w = vs2_w << shift; +}); diff --git a/riscv/insns/vwsll_vx.h b/riscv/insns/vwsll_vx.h new file mode 100644 index 0000000..5264e80 --- /dev/null +++ b/riscv/insns/vwsll_vx.h @@ -0,0 +1,10 @@ +// vwsll.vx vd, vs2, zimm5, vm + +#include "zvk_ext_macros.h" + +require_zvbb; + +VI_ZVK_VX_WIDENING_ULOOP({ + const reg_t shift = (rs1 & ((2 * sew) - 1)); + vd_w = vs2_w << shift; +}); diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc index 1c4300c..59472a4 100644 --- a/riscv/isa_parser.cc +++ b/riscv/isa_parser.cc @@ -236,10 +236,55 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv) extension_table[EXT_ZICOND] = true; } else if (ext_str == "zihpm") { extension_table[EXT_ZIHPM] = true; + } else if (ext_str == "zvbb") { + extension_table[EXT_ZVBB] = true; + } else if (ext_str == "zvbc") { + extension_table[EXT_ZVBC] = true; } else if (ext_str == "zvfbfmin") { extension_table[EXT_ZVFBFMIN] = true; } else if (ext_str == "zvfbfwma") { extension_table[EXT_ZVFBFWMA] = true; + } else if (ext_str == "zvkg") { + extension_table[EXT_ZVKG] = true; + } else if (ext_str == "zvkn") { + extension_table[EXT_ZVBB] = true; + extension_table[EXT_ZVKNED] = true; + extension_table[EXT_ZVKNHB] = true; + } else if (ext_str == "zvknc") { + extension_table[EXT_ZVBB] = true; + extension_table[EXT_ZVBC] = true; + extension_table[EXT_ZVKNED] = true; + extension_table[EXT_ZVKNHB] = true; + } else if (ext_str == "zvkng") { + extension_table[EXT_ZVBB] = true; + extension_table[EXT_ZVKG] = true; + extension_table[EXT_ZVKNED] = true; + extension_table[EXT_ZVKNHB] = true; + } else if (ext_str == "zvkned") { + extension_table[EXT_ZVKNED] = true; + } else if (ext_str == "zvknha") { + extension_table[EXT_ZVKNHA] = true; + } else if (ext_str == "zvknhb") { + extension_table[EXT_ZVKNHB] = true; + } else if (ext_str == "zvks") { + extension_table[EXT_ZVBB] = true; + extension_table[EXT_ZVKSED] = true; + extension_table[EXT_ZVKSH] = true; + } else if (ext_str == "zvksc") { + extension_table[EXT_ZVBB] = true; + extension_table[EXT_ZVBC] = true; + extension_table[EXT_ZVKSED] = true; + extension_table[EXT_ZVKSH] = true; + } else if (ext_str == "zvksg") { + extension_table[EXT_ZVBB] = true; + extension_table[EXT_ZVKG] = true; + extension_table[EXT_ZVKSED] = true; + extension_table[EXT_ZVKSH] = true; + } else if (ext_str == "zvksed") { + extension_table[EXT_ZVKSED] = true; + } else if (ext_str == "zvksh") { + extension_table[EXT_ZVKSH] = true; + } else if (ext_str == "zvkt") { } else if (ext_str == "sstc") { extension_table[EXT_SSTC] = true; } else if (ext_str[0] == 'x') { @@ -295,7 +340,7 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv) } if ((extension_table[EXT_ZCMP] || extension_table[EXT_ZCMT]) && extension_table[EXT_ZCD]) { - bad_isa_string(str, "Zcmp' and 'Zcmt' exensions are incompatible with 'Zcd' extension"); + bad_isa_string(str, "Zcmp' and 'Zcmt' extensions are incompatible with 'Zcd' extension"); } if ((extension_table[EXT_ZCF] || extension_table[EXT_ZCD] || extension_table[EXT_ZCB] || @@ -307,6 +352,24 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv) bad_isa_string(str, "'Zacas' extension requires 'A' extension"); } + // Zpn conflicts with Zvknha/Zvknhb in both rv32 and rv64 + if (extension_table[EXT_ZPN] && (extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB])) { + bad_isa_string(str, "'Zvkna' and 'Zvknhb' extensions are incompatible with 'Zpn' extension"); + } + // In rv64 only, Zpn (rv64_zpn) conflicts with Zvkg/Zvkned/Zvksh + if (max_xlen == 64 && extension_table[EXT_ZPN] && + (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) { + bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64"); + } +#ifdef WORDS_BIGENDIAN + // Access to the vector registers as element groups is unimplemented on big-endian setups. + if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] || + extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) { + bad_isa_string(str, + "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' " + "extensions are incompatible with WORDS_BIGENDIAN setups."); + } +#endif std::string lowercase = strtolower(priv); bool user = false, supervisor = false; diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h index 3cbee7d..5b04347 100644 --- a/riscv/isa_parser.h +++ b/riscv/isa_parser.h @@ -58,8 +58,24 @@ typedef enum { EXT_ZICNTR, EXT_ZICOND, EXT_ZIHPM, + EXT_ZVBB, + EXT_ZVBC, EXT_ZVFBFMIN, EXT_ZVFBFWMA, + EXT_ZVKG, + EXT_ZVKNED, + EXT_ZVKNHA, + EXT_ZVKNHB, + EXT_ZVKSED, + EXT_ZVKSH, + EXT_XZBP, + EXT_XZBS, + EXT_XZBE, + EXT_XZBF, + EXT_XZBC, + EXT_XZBM, + EXT_XZBR, + EXT_XZBT, EXT_SSTC, EXT_ZACAS, EXT_INTERNAL_ZFH_MOVE, diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h index a30c770..2214be4 100644 --- a/riscv/overlap_list.h +++ b/riscv/overlap_list.h @@ -12,3 +12,12 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD) DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA) DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA) DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA) +DECLARE_OVERLAP_INSN(vaesdf_vv, EXT_ZVKNED) +DECLARE_OVERLAP_INSN(vghsh_vv, EXT_ZVKG) +DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHA) +DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHB) +DECLARE_OVERLAP_INSN(vsm3me_vv, EXT_ZVKSH) +DECLARE_OVERLAP_INSN(rstsa16, EXT_ZPN) +DECLARE_OVERLAP_INSN(rstsa32, EXT_ZPN) +DECLARE_OVERLAP_INSN(srli32_u, EXT_ZPN) +DECLARE_OVERLAP_INSN(umax32, EXT_ZPN) diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in index 6472982..a3e125f 100644 --- a/riscv/riscv.mk.in +++ b/riscv/riscv.mk.in @@ -1340,32 +1340,98 @@ riscv_insn_ext_zacas = \ amocas_d \ $(if $(HAVE_INT128),amocas_q) +riscv_insn_ext_zvbb = \ + vandn_vv \ + vandn_vx \ + vbrev8_v \ + vbrev_v \ + vclz_v \ + vcpop_v \ + vctz_v \ + vrev8_v \ + vrol_vv \ + vrol_vx \ + vror_vi \ + vror_vv \ + vror_vx \ + vwsll_vi \ + vwsll_vv \ + vwsll_vx \ + +riscv_insn_ext_zvbc = \ + vclmul_vv \ + vclmul_vx \ + vclmulh_vv \ + vclmulh_vx \ + +riscv_insn_ext_zvkg= \ + vghsh_vv \ + vgmul_vv \ + +riscv_insn_ext_zvkned = \ + vaesdf_vs \ + vaesdf_vv \ + vaesdm_vs \ + vaesdm_vv \ + vaesef_vs \ + vaesef_vv \ + vaesem_vs \ + vaesem_vv \ + vaeskf1_vi \ + vaeskf2_vi \ + vaesz_vs \ + +# Covers both Zvknha and Zvkhnb. +riscv_insn_ext_zvknh = \ + vsha2cl_vv \ + vsha2ch_vv \ + vsha2ms_vv \ + +riscv_insn_ext_zvksed = \ + vsm4k_vi \ + vsm4r_vs \ + vsm4r_vv \ + +riscv_insn_ext_zvksh = \ + vsm3c_vi \ + vsm3me_vv \ + +riscv_insn_ext_zvk = \ + $(riscv_insn_ext_zvbb) \ + $(riscv_insn_ext_zvbc) \ + $(riscv_insn_ext_zvkg) \ + $(riscv_insn_ext_zvkned) \ + $(riscv_insn_ext_zvknh) \ + $(riscv_insn_ext_zvksed) \ + $(riscv_insn_ext_zvksh) \ + riscv_insn_list = \ + $(if $(HAVE_INT128),$(riscv_insn_ext_v),) \ $(riscv_insn_ext_a) \ + $(riscv_insn_ext_b) \ + $(riscv_insn_ext_bf16) \ $(riscv_insn_ext_c) \ - $(riscv_insn_ext_i) \ - $(riscv_insn_ext_m) \ - $(riscv_insn_ext_f) \ - $(riscv_insn_ext_f_zfa) \ + $(riscv_insn_ext_cmo) \ $(riscv_insn_ext_d) \ $(riscv_insn_ext_d_zfa) \ - $(riscv_insn_ext_zfh) \ - $(riscv_insn_ext_zfh_zfa) \ + $(riscv_insn_ext_f) \ + $(riscv_insn_ext_f_zfa) \ + $(riscv_insn_ext_h) \ + $(riscv_insn_ext_i) \ + $(riscv_insn_ext_k) \ + $(riscv_insn_ext_m) \ + $(riscv_insn_ext_p) \ $(riscv_insn_ext_q) \ $(riscv_insn_ext_q_zfa) \ - $(riscv_insn_ext_b) \ - $(riscv_insn_ext_k) \ - $(if $(HAVE_INT128),$(riscv_insn_ext_v),) \ + $(riscv_insn_ext_zacas) \ $(riscv_insn_ext_zce) \ - $(riscv_insn_ext_h) \ - $(riscv_insn_ext_p) \ + $(riscv_insn_ext_zfh) \ + $(riscv_insn_ext_zfh_zfa) \ + $(riscv_insn_ext_zicond) \ + $(riscv_insn_ext_zvk) \ $(riscv_insn_priv) \ - $(riscv_insn_svinval) \ $(riscv_insn_smrnmi) \ - $(riscv_insn_ext_cmo) \ - $(riscv_insn_ext_zicond) \ - $(riscv_insn_ext_bf16) \ - $(riscv_insn_ext_zacas) \ + $(riscv_insn_svinval) \ riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list)) diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h index 41256c7..908ff16 100644 --- a/riscv/v_ext_macros.h +++ b/riscv/v_ext_macros.h @@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize, type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \ type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i); +#define V_U_PARAMS(x) \ + type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \ + type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i); + #define VX_U_PARAMS(x) \ type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \ type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \ @@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize, } \ VI_LOOP_END +#define VI_V_ULOOP(BODY) \ + VI_CHECK_SSS(false) \ + VI_LOOP_BASE \ + if (sew == e8) { \ + V_U_PARAMS(e8); \ + BODY; \ + } else if (sew == e16) { \ + V_U_PARAMS(e16); \ + BODY; \ + } else if (sew == e32) { \ + V_U_PARAMS(e32); \ + BODY; \ + } else if (sew == e64) { \ + V_U_PARAMS(e64); \ + BODY; \ + } \ + VI_LOOP_END + #define VI_VX_ULOOP(BODY) \ VI_CHECK_SSS(false) \ VI_LOOP_BASE \ diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc index 9128df6..08adc61 100644 --- a/riscv/vector_unit.cc +++ b/riscv/vector_unit.cc @@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write return regStart[n]; } +// The logic differences between 'elt()' and 'elt_group()' come from +// the fact that, while 'elt()' requires that the element is fully +// contained in a single vector register, the element group may span +// multiple registers in a single register group (LMUL>1). +// +// Notes: +// - We do NOT check that a single element - i.e., the T in the element +// group type std::array<T, N> - fits within a single register, or that +// T is smaller or equal to VSEW. Implementations of the instructions +// sometimes use a different T than what the specification suggests. +// Instructon implementations should 'require()' what the specification +// dictates. +// - We do NOT check that 'vReg' is a valid register group, or that +// 'n+1' element groups fit in the register group 'vReg'. It is +// the responsibility of the caller to validate those preconditions. +template<typename EG> EG& +vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) { +#ifdef WORDS_BIGENDIAN + fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n", + stderr); + abort(); +#endif + using T = typename EG::value_type; + constexpr std::size_t N = std::tuple_size<EG>::value; + assert(N > 0); + + assert(vsew != 0); + constexpr reg_t elt_group_size = N * sizeof(T); + const reg_t reg_group_size = (VLEN >> 3) * vflmul; + assert(((n + 1) * elt_group_size) <= reg_group_size); + + const reg_t start_byte = n * elt_group_size; + const reg_t bytes_per_reg = VLEN >> 3; + + // Inclusive first/last register indices. + const reg_t reg_first = vReg + start_byte / bytes_per_reg; + const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg; + + // Element groups per register groups + for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) { + reg_referenced[vidx] = 1; + + if (unlikely(p->get_log_commits_enabled() && is_write)) { + p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0}; + } + } + + return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte); +} + template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool); template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool); template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool); @@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool); template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool); template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool); template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool); + +template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool); +template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool); +template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool); +template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool); diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h index b9f706c..a057c62 100644 --- a/riscv/vector_unit.h +++ b/riscv/vector_unit.h @@ -2,6 +2,9 @@ #ifndef _RISCV_VECTOR_UNIT_H #define _RISCV_VECTOR_UNIT_H +#include <array> +#include <cstdint> + #include "decode.h" #include "csrs.h" @@ -69,6 +72,17 @@ struct type_sew_t<64> using type=int64_t; }; +// Element Group of 4 32 bits elements (128b total). +using EGU32x4_t = std::array<uint32_t, 4>; + +// Element Group of 8 32 bits elements (256b total). +using EGU32x8_t = std::array<uint32_t, 8>; + +// Element Group of 4 64 bits elements (256b total). +using EGU64x4_t = std::array<uint64_t, 4>; + +// Element Group of 16 8 bits elements (128b total). +using EGU8x16_t = std::array<uint8_t, 16>; class vectorUnit_t { @@ -88,8 +102,11 @@ public: bool vill; bool vstart_alu; - // vector element for varies SEW + // vector element for various SEW template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false); + // vector element group access, where EG is a std::array<T, N>. + template<typename EG> EG& + elt_group(reg_t vReg, reg_t n, bool is_write = false); public: diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h new file mode 100644 index 0000000..bf893f9 --- /dev/null +++ b/riscv/zvk_ext_macros.h @@ -0,0 +1,1035 @@ +// Helper macros to help implement instructions defined as part of +// the RISC-V Zvk extension (vector cryptography). + +// Note that a good deal of code here would be cleaner/simpler +// if exposed as C++ functions (including templated ones), however +// this is not possible in the contexts where those headers are +// included. + +#ifndef RISCV_ZVK_EXT_MACROS_H_ +#define RISCV_ZVK_EXT_MACROS_H_ + +// +// Predicate Macros +// + +// Ensures that the ZVBB extension (vector crypto bitmanip) is present, +// and the vector unit is enabled and in a valid state. +#define require_zvbb \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVBB); \ + } while (0) + +// Ensures that the ZVBC extension (vector carryless multiplication) +// is present, and the vector unit is enabled and in a valid state. +#define require_zvbc \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVBC); \ + } while (0) + +// Ensures that the ZVKG extension (vector Gallois Field Multiplication) +// is present, and the vector unit is enabled and in a valid state. +#define require_zvkg \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKG); \ + } while (0) + +// Ensures that a ZVK extension supporting SHA-256 is present. +// For SHA-256, this support is present in either Zvknha or Zvknhb. +// Also ensures that the vector unit is enabled and in a valid state. +#define require_zvknh_256 \ + do { \ + require_vector(true); \ + require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \ + } while (0) + +// Ensures that the ZVKNED extension (vector AES single round) is present, +// and the vector unit is enabled and in a valid state. +#define require_zvkned \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKNED); \ + } while (0) + +// Ensures that a ZVK extension supporting SHA-512 is present. +// For SHA-512, this support is only present in Zvknhb. +// Also ensures that the vector unit is enabled and in a valid state. +#define require_zvknh_512 \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKNHB); \ + } while (0) + +// Ensures that the ZVKSED extension (vector SM4 block cipher) +// is present, and the vector unit is enabled and in a valid state. +#define require_zvksed \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKSED); \ + } while (0) + +// Ensures that the ZVKSH extension (vector SM3 hash) is present, +// and the vector unit is enabled and in a valid state. +#define require_zvksh \ + do { \ + require_vector(true); \ + require_extension(EXT_ZVKSH); \ + } while (0) + +// Ensures that the vector instruction is not using a mask. +#define require_no_vmask require(insn.v_vm() == 1) + +// Ensures that an element group can fit in a register group. That is, +// (LMUL * VLEN) <= EGW +#define require_egw_fits(EGW) require((EGW) <= (P.VU.VLEN * P.VU.vflmul)) + +// Checks that the vector unit state (vtype and vl) can be interpreted +// as element groups with EEW=32, EGS=4 (four 32-bits elements per group), +// for an effective element group width of EGW=128 bits. +// +// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart' +// are interpreted as a number of EEW-wide elements. They must both +// be multiples of EGS (potentially 0). +#define require_element_groups_32x4 \ + do { \ + /* 'vstart' must be a multiple of EGS */ \ + const reg_t vstart = P.VU.vstart->read(); \ + require(vstart % 4 == 0); \ + /* 'vl' must be a multiple of EGS */ \ + const reg_t vl = P.VU.vl->read(); \ + require(vl % 4 == 0); \ + } while (0) + +// Checks that the vector unit state (vtype and vl) can be interpreted +// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group), +// for an effective element group width of EGW=256 bits. +// +// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart' +// are interpreted as a number of EEW-wide elements. They must both +// be multiples of EGS (potentially 0). +#define require_element_groups_32x8 \ + do { \ + /* 'vstart' must be a multiple of EGS */ \ + const reg_t vstart = P.VU.vstart->read(); \ + require(vstart % 8 == 0); \ + /* 'vl' must be a multiple of EGS */ \ + const reg_t vl = P.VU.vl->read(); \ + require(vl % 8 == 0); \ + } while (0) + +// Checks that the vector unit state (vtype and vl) can be interpreted +// as element groups with EEW=64, EGS=4 (four 64-bits elements per group), +// for an effective element group width of EGW=128 bits. +// +// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart' +// are interpreted as a number of EEW-wide elements. They must both +// be multiples of EGS (potentially 0). +#define require_element_groups_64x4 \ + do { \ + /* 'vstart' must be a multiple of EGS */ \ + const reg_t vstart = P.VU.vstart->read(); \ + require(vstart % 4 == 0); \ + /* 'vl' must be a multiple of EGS */ \ + const reg_t vl = P.VU.vl->read(); \ + require(vl % 4 == 0); \ + } while (0) + +// +// Loop Parameters Macros +// + +// Extracts a 32b*4 element group as a EGU32x4_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// 'vs1' and 'vs2' (constant, by value). +#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \ + EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \ + const EGU32x4_t vs1 = P.VU.elt_group<EGU32x4_t>((VS1_NUM), (EG_IDX)); \ + const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX)) + +// Extracts a 32b*8 element group as a EGU32x8_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// 'vs1' and 'vs2' (constant, by value). +#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \ + EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \ + const EGU32x8_t vs1 = P.VU.elt_group<EGU32x8_t>((VS1_NUM), (EG_IDX)); \ + const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX)) + +// Extracts a 32b*4 element group as a EGU32x4_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// and 'vs2' (constant, by value). +#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \ + EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \ + const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX)) + +// Extracts a 32b*8 element group as a EGU32x8_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// and 'vs2' (constant, by value). +#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \ + EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \ + const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX)) + +// Extracts a 64b*4 element group as a EGU64x4_t variables at the given +// element group index, from register arguments 'vd' (by reference, mutable), +// 'vs1' and 'vs2' (constant, by value). +#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \ + EGU64x4_t &vd = P.VU.elt_group<EGU64x4_t>((VD_NUM), (EG_IDX), true); \ + const EGU64x4_t vs1 = P.VU.elt_group<EGU64x4_t>((VS1_NUM), (EG_IDX)); \ + const EGU64x4_t vs2 = P.VU.elt_group<EGU64x4_t>((VS2_NUM), (EG_IDX)) + +// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1', +// as part of a widening operation where 'vd' has EEW = 2 * SEW. +// Defines +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'vs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \ + auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \ + const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \ + const type_usew_t<2 * SEW>::type vs2_w = vs2; \ + const auto vs1 = P.VU.elt<type_usew_t<SEW>::type>(rs1_num, i); \ + +// Extracts elements from the vector register groups 'vd', 'vs2', +// and the scalar register 'rs1', as part of a widening operation where +// 'vd' has EEW = 2 * SEW. +// Defines +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'rs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \ + auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \ + const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \ + const type_usew_t<2 * SEW>::type vs2_w = vs2; \ + const auto rs1 = (type_usew_t<SEW>::type)RS1; \ + +// Extracts elements from the vector register groups 'vd', 'vs2', +// and the 5-bit immediate field 'zimm5', as part of a widening operation +// where 'vd' has EEW = 2 * SEW. +// Defines +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'zimm5', unsigned, SEW width, by value, constant. +#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \ + auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \ + const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \ + const type_usew_t<2 * SEW>::type vs2_w = vs2; \ + const auto zimm5 = (type_usew_t<SEW>::type)insn.v_zimm5(); \ + +// +// Loop Macros +// + +// NOTES: +// - Each of the element-group loop macros DO contain an invocation +// of the corresponding 'require_element_groups_<bits>x<#elements>;', +// because the macro correctness requires proper VL/VSTART values. +// - Each of the loop macros named "_NOVM_" DO contain an invocation +// of the 'require_no_vmask>;' macro. Those macros (all of them +// at this time) do not support masking (i.e., no skipping +// of elements/element groups is performed). + +// Processes all 32b*4 element groups available in the vector register +// operands vd, vs1, and vs2. This interprets the vectors as containing +// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while +// *ignoring* the current SEW setting of the vector unit. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x4_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs1': EGU32x4_t, content of the current element group +// in the 'vs1' vector register / register group. +// 'vs2': EGU32x4_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*8 element groups available in the vector register +// operands vd, vs1, and vs2. This interprets the vectors as containing +// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while +// *ignoring* the current SEW setting of the vector unit. +// +// IMPORTANT +// - This macro contains an invocation of the macro 'require_element_groups_32x8;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x8_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs1': EGU32x8_t, content of the current element group +// in the 'vs1' vector register / register group. +// 'vs2': EGU32x8_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_32x8;; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 8; \ + const reg_t vl_eg = P.VU.vl->read() / 8; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector register +// operands vd, vs1, and vs2. This interprets the vectors as containing +// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while +// *ignoring* the current SEW setting of the vector unit. +// +// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP: +// - this macro does NOT extract the element groups into EGU32x4_t +// variables. It is intended for uses where there is a more natural +// type to use (e.g., EGU8x16_t). The type should still be a 128 bits +// wide type if extracted via 'P.VU.elt_group<Type>(...)'. +// - this macro offers the additional PRELOOP code block argument, +// that is executed once if the loop is going to be entered. +// This is intended for use with "vector scalar" instructions where +// we extract the first element group from one of the operands and +// use it for all loop iterations. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// +#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \ + PRELOOP, \ + EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector register +// operands vd and vs2. This interprets the vectors as containing +// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while +// *ignoring* the current SEW setting of the vector unit. +// +// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP: +// - this macro is meant to be used for "op vd, vs2" instructions, +// whether vd is output only, or input and output. +// - this macro does NOT extract the element groups into EGU32x4_t +// variables. It is intended for uses where there is a more natural +// type to use (e.g., EGU8x16_t). The type should still be a 128 bits +// wide type if extracted via 'P.VU.elt_group<Type>(...)'. +// - this macro offers the additional PRELOOP code block argument, +// that is executed once if the loop is going to be entered. +// This is intended for use with "vector scalar" instructions where +// we extract the first element group from one of the operands and +// use it for all loop iterations. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes three statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// +#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \ + PRELOOP, \ + EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector registers +// vd, vs2. This interprets the vectors as containing element groups +// of 4 uint32_t values (EGW=128, EEW=32, EGS=4), +// *ignoring* the current SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x4_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs2': EGU32x4_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*4 element groups available in the vector registers +// vd, vs2, given the 'zimm5' immediate. This interprets the vectors as +// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), +// *ignoring* the current SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// +// Invokes three statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'zimm5': 5 bits unsigned immediate +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x4_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs2': EGU32x4_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \ + do { \ + require_element_groups_32x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t zimm5 = insn.v_zimm5(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 32b*8 element groups available in the vector registers +// vd, vs2, given the 'zimm5' immediate. This interprets the vectors as +// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), +// *ignoring* the current SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_32x8;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// +// Invokes three statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - PRELOOP, invoked once IF there is at least one element group to process. +// It is NOT placed in its own scope, variables declared in PRELOOP are +// visible when EG_BODY executes. +// Pass {} when there is no need for such a pre-loop block. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs2_num': register index of vs2 +// 'zimm5': unsigned 5 bits immediate +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU32x8_t reference, mutable,, content of the current +// element group in the 'vd' vector register / register group. +// 'vs2': EGU32x8_t, content of the current element group +// in the 'vs2' vector register / register group. +// +#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \ + do { \ + require_element_groups_32x8; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t zimm5 = insn.v_zimm5(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 8; \ + const reg_t vl_eg = P.VU.vl->read() / 8; \ + do { PRELUDE } while (0); \ + if (vstart_eg < vl_eg) { \ + PRELOOP \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + } \ + P.VU.vstart->write(0); \ + } while (0) + +// Processes all 64b*4 element groups available in the vector registers +// vd, vs1, and vs2. This interprets the vectors as containing element groups +// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current +// SEW that applies to the vectors. +// +// IMPORTANT +// - This macro contains an invocation of 'require_element_groups_64x4;', +// since the "loop" macro correctness depends on invariants that +// are checked by the "require" macro. +// - This macro does not support masking, and contains an invocation +// of 'require_no_vmask;'. +// - While the name states "VD_VS1_VS2", many vector instructions +// are specified as "op vd, vs2, vs1". This macro does not imply +// a specific operand order and can be used with both "op vd, vs2, vs1" +// and "op vd, vs1, vs2" instructions. +// +// Invokes two statement blocks: +// - PRELUDE, invoked once, before any element group. It is executed even +// if the vector is empty. It is placed in a "do { } while (0);", hence +// any variable declared there is not visible outside. +// - EG_BODY, once per element group. +// +// Declares the following variables available for use in both statement blocks: +// 'vd_num': register index of vd +// 'vs1_num': register index of vs1 +// 'vs2_num': register index of vs2 +// 'vstart_eg': index of the first element group, *in EG units* +// 'vl_eg': length of the vector, *in EG units* +// +// The following variables are available in the EG_BODY block: +// 'idx_eg': index of the current element group. +// 'vd': EGU64x4_t reference, content of the current element group +// in the 'vd' vector register / vector register group. +// 'vs1': EGU64x4_t, content of the current element group +// in the 'vs1' vector register / vector register group. +// 'vs2': EGU64x4_t, content of the current element group +// in the 'vs2' vector register / vector register group. +#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \ + do { \ + require_element_groups_64x4; \ + require_no_vmask; \ + const reg_t vd_num = insn.rd(); \ + const reg_t vs1_num = insn.rs1(); \ + const reg_t vs2_num = insn.rs2(); \ + const reg_t vstart_eg = P.VU.vstart->read() / 4; \ + const reg_t vl_eg = P.VU.vl->read() / 4; \ + do { PRELUDE } while (0); \ + for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \ + VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \ + EG_BODY \ + } \ + P.VU.vstart->write(0); \ + } while (0) + + +// Loop macro for widening instructions taking parameters 'vd, vs2, v1', +// with logic processing elements one-at-a-time in those register groups +// and treating the elements as unsigned integers. +// +// Invokes the BODY statement block once per element. +// As a widening instruction, it is defined for SEW in {8, 16, 32}. +// A separate copy of BODY is instantiated for each SEW value. +// +// Declares the following variables available for use in BODY: +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'vs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \ + do { \ + VI_CHECK_DSS(true); \ + VI_LOOP_BASE \ + switch (sew) { \ + case e8: { \ + VI_ZVK_VV_WIDENING_U_PARAMS(e8); \ + BODY \ + break; \ + } \ + case e16: { \ + VI_ZVK_VV_WIDENING_U_PARAMS(e16); \ + BODY \ + break; \ + } \ + case e32: { \ + VI_ZVK_VV_WIDENING_U_PARAMS(e32); \ + BODY \ + break; \ + } \ + } \ + VI_LOOP_END \ + } while (0) + +// Loop macro for widening instructions taking parameters 'vd, vs2, rs1', +// with logic processing elements one-at-a-time in those register groups +// and treating the elements as unsigned integers. +// +// Invokes the BODY statement block once per element. +// As a widening instruction, it is defined for SEW in {8, 16, 32}. +// A separate copy of BODY is instantiated for each SEW value. +// +// Declares the following variables available for use in BODY: +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'rs1', unsigned, SEW width, by value, constant. +#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \ + do { \ + VI_CHECK_DSS(true); \ + VI_LOOP_BASE \ + switch (sew) { \ + case e8: { \ + VI_ZVK_VX_WIDENING_U_PARAMS(e8); \ + BODY \ + break; \ + } \ + case e16: { \ + VI_ZVK_VX_WIDENING_U_PARAMS(e16); \ + BODY \ + break; \ + } \ + case e32: { \ + VI_ZVK_VX_WIDENING_U_PARAMS(e32); \ + BODY \ + break; \ + } \ + } \ + VI_LOOP_END \ + } while (0) + +// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5', +// with logic processing elements one-at-a-time in those register groups +// and treating the elements as unsigned integers. +// +// Invokes the BODY statement block once per element. +// As a widening instruction, it is defined for SEW in {8, 16, 32}. +// A separate copy of BODY is instantiated for each SEW value. +// +// Declares the following variables available for use in BODY: +// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable. +// - 'vs2', unsigned, SEW width, by value, constant. +// - 'vs2_w', unsigned, 2 * SEW width, by value, constant, +// a widened copy of 'vs2'. +// - 'zimm5', unsigned, SEW width, by value, constant. +#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \ + do { \ + VI_CHECK_DSS(true); \ + VI_LOOP_BASE \ + switch (sew) { \ + case e8: { \ + VI_ZVK_VI_WIDENING_U_PARAMS(e8); \ + BODY \ + break; \ + } \ + case e16: { \ + VI_ZVK_VI_WIDENING_U_PARAMS(e16); \ + BODY \ + break; \ + } \ + case e32: { \ + VI_ZVK_VI_WIDENING_U_PARAMS(e32); \ + BODY \ + break; \ + } \ + } \ + VI_LOOP_END \ + } while (0) + +// +// Element Group Manipulation Macros +// + +// Extracts 4 uint32_t words from the input EGU32x4_t value +// into the (mutable) variables named by the W arguments, provided in +// "Little Endian" (LE) order, i.e., from the least significant (W0) +// to the most significant (W3). +#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \ + uint32_t W0 = (X)[0]; \ + uint32_t W1 = (X)[1]; \ + uint32_t W2 = (X)[2]; \ + uint32_t W3 = (X)[3]; \ + (void)(0) + +// Sets the elements words of given EGU32x4_t variable 'X' to +// the given 4 uint32_t values privided in "Little Endian" (LE) +// order, i.e., from the least significant (W0) to the most +// significant (W3). +#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \ + do { \ + (X)[0] = (W0); \ + (X)[1] = (W1); \ + (X)[2] = (W2); \ + (X)[3] = (W3); \ + } while (0) + +// Extracts 4 uint32_t words from the input EGU32x4_t value +// into the (mutable) variables named by the W arguments, provided in +// "Big Endian" (BE) order, i.e., from the most significant (W3) +// to the least significant (W0). +#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \ + uint32_t W0 = (X)[0]; \ + uint32_t W1 = (X)[1]; \ + uint32_t W2 = (X)[2]; \ + uint32_t W3 = (X)[3]; \ + (void)(0) + +// Sets the elements words of given EGU32x4_t variable 'X' to +// the given 4 uint32_t values privided in "Big Endian" (BE) +// order, i.e., from the most significant (W3) to the least +// significant (W0). +#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \ + do { \ + (X)[0] = (W0); \ + (X)[1] = (W1); \ + (X)[2] = (W2); \ + (X)[3] = (W3); \ + } while (0) + +// Byte-swap the bytes of a uin32_t such that the order of bytes +// is reversed. +#define ZVK_BSWAP32(x) \ + ((((uint32_t)((x) >> 24)) & 0xFF) << 0 | \ + (((uint32_t)((x) >> 16)) & 0xFF) << 8 | \ + (((uint32_t)((x) >> 8)) & 0xFF) << 16 | \ + (((uint32_t)((x) >> 0)) & 0xFF) << 24) + +// Extracts 8 uint32_t words from the input EGU32x8_t value +// into the (mutable) variables named by the W arguments, provided in +// "Big Endian" (BE) order, i.e., from the most significant (W7) +// to the least significant (W0). Each of the words is byte-swapped, +// from a big-endian representation in the EGU32x8_t to a native/little-endian +// ordering in the variables. +#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \ + uint32_t W0 = ZVK_BSWAP32((X)[0]); \ + uint32_t W1 = ZVK_BSWAP32((X)[1]); \ + uint32_t W2 = ZVK_BSWAP32((X)[2]); \ + uint32_t W3 = ZVK_BSWAP32((X)[3]); \ + uint32_t W4 = ZVK_BSWAP32((X)[4]); \ + uint32_t W5 = ZVK_BSWAP32((X)[5]); \ + uint32_t W6 = ZVK_BSWAP32((X)[6]); \ + uint32_t W7 = ZVK_BSWAP32((X)[7]); \ + (void)(0) + +// Sets the elements words of given EGU32x8_t variable 'X' to +// the given 8 uint32_t values privided in "Big Endian" (BE) +// order, i.e., from the most significant (W7) to the least +// significant (W0). Each of the words is byte-swapped, +// from a native/little-endian ordering in the variables to +// a big-endian representation in the EGU32x8_t. +#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \ + do { \ + (X)[0] = ZVK_BSWAP32(W0); \ + (X)[1] = ZVK_BSWAP32(W1); \ + (X)[2] = ZVK_BSWAP32(W2); \ + (X)[3] = ZVK_BSWAP32(W3); \ + (X)[4] = ZVK_BSWAP32(W4); \ + (X)[5] = ZVK_BSWAP32(W5); \ + (X)[6] = ZVK_BSWAP32(W6); \ + (X)[7] = ZVK_BSWAP32(W7); \ + } while (0) + +// Extracts 4 uint64_t words from the input EGU64x4_t value +// into the (mutable) variables named by the W arguments, provided in +// "Big Endian" (BE) order, i.e., from the most significant (W3) +// to the least significant (W0). +#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \ + uint64_t W0 = (X)[0]; \ + uint64_t W1 = (X)[1]; \ + uint64_t W2 = (X)[2]; \ + uint64_t W3 = (X)[3]; \ + (void)(0) + +// Sets the elements words of given EGU64x4_t variable 'X' to +// the given 4 uint64_t values privided in "Big Endian" (BE) +// order, i.e., from the most significant (W3) to the least +// significant (W0). +#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \ + do { \ + (X)[0] = (W0); \ + (X)[1] = (W1); \ + (X)[2] = (W2); \ + (X)[3] = (W3); \ + } while (0) + +// Copies a EGU8x16_t value from 'SRC' into 'DST'. +#define EGU8x16_COPY(DST, SRC) \ + for (std::size_t bidx = 0; bidx < 16; ++bidx) { \ + (DST)[bidx] = (SRC)[bidx]; \ + } + +// Performs "MUT_A ^= CONST_B;", i.e., xor of the bytes +// in A (mutated) with the bytes in B (unchanged). +#define EGU8x16_XOREQ(MUT_A, CONST_B) \ + for (std::size_t bidx = 0; bidx < 16; ++bidx) { \ + (MUT_A)[bidx] ^= (CONST_B)[bidx]; \ + } + +// Performs "MUT_A ^= CONST_B;", i.e., xor of the bytes +// in A (mutated) with the bytes in B (unchanged). +#define EGU32x4_XOREQ(MUT_A, CONST_B) \ + for (std::size_t idx = 0; idx < 4; ++idx) { \ + (MUT_A)[idx] ^= (CONST_B)[idx]; \ + } + +// Performs "DST = A ^ B;", i.e., DST (overwritten) receives +// the xor of the bytes in A and B (both unchanged). +#define EGU8x16_XOR(DST, A, B) \ + for (std::size_t bidx = 0; bidx < 16; ++bidx) { \ + (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \ + } + +// Performs "DST = A ^ B;", i.e., DST (overwritten) receives +// the xor of the bytes in A and B (both unchanged). +#define EGU32x4_XOR(DST, A, B) \ + do { \ + static_assert(std::is_same<EGU32x4_t, decltype(A)>::value); \ + static_assert(std::is_same<EGU32x4_t, decltype(B)>::value); \ + static_assert(std::is_same<EGU32x4_t, decltype(DST)>::value); \ + for (std::size_t idx = 0; idx < 4; ++idx) { \ + (DST)[idx] = (A)[idx] ^ (B)[idx]; \ + } \ + } while (0) + +// +// Common bit manipulations logic. +// + +// Form a 64 bit integer with bit X set +#define ZVK_BIT(X) (1ULL << (X)) + +// Reverse the order of bits within bytes of a word. +// This is used to match the data interpretation in NIST SP 800-38D +// a.k.a the GCM specification. +#define ZVK_BREV8_32(X) \ + do { \ + (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \ + (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \ + (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \ + } while (0) + +// Rotates right a uint32_t value by N bits. +// uint32_t ROR32(uint32_t X, std::size_t N); +#define ZVK_ROR32(X, N) rotate_right<uint32_t>((X), (N)) + +// Rotates right a uint64_t value by N bits. +// uint64_t ROR64(uint64_t X, std::size_t N); +#define ZVK_ROR64(X, N) rotate_right<uint64_t>((X), (N)) + +// Rotates left a uint32_t value by N bits. +// uint32_t ROL32(uint32_t X, std::size_t N); +#define ZVK_ROL32(X, N) rotate_left<uint32_t>((X), (N)) + +// +// Element Group Bit Manipulation Macros +// + +// Performs bit reversal in a EGU32x4_t group. +#define EGU32x4_BREV8(X) \ + for (std::size_t bidx = 0; bidx < 4; ++bidx) { \ + ZVK_BREV8_32((X)[bidx]); \ + } + +// Checks if a given bit is set within a EGU32x4_t group. +// Assumes LE ordering. +#define EGU32x4_ISSET(X, BIDX) \ + (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0) + +// Shfts a EGU32x4_t group left by one bit. +// +// Since the entire 128 bit value is shifted we need to handle carry bits. +// In order to limit the amount of carry check logic the elements are copied to +// a 64 bit temporary variable. +#define EGU32x4_LSHIFT(X) \ + do { \ + uint64_t dword; \ + dword = ((uint64_t)(X)[3]) << 32; \ + dword |= X[2]; \ + dword <<= 1; \ + if (X[1] & ZVK_BIT(31)) { \ + dword |= ZVK_BIT(0); \ + } \ + X[2] = dword & UINT32_MAX; \ + X[3] = dword >> 32; \ + dword = ((uint64_t)(X)[1]) << 32; \ + dword |= X[0]; \ + dword <<= 1; \ + X[0] = dword & UINT32_MAX; \ + X[1] = dword >> 32; \ + } while (0) + +#endif // RISCV_ZVK_EXT_MACROS_H_ diff --git a/riscv/zvkned_ext_macros.h b/riscv/zvkned_ext_macros.h new file mode 100644 index 0000000..db705c7 --- /dev/null +++ b/riscv/zvkned_ext_macros.h @@ -0,0 +1,270 @@ +// Helper macros to help implement instructions defined as part of +// the RISC-V Zvkned extension (vector AES single round). + +#include "insns/aes_common.h" + +#ifndef RISCV_ZVKNED_EXT_MACROS_H_ +#define RISCV_ZVKNED_EXT_MACROS_H_ + +// vaes*.vs instruction constraints: +// - Zvkned is enabled +// - EGW (128) <= LMUL * VLEN +// - vd and vs2 cannot overlap +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros. +#define require_vaes_vs_constraints \ + do { \ + require_zvkned; \ + require(P.VU.vsew == 32); \ + require_egw_fits(128); \ + require(insn.rd() != insn.rs2()); \ + } while (false) + +// vaes*.vv instruction constraints. Those are the same as the .vs ones, +// except for the overlap constraint that is not present for .vv variants. +// - Zvkned is enabled +// - EGW (128) <= LMUL * VLEN +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros. +#define require_vaes_vv_constraints \ + do { \ + require_zvkned; \ + require(P.VU.vsew == 32); \ + require_egw_fits(128); \ + } while (false) + +// vaeskf*.vi instruction constraints. Those are the same as the .vv ones. +#define require_vaeskf_vi_constraints \ + do { \ + require_zvkned; \ + require(P.VU.vsew == 32); \ + require_egw_fits(128); \ + } while (false) + +#define VAES_XTIME(A) (((A) << 1) ^ (((A) & 0x80) ? 0x1b : 0)) + +#define VAES_GFMUL(A, B) \ + ((((B) & 0x1) ? (A) : 0) ^ \ + (((B) & 0x2) ? VAES_XTIME(A) : 0) ^ \ + (((B) & 0x4) ? VAES_XTIME(VAES_XTIME(A)) : 0) ^ \ + (((B) & 0x8) ? VAES_XTIME(VAES_XTIME(VAES_XTIME(A))) : 0)) + +// Apply the S-box transform to every byte in the VAESState 'state' +#define VAES_SUB_BYTES(STATE) \ + do { \ + static constexpr uint8_t kVAESXEncSBox[256]= { \ + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, \ + 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, \ + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, \ + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, \ + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, \ + 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, \ + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, \ + 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, \ + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, \ + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, \ + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, \ + 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, \ + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, \ + 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, \ + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, \ + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, \ + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, \ + 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, \ + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, \ + 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, \ + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, \ + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, \ + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, \ + 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, \ + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, \ + 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, \ + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, \ + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, \ + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, \ + 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, \ + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, \ + 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, \ + }; \ + for (uint8_t& byte : (STATE)) { \ + byte = kVAESXEncSBox[byte]; \ + } \ + } while (0) + +// Applies the S-box inverse (decode) transform to every byte +// in the VAESState 'state'. +#define VAES_INV_SUB_BYTES(STATE) \ + do { \ + static constexpr uint8_t kVAESXDecSBox[256] = { \ + 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, \ + 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, \ + 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, \ + 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, \ + 0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, \ + 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, \ + 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, \ + 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, \ + 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, \ + 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, \ + 0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, \ + 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, \ + 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, \ + 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, \ + 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, \ + 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, \ + 0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, \ + 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, \ + 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, \ + 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, \ + 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, \ + 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, \ + 0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, \ + 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, \ + 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, \ + 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, \ + 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, \ + 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, \ + 0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, \ + 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, \ + 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, \ + 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D, \ + }; \ + for (uint8_t &byte : (STATE)) { \ + byte = kVAESXDecSBox[byte]; \ + } \ + } while (0) + +// Shift the state rows, as specified in ShiftRows. +// 'STATE' is a VAESState value. +#define VAES_SHIFT_ROWS(STATE) \ + do { \ + uint8_t temp; \ + /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \ + /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \ + temp = (STATE)[1]; \ + (STATE)[ 1] = (STATE)[ 5]; \ + (STATE)[ 5] = (STATE)[ 9]; \ + (STATE)[ 9] = (STATE)[13]; \ + (STATE)[13] = temp; \ + /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \ + temp = (STATE)[2]; \ + (STATE)[ 2] = (STATE)[10]; \ + (STATE)[10] = temp; \ + temp = (STATE)[6]; \ + (STATE)[ 6] = (STATE)[14]; \ + (STATE)[14] = temp; \ + /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \ + temp = (STATE)[3]; \ + (STATE)[ 3] = (STATE)[15]; \ + (STATE)[15] = (STATE)[11]; \ + (STATE)[11] = (STATE)[ 7]; \ + (STATE)[ 7] = temp; \ + } while (0) + +// Shifts the state rows, as specified in InvShiftRows. +// 'STATE' is a VAESState value. +#define VAES_INV_SHIFT_ROWS(STATE) \ + do { \ + uint8_t temp; \ + /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \ + /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \ + temp = (STATE)[1]; \ + (STATE)[ 1] = (STATE)[13]; \ + (STATE)[13] = (STATE)[ 9]; \ + (STATE)[ 9] = (STATE)[ 5]; \ + (STATE)[ 5] = temp; \ + /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \ + temp = (STATE)[2]; \ + (STATE)[ 2] = (STATE)[10]; \ + (STATE)[10] = temp; \ + temp = (STATE)[6]; \ + (STATE)[ 6] = (STATE)[14]; \ + (STATE)[14] = temp; \ + /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \ + temp = (STATE)[3]; \ + (STATE)[ 3] = (STATE)[ 7]; \ + (STATE)[ 7] = (STATE)[11]; \ + (STATE)[11] = (STATE)[15]; \ + (STATE)[15] = temp; \ + } while (0) + +// Implements the function producing one byte, one-fourth of the column +// transformation MixColumns() specified in FIPS-197 5.1.3 . +// +// The arguments are all bytes (i.e., uint8_t). The function implemented +// is +// F(A, B, C, D) = (2 . A) xor (3 . B) xor C xor D +// where '.' denotes the Galois Field multiplication over 2**8. +// +#define VAES_MIX_COLUMN_BYTE(A, B, C, D) \ + (VAES_GFMUL((A), 0x2) ^ VAES_GFMUL((B), 0x3) ^ (C) ^ (D)) + +// Implements the function producing one byte, one-fourth of the column +// transformation InvMixColumns() specified in FIPS-197 5.3.3 . +// +// The arguments are all bytes (i.e., uint8_t). The function implemented +// is +// F(A, B, C, D) = (0xE . A) xor (0xB . B) xor (0xD . C) xor (0x9 . D) +// where '.' denotes the Galois Field multiplication over 2**8. +// +#define VAES_INV_MIX_COLUMN_BYTE(A, B, C, D) \ + (VAES_GFMUL((A), 0xE) ^ \ + VAES_GFMUL((B), 0xB) ^ \ + VAES_GFMUL((C), 0xD) ^ \ + VAES_GFMUL((D), 0x9)) + +// Given a column as a uin32_t (4 Bytes), produces the mixed column +// as a uin32_t. +#define VAES_MIX_COLUMN(STATE, COL_IDX) \ + do { \ + uint8_t *column = &(STATE)[(COL_IDX) * 4]; \ + /* Extract the bytes, before we start overwriting them */ \ + const uint8_t b0 = column[0]; \ + const uint8_t b1 = column[1]; \ + const uint8_t b2 = column[2]; \ + const uint8_t b3 = column[3]; \ + /* Every iteration rotates the byte indices by 1 */ \ + column[0] = VAES_MIX_COLUMN_BYTE(b0, b1, b2, b3); \ + column[1] = VAES_MIX_COLUMN_BYTE(b1, b2, b3, b0); \ + column[2] = VAES_MIX_COLUMN_BYTE(b2, b3, b0, b1); \ + column[3] = VAES_MIX_COLUMN_BYTE(b3, b0, b1, b2); \ + } while (0) + +// Given a column as a uin32_t (4 Bytes), produces the inverse +// mixed column as a uin32_t. +#define VAES_INV_MIX_COLUMN(STATE, COL_IDX) \ + do { \ + uint8_t *column = &(STATE)[(COL_IDX) * 4]; \ + /* Extract the bytes, before we start overwriting them */ \ + const uint8_t b0 = column[0]; \ + const uint8_t b1 = column[1]; \ + const uint8_t b2 = column[2]; \ + const uint8_t b3 = column[3]; \ + /* Every iteration rotates the byte indices by 1 */ \ + column[0] = VAES_INV_MIX_COLUMN_BYTE(b0, b1, b2, b3); \ + column[1] = VAES_INV_MIX_COLUMN_BYTE(b1, b2, b3, b0); \ + column[2] = VAES_INV_MIX_COLUMN_BYTE(b2, b3, b0, b1); \ + column[3] = VAES_INV_MIX_COLUMN_BYTE(b3, b0, b1, b2); \ + } while (0) + +// Implements MixColumns as defined in FIPS-197 5.1.3. +#define VAES_MIX_COLUMNS(STATE) \ + do { \ + VAES_MIX_COLUMN((STATE), 0); \ + VAES_MIX_COLUMN((STATE), 1); \ + VAES_MIX_COLUMN((STATE), 2); \ + VAES_MIX_COLUMN((STATE), 3); \ + } while (0) + +// Implements InvMixColumns as defined in FIPS-197 5.3.3. +#define VAES_INV_MIX_COLUMNS(STATE) \ + do { \ + VAES_INV_MIX_COLUMN((STATE), 0); \ + VAES_INV_MIX_COLUMN((STATE), 1); \ + VAES_INV_MIX_COLUMN((STATE), 2); \ + VAES_INV_MIX_COLUMN((STATE), 3); \ + } while (0) + +#endif // RISCV_ZVKNED_EXT_MACROS_H_ diff --git a/riscv/zvknh_ext_macros.h b/riscv/zvknh_ext_macros.h new file mode 100644 index 0000000..b50818b --- /dev/null +++ b/riscv/zvknh_ext_macros.h @@ -0,0 +1,155 @@ +// Helper macros to help implement instructions defined as part of +// the RISC-V Zvknh[ab] extensions (vector SHA-256/SHA-512 cryptography). + +#include "zvk_ext_macros.h" + +#ifndef RISCV_ZVKNH_EXT_MACROS_H_ +#define RISCV_ZVKNH_EXT_MACROS_H_ + +// Constraints common to all vsha* instructions, across all VSEW: +// - VSEW is 32 (SHA-256) or 64 (SHA-512) +// - No overlap of vd with vs1 or vs2. +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_..._EGU32x4_..._LOOP and VI_..._EGU64x4_..._LOOP +// macros. +#define require_vsha2_common_constraints \ + do { \ + require(P.VU.vsew == 32 || P.VU.vsew == 64); \ + require(insn.rd() != insn.rs1()); \ + require(insn.rd() != insn.rs2()); \ + } while (false) + +// Constraints on vsha2 instructions that must be verified when VSEW==32. +// Those are *IN ADDITION* to the constraints checked by +// 'require_vsha2_common_constraints', which is meant to be run earlier. +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros. +#define require_vsha2_vsew32_constraints \ + do { \ + require_zvknh_256; \ + require_egw_fits(128); \ + } while (false) + +// Constraints on vsha2 instructions that must be verified when VSEW==32. +// Those are *IN ADDITION* to the constraints checked by +// 'require_vsha2_common_constraints', which is meant to be run earlier. +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_ZVK_..._EGU64x4_..._LOOP macros. +#define require_vsha2_vsew64_constraints \ + do { \ + require_zvknh_512; \ + require_egw_fits(256); \ + } while (false) + +// +// SHA-256 and SHA-512 common logic +// + +// Ch(x, y, z) = (xy) ⊕ (~xz) = xy | ~xz +#define ZVK_SHA_CH(X, Y, Z) (((X) & (Y)) ^ ((~(X)) & (Z))) + +// Maj(x,y,z) = (xy) ⊕ (xz) ⊕(yz) = xy | xz | yz +#define ZVK_SHA_MAJ(X, Y, Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z))) + +// +// SHA-256 +// + +// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x) +#define ZVK_SHA256_SUM0(X) \ + (ZVK_ROR32(X, 2) ^ ZVK_ROR32(X, 13) ^ ZVK_ROR32(X, 22)) + +// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x) +#define ZVK_SHA256_SUM1(X) \ + (ZVK_ROR32(X, 6) ^ ZVK_ROR32(X, 11) ^ ZVK_ROR32(X, 25)) + +// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x) +#define ZVK_SHA256_SIG0(X) \ + (ZVK_ROR32(X, 7) ^ ZVK_ROR32(X, 18) ^ ((X) >> 3)) + +// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x) +#define ZVK_SHA256_SIG1(X) \ + (ZVK_ROR32(X, 17) ^ ZVK_ROR32(X, 19) ^ ((X) >> 10)) + +// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes +// W[t+16]. +#define ZVK_SHA256_SCHEDULE(W14, W9, W1, W0) \ + (ZVK_SHA256_SIG1(W14) + (W9) + ZVK_SHA256_SIG0(W1) + (W0)) + +// Performs one round of compression (out of the 64 rounds), given the state +// temporaries A,B,C,...,H, and KW, the sum Kt+Wt. +// Updates A,B,C,...,H to their new values. KW is not modified. +// +// Note that some of the logic could be omitted in vshac[ab] since +// some of the variables are dropped in each of those. However removing +// those unnecessary updates reduces the opportunities to share this single +// per-round logic and forces us to move further away from the how the logic +// is expressed in FIPS PUB 180-4. +#define ZVK_SHA256_COMPRESS(A, B, C, D, E, F, G, H, KW) \ + { \ + const uint32_t t1 = (H) + ZVK_SHA256_SUM1(E) + \ + ZVK_SHA_CH((E), (F), (G)) + (KW); \ + const uint32_t t2 = ZVK_SHA256_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \ + (H) = (G); \ + (G) = (F); \ + (F) = (E); \ + (E) = (D) + t1; \ + (D) = (C); \ + (C) = (B); \ + (B) = (A); \ + (A) = t1 + t2; \ + } + +// +// SHA-512 +// + +// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x) +#define ZVK_SHA512_SUM0(X) \ + (ZVK_ROR64(X, 28) ^ ZVK_ROR64(X, 34) ^ ZVK_ROR64(X, 39)) + +// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x) +#define ZVK_SHA512_SUM1(X) \ + (ZVK_ROR64(X, 14) ^ ZVK_ROR64(X, 18) ^ ZVK_ROR64(X, 41)) + +// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x) +#define ZVK_SHA512_SIG0(X) \ + (ZVK_ROR64(X, 1) ^ ZVK_ROR64(X, 8) ^ ((X) >> 7)) + +// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x) +#define ZVK_SHA512_SIG1(X) \ + (ZVK_ROR64(X, 19) ^ ZVK_ROR64(X, 61) ^ ((X) >> 6)) + +// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes +// W[t+16]. +#define ZVK_SHA512_SCHEDULE(W14, W9, W1, W0) \ + (ZVK_SHA512_SIG1(W14) + (W9) + ZVK_SHA512_SIG0(W1) + (W0)) + +// Performs one round of compression (out of the 64 rounds), given the state +// temporaries A,B,C,...,H, and KW, the sum Kt+Wt. +// Updates A,B,C,...,H to their new values. KW is not modified. +// +// Note that some of the logic could be omitted in vshac[ab] since +// some of the variables are dropped in each of those. However removing +// those unnecessary updates reduces the opportunities to share this single +// per-round logic and forces us to move further away from the how the logic +// is expressed in FIPS PUB 180-4. +#define ZVK_SHA512_COMPRESS(A, B, C, D, E, F, G, H, KW) \ + { \ + const uint64_t t1 = (H) + ZVK_SHA512_SUM1(E) + \ + ZVK_SHA_CH((E), (F), (G)) + (KW); \ + const uint64_t t2 = ZVK_SHA512_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \ + (H) = (G); \ + (G) = (F); \ + (F) = (E); \ + (E) = (D) + t1; \ + (D) = (C); \ + (C) = (B); \ + (B) = (A); \ + (A) = t1 + t2; \ + } + +#endif // RISCV_ZVKNH_EXT_MACROS_H_ diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h new file mode 100644 index 0000000..46e399b --- /dev/null +++ b/riscv/zvksed_ext_macros.h @@ -0,0 +1,60 @@ +// Helper macros and functions to help implement instructions defined as part of +// the RISC-V Zvksed extension (vectorized SM4). + +#include "insns/sm4_common.h" +#include "zvk_ext_macros.h" + +#ifndef RISCV_ZVKSED_MACROS_H_ +#define RISCV_ZVKSED_MACROS_H_ + +// Constraints common to all vsm4* instructions: +// - Zvksed is enabled +// - VSEW == 32 +// - EGW (128) <= LMUL * VLEN +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros. +#define require_vsm4_constraints \ + do { \ + require_zvksed; \ + require(P.VU.vsew == 32); \ + require_egw_fits(128); \ + } while (false) + +// Returns a uint32_t value constructed from the 4 bytes (uint8_t) +// provided in "Little Endian" (LE) order, i.e., from least significant (B0) +// to most significant (B3). +#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \ + (((uint32_t)(B0)) << 0 | \ + ((uint32_t)(B1)) << 8 | \ + ((uint32_t)(B2)) << 16 | \ + ((uint32_t)(B3)) << 24) + +// Get byte BYTE of the SBox. +#define ZVKSED_SBOX(BYTE) (sm4_sbox[(BYTE)]) + +// Given an unsigned integer value 'X' and a byte index, +// returns a uint8_t value for the byte at the given index. +#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8))) + +// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1. +// of the IETF draft. +#define ZVKSED_SUB_BYTES(B) \ + ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \ + ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \ + ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \ + ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3))) + +// Perform the linear transformation L to a 32 bit word S and xor it with a 32 +// bit word X - section 6.2.2. of the IETF draft. +#define ZVKSED_ROUND(X, S) \ + ((X) ^ \ + ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \ + ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24))) + +// Perform the linear transformation L' to a 32 bit word S and xor it with a 32 +// bit word X - section 6.2.2. of the IETF draft. +#define ZVKSED_ROUND_KEY(X, S) \ + ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23))) + +#endif // RISCV_ZVKSED_MACROS_H_ diff --git a/riscv/zvksh_ext_macros.h b/riscv/zvksh_ext_macros.h new file mode 100644 index 0000000..71c5a09 --- /dev/null +++ b/riscv/zvksh_ext_macros.h @@ -0,0 +1,47 @@ +// Helper macros and functions to help implement instructions defined as part of +// the RISC-V Zvksh extension (vectorized SM3). + +#include "zvk_ext_macros.h" + +#ifndef RISCV_INSNS_ZVKSH_COMMON_H_ +#define RISCV_INSNS_ZVKSH_COMMON_H_ + +// Constraints common to all vsm3* instructions: +// - Zvksh is enabled +// - VSEW == 32 +// - EGW (256) <= LMUL * VLEN +// - No overlap of vd and vs2. +// +// The constraint that vstart and vl are both EGS (8) aligned +// is checked in the VI_ZVK_..._EGU32x8_..._LOOP macros. +#define require_vsm3_constraints \ + do { \ + require_zvksh; \ + require(P.VU.vsew == 32); \ + require_egw_fits(256); \ + require(insn.rd() != insn.rs2()); \ + } while (false) + +#define FF1(X, Y, Z) ((X) ^ (Y) ^ (Z)) +#define FF2(X, Y, Z) (((X) & (Y)) | ((X) & (Z)) | ((Y) & (Z))) + +// Boolean function FF_j - section 4.3. of the IETF draft. +#define ZVKSH_FF(X, Y, Z, J) (((J) <= 15) ? FF1(X, Y, Z) : FF2(X, Y, Z)) + +#define GG1(X, Y, Z) ((X) ^ (Y) ^ (Z)) +#define GG2(X, Y, Z) (((X) & (Y)) | ((~(X)) & (Z))) + +// Boolean function GG_j - section 4.3. of the IETF draft. +#define ZVKSH_GG(X, Y, Z, J) (((J) <= 15) ? GG1(X, Y, Z) : GG2(X, Y, Z)) + +#define T1 0x79CC4519 +#define T2 0x7A879D8A + +// T_j constant - section 4.2. of the IETF draft. +#define ZVKSH_T(J) (((J) <= 15) ? (T1) : (T2)) + +// Permutation functions P_0 and P_1 - section 4.4 of the IETF draft. +#define ZVKSH_P0(X) ((X) ^ ZVK_ROL32((X), 9) ^ ZVK_ROL32((X), 17)) +#define ZVKSH_P1(X) ((X) ^ ZVK_ROL32((X), 15) ^ ZVK_ROL32((X), 23)) + +#endif // RISCV_INSNS_ZVKSH_COMMON_H |