aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Waterman <andrew@sifive.com>2023-06-19 20:18:09 -0700
committerGitHub <noreply@github.com>2023-06-19 20:18:09 -0700
commit5731a478ea2b7cf639a383498eb114a9dc7d64df (patch)
tree43f28cdc046246deb9275b71ecaaacdce45ecaa9
parent8b10de64dd2048e813438dbb5e4ed24d09feb8eb (diff)
parenta55f96ae9380d5cc9bef05e8b9e82e54d5d6ec35 (diff)
downloadspike-5731a478ea2b7cf639a383498eb114a9dc7d64df.zip
spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.gz
spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.bz2
Merge pull request #1303 from rivosinc/zvk-vector-crypto
Zvk vector crypto support (v5)
-rw-r--r--riscv/arith.h21
-rw-r--r--riscv/decode.h1
-rw-r--r--riscv/insns/sm4_common.h1
-rw-r--r--riscv/insns/vaesdf_vs.h43
-rw-r--r--riscv/insns/vaesdf_vv.h37
-rw-r--r--riscv/insns/vaesdm_vs.h44
-rw-r--r--riscv/insns/vaesdm_vv.h38
-rw-r--r--riscv/insns/vaesef_vs.h43
-rw-r--r--riscv/insns/vaesef_vv.h37
-rw-r--r--riscv/insns/vaesem_vs.h44
-rw-r--r--riscv/insns/vaesem_vv.h38
-rw-r--r--riscv/insns/vaeskf1_vi.h65
-rw-r--r--riscv/insns/vaeskf2_vi.h89
-rw-r--r--riscv/insns/vaesz_vs.h24
-rw-r--r--riscv/insns/vandn_vv.h10
-rw-r--r--riscv/insns/vandn_vx.h10
-rw-r--r--riscv/insns/vbrev8_v.h13
-rw-r--r--riscv/insns/vbrev_v.h24
-rw-r--r--riscv/insns/vclmul_vv.h20
-rw-r--r--riscv/insns/vclmul_vx.h20
-rw-r--r--riscv/insns/vclmulh_vv.h20
-rw-r--r--riscv/insns/vclmulh_vx.h20
-rw-r--r--riscv/insns/vclz_v.h16
-rw-r--r--riscv/insns/vcpop_v.h16
-rw-r--r--riscv/insns/vctz_v.h16
-rw-r--r--riscv/insns/vghsh_vv.h38
-rw-r--r--riscv/insns/vgmul_vv.h32
-rw-r--r--riscv/insns/vrev8_v.h16
-rw-r--r--riscv/insns/vrol_vv.h17
-rw-r--r--riscv/insns/vrol_vx.h18
-rw-r--r--riscv/insns/vror_vi.h18
-rw-r--r--riscv/insns/vror_vv.h17
-rw-r--r--riscv/insns/vror_vx.h18
-rw-r--r--riscv/insns/vsha2ch_vv.h61
-rw-r--r--riscv/insns/vsha2cl_vv.h62
-rw-r--r--riscv/insns/vsha2ms_vv.h63
-rw-r--r--riscv/insns/vsm3c_vi.h60
-rw-r--r--riscv/insns/vsm3me_vv.h39
-rw-r--r--riscv/insns/vsm4k_vi.h52
-rw-r--r--riscv/insns/vsm4r_vs.h51
-rw-r--r--riscv/insns/vsm4r_vv.h37
-rw-r--r--riscv/insns/vwsll_vi.h10
-rw-r--r--riscv/insns/vwsll_vv.h10
-rw-r--r--riscv/insns/vwsll_vx.h10
-rw-r--r--riscv/isa_parser.cc65
-rw-r--r--riscv/isa_parser.h16
-rw-r--r--riscv/overlap_list.h9
-rw-r--r--riscv/riscv.mk.in98
-rw-r--r--riscv/v_ext_macros.h22
-rw-r--r--riscv/vector_unit.cc55
-rw-r--r--riscv/vector_unit.h19
-rw-r--r--riscv/zvk_ext_macros.h1035
-rw-r--r--riscv/zvkned_ext_macros.h270
-rw-r--r--riscv/zvknh_ext_macros.h155
-rw-r--r--riscv/zvksed_ext_macros.h60
-rw-r--r--riscv/zvksh_ext_macros.h47
56 files changed, 3171 insertions, 19 deletions
diff --git a/riscv/arith.h b/riscv/arith.h
index 3b807e9..20b1504 100644
--- a/riscv/arith.h
+++ b/riscv/arith.h
@@ -7,6 +7,7 @@
#include <cstdint>
#include <climits>
#include <cstddef>
+#include <type_traits>
inline uint64_t mulhu(uint64_t a, uint64_t b)
{
@@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
return r;
}
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+ static_assert(std::is_unsigned<T>::value);
+ static constexpr T mask = (8 * sizeof(T)) - 1;
+ const std::size_t rshift = shiftamt & mask;
+ const std::size_t lshift = (-rshift) & mask;
+ return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+ static_assert(std::is_unsigned<T>::value);
+ static constexpr T mask = (8 * sizeof(T)) - 1;
+ const std::size_t lshift = shiftamt & mask;
+ const std::size_t rshift = (-lshift) & mask;
+ return (x << lshift) | (x >> rshift);
+}
+
#endif
diff --git a/riscv/decode.h b/riscv/decode.h
index dad32a1..cd1c0a1 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -140,6 +140,7 @@ public:
uint64_t v_vta() { return x(26, 1); }
uint64_t v_vma() { return x(27, 1); }
uint64_t v_mew() { return x(28, 1); }
+ uint64_t v_zimm6() { return x(15, 5) + (x(26, 1) << 5); }
uint64_t p_imm2() { return x(20, 2); }
uint64_t p_imm3() { return x(20, 3); }
diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h
index 17f129f..24d6ce1 100644
--- a/riscv/insns/sm4_common.h
+++ b/riscv/insns/sm4_common.h
@@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
0xD7, 0xCB, 0x39, 0x48
};
-
diff --git a/riscv/insns/vaesdf_vs.h b/riscv/insns/vaesdf_vs.h
new file mode 100644
index 0000000..a124278
--- /dev/null
+++ b/riscv/insns/vaesdf_vs.h
@@ -0,0 +1,43 @@
+// vaesdf.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ // This statement will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the variables declared
+ // here to be visible in the loop block.
+ // We capture the "scalar", vs2's first element, by copy, even though
+ // the "no overlap" constraint means that vs2 should remain constant
+ // during the loop.
+ const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd contains the input state,
+ // - vs2 contains the round key,
+ // - vd does receive the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+ // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_INV_SHIFT_ROWS(aes_state);
+ // InvSubBytes - Apply S-box to every byte in the state
+ VAES_INV_SUB_BYTES(aes_state);
+ // AddRoundKey (which is also InvAddRoundKey as it's xor)
+ EGU8x16_XOREQ(aes_state, scalar_key);
+ // InvMixColumns is not performed in the final round.
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesdf_vv.h b/riscv/insns/vaesdf_vv.h
new file mode 100644
index 0000000..9fca572
--- /dev/null
+++ b/riscv/insns/vaesdf_vv.h
@@ -0,0 +1,37 @@
+// vaesdf.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ {}, // No PRELOOP.
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd in contains the input state,
+ // - vs2 contains the input round key,
+ // - vd out receives the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+ const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+ // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_INV_SHIFT_ROWS(aes_state);
+ // InvSubBytes - Apply S-box to every byte in the state
+ VAES_INV_SUB_BYTES(aes_state);
+ // AddRoundKey (which is also InvAddRoundKey as it's xor)
+ EGU8x16_XOREQ(aes_state, round_key);
+ // InvMixColumns is not performed in the final round.
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesdm_vs.h b/riscv/insns/vaesdm_vs.h
new file mode 100644
index 0000000..3c23e69
--- /dev/null
+++ b/riscv/insns/vaesdm_vs.h
@@ -0,0 +1,44 @@
+// vaesdm.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ // This statement will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the variables declared
+ // here to be visible in the loop block.
+ // We capture the "scalar", vs2's first element, by copy, even though
+ // the "no overlap" constraint means that vs2 should remain constant
+ // during the loop.
+ const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd in contains the input state,
+ // - vs2 contains the input round key,
+ // - vd out receives the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+ // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_INV_SHIFT_ROWS(aes_state);
+ // InvSubBytes - Apply S-box to every byte in the state
+ VAES_INV_SUB_BYTES(aes_state);
+ // AddRoundKey (which is also InvAddRoundKey as it's xor)
+ EGU8x16_XOREQ(aes_state, scalar_key);
+ // InvMixColumns
+ VAES_INV_MIX_COLUMNS(aes_state);
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesdm_vv.h b/riscv/insns/vaesdm_vv.h
new file mode 100644
index 0000000..9c29cd9
--- /dev/null
+++ b/riscv/insns/vaesdm_vv.h
@@ -0,0 +1,38 @@
+// vaesdm.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ {}, // No PRELOOP.
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd contains the input state,
+ // - vs2 contains the round key,
+ // - vd does receive the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+ const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+ // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_INV_SHIFT_ROWS(aes_state);
+ // InvSubBytes - Apply S-box to every byte in the state
+ VAES_INV_SUB_BYTES(aes_state);
+ // AddRoundKey (which is also InvAddRoundKey as it's xor)
+ EGU8x16_XOREQ(aes_state, round_key);
+ // InvMixColumns
+ VAES_INV_MIX_COLUMNS(aes_state);
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesef_vs.h b/riscv/insns/vaesef_vs.h
new file mode 100644
index 0000000..2d32653
--- /dev/null
+++ b/riscv/insns/vaesef_vs.h
@@ -0,0 +1,43 @@
+// vaesef.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ // This statement will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the variables declared
+ // here to be visible in the loop block.
+ // We capture the "scalar", vs2's first element, by copy, even though
+ // the "no overlap" constraint means that vs2 should remain constant
+ // during the loop.
+ const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd contains the input state,
+ // - vs2 contains the round key,
+ // - vd receives the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+ // SubBytes - Apply S-box to every byte in the state
+ VAES_SUB_BYTES(aes_state);
+ // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_SHIFT_ROWS(aes_state);
+ // MixColumns is not performed for the final round.
+ // AddRoundKey
+ EGU8x16_XOREQ(aes_state, scalar_key);
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesef_vv.h b/riscv/insns/vaesef_vv.h
new file mode 100644
index 0000000..9b43a6d
--- /dev/null
+++ b/riscv/insns/vaesef_vv.h
@@ -0,0 +1,37 @@
+// vaesef.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ {}, // No PRELOOP.
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd contains the input state,
+ // - vs2 contains the round key,
+ // - vd receives the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+ const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+ // SubBytes - Apply S-box to every byte in the state
+ VAES_SUB_BYTES(aes_state);
+ // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_SHIFT_ROWS(aes_state);
+ // MixColumns is not performed for the final round.
+ // AddRoundKey
+ EGU8x16_XOREQ(aes_state, round_key);
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesem_vs.h b/riscv/insns/vaesem_vs.h
new file mode 100644
index 0000000..348cd9f
--- /dev/null
+++ b/riscv/insns/vaesem_vs.h
@@ -0,0 +1,44 @@
+// vaesem.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ // This statement will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the variables declared
+ // here to be visible in the loop block.
+ // We capture the "scalar", vs2's first element, by copy, even though
+ // the "no overlap" constraint means that vs2 should remain constant
+ // during the loop.
+ const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd contains the input state,
+ // - vs2 contains the round key,
+ // - vd receives the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+ // SubBytes - Apply S-box to every byte in the state
+ VAES_SUB_BYTES(aes_state);
+ // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_SHIFT_ROWS(aes_state);
+ // MixColumns
+ VAES_MIX_COLUMNS(aes_state);
+ // AddRoundKey
+ EGU8x16_XOREQ(aes_state, scalar_key);
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaesem_vv.h b/riscv/insns/vaesem_vv.h
new file mode 100644
index 0000000..34f0056
--- /dev/null
+++ b/riscv/insns/vaesem_vv.h
@@ -0,0 +1,38 @@
+// vaesem.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ {}, // No PRELOOP.
+ {
+ // For AES128, AES192, or AES256, state and key are 128b/16B values:
+ // - vd contains the input state,
+ // - vs2 contains the round key,
+ // - vd receives the output state.
+ //
+ // While the spec calls for handling the vector as made of EGU32x4
+ // element groups (i.e., 4 uint32_t), it is convenient to treat
+ // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+ // we extract the operands here instead of using the existing LOOP
+ // macro that defines/extracts the operand variables as EGU32x4.
+ EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+ const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+ // SubBytes - Apply S-box to every byte in the state
+ VAES_SUB_BYTES(aes_state);
+ // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+ VAES_SHIFT_ROWS(aes_state);
+ // MixColumns
+ VAES_MIX_COLUMNS(aes_state);
+ // AddRoundKey
+ EGU8x16_XOREQ(aes_state, round_key);
+
+ // Update the destination register.
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ EGU8x16_COPY(vd, aes_state);
+ }
+);
diff --git a/riscv/insns/vaeskf1_vi.h b/riscv/insns/vaeskf1_vi.h
new file mode 100644
index 0000000..28d03d0
--- /dev/null
+++ b/riscv/insns/vaeskf1_vi.h
@@ -0,0 +1,65 @@
+// vaeskf1.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// There is one round constant for each round number
+// between 1 and 10. We index using 'round# -1'.
+static constexpr uint8_t kRoundConstants[10] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+ {},
+ // The following statements will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the 'round' variable
+ // declared and defined here here to be visible in the loop block.
+ // Only consider the bottom 4 bits of the immediate.
+ const reg_t zimm4 = zimm5 & 0xF;
+ // Normalize the round value to be in [2, 14] by toggling bit 3
+ // if outside the range (i.e., +8 or -8).
+ const reg_t round = ((1 <= zimm4) && (zimm4 <= 10)) ? zimm4 : (zimm4 ^ 0x8);
+ const uint32_t rcon = kRoundConstants[round - 1];,
+ // Per Element Group body.
+ {
+ // vaeskf1_vi produces key[i+1] in vd, it receives key[i] in vs2,
+ // i.e., 4x32b values (4 words).
+ //
+ // The logic is fairly similar between vaeskf1/vaeskf2, with the following
+ // differences:
+ // - in AES-128 (vaeskf1), we get both the 'temp' word and
+ // the "previous words" w0..w3 from key[i]/vs2.
+ // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+ // the "previous words" w0..w3 from key[i-1]/vd.
+
+ // 'temp' is extracted from the last (most significant) word of key[i].
+ uint32_t temp = vs2[3];
+ temp = (temp >> 8) | (temp << 24); // Rotate right by 8
+ temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+ ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+ ((uint32_t)AES_ENC_SBOX[(temp >> 8) & 0xFF] << 8) |
+ ((uint32_t)AES_ENC_SBOX[(temp >> 0) & 0xFF] << 0));
+ temp = temp ^ rcon;
+
+ // "old" words are the w[i-Nk] of FIPS-197. They are extracted
+ // from vs2, which contains key[i] in AES-128 where Nk=4.
+ const uint32_t w0 = vs2[0] ^ temp;
+ const uint32_t w1 = vs2[1] ^ w0;
+ const uint32_t w2 = vs2[2] ^ w1;
+ const uint32_t w3 = vs2[3] ^ w2;
+
+ // Overwrite vd with k[i+1] from the new words.
+ SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+ }
+);
diff --git a/riscv/insns/vaeskf2_vi.h b/riscv/insns/vaeskf2_vi.h
new file mode 100644
index 0000000..49c2a2d
--- /dev/null
+++ b/riscv/insns/vaeskf2_vi.h
@@ -0,0 +1,89 @@
+// vaeskf2.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// Round Constants
+//
+// Only the odd rounds need to be encoded, the even ones can use 0
+// or skip the rcon handling. We can use '(round# / 2) - 1'
+// (or "(round# >> 1) - 1") to index into the array.
+//
+// Round# Constant
+// [ 2] -> kRoundConstants[0]
+// [ 3] -> 0 / Nothing
+// [ 4] -> kRoundConstants[1]
+// [ 5] -> 0 / Nothing
+// [ 6] -> kRoundConstants[2]
+// [ 7] -> 0 / Nothing
+// ...
+// [13] -> 0 / Nothing
+// [14] -> kRoundConstants[6]
+static constexpr uint8_t kRoundConstants[7] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40,
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+ {},
+ // The following statements will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the 'round' variable
+ // declared and defined here here to be visible in the loop block.
+ // Only consider the bottom 4 bits of the immediate.
+ const reg_t zimm4 = zimm5 & 0xF;
+ // Normalize the round value to be in [2, 14] by toggling bit 3
+ // if outside the range (i.e., +8 or -8).
+ const reg_t round = ((2 <= zimm4) && (zimm4 <= 14)) ? zimm4 : (zimm4 ^ 0x8);,
+ // Per Element Group body.
+ {
+ // vaeskf2_vi produces key[i+1] in vd, it receives key[i] in vs2,
+ // i.e., 4x32b values (4 words).
+ //
+ // The logic is fairly similar between vaeskf2/vaeskf2, with the following
+ // differences:
+ // - in AES-128 (vaeskf1), we get both the 'temp' word and
+ // the "previous words" w0..w3 from key[i]/vs2.
+ // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+ // the "previous words" w0..w3 from key[i-1]/vd.
+
+ // 'temp' is extracted from the last (most significant) word of key[i].
+ uint32_t temp = vs2[3];
+ // With AES-256, when we have an even round number, we hit the
+ // Nk > 6 and i mod Nk = 4
+ // condition in the FIPS-197 key expansion pseudo-code (Figure 11).
+ // In those cases we skip RotWord and the round constant is 0.
+ const bool is_even_round = (round & 0x1) == 0;
+ if (is_even_round) {
+ temp = (temp >> 8) | (temp << 24); // Rotate right by 8
+ }
+ temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+ ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+ ((uint32_t)AES_ENC_SBOX[(temp >> 8) & 0xFF] << 8) |
+ ((uint32_t)AES_ENC_SBOX[(temp >> 0) & 0xFF] << 0));
+
+ if (is_even_round) {
+ const uint32_t rcon = kRoundConstants[(round >> 1) - 1];
+ temp = temp ^ rcon;
+ }
+
+ // "old" words are the w[i-Nk] of FIPS-197. For AES-256, where Nk=8,
+ // they are extracted from vd which contains key[i-1].
+ const uint32_t w0 = vd[0] ^ temp;
+ const uint32_t w1 = vd[1] ^ w0;
+ const uint32_t w2 = vd[2] ^ w1;
+ const uint32_t w3 = vd[3] ^ w2;
+
+ // Overwrite vd with k[i+1] from the new words.
+ SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+ }
+);
diff --git a/riscv/insns/vaesz_vs.h b/riscv/insns/vaesz_vs.h
new file mode 100644
index 0000000..c3dc931
--- /dev/null
+++ b/riscv/insns/vaesz_vs.h
@@ -0,0 +1,24 @@
+// vaesz.vs vd, vs2
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ // This statement will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the variables declared
+ // here to be visible in the loop block.
+ // We capture the "scalar", vs2's first element, by copy, even though
+ // the "no overlap" constraint means that vs2 should remain constant
+ // during the loop.
+ const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+ // Per Element Group body.
+ {
+ EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+ // Produce vd = vd ^ "common key from vs2".
+ EGU8x16_XOR(vd, vd, scalar_key);
+ }
+);
diff --git a/riscv/insns/vandn_vv.h b/riscv/insns/vandn_vv.h
new file mode 100644
index 0000000..d85e47d
--- /dev/null
+++ b/riscv/insns/vandn_vv.h
@@ -0,0 +1,10 @@
+// vandn.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VV_LOOP
+({
+ vd = vs2 & (~vs1);
+})
diff --git a/riscv/insns/vandn_vx.h b/riscv/insns/vandn_vx.h
new file mode 100644
index 0000000..1c66a40
--- /dev/null
+++ b/riscv/insns/vandn_vx.h
@@ -0,0 +1,10 @@
+// vandn.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VX_LOOP
+({
+ vd = vs2 & (~rs1);
+})
diff --git a/riscv/insns/vbrev8_v.h b/riscv/insns/vbrev8_v.h
new file mode 100644
index 0000000..a6d3cda
--- /dev/null
+++ b/riscv/insns/vbrev8_v.h
@@ -0,0 +1,13 @@
+// vbrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+ vd = vs2;
+ vd = ((vd & 0x5555555555555555llu) << 1) | ((vd & 0xAAAAAAAAAAAAAAAAllu) >> 1);
+ vd = ((vd & 0x3333333333333333llu) << 2) | ((vd & 0xCCCCCCCCCCCCCCCCllu) >> 2);
+ vd = ((vd & 0x0F0F0F0F0F0F0F0Fllu) << 4) | ((vd & 0xF0F0F0F0F0F0F0F0llu) >> 4);
+})
diff --git a/riscv/insns/vbrev_v.h b/riscv/insns/vbrev_v.h
new file mode 100644
index 0000000..7f784c2
--- /dev/null
+++ b/riscv/insns/vbrev_v.h
@@ -0,0 +1,24 @@
+// vbrev.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+ reg_t x = vs2;
+
+ // Reverse bits in bytes (vbrev8)
+ x = ((x & 0x5555555555555555llu) << 1) | ((x & 0xAAAAAAAAAAAAAAAAllu) >> 1);
+ x = ((x & 0x3333333333333333llu) << 2) | ((x & 0xCCCCCCCCCCCCCCCCllu) >> 2);
+ x = ((x & 0x0F0F0F0F0F0F0F0Fllu) << 4) | ((x & 0xF0F0F0F0F0F0F0F0llu) >> 4);
+ // Re-order bytes (vrev8)
+ if (P.VU.vsew > 8)
+ x = ((x & 0x00FF00FF00FF00FFllu) << 8) | ((x & 0xFF00FF00FF00FF00llu) >> 8);
+ if (P.VU.vsew > 16)
+ x = ((x & 0x0000FFFF0000FFFFllu) << 16) | ((x & 0xFFFF0000FFFF0000llu) >> 16);
+ if (P.VU.vsew > 32)
+ x = ((x & 0x00000000FFFFFFFFllu) << 32) | ((x & 0xFFFFFFFF00000000llu) >> 32);
+
+ vd = x;
+})
diff --git a/riscv/insns/vclmul_vv.h b/riscv/insns/vclmul_vv.h
new file mode 100644
index 0000000..8957738
--- /dev/null
+++ b/riscv/insns/vclmul_vv.h
@@ -0,0 +1,20 @@
+// vclmul.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+ // Perform a carryless multiplication 64bx64b on each 64b element,
+ // return the low 64b of the 128b product.
+ // <https://en.wikipedia.org/wiki/Carry-less_product>
+ vd = 0;
+ for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+ const reg_t mask = ((reg_t) 1) << bit_idx;
+ if ((vs1 & mask) != 0) {
+ vd ^= vs2 << bit_idx;
+ }
+ }
+})
diff --git a/riscv/insns/vclmul_vx.h b/riscv/insns/vclmul_vx.h
new file mode 100644
index 0000000..1df7a3a
--- /dev/null
+++ b/riscv/insns/vclmul_vx.h
@@ -0,0 +1,20 @@
+// vclmul.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+ // Perform a carryless multiplication 64bx64b on each 64b element,
+ // return the low 64b of the 128b product.
+ // <https://en.wikipedia.org/wiki/Carry-less_product>
+ vd = 0;
+ for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+ const reg_t mask = ((reg_t) 1) << bit_idx;
+ if ((rs1 & mask) != 0) {
+ vd ^= vs2 << bit_idx;
+ }
+ }
+})
diff --git a/riscv/insns/vclmulh_vv.h b/riscv/insns/vclmulh_vv.h
new file mode 100644
index 0000000..6a54bcf
--- /dev/null
+++ b/riscv/insns/vclmulh_vv.h
@@ -0,0 +1,20 @@
+// vclmulh.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+ // Perform a carryless multiplication 64bx64b on each 64b element,
+ // return the high 64b of the 128b product.
+ // <https://en.wikipedia.org/wiki/Carry-less_product>
+ vd = 0;
+ for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+ const reg_t mask = ((reg_t) 1) << bit_idx;
+ if ((vs1 & mask) != 0) {
+ vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+ }
+ }
+})
diff --git a/riscv/insns/vclmulh_vx.h b/riscv/insns/vclmulh_vx.h
new file mode 100644
index 0000000..e874d1d
--- /dev/null
+++ b/riscv/insns/vclmulh_vx.h
@@ -0,0 +1,20 @@
+// vclmulh.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+ // Perform a carryless multiplication 64bx64b on each 64b element,
+ // return the high 64b of the 128b product.
+ // <https://en.wikipedia.org/wiki/Carry-less_product>
+ vd = 0;
+ for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+ const reg_t mask = ((reg_t) 1) << bit_idx;
+ if ((rs1 & mask) != 0) {
+ vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+ }
+ }
+})
diff --git a/riscv/insns/vclz_v.h b/riscv/insns/vclz_v.h
new file mode 100644
index 0000000..5f7f03c
--- /dev/null
+++ b/riscv/insns/vclz_v.h
@@ -0,0 +1,16 @@
+// vclz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+ unsigned int i = 0;
+ for (; i < P.VU.vsew; ++i) {
+ if (1 & (vs2 >> (P.VU.vsew - 1 - i))) {
+ break;
+ }
+ }
+ vd = i;
+})
diff --git a/riscv/insns/vcpop_v.h b/riscv/insns/vcpop_v.h
new file mode 100644
index 0000000..52b29c6
--- /dev/null
+++ b/riscv/insns/vcpop_v.h
@@ -0,0 +1,16 @@
+// vpopc.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+ reg_t count = 0;
+ for (std::size_t i = 0; i < P.VU.vsew; ++i) {
+ if (1 & (vs2 >> i)) {
+ count++;
+ }
+ }
+ vd = count;
+})
diff --git a/riscv/insns/vctz_v.h b/riscv/insns/vctz_v.h
new file mode 100644
index 0000000..b63dd01
--- /dev/null
+++ b/riscv/insns/vctz_v.h
@@ -0,0 +1,16 @@
+// vctz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+ unsigned int i = 0;
+ for (; i < P.VU.vsew; ++i) {
+ if (1 & (vs2 >> i)) {
+ break;
+ }
+ }
+ vd = i;
+})
diff --git a/riscv/insns/vghsh_vv.h b/riscv/insns/vghsh_vv.h
new file mode 100644
index 0000000..bcbfe74
--- /dev/null
+++ b/riscv/insns/vghsh_vv.h
@@ -0,0 +1,38 @@
+// vghsh.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+ {},
+ {
+ EGU32x4_t Y = vd; // Current partial hash
+ EGU32x4_t X = vs1; // Block cipher output
+ EGU32x4_t H = vs2; // Hash subkey
+
+ EGU32x4_BREV8(H);
+ EGU32x4_t Z = {};
+
+ // S = brev8(Y ^ X)
+ EGU32x4_t S;
+ EGU32x4_XOR(S, Y, X);
+ EGU32x4_BREV8(S);
+
+ for (int bit = 0; bit < 128; bit++) {
+ if (EGU32x4_ISSET(S, bit)) {
+ EGU32x4_XOREQ(Z, H);
+ }
+
+ const bool reduce = EGU32x4_ISSET(H, 127);
+ EGU32x4_LSHIFT(H); // Left shift by 1.
+ if (reduce) {
+ H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+ }
+ }
+ EGU32x4_BREV8(Z);
+ vd = Z;
+ }
+);
diff --git a/riscv/insns/vgmul_vv.h b/riscv/insns/vgmul_vv.h
new file mode 100644
index 0000000..820b396
--- /dev/null
+++ b/riscv/insns/vgmul_vv.h
@@ -0,0 +1,32 @@
+// vgmul.vv vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+ {},
+ {
+ EGU32x4_t Y = vd; // Multiplier
+ EGU32x4_BREV8(Y);
+ EGU32x4_t H = vs2; // Multiplicand
+ EGU32x4_BREV8(H);
+ EGU32x4_t Z = {};
+
+ for (int bit = 0; bit < 128; bit++) {
+ if (EGU32x4_ISSET(Y, bit)) {
+ EGU32x4_XOREQ(Z, H);
+ }
+
+ bool reduce = EGU32x4_ISSET(H, 127);
+ EGU32x4_LSHIFT(H); // Lef shift by 1
+ if (reduce) {
+ H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+ }
+ }
+ EGU32x4_BREV8(Z);
+ vd = Z;
+ }
+);
diff --git a/riscv/insns/vrev8_v.h b/riscv/insns/vrev8_v.h
new file mode 100644
index 0000000..f26c5a0
--- /dev/null
+++ b/riscv/insns/vrev8_v.h
@@ -0,0 +1,16 @@
+// vrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+ vd = vs2;
+ if (P.VU.vsew > 8)
+ vd = ((vd & 0x00FF00FF00FF00FFllu) << 8) | ((vd & 0xFF00FF00FF00FF00llu) >> 8);
+ if (P.VU.vsew > 16)
+ vd = ((vd & 0x0000FFFF0000FFFFllu) << 16) | ((vd & 0xFFFF0000FFFF0000llu) >> 16);
+ if (P.VU.vsew > 32)
+ vd = ((vd & 0x00000000FFFFFFFFllu) << 32) | ((vd & 0xFFFFFFFF00000000llu) >> 32);
+})
diff --git a/riscv/insns/vrol_vv.h b/riscv/insns/vrol_vv.h
new file mode 100644
index 0000000..fb2e483
--- /dev/null
+++ b/riscv/insns/vrol_vv.h
@@ -0,0 +1,17 @@
+// vrol.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+ // For .vv, the shift amount comes from the vs1 element.
+ const reg_t lshift = vs1 & mask;
+ const reg_t rshift = (-lshift) & mask;
+ vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vrol_vx.h b/riscv/insns/vrol_vx.h
new file mode 100644
index 0000000..b0c89a2
--- /dev/null
+++ b/riscv/insns/vrol_vx.h
@@ -0,0 +1,18 @@
+// vrol.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t lshift = ((reg_t)RS1) & mask;
+const reg_t rshift = (-lshift) & mask;
+
+VI_V_ULOOP
+({
+ vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vi.h b/riscv/insns/vror_vi.h
new file mode 100644
index 0000000..1269c3d
--- /dev/null
+++ b/riscv/insns/vror_vi.h
@@ -0,0 +1,18 @@
+// vror.vi vd, vs2, zimm6, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vi, the shift amount comes from bits [26,19-15].
+const reg_t rshift = insn.v_zimm6() & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+ vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vv.h b/riscv/insns/vror_vv.h
new file mode 100644
index 0000000..c649c6d
--- /dev/null
+++ b/riscv/insns/vror_vv.h
@@ -0,0 +1,17 @@
+// vror.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+ // For .vv, the shift amount comes from the vs1 element.
+ const reg_t rshift = vs1 & mask;
+ const reg_t lshift = (-rshift) & mask;
+ vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vx.h b/riscv/insns/vror_vx.h
new file mode 100644
index 0000000..50c8e5c
--- /dev/null
+++ b/riscv/insns/vror_vx.h
@@ -0,0 +1,18 @@
+// vror.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t rshift = ((reg_t)RS1) & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+ vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vsha2ch_vv.h b/riscv/insns/vsha2ch_vv.h
new file mode 100644
index 0000000..34c6e05
--- /dev/null
+++ b/riscv/insns/vsha2ch_vv.h
@@ -0,0 +1,61 @@
+// vsha2ch.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+ case e32: {
+ require_vsha2_vsew32_constraints;
+
+ VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+ {},
+ {
+ // {c, d, g, h} <- vd
+ EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+ // {a, b, e, f} <- vs2
+ EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+ // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W
+ EXTRACT_EGU32x4_WORDS_BE(vs1, kw3, kw2,
+ UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+ ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+ ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+ // Update the destination register, vd <- {a, b, e, f}.
+ SET_EGU32x4_BE(vd, a, b, e, f);
+ }
+ );
+ break;
+ }
+
+ case e64: {
+ require_vsha2_vsew64_constraints;
+
+ VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+ {},
+ {
+ // {c, d, g, h} <- vd
+ EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+ // {a, b, e, f} <- vs2
+ EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+ // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W
+ EXTRACT_EGU64x4_WORDS_BE(vs1, kw3, kw2,
+ UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+ ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+ ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+ // Update the destination register, vd <- {a, b, e, f}.
+ SET_EGU64x4_BE(vd, a, b, e, f);
+ }
+ );
+ break;
+ }
+
+ // 'require_vsha2_common_constraints' ensures that
+ // VSEW is either 32 or 64.
+ default:
+ require(false);
+}
diff --git a/riscv/insns/vsha2cl_vv.h b/riscv/insns/vsha2cl_vv.h
new file mode 100644
index 0000000..4a1df09
--- /dev/null
+++ b/riscv/insns/vsha2cl_vv.h
@@ -0,0 +1,62 @@
+// vsha2cl.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+ case e32: {
+ require_vsha2_vsew32_constraints;
+
+ VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+ {},
+ {
+ // {c, d, g, h} <- vd
+ EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+ // {a, b, e, f} <- vs2
+ EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+ // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W
+ EXTRACT_EGU32x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+ kw1, kw0);
+
+ ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+ ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+ // Update the destination register, vd <- {a, b, e, f}.
+ SET_EGU32x4_BE(vd, a, b, e, f);
+ }
+ );
+ break;
+ }
+
+ case e64: {
+ require_vsha2_vsew64_constraints;
+
+ VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+ {},
+ {
+ // {c, d, g, h} <- vd
+ EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+ // {a, b, e, f} <- vs2
+ EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+ // {kw3, kw2, kw1, kw0} <- vs1. "kw" stands for K+W
+ EXTRACT_EGU64x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+ kw1, kw0);
+
+ ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+ ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+ // Update the destination register, vd <- {a, b, e, f}.
+ SET_EGU64x4_BE(vd, a, b, e, f);
+ }
+ );
+ break;
+ }
+
+ // 'require_vsha2_common_constraints' ensures that
+ // VSEW is either 32 or 64.
+ default:
+ require(false);
+}
+
diff --git a/riscv/insns/vsha2ms_vv.h b/riscv/insns/vsha2ms_vv.h
new file mode 100644
index 0000000..8f1ca08
--- /dev/null
+++ b/riscv/insns/vsha2ms_vv.h
@@ -0,0 +1,63 @@
+// vshams.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+ case e32: {
+ require_vsha2_vsew32_constraints;
+
+ VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+ {},
+ {
+ // {w3, w2, w1, w0} <- vd
+ EXTRACT_EGU32x4_WORDS_BE(vd, w3, w2, w1, w0);
+ // {w11, w10, w9, w4} <- vs2
+ EXTRACT_EGU32x4_WORDS_BE(vs2, w11, w10, w9, w4);
+ // {w15, w14, w13, w12} <- vs1
+ EXTRACT_EGU32x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+ const uint32_t w16 = ZVK_SHA256_SCHEDULE(w14, w9, w1, w0);
+ const uint32_t w17 = ZVK_SHA256_SCHEDULE(w15, w10, w2, w1);
+ const uint32_t w18 = ZVK_SHA256_SCHEDULE(w16, w11, w3, w2);
+ const uint32_t w19 = ZVK_SHA256_SCHEDULE(w17, w12, w4, w3);
+
+ // Update the destination register.
+ SET_EGU32x4_BE(vd, w19, w18, w17, w16);;
+ }
+ );
+ break;
+ }
+
+ case e64: {
+ require_vsha2_vsew64_constraints;
+
+ VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+ {},
+ {
+ // {w3, w2, w1, w0} <- vd
+ EXTRACT_EGU64x4_WORDS_BE(vd, w3, w2, w1, w0);
+ // {w11, w10, w9, w4} <- vs2
+ EXTRACT_EGU64x4_WORDS_BE(vs2, w11, w10, w9, w4);
+ // {w15, w14, w13, w12} <- vs1
+ EXTRACT_EGU64x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+ const uint64_t w16 = ZVK_SHA512_SCHEDULE(w14, w9, w1, w0);
+ const uint64_t w17 = ZVK_SHA512_SCHEDULE(w15, w10, w2, w1);
+ const uint64_t w18 = ZVK_SHA512_SCHEDULE(w16, w11, w3, w2);
+ const uint64_t w19 = ZVK_SHA512_SCHEDULE(w17, w12, w4, w3);
+
+ // Update the destination register.
+ SET_EGU64x4_BE(vd, w19, w18, w17, w16);;
+ }
+ );
+ break;
+ }
+
+ // 'require_vsha2_common_constraints' ensures that
+ // VSEW is either 32 or 64.
+ default:
+ require(false);
+}
diff --git a/riscv/insns/vsm3c_vi.h b/riscv/insns/vsm3c_vi.h
new file mode 100644
index 0000000..b3e8121
--- /dev/null
+++ b/riscv/insns/vsm3c_vi.h
@@ -0,0 +1,60 @@
+// vsm3c.vi vd, vs2, rnd
+
+#include "zvksh_ext_macros.h"
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(
+ {},
+ // No need to validate or normalize 'zimm5' here as this is a 5 bits value
+ // and all values in 0-31 are valid.
+ const reg_t round = zimm5;,
+ {
+ // {H, G, F, E, D, C, B, A} <- vd
+ EXTRACT_EGU32x8_WORDS_BE_BSWAP(vd, H, G, F, E, D, C, B, A);
+ // {_, _, w5, w4, _, _, w1, w0} <- vs2
+ EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2,
+ UNUSED _unused_w7, UNUSED _unused_w6, w5, w4,
+ UNUSED _unused_w3, UNUSED _unused_w2, w1, w0);
+ const uint32_t x0 = w0 ^ w4; // W'[0] in spec documentation.
+ const uint32_t x1 = w1 ^ w5; // W'[1]
+
+ // Two rounds of compression.
+ uint32_t ss1;
+ uint32_t ss2;
+ uint32_t tt1;
+ uint32_t tt2;
+ uint32_t j;
+
+ j = 2 * round;
+ ss1 = ZVK_ROL32(ZVK_ROL32(A, 12) + E + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+ ss2 = ss1 ^ ZVK_ROL32(A, 12);
+ tt1 = ZVKSH_FF(A, B, C, j) + D + ss2 + x0;
+ tt2 = ZVKSH_GG(E, F, G, j) + H + ss1 + w0;
+ D = C;
+ const uint32_t C1 = ZVK_ROL32(B, 9);
+ B = A;
+ const uint32_t A1 = tt1;
+ H = G;
+ const uint32_t G1 = ZVK_ROL32(F, 19);
+ F = E;
+ const uint32_t E1 = ZVKSH_P0(tt2);
+
+ j = 2 * round + 1;
+ ss1 = ZVK_ROL32(ZVK_ROL32(A1, 12) + E1 + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+ ss2 = ss1 ^ ZVK_ROL32(A1, 12);
+ tt1 = ZVKSH_FF(A1, B, C1, j) + D + ss2 + x1;
+ tt2 = ZVKSH_GG(E1, F, G1, j) + H + ss1 + w1;
+ D = C1;
+ const uint32_t C2 = ZVK_ROL32(B, 9);
+ B = A1;
+ const uint32_t A2 = tt1;
+ H = G1;
+ const uint32_t G2 = ZVK_ROL32(F, 19);
+ F = E1;
+ const uint32_t E2 = ZVKSH_P0(tt2);
+
+ // Update the destination register.
+ SET_EGU32x8_WORDS_BE_BSWAP(vd, G1, G2, E1, E2, C1, C2, A1, A2);
+ }
+);
diff --git a/riscv/insns/vsm3me_vv.h b/riscv/insns/vsm3me_vv.h
new file mode 100644
index 0000000..dd6cb52
--- /dev/null
+++ b/riscv/insns/vsm3me_vv.h
@@ -0,0 +1,39 @@
+// vsm3me.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+#include "zvksh_ext_macros.h"
+
+// Per the SM3 spec, the message expansion computes new words Wi as:
+// W[i] = ( P_1( W[i-16] xor W[i-9] xor ( W[i-3] <<< 15 ) )
+// xor ( W[i-13] <<< 7 )
+// xor W[i-6]))
+// Using arguments M16 = W[i-16], M9 = W[i-9], etc.,
+// where Mk stands for "W[i Minus k]", we define the "W function":
+#define ZVKSH_W(M16, M9, M3, M13, M6) \
+ (ZVKSH_P1((M16) ^ (M9) ^ ZVK_ROL32((M3), 15)) ^ ZVK_ROL32((M13), 7) ^ (M6))
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(
+ {},
+ {
+ // {w7, w6, w5, w4, w3, w2, w1, w0} <- vs1
+ EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs1, w7, w6, w5, w4, w3, w2, w1, w0);
+ // {w15, w14, w13, w12, w11, w10, w9, w8} <- vs2
+ EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2, w15, w14, w13, w12, w11, w10, w9, w8);
+
+ // Arguments are W[i-16], W[i-9], W[i-13], W[i-6].
+ // Note that some of the newly computed words are used in later invocations.
+ const uint32_t w16 = ZVKSH_W(w0, w7, w13, w3, w10);
+ const uint32_t w17 = ZVKSH_W(w1, w8, w14, w4, w11);
+ const uint32_t w18 = ZVKSH_W(w2, w9, w15, w5, w12);
+ const uint32_t w19 = ZVKSH_W(w3, w10, w16, w6, w13);
+ const uint32_t w20 = ZVKSH_W(w4, w11, w17, w7, w14);
+ const uint32_t w21 = ZVKSH_W(w5, w12, w18, w8, w15);
+ const uint32_t w22 = ZVKSH_W(w6, w13, w19, w9, w16);
+ const uint32_t w23 = ZVKSH_W(w7, w14, w20, w10, w17);
+
+ // Update the destination register.
+ SET_EGU32x8_WORDS_BE_BSWAP(vd, w23, w22, w21, w20, w19, w18, w17, w16);
+ }
+);
diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h
new file mode 100644
index 0000000..8f52e68
--- /dev/null
+++ b/riscv/insns/vsm4k_vi.h
@@ -0,0 +1,52 @@
+// vsm4k.vi vd, vs2, round#
+
+#include "zvksed_ext_macros.h"
+
+// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft.
+static constexpr uint32_t zvksed_ck[32] = {
+ 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
+ 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
+ 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
+ 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
+ 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
+ 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
+ 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
+ 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+};
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+ {},
+ // The following statements will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the 'round' variable
+ // declared and defined here here to be visible in the loop block.
+ // Only consider the bottom 3 bits of the immediate, ensuring that
+ // 'round' is in the valid range [0, 7].
+ const reg_t round = zimm5 & 0x7;,
+ // Per Element Group body.
+ {
+ // {rk0, rk1, rk2, rk3} <- vs2
+ EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+ uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round];
+ uint32_t S = ZVKSED_SUB_BYTES(B);
+ uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S);
+
+ B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1];
+ S = ZVKSED_SUB_BYTES(B);
+ uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S);
+
+ B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2];
+ S = ZVKSED_SUB_BYTES(B);
+ uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S);
+
+ B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3];
+ S = ZVKSED_SUB_BYTES(B);
+ uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S);
+
+ // Update the destination register.
+ SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7);
+ }
+);
diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h
new file mode 100644
index 0000000..44011eb
--- /dev/null
+++ b/riscv/insns/vsm4r_vs.h
@@ -0,0 +1,51 @@
+// vsm4r.vs vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+// No overlap of vd and vs2.
+require(insn.rd() != insn.rs2());
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+ {},
+ // This statement will be executed before the first execution
+ // of the loop, and only if the loop is going to be entered.
+ // We cannot use a block ( { ... } ) since we want the variables declared
+ // here to be visible in the loop block.
+ // We capture the "scalar", vs2's first element, by copy, even though
+ // the "no overlap" constraint means that vs2 should remain constant
+ // during the loop.
+ const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0);
+ const uint32_t rk0 = scalar_key[0];
+ const uint32_t rk1 = scalar_key[1];
+ const uint32_t rk2 = scalar_key[2];
+ const uint32_t rk3 = scalar_key[3];,
+ {
+ EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true);
+
+ // {x0, x1,x2, x3} <- vd
+ EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3);
+
+ uint32_t B;
+ uint32_t S;
+
+ B = x1 ^ x2 ^ x3 ^ rk0;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+ B = x2 ^ x3 ^ x4 ^ rk1;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+ B = x3 ^ x4 ^ x5 ^ rk2;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+ B = x4 ^ x5 ^ x6 ^ rk3;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+ // Update the destination register.
+ SET_EGU32x4_LE(state, x4, x5, x6, x7);
+ }
+);
diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h
new file mode 100644
index 0000000..9a18cec
--- /dev/null
+++ b/riscv/insns/vsm4r_vv.h
@@ -0,0 +1,37 @@
+// vsm4r.vv vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+ {},
+ {
+ // vd = {x0, x1,x2, x3} <- vd
+ EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3);
+ // {rk0, rk1, rk2, rk3} <- vs2
+ EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+ uint32_t B;
+ uint32_t S;
+
+ B = x1 ^ x2 ^ x3 ^ rk0;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+ B = x2 ^ x3 ^ x4 ^ rk1;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+ B = x3 ^ x4 ^ x5 ^ rk2;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+ B = x4 ^ x5 ^ x6 ^ rk3;
+ S = ZVKSED_SUB_BYTES(B);
+ const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+ // Update the destination register.
+ SET_EGU32x4_LE(vd, x4, x5, x6, x7);
+ }
+);
diff --git a/riscv/insns/vwsll_vi.h b/riscv/insns/vwsll_vi.h
new file mode 100644
index 0000000..13b5eb4
--- /dev/null
+++ b/riscv/insns/vwsll_vi.h
@@ -0,0 +1,10 @@
+// vwsll.vi vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VI_WIDENING_ULOOP({
+ const reg_t shift = zimm5 & ((2 * sew) - 1);
+ vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vv.h b/riscv/insns/vwsll_vv.h
new file mode 100644
index 0000000..5a64c6c
--- /dev/null
+++ b/riscv/insns/vwsll_vv.h
@@ -0,0 +1,10 @@
+// vwsll.vv vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VV_WIDENING_ULOOP({
+ const reg_t shift = (vs1 & ((2 * sew) - 1));
+ vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vx.h b/riscv/insns/vwsll_vx.h
new file mode 100644
index 0000000..5264e80
--- /dev/null
+++ b/riscv/insns/vwsll_vx.h
@@ -0,0 +1,10 @@
+// vwsll.vx vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VX_WIDENING_ULOOP({
+ const reg_t shift = (rs1 & ((2 * sew) - 1));
+ vd_w = vs2_w << shift;
+});
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 1c4300c..59472a4 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -236,10 +236,55 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
extension_table[EXT_ZICOND] = true;
} else if (ext_str == "zihpm") {
extension_table[EXT_ZIHPM] = true;
+ } else if (ext_str == "zvbb") {
+ extension_table[EXT_ZVBB] = true;
+ } else if (ext_str == "zvbc") {
+ extension_table[EXT_ZVBC] = true;
} else if (ext_str == "zvfbfmin") {
extension_table[EXT_ZVFBFMIN] = true;
} else if (ext_str == "zvfbfwma") {
extension_table[EXT_ZVFBFWMA] = true;
+ } else if (ext_str == "zvkg") {
+ extension_table[EXT_ZVKG] = true;
+ } else if (ext_str == "zvkn") {
+ extension_table[EXT_ZVBB] = true;
+ extension_table[EXT_ZVKNED] = true;
+ extension_table[EXT_ZVKNHB] = true;
+ } else if (ext_str == "zvknc") {
+ extension_table[EXT_ZVBB] = true;
+ extension_table[EXT_ZVBC] = true;
+ extension_table[EXT_ZVKNED] = true;
+ extension_table[EXT_ZVKNHB] = true;
+ } else if (ext_str == "zvkng") {
+ extension_table[EXT_ZVBB] = true;
+ extension_table[EXT_ZVKG] = true;
+ extension_table[EXT_ZVKNED] = true;
+ extension_table[EXT_ZVKNHB] = true;
+ } else if (ext_str == "zvkned") {
+ extension_table[EXT_ZVKNED] = true;
+ } else if (ext_str == "zvknha") {
+ extension_table[EXT_ZVKNHA] = true;
+ } else if (ext_str == "zvknhb") {
+ extension_table[EXT_ZVKNHB] = true;
+ } else if (ext_str == "zvks") {
+ extension_table[EXT_ZVBB] = true;
+ extension_table[EXT_ZVKSED] = true;
+ extension_table[EXT_ZVKSH] = true;
+ } else if (ext_str == "zvksc") {
+ extension_table[EXT_ZVBB] = true;
+ extension_table[EXT_ZVBC] = true;
+ extension_table[EXT_ZVKSED] = true;
+ extension_table[EXT_ZVKSH] = true;
+ } else if (ext_str == "zvksg") {
+ extension_table[EXT_ZVBB] = true;
+ extension_table[EXT_ZVKG] = true;
+ extension_table[EXT_ZVKSED] = true;
+ extension_table[EXT_ZVKSH] = true;
+ } else if (ext_str == "zvksed") {
+ extension_table[EXT_ZVKSED] = true;
+ } else if (ext_str == "zvksh") {
+ extension_table[EXT_ZVKSH] = true;
+ } else if (ext_str == "zvkt") {
} else if (ext_str == "sstc") {
extension_table[EXT_SSTC] = true;
} else if (ext_str[0] == 'x') {
@@ -295,7 +340,7 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
}
if ((extension_table[EXT_ZCMP] || extension_table[EXT_ZCMT]) && extension_table[EXT_ZCD]) {
- bad_isa_string(str, "Zcmp' and 'Zcmt' exensions are incompatible with 'Zcd' extension");
+ bad_isa_string(str, "Zcmp' and 'Zcmt' extensions are incompatible with 'Zcd' extension");
}
if ((extension_table[EXT_ZCF] || extension_table[EXT_ZCD] || extension_table[EXT_ZCB] ||
@@ -307,6 +352,24 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
bad_isa_string(str, "'Zacas' extension requires 'A' extension");
}
+ // Zpn conflicts with Zvknha/Zvknhb in both rv32 and rv64
+ if (extension_table[EXT_ZPN] && (extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB])) {
+ bad_isa_string(str, "'Zvkna' and 'Zvknhb' extensions are incompatible with 'Zpn' extension");
+ }
+ // In rv64 only, Zpn (rv64_zpn) conflicts with Zvkg/Zvkned/Zvksh
+ if (max_xlen == 64 && extension_table[EXT_ZPN] &&
+ (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
+ bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
+ }
+#ifdef WORDS_BIGENDIAN
+ // Access to the vector registers as element groups is unimplemented on big-endian setups.
+ if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] ||
+ extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) {
+ bad_isa_string(str,
+ "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' "
+ "extensions are incompatible with WORDS_BIGENDIAN setups.");
+ }
+#endif
std::string lowercase = strtolower(priv);
bool user = false, supervisor = false;
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 3cbee7d..5b04347 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -58,8 +58,24 @@ typedef enum {
EXT_ZICNTR,
EXT_ZICOND,
EXT_ZIHPM,
+ EXT_ZVBB,
+ EXT_ZVBC,
EXT_ZVFBFMIN,
EXT_ZVFBFWMA,
+ EXT_ZVKG,
+ EXT_ZVKNED,
+ EXT_ZVKNHA,
+ EXT_ZVKNHB,
+ EXT_ZVKSED,
+ EXT_ZVKSH,
+ EXT_XZBP,
+ EXT_XZBS,
+ EXT_XZBE,
+ EXT_XZBF,
+ EXT_XZBC,
+ EXT_XZBM,
+ EXT_XZBR,
+ EXT_XZBT,
EXT_SSTC,
EXT_ZACAS,
EXT_INTERNAL_ZFH_MOVE,
diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index a30c770..2214be4 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -12,3 +12,12 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)
+DECLARE_OVERLAP_INSN(vaesdf_vv, EXT_ZVKNED)
+DECLARE_OVERLAP_INSN(vghsh_vv, EXT_ZVKG)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHA)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHB)
+DECLARE_OVERLAP_INSN(vsm3me_vv, EXT_ZVKSH)
+DECLARE_OVERLAP_INSN(rstsa16, EXT_ZPN)
+DECLARE_OVERLAP_INSN(rstsa32, EXT_ZPN)
+DECLARE_OVERLAP_INSN(srli32_u, EXT_ZPN)
+DECLARE_OVERLAP_INSN(umax32, EXT_ZPN)
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 6472982..a3e125f 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1340,32 +1340,98 @@ riscv_insn_ext_zacas = \
amocas_d \
$(if $(HAVE_INT128),amocas_q)
+riscv_insn_ext_zvbb = \
+ vandn_vv \
+ vandn_vx \
+ vbrev8_v \
+ vbrev_v \
+ vclz_v \
+ vcpop_v \
+ vctz_v \
+ vrev8_v \
+ vrol_vv \
+ vrol_vx \
+ vror_vi \
+ vror_vv \
+ vror_vx \
+ vwsll_vi \
+ vwsll_vv \
+ vwsll_vx \
+
+riscv_insn_ext_zvbc = \
+ vclmul_vv \
+ vclmul_vx \
+ vclmulh_vv \
+ vclmulh_vx \
+
+riscv_insn_ext_zvkg= \
+ vghsh_vv \
+ vgmul_vv \
+
+riscv_insn_ext_zvkned = \
+ vaesdf_vs \
+ vaesdf_vv \
+ vaesdm_vs \
+ vaesdm_vv \
+ vaesef_vs \
+ vaesef_vv \
+ vaesem_vs \
+ vaesem_vv \
+ vaeskf1_vi \
+ vaeskf2_vi \
+ vaesz_vs \
+
+# Covers both Zvknha and Zvkhnb.
+riscv_insn_ext_zvknh = \
+ vsha2cl_vv \
+ vsha2ch_vv \
+ vsha2ms_vv \
+
+riscv_insn_ext_zvksed = \
+ vsm4k_vi \
+ vsm4r_vs \
+ vsm4r_vv \
+
+riscv_insn_ext_zvksh = \
+ vsm3c_vi \
+ vsm3me_vv \
+
+riscv_insn_ext_zvk = \
+ $(riscv_insn_ext_zvbb) \
+ $(riscv_insn_ext_zvbc) \
+ $(riscv_insn_ext_zvkg) \
+ $(riscv_insn_ext_zvkned) \
+ $(riscv_insn_ext_zvknh) \
+ $(riscv_insn_ext_zvksed) \
+ $(riscv_insn_ext_zvksh) \
+
riscv_insn_list = \
+ $(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
$(riscv_insn_ext_a) \
+ $(riscv_insn_ext_b) \
+ $(riscv_insn_ext_bf16) \
$(riscv_insn_ext_c) \
- $(riscv_insn_ext_i) \
- $(riscv_insn_ext_m) \
- $(riscv_insn_ext_f) \
- $(riscv_insn_ext_f_zfa) \
+ $(riscv_insn_ext_cmo) \
$(riscv_insn_ext_d) \
$(riscv_insn_ext_d_zfa) \
- $(riscv_insn_ext_zfh) \
- $(riscv_insn_ext_zfh_zfa) \
+ $(riscv_insn_ext_f) \
+ $(riscv_insn_ext_f_zfa) \
+ $(riscv_insn_ext_h) \
+ $(riscv_insn_ext_i) \
+ $(riscv_insn_ext_k) \
+ $(riscv_insn_ext_m) \
+ $(riscv_insn_ext_p) \
$(riscv_insn_ext_q) \
$(riscv_insn_ext_q_zfa) \
- $(riscv_insn_ext_b) \
- $(riscv_insn_ext_k) \
- $(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
+ $(riscv_insn_ext_zacas) \
$(riscv_insn_ext_zce) \
- $(riscv_insn_ext_h) \
- $(riscv_insn_ext_p) \
+ $(riscv_insn_ext_zfh) \
+ $(riscv_insn_ext_zfh_zfa) \
+ $(riscv_insn_ext_zicond) \
+ $(riscv_insn_ext_zvk) \
$(riscv_insn_priv) \
- $(riscv_insn_svinval) \
$(riscv_insn_smrnmi) \
- $(riscv_insn_ext_cmo) \
- $(riscv_insn_ext_zicond) \
- $(riscv_insn_ext_bf16) \
- $(riscv_insn_ext_zacas) \
+ $(riscv_insn_svinval) \
riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list))
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 41256c7..908ff16 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize,
type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+#define V_U_PARAMS(x) \
+ type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+ type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
#define VX_U_PARAMS(x) \
type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
@@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize,
} \
VI_LOOP_END
+#define VI_V_ULOOP(BODY) \
+ VI_CHECK_SSS(false) \
+ VI_LOOP_BASE \
+ if (sew == e8) { \
+ V_U_PARAMS(e8); \
+ BODY; \
+ } else if (sew == e16) { \
+ V_U_PARAMS(e16); \
+ BODY; \
+ } else if (sew == e32) { \
+ V_U_PARAMS(e32); \
+ BODY; \
+ } else if (sew == e64) { \
+ V_U_PARAMS(e64); \
+ BODY; \
+ } \
+ VI_LOOP_END
+
#define VI_VX_ULOOP(BODY) \
VI_CHECK_SSS(false) \
VI_LOOP_BASE \
diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc
index 9128df6..08adc61 100644
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write
return regStart[n];
}
+// The logic differences between 'elt()' and 'elt_group()' come from
+// the fact that, while 'elt()' requires that the element is fully
+// contained in a single vector register, the element group may span
+// multiple registers in a single register group (LMUL>1).
+//
+// Notes:
+// - We do NOT check that a single element - i.e., the T in the element
+// group type std::array<T, N> - fits within a single register, or that
+// T is smaller or equal to VSEW. Implementations of the instructions
+// sometimes use a different T than what the specification suggests.
+// Instructon implementations should 'require()' what the specification
+// dictates.
+// - We do NOT check that 'vReg' is a valid register group, or that
+// 'n+1' element groups fit in the register group 'vReg'. It is
+// the responsibility of the caller to validate those preconditions.
+template<typename EG> EG&
+vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) {
+#ifdef WORDS_BIGENDIAN
+ fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n",
+ stderr);
+ abort();
+#endif
+ using T = typename EG::value_type;
+ constexpr std::size_t N = std::tuple_size<EG>::value;
+ assert(N > 0);
+
+ assert(vsew != 0);
+ constexpr reg_t elt_group_size = N * sizeof(T);
+ const reg_t reg_group_size = (VLEN >> 3) * vflmul;
+ assert(((n + 1) * elt_group_size) <= reg_group_size);
+
+ const reg_t start_byte = n * elt_group_size;
+ const reg_t bytes_per_reg = VLEN >> 3;
+
+ // Inclusive first/last register indices.
+ const reg_t reg_first = vReg + start_byte / bytes_per_reg;
+ const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg;
+
+ // Element groups per register groups
+ for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) {
+ reg_referenced[vidx] = 1;
+
+ if (unlikely(p->get_log_commits_enabled() && is_write)) {
+ p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0};
+ }
+ }
+
+ return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte);
+}
+
template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool);
template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool);
template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool);
@@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool);
template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool);
template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool);
template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool);
+
+template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool);
+template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool);
+template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool);
+template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool);
diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h
index b9f706c..a057c62 100644
--- a/riscv/vector_unit.h
+++ b/riscv/vector_unit.h
@@ -2,6 +2,9 @@
#ifndef _RISCV_VECTOR_UNIT_H
#define _RISCV_VECTOR_UNIT_H
+#include <array>
+#include <cstdint>
+
#include "decode.h"
#include "csrs.h"
@@ -69,6 +72,17 @@ struct type_sew_t<64>
using type=int64_t;
};
+// Element Group of 4 32 bits elements (128b total).
+using EGU32x4_t = std::array<uint32_t, 4>;
+
+// Element Group of 8 32 bits elements (256b total).
+using EGU32x8_t = std::array<uint32_t, 8>;
+
+// Element Group of 4 64 bits elements (256b total).
+using EGU64x4_t = std::array<uint64_t, 4>;
+
+// Element Group of 16 8 bits elements (128b total).
+using EGU8x16_t = std::array<uint8_t, 16>;
class vectorUnit_t
{
@@ -88,8 +102,11 @@ public:
bool vill;
bool vstart_alu;
- // vector element for varies SEW
+ // vector element for various SEW
template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false);
+ // vector element group access, where EG is a std::array<T, N>.
+ template<typename EG> EG&
+ elt_group(reg_t vReg, reg_t n, bool is_write = false);
public:
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
new file mode 100644
index 0000000..bf893f9
--- /dev/null
+++ b/riscv/zvk_ext_macros.h
@@ -0,0 +1,1035 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvk extension (vector cryptography).
+
+// Note that a good deal of code here would be cleaner/simpler
+// if exposed as C++ functions (including templated ones), however
+// this is not possible in the contexts where those headers are
+// included.
+
+#ifndef RISCV_ZVK_EXT_MACROS_H_
+#define RISCV_ZVK_EXT_MACROS_H_
+
+//
+// Predicate Macros
+//
+
+// Ensures that the ZVBB extension (vector crypto bitmanip) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvbb \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVBB); \
+ } while (0)
+
+// Ensures that the ZVBC extension (vector carryless multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvbc \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVBC); \
+ } while (0)
+
+// Ensures that the ZVKG extension (vector Gallois Field Multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvkg \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVKG); \
+ } while (0)
+
+// Ensures that a ZVK extension supporting SHA-256 is present.
+// For SHA-256, this support is present in either Zvknha or Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_256 \
+ do { \
+ require_vector(true); \
+ require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \
+ } while (0)
+
+// Ensures that the ZVKNED extension (vector AES single round) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvkned \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVKNED); \
+ } while (0)
+
+// Ensures that a ZVK extension supporting SHA-512 is present.
+// For SHA-512, this support is only present in Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_512 \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVKNHB); \
+ } while (0)
+
+// Ensures that the ZVKSED extension (vector SM4 block cipher)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvksed \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVKSED); \
+ } while (0)
+
+// Ensures that the ZVKSH extension (vector SM3 hash) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvksh \
+ do { \
+ require_vector(true); \
+ require_extension(EXT_ZVKSH); \
+ } while (0)
+
+// Ensures that the vector instruction is not using a mask.
+#define require_no_vmask require(insn.v_vm() == 1)
+
+// Ensures that an element group can fit in a register group. That is,
+// (LMUL * VLEN) <= EGW
+#define require_egw_fits(EGW) require((EGW) <= (P.VU.VLEN * P.VU.vflmul))
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=4 (four 32-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x4 \
+ do { \
+ /* 'vstart' must be a multiple of EGS */ \
+ const reg_t vstart = P.VU.vstart->read(); \
+ require(vstart % 4 == 0); \
+ /* 'vl' must be a multiple of EGS */ \
+ const reg_t vl = P.VU.vl->read(); \
+ require(vl % 4 == 0); \
+ } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group),
+// for an effective element group width of EGW=256 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x8 \
+ do { \
+ /* 'vstart' must be a multiple of EGS */ \
+ const reg_t vstart = P.VU.vstart->read(); \
+ require(vstart % 8 == 0); \
+ /* 'vl' must be a multiple of EGS */ \
+ const reg_t vl = P.VU.vl->read(); \
+ require(vl % 8 == 0); \
+ } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=64, EGS=4 (four 64-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_64x4 \
+ do { \
+ /* 'vstart' must be a multiple of EGS */ \
+ const reg_t vstart = P.VU.vstart->read(); \
+ require(vstart % 4 == 0); \
+ /* 'vl' must be a multiple of EGS */ \
+ const reg_t vl = P.VU.vl->read(); \
+ require(vl % 4 == 0); \
+ } while (0)
+
+//
+// Loop Parameters Macros
+//
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+ EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+ const EGU32x4_t vs1 = P.VU.elt_group<EGU32x4_t>((VS1_NUM), (EG_IDX)); \
+ const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+ EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+ const EGU32x8_t vs1 = P.VU.elt_group<EGU32x8_t>((VS1_NUM), (EG_IDX)); \
+ const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+ EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+ const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+ EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+ const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 64b*4 element group as a EGU64x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+ EGU64x4_t &vd = P.VU.elt_group<EGU64x4_t>((VD_NUM), (EG_IDX), true); \
+ const EGU64x4_t vs1 = P.VU.elt_group<EGU64x4_t>((VS1_NUM), (EG_IDX)); \
+ const EGU64x4_t vs2 = P.VU.elt_group<EGU64x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1',
+// as part of a widening operation where 'vd' has EEW = 2 * SEW.
+// Defines
+// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+// - 'vs2', unsigned, SEW width, by value, constant.
+// - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+// a widened copy of 'vs2'.
+// - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \
+ auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+ const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+ const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+ const auto vs1 = P.VU.elt<type_usew_t<SEW>::type>(rs1_num, i); \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the scalar register 'rs1', as part of a widening operation where
+// 'vd' has EEW = 2 * SEW.
+// Defines
+// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+// - 'vs2', unsigned, SEW width, by value, constant.
+// - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+// a widened copy of 'vs2'.
+// - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \
+ auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+ const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+ const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+ const auto rs1 = (type_usew_t<SEW>::type)RS1; \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the 5-bit immediate field 'zimm5', as part of a widening operation
+// where 'vd' has EEW = 2 * SEW.
+// Defines
+// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+// - 'vs2', unsigned, SEW width, by value, constant.
+// - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+// a widened copy of 'vs2'.
+// - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \
+ auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+ const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+ const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+ const auto zimm5 = (type_usew_t<SEW>::type)insn.v_zimm5(); \
+
+//
+// Loop Macros
+//
+
+// NOTES:
+// - Each of the element-group loop macros DO contain an invocation
+// of the corresponding 'require_element_groups_<bits>x<#elements>;',
+// because the macro correctness requires proper VL/VSTART values.
+// - Each of the loop macros named "_NOVM_" DO contain an invocation
+// of the 'require_no_vmask>;' macro. Those macros (all of them
+// at this time) do not support masking (i.e., no skipping
+// of elements/element groups is performed).
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2. This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_32x4;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+// - While the name states "VD_VS1_VS2", many vector instructions
+// are specified as "op vd, vs2, vs1". This macro does not imply
+// a specific operand order and can be used with both "op vd, vs2, vs1"
+// and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs1_num': register index of vs1
+// 'vs2_num': register index of vs2
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+// 'vd': EGU32x4_t reference, mutable,, content of the current
+// element group in the 'vd' vector register / register group.
+// 'vs1': EGU32x4_t, content of the current element group
+// in the 'vs1' vector register / register group.
+// 'vs2': EGU32x4_t, content of the current element group
+// in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+ do { \
+ require_element_groups_32x4; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs1_num = insn.rs1(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+ const reg_t vl_eg = P.VU.vl->read() / 4; \
+ do { PRELUDE } while (0); \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+ EG_BODY \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 32b*8 element groups available in the vector register
+// operands vd, vs1, and vs2. This interprets the vectors as containing
+// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+// - This macro contains an invocation of the macro 'require_element_groups_32x8;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+// - While the name states "VD_VS1_VS2", many vector instructions
+// are specified as "op vd, vs2, vs1". This macro does not imply
+// a specific operand order and can be used with both "op vd, vs2, vs1"
+// and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs1_num': register index of vs1
+// 'vs2_num': register index of vs2
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+// 'vd': EGU32x8_t reference, mutable,, content of the current
+// element group in the 'vd' vector register / register group.
+// 'vs1': EGU32x8_t, content of the current element group
+// in the 'vs1' vector register / register group.
+// 'vs2': EGU32x8_t, content of the current element group
+// in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \
+ do { \
+ require_element_groups_32x8;; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs1_num = insn.rs1(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+ const reg_t vl_eg = P.VU.vl->read() / 8; \
+ do { PRELUDE } while (0); \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+ EG_BODY \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2. This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+// - this macro does NOT extract the element groups into EGU32x4_t
+// variables. It is intended for uses where there is a more natural
+// type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+// wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+// - this macro offers the additional PRELOOP code block argument,
+// that is executed once if the loop is going to be entered.
+// This is intended for use with "vector scalar" instructions where
+// we extract the first element group from one of the operands and
+// use it for all loop iterations.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_32x4;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+// - While the name states "VD_VS1_VS2", many vector instructions
+// are specified as "op vd, vs2, vs1". This macro does not imply
+// a specific operand order and can be used with both "op vd, vs2, vs1"
+// and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - PRELOOP, invoked once IF there is at least one element group to process.
+// It is NOT placed in its own scope, variables declared in PRELOOP are
+// visible when EG_BODY executes.
+// Pass {} when there is no need for such a pre-loop block.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs1_num': register index of vs1
+// 'vs2_num': register index of vs2
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+ PRELOOP, \
+ EG_BODY) \
+ do { \
+ require_element_groups_32x4; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs1_num = insn.rs1(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+ const reg_t vl_eg = P.VU.vl->read() / 4; \
+ do { PRELUDE } while (0); \
+ if (vstart_eg < vl_eg) { \
+ PRELOOP \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ EG_BODY \
+ } \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd and vs2. This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+// - this macro is meant to be used for "op vd, vs2" instructions,
+// whether vd is output only, or input and output.
+// - this macro does NOT extract the element groups into EGU32x4_t
+// variables. It is intended for uses where there is a more natural
+// type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+// wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+// - this macro offers the additional PRELOOP code block argument,
+// that is executed once if the loop is going to be entered.
+// This is intended for use with "vector scalar" instructions where
+// we extract the first element group from one of the operands and
+// use it for all loop iterations.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_32x4;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+// - While the name states "VD_VS1_VS2", many vector instructions
+// are specified as "op vd, vs2, vs1". This macro does not imply
+// a specific operand order and can be used with both "op vd, vs2, vs1"
+// and "op vd, vs1, vs2" instructions.
+//
+// Invokes three statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - PRELOOP, invoked once IF there is at least one element group to process.
+// It is NOT placed in its own scope, variables declared in PRELOOP are
+// visible when EG_BODY executes.
+// Pass {} when there is no need for such a pre-loop block.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs2_num': register index of vs2
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+ PRELOOP, \
+ EG_BODY) \
+ do { \
+ require_element_groups_32x4; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+ const reg_t vl_eg = P.VU.vl->read() / 4; \
+ do { PRELUDE } while (0); \
+ if (vstart_eg < vl_eg) { \
+ PRELOOP \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ EG_BODY \
+ } \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2. This interprets the vectors as containing element groups
+// of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_32x4;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+//
+// Invokes two statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs2_num': register index of vs2
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+// 'vd': EGU32x4_t reference, mutable,, content of the current
+// element group in the 'vd' vector register / register group.
+// 'vs2': EGU32x4_t, content of the current element group
+// in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+ do { \
+ require_element_groups_32x4; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+ const reg_t vl_eg = P.VU.vl->read() / 4; \
+ do { PRELUDE } while (0); \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+ EG_BODY \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate. This interprets the vectors as
+// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_32x4;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - PRELOOP, invoked once IF there is at least one element group to process.
+// It is NOT placed in its own scope, variables declared in PRELOOP are
+// visible when EG_BODY executes.
+// Pass {} when there is no need for such a pre-loop block.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs2_num': register index of vs2
+// 'zimm5': 5 bits unsigned immediate
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+// 'vd': EGU32x4_t reference, mutable,, content of the current
+// element group in the 'vd' vector register / register group.
+// 'vs2': EGU32x4_t, content of the current element group
+// in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+ do { \
+ require_element_groups_32x4; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t zimm5 = insn.v_zimm5(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+ const reg_t vl_eg = P.VU.vl->read() / 4; \
+ do { PRELUDE } while (0); \
+ if (vstart_eg < vl_eg) { \
+ PRELOOP \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+ EG_BODY \
+ } \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 32b*8 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate. This interprets the vectors as
+// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_32x8;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - PRELOOP, invoked once IF there is at least one element group to process.
+// It is NOT placed in its own scope, variables declared in PRELOOP are
+// visible when EG_BODY executes.
+// Pass {} when there is no need for such a pre-loop block.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs2_num': register index of vs2
+// 'zimm5': unsigned 5 bits immediate
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+// 'vd': EGU32x8_t reference, mutable,, content of the current
+// element group in the 'vd' vector register / register group.
+// 'vs2': EGU32x8_t, content of the current element group
+// in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+ do { \
+ require_element_groups_32x8; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t zimm5 = insn.v_zimm5(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+ const reg_t vl_eg = P.VU.vl->read() / 8; \
+ do { PRELUDE } while (0); \
+ if (vstart_eg < vl_eg) { \
+ PRELOOP \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \
+ EG_BODY \
+ } \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+// Processes all 64b*4 element groups available in the vector registers
+// vd, vs1, and vs2. This interprets the vectors as containing element groups
+// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current
+// SEW that applies to the vectors.
+//
+// IMPORTANT
+// - This macro contains an invocation of 'require_element_groups_64x4;',
+// since the "loop" macro correctness depends on invariants that
+// are checked by the "require" macro.
+// - This macro does not support masking, and contains an invocation
+// of 'require_no_vmask;'.
+// - While the name states "VD_VS1_VS2", many vector instructions
+// are specified as "op vd, vs2, vs1". This macro does not imply
+// a specific operand order and can be used with both "op vd, vs2, vs1"
+// and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+// - PRELUDE, invoked once, before any element group. It is executed even
+// if the vector is empty. It is placed in a "do { } while (0);", hence
+// any variable declared there is not visible outside.
+// - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+// 'vd_num': register index of vd
+// 'vs1_num': register index of vs1
+// 'vs2_num': register index of vs2
+// 'vstart_eg': index of the first element group, *in EG units*
+// 'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+// 'idx_eg': index of the current element group.
+// 'vd': EGU64x4_t reference, content of the current element group
+// in the 'vd' vector register / vector register group.
+// 'vs1': EGU64x4_t, content of the current element group
+// in the 'vs1' vector register / vector register group.
+// 'vs2': EGU64x4_t, content of the current element group
+// in the 'vs2' vector register / vector register group.
+#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+ do { \
+ require_element_groups_64x4; \
+ require_no_vmask; \
+ const reg_t vd_num = insn.rd(); \
+ const reg_t vs1_num = insn.rs1(); \
+ const reg_t vs2_num = insn.rs2(); \
+ const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+ const reg_t vl_eg = P.VU.vl->read() / 4; \
+ do { PRELUDE } while (0); \
+ for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+ VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+ EG_BODY \
+ } \
+ P.VU.vstart->write(0); \
+ } while (0)
+
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, v1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+// - 'vs2', unsigned, SEW width, by value, constant.
+// - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+// a widened copy of 'vs2'.
+// - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \
+ do { \
+ VI_CHECK_DSS(true); \
+ VI_LOOP_BASE \
+ switch (sew) { \
+ case e8: { \
+ VI_ZVK_VV_WIDENING_U_PARAMS(e8); \
+ BODY \
+ break; \
+ } \
+ case e16: { \
+ VI_ZVK_VV_WIDENING_U_PARAMS(e16); \
+ BODY \
+ break; \
+ } \
+ case e32: { \
+ VI_ZVK_VV_WIDENING_U_PARAMS(e32); \
+ BODY \
+ break; \
+ } \
+ } \
+ VI_LOOP_END \
+ } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, rs1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+// - 'vs2', unsigned, SEW width, by value, constant.
+// - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+// a widened copy of 'vs2'.
+// - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \
+ do { \
+ VI_CHECK_DSS(true); \
+ VI_LOOP_BASE \
+ switch (sew) { \
+ case e8: { \
+ VI_ZVK_VX_WIDENING_U_PARAMS(e8); \
+ BODY \
+ break; \
+ } \
+ case e16: { \
+ VI_ZVK_VX_WIDENING_U_PARAMS(e16); \
+ BODY \
+ break; \
+ } \
+ case e32: { \
+ VI_ZVK_VX_WIDENING_U_PARAMS(e32); \
+ BODY \
+ break; \
+ } \
+ } \
+ VI_LOOP_END \
+ } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+// - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+// - 'vs2', unsigned, SEW width, by value, constant.
+// - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+// a widened copy of 'vs2'.
+// - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \
+ do { \
+ VI_CHECK_DSS(true); \
+ VI_LOOP_BASE \
+ switch (sew) { \
+ case e8: { \
+ VI_ZVK_VI_WIDENING_U_PARAMS(e8); \
+ BODY \
+ break; \
+ } \
+ case e16: { \
+ VI_ZVK_VI_WIDENING_U_PARAMS(e16); \
+ BODY \
+ break; \
+ } \
+ case e32: { \
+ VI_ZVK_VI_WIDENING_U_PARAMS(e32); \
+ BODY \
+ break; \
+ } \
+ } \
+ VI_LOOP_END \
+ } while (0)
+
+//
+// Element Group Manipulation Macros
+//
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Little Endian" (LE) order, i.e., from the least significant (W0)
+// to the most significant (W3).
+#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \
+ uint32_t W0 = (X)[0]; \
+ uint32_t W1 = (X)[1]; \
+ uint32_t W2 = (X)[2]; \
+ uint32_t W3 = (X)[3]; \
+ (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Little Endian" (LE)
+// order, i.e., from the least significant (W0) to the most
+// significant (W3).
+#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \
+ do { \
+ (X)[0] = (W0); \
+ (X)[1] = (W1); \
+ (X)[2] = (W2); \
+ (X)[3] = (W3); \
+ } while (0)
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \
+ uint32_t W0 = (X)[0]; \
+ uint32_t W1 = (X)[1]; \
+ uint32_t W2 = (X)[2]; \
+ uint32_t W3 = (X)[3]; \
+ (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \
+ do { \
+ (X)[0] = (W0); \
+ (X)[1] = (W1); \
+ (X)[2] = (W2); \
+ (X)[3] = (W3); \
+ } while (0)
+
+// Byte-swap the bytes of a uin32_t such that the order of bytes
+// is reversed.
+#define ZVK_BSWAP32(x) \
+ ((((uint32_t)((x) >> 24)) & 0xFF) << 0 | \
+ (((uint32_t)((x) >> 16)) & 0xFF) << 8 | \
+ (((uint32_t)((x) >> 8)) & 0xFF) << 16 | \
+ (((uint32_t)((x) >> 0)) & 0xFF) << 24)
+
+// Extracts 8 uint32_t words from the input EGU32x8_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W7)
+// to the least significant (W0). Each of the words is byte-swapped,
+// from a big-endian representation in the EGU32x8_t to a native/little-endian
+// ordering in the variables.
+#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+ uint32_t W0 = ZVK_BSWAP32((X)[0]); \
+ uint32_t W1 = ZVK_BSWAP32((X)[1]); \
+ uint32_t W2 = ZVK_BSWAP32((X)[2]); \
+ uint32_t W3 = ZVK_BSWAP32((X)[3]); \
+ uint32_t W4 = ZVK_BSWAP32((X)[4]); \
+ uint32_t W5 = ZVK_BSWAP32((X)[5]); \
+ uint32_t W6 = ZVK_BSWAP32((X)[6]); \
+ uint32_t W7 = ZVK_BSWAP32((X)[7]); \
+ (void)(0)
+
+// Sets the elements words of given EGU32x8_t variable 'X' to
+// the given 8 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W7) to the least
+// significant (W0). Each of the words is byte-swapped,
+// from a native/little-endian ordering in the variables to
+// a big-endian representation in the EGU32x8_t.
+#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+ do { \
+ (X)[0] = ZVK_BSWAP32(W0); \
+ (X)[1] = ZVK_BSWAP32(W1); \
+ (X)[2] = ZVK_BSWAP32(W2); \
+ (X)[3] = ZVK_BSWAP32(W3); \
+ (X)[4] = ZVK_BSWAP32(W4); \
+ (X)[5] = ZVK_BSWAP32(W5); \
+ (X)[6] = ZVK_BSWAP32(W6); \
+ (X)[7] = ZVK_BSWAP32(W7); \
+ } while (0)
+
+// Extracts 4 uint64_t words from the input EGU64x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \
+ uint64_t W0 = (X)[0]; \
+ uint64_t W1 = (X)[1]; \
+ uint64_t W2 = (X)[2]; \
+ uint64_t W3 = (X)[3]; \
+ (void)(0)
+
+// Sets the elements words of given EGU64x4_t variable 'X' to
+// the given 4 uint64_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \
+ do { \
+ (X)[0] = (W0); \
+ (X)[1] = (W1); \
+ (X)[2] = (W2); \
+ (X)[3] = (W3); \
+ } while (0)
+
+// Copies a EGU8x16_t value from 'SRC' into 'DST'.
+#define EGU8x16_COPY(DST, SRC) \
+ for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+ (DST)[bidx] = (SRC)[bidx]; \
+ }
+
+// Performs "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU8x16_XOREQ(MUT_A, CONST_B) \
+ for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+ (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+ }
+
+// Performs "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU32x4_XOREQ(MUT_A, CONST_B) \
+ for (std::size_t idx = 0; idx < 4; ++idx) { \
+ (MUT_A)[idx] ^= (CONST_B)[idx]; \
+ }
+
+// Performs "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU8x16_XOR(DST, A, B) \
+ for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+ (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
+ }
+
+// Performs "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU32x4_XOR(DST, A, B) \
+ do { \
+ static_assert(std::is_same<EGU32x4_t, decltype(A)>::value); \
+ static_assert(std::is_same<EGU32x4_t, decltype(B)>::value); \
+ static_assert(std::is_same<EGU32x4_t, decltype(DST)>::value); \
+ for (std::size_t idx = 0; idx < 4; ++idx) { \
+ (DST)[idx] = (A)[idx] ^ (B)[idx]; \
+ } \
+ } while (0)
+
+//
+// Common bit manipulations logic.
+//
+
+// Form a 64 bit integer with bit X set
+#define ZVK_BIT(X) (1ULL << (X))
+
+// Reverse the order of bits within bytes of a word.
+// This is used to match the data interpretation in NIST SP 800-38D
+// a.k.a the GCM specification.
+#define ZVK_BREV8_32(X) \
+ do { \
+ (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \
+ (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \
+ (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \
+ } while (0)
+
+// Rotates right a uint32_t value by N bits.
+// uint32_t ROR32(uint32_t X, std::size_t N);
+#define ZVK_ROR32(X, N) rotate_right<uint32_t>((X), (N))
+
+// Rotates right a uint64_t value by N bits.
+// uint64_t ROR64(uint64_t X, std::size_t N);
+#define ZVK_ROR64(X, N) rotate_right<uint64_t>((X), (N))
+
+// Rotates left a uint32_t value by N bits.
+// uint32_t ROL32(uint32_t X, std::size_t N);
+#define ZVK_ROL32(X, N) rotate_left<uint32_t>((X), (N))
+
+//
+// Element Group Bit Manipulation Macros
+//
+
+// Performs bit reversal in a EGU32x4_t group.
+#define EGU32x4_BREV8(X) \
+ for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+ ZVK_BREV8_32((X)[bidx]); \
+ }
+
+// Checks if a given bit is set within a EGU32x4_t group.
+// Assumes LE ordering.
+#define EGU32x4_ISSET(X, BIDX) \
+ (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0)
+
+// Shfts a EGU32x4_t group left by one bit.
+//
+// Since the entire 128 bit value is shifted we need to handle carry bits.
+// In order to limit the amount of carry check logic the elements are copied to
+// a 64 bit temporary variable.
+#define EGU32x4_LSHIFT(X) \
+ do { \
+ uint64_t dword; \
+ dword = ((uint64_t)(X)[3]) << 32; \
+ dword |= X[2]; \
+ dword <<= 1; \
+ if (X[1] & ZVK_BIT(31)) { \
+ dword |= ZVK_BIT(0); \
+ } \
+ X[2] = dword & UINT32_MAX; \
+ X[3] = dword >> 32; \
+ dword = ((uint64_t)(X)[1]) << 32; \
+ dword |= X[0]; \
+ dword <<= 1; \
+ X[0] = dword & UINT32_MAX; \
+ X[1] = dword >> 32; \
+ } while (0)
+
+#endif // RISCV_ZVK_EXT_MACROS_H_
diff --git a/riscv/zvkned_ext_macros.h b/riscv/zvkned_ext_macros.h
new file mode 100644
index 0000000..db705c7
--- /dev/null
+++ b/riscv/zvkned_ext_macros.h
@@ -0,0 +1,270 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvkned extension (vector AES single round).
+
+#include "insns/aes_common.h"
+
+#ifndef RISCV_ZVKNED_EXT_MACROS_H_
+#define RISCV_ZVKNED_EXT_MACROS_H_
+
+// vaes*.vs instruction constraints:
+// - Zvkned is enabled
+// - EGW (128) <= LMUL * VLEN
+// - vd and vs2 cannot overlap
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vs_constraints \
+ do { \
+ require_zvkned; \
+ require(P.VU.vsew == 32); \
+ require_egw_fits(128); \
+ require(insn.rd() != insn.rs2()); \
+ } while (false)
+
+// vaes*.vv instruction constraints. Those are the same as the .vs ones,
+// except for the overlap constraint that is not present for .vv variants.
+// - Zvkned is enabled
+// - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vv_constraints \
+ do { \
+ require_zvkned; \
+ require(P.VU.vsew == 32); \
+ require_egw_fits(128); \
+ } while (false)
+
+// vaeskf*.vi instruction constraints. Those are the same as the .vv ones.
+#define require_vaeskf_vi_constraints \
+ do { \
+ require_zvkned; \
+ require(P.VU.vsew == 32); \
+ require_egw_fits(128); \
+ } while (false)
+
+#define VAES_XTIME(A) (((A) << 1) ^ (((A) & 0x80) ? 0x1b : 0))
+
+#define VAES_GFMUL(A, B) \
+ ((((B) & 0x1) ? (A) : 0) ^ \
+ (((B) & 0x2) ? VAES_XTIME(A) : 0) ^ \
+ (((B) & 0x4) ? VAES_XTIME(VAES_XTIME(A)) : 0) ^ \
+ (((B) & 0x8) ? VAES_XTIME(VAES_XTIME(VAES_XTIME(A))) : 0))
+
+// Apply the S-box transform to every byte in the VAESState 'state'
+#define VAES_SUB_BYTES(STATE) \
+ do { \
+ static constexpr uint8_t kVAESXEncSBox[256]= { \
+ 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, \
+ 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, \
+ 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, \
+ 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, \
+ 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, \
+ 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, \
+ 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, \
+ 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, \
+ 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, \
+ 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, \
+ 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, \
+ 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, \
+ 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, \
+ 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, \
+ 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, \
+ 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, \
+ 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, \
+ 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, \
+ 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, \
+ 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, \
+ 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, \
+ 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, \
+ 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, \
+ 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, \
+ 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, \
+ 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, \
+ 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, \
+ 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, \
+ 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, \
+ 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, \
+ 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, \
+ 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, \
+ }; \
+ for (uint8_t& byte : (STATE)) { \
+ byte = kVAESXEncSBox[byte]; \
+ } \
+ } while (0)
+
+// Applies the S-box inverse (decode) transform to every byte
+// in the VAESState 'state'.
+#define VAES_INV_SUB_BYTES(STATE) \
+ do { \
+ static constexpr uint8_t kVAESXDecSBox[256] = { \
+ 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, \
+ 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, \
+ 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, \
+ 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, \
+ 0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, \
+ 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, \
+ 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, \
+ 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, \
+ 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, \
+ 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, \
+ 0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, \
+ 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, \
+ 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, \
+ 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, \
+ 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, \
+ 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, \
+ 0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, \
+ 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, \
+ 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, \
+ 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, \
+ 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, \
+ 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, \
+ 0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, \
+ 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, \
+ 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, \
+ 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, \
+ 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, \
+ 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, \
+ 0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, \
+ 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, \
+ 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, \
+ 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D, \
+ }; \
+ for (uint8_t &byte : (STATE)) { \
+ byte = kVAESXDecSBox[byte]; \
+ } \
+ } while (0)
+
+// Shift the state rows, as specified in ShiftRows.
+// 'STATE' is a VAESState value.
+#define VAES_SHIFT_ROWS(STATE) \
+ do { \
+ uint8_t temp; \
+ /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+ /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+ temp = (STATE)[1]; \
+ (STATE)[ 1] = (STATE)[ 5]; \
+ (STATE)[ 5] = (STATE)[ 9]; \
+ (STATE)[ 9] = (STATE)[13]; \
+ (STATE)[13] = temp; \
+ /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+ temp = (STATE)[2]; \
+ (STATE)[ 2] = (STATE)[10]; \
+ (STATE)[10] = temp; \
+ temp = (STATE)[6]; \
+ (STATE)[ 6] = (STATE)[14]; \
+ (STATE)[14] = temp; \
+ /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+ temp = (STATE)[3]; \
+ (STATE)[ 3] = (STATE)[15]; \
+ (STATE)[15] = (STATE)[11]; \
+ (STATE)[11] = (STATE)[ 7]; \
+ (STATE)[ 7] = temp; \
+ } while (0)
+
+// Shifts the state rows, as specified in InvShiftRows.
+// 'STATE' is a VAESState value.
+#define VAES_INV_SHIFT_ROWS(STATE) \
+ do { \
+ uint8_t temp; \
+ /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+ /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+ temp = (STATE)[1]; \
+ (STATE)[ 1] = (STATE)[13]; \
+ (STATE)[13] = (STATE)[ 9]; \
+ (STATE)[ 9] = (STATE)[ 5]; \
+ (STATE)[ 5] = temp; \
+ /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+ temp = (STATE)[2]; \
+ (STATE)[ 2] = (STATE)[10]; \
+ (STATE)[10] = temp; \
+ temp = (STATE)[6]; \
+ (STATE)[ 6] = (STATE)[14]; \
+ (STATE)[14] = temp; \
+ /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+ temp = (STATE)[3]; \
+ (STATE)[ 3] = (STATE)[ 7]; \
+ (STATE)[ 7] = (STATE)[11]; \
+ (STATE)[11] = (STATE)[15]; \
+ (STATE)[15] = temp; \
+ } while (0)
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation MixColumns() specified in FIPS-197 5.1.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+// F(A, B, C, D) = (2 . A) xor (3 . B) xor C xor D
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_MIX_COLUMN_BYTE(A, B, C, D) \
+ (VAES_GFMUL((A), 0x2) ^ VAES_GFMUL((B), 0x3) ^ (C) ^ (D))
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation InvMixColumns() specified in FIPS-197 5.3.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+// F(A, B, C, D) = (0xE . A) xor (0xB . B) xor (0xD . C) xor (0x9 . D)
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_INV_MIX_COLUMN_BYTE(A, B, C, D) \
+ (VAES_GFMUL((A), 0xE) ^ \
+ VAES_GFMUL((B), 0xB) ^ \
+ VAES_GFMUL((C), 0xD) ^ \
+ VAES_GFMUL((D), 0x9))
+
+// Given a column as a uin32_t (4 Bytes), produces the mixed column
+// as a uin32_t.
+#define VAES_MIX_COLUMN(STATE, COL_IDX) \
+ do { \
+ uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+ /* Extract the bytes, before we start overwriting them */ \
+ const uint8_t b0 = column[0]; \
+ const uint8_t b1 = column[1]; \
+ const uint8_t b2 = column[2]; \
+ const uint8_t b3 = column[3]; \
+ /* Every iteration rotates the byte indices by 1 */ \
+ column[0] = VAES_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+ column[1] = VAES_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+ column[2] = VAES_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+ column[3] = VAES_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+ } while (0)
+
+// Given a column as a uin32_t (4 Bytes), produces the inverse
+// mixed column as a uin32_t.
+#define VAES_INV_MIX_COLUMN(STATE, COL_IDX) \
+ do { \
+ uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+ /* Extract the bytes, before we start overwriting them */ \
+ const uint8_t b0 = column[0]; \
+ const uint8_t b1 = column[1]; \
+ const uint8_t b2 = column[2]; \
+ const uint8_t b3 = column[3]; \
+ /* Every iteration rotates the byte indices by 1 */ \
+ column[0] = VAES_INV_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+ column[1] = VAES_INV_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+ column[2] = VAES_INV_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+ column[3] = VAES_INV_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+ } while (0)
+
+// Implements MixColumns as defined in FIPS-197 5.1.3.
+#define VAES_MIX_COLUMNS(STATE) \
+ do { \
+ VAES_MIX_COLUMN((STATE), 0); \
+ VAES_MIX_COLUMN((STATE), 1); \
+ VAES_MIX_COLUMN((STATE), 2); \
+ VAES_MIX_COLUMN((STATE), 3); \
+ } while (0)
+
+// Implements InvMixColumns as defined in FIPS-197 5.3.3.
+#define VAES_INV_MIX_COLUMNS(STATE) \
+ do { \
+ VAES_INV_MIX_COLUMN((STATE), 0); \
+ VAES_INV_MIX_COLUMN((STATE), 1); \
+ VAES_INV_MIX_COLUMN((STATE), 2); \
+ VAES_INV_MIX_COLUMN((STATE), 3); \
+ } while (0)
+
+#endif // RISCV_ZVKNED_EXT_MACROS_H_
diff --git a/riscv/zvknh_ext_macros.h b/riscv/zvknh_ext_macros.h
new file mode 100644
index 0000000..b50818b
--- /dev/null
+++ b/riscv/zvknh_ext_macros.h
@@ -0,0 +1,155 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvknh[ab] extensions (vector SHA-256/SHA-512 cryptography).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKNH_EXT_MACROS_H_
+#define RISCV_ZVKNH_EXT_MACROS_H_
+
+// Constraints common to all vsha* instructions, across all VSEW:
+// - VSEW is 32 (SHA-256) or 64 (SHA-512)
+// - No overlap of vd with vs1 or vs2.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_..._EGU32x4_..._LOOP and VI_..._EGU64x4_..._LOOP
+// macros.
+#define require_vsha2_common_constraints \
+ do { \
+ require(P.VU.vsew == 32 || P.VU.vsew == 64); \
+ require(insn.rd() != insn.rs1()); \
+ require(insn.rd() != insn.rs2()); \
+ } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsha2_vsew32_constraints \
+ do { \
+ require_zvknh_256; \
+ require_egw_fits(128); \
+ } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU64x4_..._LOOP macros.
+#define require_vsha2_vsew64_constraints \
+ do { \
+ require_zvknh_512; \
+ require_egw_fits(256); \
+ } while (false)
+
+//
+// SHA-256 and SHA-512 common logic
+//
+
+// Ch(x, y, z) = (xy) ⊕ (~xz) = xy | ~xz
+#define ZVK_SHA_CH(X, Y, Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+
+// Maj(x,y,z) = (xy) ⊕ (xz) ⊕(yz) = xy | xz | yz
+#define ZVK_SHA_MAJ(X, Y, Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+
+//
+// SHA-256
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA256_SUM0(X) \
+ (ZVK_ROR32(X, 2) ^ ZVK_ROR32(X, 13) ^ ZVK_ROR32(X, 22))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA256_SUM1(X) \
+ (ZVK_ROR32(X, 6) ^ ZVK_ROR32(X, 11) ^ ZVK_ROR32(X, 25))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA256_SIG0(X) \
+ (ZVK_ROR32(X, 7) ^ ZVK_ROR32(X, 18) ^ ((X) >> 3))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA256_SIG1(X) \
+ (ZVK_ROR32(X, 17) ^ ZVK_ROR32(X, 19) ^ ((X) >> 10))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA256_SCHEDULE(W14, W9, W1, W0) \
+ (ZVK_SHA256_SIG1(W14) + (W9) + ZVK_SHA256_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA256_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+ { \
+ const uint32_t t1 = (H) + ZVK_SHA256_SUM1(E) + \
+ ZVK_SHA_CH((E), (F), (G)) + (KW); \
+ const uint32_t t2 = ZVK_SHA256_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+ (H) = (G); \
+ (G) = (F); \
+ (F) = (E); \
+ (E) = (D) + t1; \
+ (D) = (C); \
+ (C) = (B); \
+ (B) = (A); \
+ (A) = t1 + t2; \
+ }
+
+//
+// SHA-512
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA512_SUM0(X) \
+ (ZVK_ROR64(X, 28) ^ ZVK_ROR64(X, 34) ^ ZVK_ROR64(X, 39))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA512_SUM1(X) \
+ (ZVK_ROR64(X, 14) ^ ZVK_ROR64(X, 18) ^ ZVK_ROR64(X, 41))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA512_SIG0(X) \
+ (ZVK_ROR64(X, 1) ^ ZVK_ROR64(X, 8) ^ ((X) >> 7))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA512_SIG1(X) \
+ (ZVK_ROR64(X, 19) ^ ZVK_ROR64(X, 61) ^ ((X) >> 6))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA512_SCHEDULE(W14, W9, W1, W0) \
+ (ZVK_SHA512_SIG1(W14) + (W9) + ZVK_SHA512_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA512_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+ { \
+ const uint64_t t1 = (H) + ZVK_SHA512_SUM1(E) + \
+ ZVK_SHA_CH((E), (F), (G)) + (KW); \
+ const uint64_t t2 = ZVK_SHA512_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+ (H) = (G); \
+ (G) = (F); \
+ (F) = (E); \
+ (E) = (D) + t1; \
+ (D) = (C); \
+ (C) = (B); \
+ (B) = (A); \
+ (A) = t1 + t2; \
+ }
+
+#endif // RISCV_ZVKNH_EXT_MACROS_H_
diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h
new file mode 100644
index 0000000..46e399b
--- /dev/null
+++ b/riscv/zvksed_ext_macros.h
@@ -0,0 +1,60 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksed extension (vectorized SM4).
+
+#include "insns/sm4_common.h"
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKSED_MACROS_H_
+#define RISCV_ZVKSED_MACROS_H_
+
+// Constraints common to all vsm4* instructions:
+// - Zvksed is enabled
+// - VSEW == 32
+// - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsm4_constraints \
+ do { \
+ require_zvksed; \
+ require(P.VU.vsew == 32); \
+ require_egw_fits(128); \
+ } while (false)
+
+// Returns a uint32_t value constructed from the 4 bytes (uint8_t)
+// provided in "Little Endian" (LE) order, i.e., from least significant (B0)
+// to most significant (B3).
+#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \
+ (((uint32_t)(B0)) << 0 | \
+ ((uint32_t)(B1)) << 8 | \
+ ((uint32_t)(B2)) << 16 | \
+ ((uint32_t)(B3)) << 24)
+
+// Get byte BYTE of the SBox.
+#define ZVKSED_SBOX(BYTE) (sm4_sbox[(BYTE)])
+
+// Given an unsigned integer value 'X' and a byte index,
+// returns a uint8_t value for the byte at the given index.
+#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8)))
+
+// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1.
+// of the IETF draft.
+#define ZVKSED_SUB_BYTES(B) \
+ ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \
+ ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \
+ ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \
+ ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3)))
+
+// Perform the linear transformation L to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND(X, S) \
+ ((X) ^ \
+ ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \
+ ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24)))
+
+// Perform the linear transformation L' to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND_KEY(X, S) \
+ ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23)))
+
+#endif // RISCV_ZVKSED_MACROS_H_
diff --git a/riscv/zvksh_ext_macros.h b/riscv/zvksh_ext_macros.h
new file mode 100644
index 0000000..71c5a09
--- /dev/null
+++ b/riscv/zvksh_ext_macros.h
@@ -0,0 +1,47 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksh extension (vectorized SM3).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_INSNS_ZVKSH_COMMON_H_
+#define RISCV_INSNS_ZVKSH_COMMON_H_
+
+// Constraints common to all vsm3* instructions:
+// - Zvksh is enabled
+// - VSEW == 32
+// - EGW (256) <= LMUL * VLEN
+// - No overlap of vd and vs2.
+//
+// The constraint that vstart and vl are both EGS (8) aligned
+// is checked in the VI_ZVK_..._EGU32x8_..._LOOP macros.
+#define require_vsm3_constraints \
+ do { \
+ require_zvksh; \
+ require(P.VU.vsew == 32); \
+ require_egw_fits(256); \
+ require(insn.rd() != insn.rs2()); \
+ } while (false)
+
+#define FF1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define FF2(X, Y, Z) (((X) & (Y)) | ((X) & (Z)) | ((Y) & (Z)))
+
+// Boolean function FF_j - section 4.3. of the IETF draft.
+#define ZVKSH_FF(X, Y, Z, J) (((J) <= 15) ? FF1(X, Y, Z) : FF2(X, Y, Z))
+
+#define GG1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define GG2(X, Y, Z) (((X) & (Y)) | ((~(X)) & (Z)))
+
+// Boolean function GG_j - section 4.3. of the IETF draft.
+#define ZVKSH_GG(X, Y, Z, J) (((J) <= 15) ? GG1(X, Y, Z) : GG2(X, Y, Z))
+
+#define T1 0x79CC4519
+#define T2 0x7A879D8A
+
+// T_j constant - section 4.2. of the IETF draft.
+#define ZVKSH_T(J) (((J) <= 15) ? (T1) : (T2))
+
+// Permutation functions P_0 and P_1 - section 4.4 of the IETF draft.
+#define ZVKSH_P0(X) ((X) ^ ZVK_ROL32((X), 9) ^ ZVK_ROL32((X), 17))
+#define ZVKSH_P1(X) ((X) ^ ZVK_ROL32((X), 15) ^ ZVK_ROL32((X), 23))
+
+#endif // RISCV_INSNS_ZVKSH_COMMON_H