Merge pull request #1303 from rivosinc/zvk-vector-crypto

Zvk vector crypto support (v5)
author: Andrew Waterman <andrew@sifive.com> 2023-06-19 20:18:09 -0700
committer: GitHub <noreply@github.com> 2023-06-19 20:18:09 -0700
commit: 5731a478ea2b7cf639a383498eb114a9dc7d64df (patch)
tree: 43f28cdc046246deb9275b71ecaaacdce45ecaa9
parent: 8b10de64dd2048e813438dbb5e4ed24d09feb8eb (diff)
parent: a55f96ae9380d5cc9bef05e8b9e82e54d5d6ec35 (diff)
download: spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.zip
spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.gz
spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.bz2
56 files changed, 3171 insertions, 19 deletions
diff --git a/riscv/arith.h b/riscv/arith.h
index 3b807e9..20b1504 100644
--- a/riscv/arith.h
+++ b/riscv/arith.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <climits>
 #include <cstddef>
+#include <type_traits>
 
 inline uint64_t mulhu(uint64_t a, uint64_t b)
 {
@@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
   return r;
 }
 
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t rshift = shiftamt & mask;
+  const std::size_t lshift = (-rshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t lshift = shiftamt & mask;
+  const std::size_t rshift = (-lshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
 #endif
diff --git a/riscv/decode.h b/riscv/decode.h
index dad32a1..cd1c0a1 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -140,6 +140,7 @@ public:
   uint64_t v_vta() { return x(26, 1); }
   uint64_t v_vma() { return x(27, 1); }
   uint64_t v_mew() { return x(28, 1); }
+  uint64_t v_zimm6() { return x(15, 5) + (x(26, 1) << 5); }
 
   uint64_t p_imm2() { return x(20, 2); }
   uint64_t p_imm3() { return x(20, 3); }
diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h
index 17f129f..24d6ce1 100644
--- a/riscv/insns/sm4_common.h
+++ b/riscv/insns/sm4_common.h
@@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
 	0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
 	0xD7, 0xCB, 0x39, 0x48
 };
-
diff --git a/riscv/insns/vaesdf_vs.h b/riscv/insns/vaesdf_vs.h
new file mode 100644
index 0000000..a124278
--- /dev/null
+++ b/riscv/insns/vaesdf_vs.h
@@ -0,0 +1,43 @@
+// vaesdf.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdf_vv.h b/riscv/insns/vaesdf_vv.h
new file mode 100644
index 0000000..9fca572
--- /dev/null
+++ b/riscv/insns/vaesdf_vv.h
@@ -0,0 +1,37 @@
+// vaesdf.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vs.h b/riscv/insns/vaesdm_vs.h
new file mode 100644
index 0000000..3c23e69
--- /dev/null
+++ b/riscv/insns/vaesdm_vs.h
@@ -0,0 +1,44 @@
+// vaesdm.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vv.h b/riscv/insns/vaesdm_vv.h
new file mode 100644
index 0000000..9c29cd9
--- /dev/null
+++ b/riscv/insns/vaesdm_vv.h
@@ -0,0 +1,38 @@
+// vaesdm.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vs.h b/riscv/insns/vaesef_vs.h
new file mode 100644
index 0000000..2d32653
--- /dev/null
+++ b/riscv/insns/vaesef_vs.h
@@ -0,0 +1,43 @@
+// vaesef.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vv.h b/riscv/insns/vaesef_vv.h
new file mode 100644
index 0000000..9b43a6d
--- /dev/null
+++ b/riscv/insns/vaesef_vv.h
@@ -0,0 +1,37 @@
+// vaesef.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vs.h b/riscv/insns/vaesem_vs.h
new file mode 100644
index 0000000..348cd9f
--- /dev/null
+++ b/riscv/insns/vaesem_vs.h
@@ -0,0 +1,44 @@
+// vaesem.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vv.h b/riscv/insns/vaesem_vv.h
new file mode 100644
index 0000000..34f0056
--- /dev/null
+++ b/riscv/insns/vaesem_vv.h
@@ -0,0 +1,38 @@
+// vaesem.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaeskf1_vi.h b/riscv/insns/vaeskf1_vi.h
new file mode 100644
index 0000000..28d03d0
--- /dev/null
+++ b/riscv/insns/vaeskf1_vi.h
@@ -0,0 +1,65 @@
+// vaeskf1.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// There is one round constant for each round number
+// between 1 and 10. We index using 'round# -1'.
+static constexpr uint8_t kRoundConstants[10] = {
+  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 4 bits of the immediate.
+  const reg_t zimm4 = zimm5 & 0xF;
+  // Normalize the round value to be in [2, 14] by toggling bit 3
+  // if outside the range (i.e., +8 or -8).
+  const reg_t round = ((1 <= zimm4) && (zimm4 <= 10)) ? zimm4 : (zimm4 ^ 0x8);
+  const uint32_t rcon = kRoundConstants[round - 1];,
+  // Per Element Group body.
+  {
+    // vaeskf1_vi produces key[i+1] in vd, it receives key[i] in vs2,
+    // i.e., 4x32b values (4 words).
+    //
+    // The logic is fairly similar between vaeskf1/vaeskf2, with the following
+    // differences:
+    // - in AES-128 (vaeskf1), we get both the 'temp' word and
+    //   the "previous words" w0..w3 from key[i]/vs2.
+    // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+    //   the "previous words" w0..w3 from key[i-1]/vd.
+
+    // 'temp' is extracted from the last (most significant) word of key[i].
+    uint32_t temp = vs2[3];
+    temp = (temp >> 8) | (temp << 24);  // Rotate right by 8
+    temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+            ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  8) & 0xFF] <<  8) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  0) & 0xFF] <<  0));
+    temp = temp ^ rcon;
+
+    // "old" words are the w[i-Nk] of FIPS-197. They are extracted
+    // from vs2, which contains key[i] in AES-128 where Nk=4.
+    const uint32_t w0 = vs2[0] ^ temp;
+    const uint32_t w1 = vs2[1] ^ w0;
+    const uint32_t w2 = vs2[2] ^ w1;
+    const uint32_t w3 = vs2[3] ^ w2;
+
+    // Overwrite vd with k[i+1] from the new words.
+    SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+  }
+);
diff --git a/riscv/insns/vaeskf2_vi.h b/riscv/insns/vaeskf2_vi.h
new file mode 100644
index 0000000..49c2a2d
--- /dev/null
+++ b/riscv/insns/vaeskf2_vi.h
@@ -0,0 +1,89 @@
+// vaeskf2.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// Round Constants
+//
+// Only the odd rounds need to be encoded, the even ones can use 0
+// or skip the rcon handling. We can use '(round# / 2) - 1'
+// (or "(round# >> 1) - 1") to index into the array.
+//
+// Round#  Constant
+//  [ 2]  -> kRoundConstants[0]
+//  [ 3]  -> 0 / Nothing
+//  [ 4]  -> kRoundConstants[1]
+//  [ 5]  -> 0 / Nothing
+//  [ 6]  -> kRoundConstants[2]
+//  [ 7]  -> 0 / Nothing
+// ...
+//  [13]  -> 0 / Nothing
+//  [14]  -> kRoundConstants[6]
+static constexpr uint8_t kRoundConstants[7] = {
+  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40,
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 4 bits of the immediate.
+  const reg_t zimm4 = zimm5 & 0xF;
+  // Normalize the round value to be in [2, 14] by toggling bit 3
+  // if outside the range (i.e., +8 or -8).
+  const reg_t round = ((2 <= zimm4) && (zimm4 <= 14)) ? zimm4 : (zimm4 ^ 0x8);,
+  // Per Element Group body.
+  {
+    // vaeskf2_vi produces key[i+1] in vd, it receives key[i] in vs2,
+    // i.e., 4x32b values (4 words).
+    //
+    // The logic is fairly similar between vaeskf2/vaeskf2, with the following
+    // differences:
+    // - in AES-128 (vaeskf1), we get both the 'temp' word and
+    //   the "previous words" w0..w3 from key[i]/vs2.
+    // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+    //   the "previous words" w0..w3 from key[i-1]/vd.
+
+    // 'temp' is extracted from the last (most significant) word of key[i].
+    uint32_t temp = vs2[3];
+    // With AES-256, when we have an even round number, we hit the
+    //       Nk > 6 and i mod Nk = 4
+    // condition in the FIPS-197 key expansion pseudo-code (Figure 11).
+    // In those cases we skip RotWord and the round constant is 0.
+    const bool is_even_round = (round & 0x1) == 0;
+    if (is_even_round) {
+      temp = (temp >> 8) | (temp << 24);  // Rotate right by 8
+    }
+    temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+            ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  8) & 0xFF] <<  8) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  0) & 0xFF] <<  0));
+
+    if (is_even_round) {
+      const uint32_t rcon = kRoundConstants[(round >> 1) - 1];
+      temp = temp ^ rcon;
+    }
+
+    // "old" words are the w[i-Nk] of FIPS-197. For AES-256, where Nk=8,
+    // they are extracted from vd which contains key[i-1].
+    const uint32_t w0 = vd[0] ^ temp;
+    const uint32_t w1 = vd[1] ^ w0;
+    const uint32_t w2 = vd[2] ^ w1;
+    const uint32_t w3 = vd[3] ^ w2;
+
+    // Overwrite vd with k[i+1] from the new words.
+    SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+  }
+);
diff --git a/riscv/insns/vaesz_vs.h b/riscv/insns/vaesz_vs.h
new file mode 100644
index 0000000..c3dc931
--- /dev/null
+++ b/riscv/insns/vaesz_vs.h
@@ -0,0 +1,24 @@
+// vaesz.vs vd, vs2
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  // Per Element Group body.
+  {
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    // Produce vd = vd ^ "common key from vs2".
+    EGU8x16_XOR(vd, vd, scalar_key);
+  }
+);
diff --git a/riscv/insns/vandn_vv.h b/riscv/insns/vandn_vv.h
new file mode 100644
index 0000000..d85e47d
--- /dev/null
+++ b/riscv/insns/vandn_vv.h
@@ -0,0 +1,10 @@
+// vandn.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VV_LOOP
+({
+  vd = vs2 & (~vs1);
+})
diff --git a/riscv/insns/vandn_vx.h b/riscv/insns/vandn_vx.h
new file mode 100644
index 0000000..1c66a40
--- /dev/null
+++ b/riscv/insns/vandn_vx.h
@@ -0,0 +1,10 @@
+// vandn.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VX_LOOP
+({
+  vd = vs2 & (~rs1);
+})
diff --git a/riscv/insns/vbrev8_v.h b/riscv/insns/vbrev8_v.h
new file mode 100644
index 0000000..a6d3cda
--- /dev/null
+++ b/riscv/insns/vbrev8_v.h
@@ -0,0 +1,13 @@
+// vbrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  vd = vs2;
+  vd = ((vd & 0x5555555555555555llu) <<  1) | ((vd & 0xAAAAAAAAAAAAAAAAllu) >>  1);
+  vd = ((vd & 0x3333333333333333llu) <<  2) | ((vd & 0xCCCCCCCCCCCCCCCCllu) >>  2);
+  vd = ((vd & 0x0F0F0F0F0F0F0F0Fllu) <<  4) | ((vd & 0xF0F0F0F0F0F0F0F0llu) >>  4);
+})
diff --git a/riscv/insns/vbrev_v.h b/riscv/insns/vbrev_v.h
new file mode 100644
index 0000000..7f784c2
--- /dev/null
+++ b/riscv/insns/vbrev_v.h
@@ -0,0 +1,24 @@
+// vbrev.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  reg_t x = vs2;
+
+  // Reverse bits in bytes (vbrev8)
+  x = ((x & 0x5555555555555555llu) <<  1) | ((x & 0xAAAAAAAAAAAAAAAAllu) >>  1);
+  x = ((x & 0x3333333333333333llu) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCllu) >>  2);
+  x = ((x & 0x0F0F0F0F0F0F0F0Fllu) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0llu) >>  4);
+  // Re-order bytes (vrev8)
+  if (P.VU.vsew > 8)
+    x = ((x & 0x00FF00FF00FF00FFllu) <<  8) | ((x & 0xFF00FF00FF00FF00llu) >>  8);
+  if (P.VU.vsew > 16)
+    x = ((x & 0x0000FFFF0000FFFFllu) << 16) | ((x & 0xFFFF0000FFFF0000llu) >> 16);
+  if (P.VU.vsew > 32)
+    x = ((x & 0x00000000FFFFFFFFllu) << 32) | ((x & 0xFFFFFFFF00000000llu) >> 32);
+
+  vd = x;
+})
diff --git a/riscv/insns/vclmul_vv.h b/riscv/insns/vclmul_vv.h
new file mode 100644
index 0000000..8957738
--- /dev/null
+++ b/riscv/insns/vclmul_vv.h
@@ -0,0 +1,20 @@
+// vclmul.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the low 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((vs1 & mask) != 0) {
+      vd ^= vs2 << bit_idx;
+    }
+  }
+})
diff --git a/riscv/insns/vclmul_vx.h b/riscv/insns/vclmul_vx.h
new file mode 100644
index 0000000..1df7a3a
--- /dev/null
+++ b/riscv/insns/vclmul_vx.h
@@ -0,0 +1,20 @@
+// vclmul.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the low 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((rs1 & mask) != 0) {
+        vd ^= vs2 << bit_idx;
+    }
+  }
+})
diff --git a/riscv/insns/vclmulh_vv.h b/riscv/insns/vclmulh_vv.h
new file mode 100644
index 0000000..6a54bcf
--- /dev/null
+++ b/riscv/insns/vclmulh_vv.h
@@ -0,0 +1,20 @@
+// vclmulh.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the high 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((vs1 & mask) != 0) {
+      vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+    }
+  }
+})
diff --git a/riscv/insns/vclmulh_vx.h b/riscv/insns/vclmulh_vx.h
new file mode 100644
index 0000000..e874d1d
--- /dev/null
+++ b/riscv/insns/vclmulh_vx.h
@@ -0,0 +1,20 @@
+// vclmulh.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the high 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((rs1 & mask) != 0) {
+      vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+    }
+  }
+})
diff --git a/riscv/insns/vclz_v.h b/riscv/insns/vclz_v.h
new file mode 100644
index 0000000..5f7f03c
--- /dev/null
+++ b/riscv/insns/vclz_v.h
@@ -0,0 +1,16 @@
+// vclz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  unsigned int i = 0;
+  for (; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> (P.VU.vsew - 1 - i))) {
+      break;
+    }
+  }
+  vd = i;
+})
diff --git a/riscv/insns/vcpop_v.h b/riscv/insns/vcpop_v.h
new file mode 100644
index 0000000..52b29c6
--- /dev/null
+++ b/riscv/insns/vcpop_v.h
@@ -0,0 +1,16 @@
+// vpopc.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  reg_t count = 0;
+  for (std::size_t i = 0; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> i)) {
+      count++;
+    }
+  }
+  vd = count;
+})
diff --git a/riscv/insns/vctz_v.h b/riscv/insns/vctz_v.h
new file mode 100644
index 0000000..b63dd01
--- /dev/null
+++ b/riscv/insns/vctz_v.h
@@ -0,0 +1,16 @@
+// vctz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  unsigned int i = 0;
+  for (; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> i)) {
+      break;
+    }
+  }
+  vd = i;
+})
diff --git a/riscv/insns/vghsh_vv.h b/riscv/insns/vghsh_vv.h
new file mode 100644
index 0000000..bcbfe74
--- /dev/null
+++ b/riscv/insns/vghsh_vv.h
@@ -0,0 +1,38 @@
+// vghsh.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;   // Current partial hash
+    EGU32x4_t X = vs1;  // Block cipher output
+    EGU32x4_t H = vs2;  // Hash subkey
+
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    // S = brev8(Y ^ X)
+    EGU32x4_t S;
+    EGU32x4_XOR(S, Y, X);
+    EGU32x4_BREV8(S);
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(S, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      const bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Left shift by 1.
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/insns/vgmul_vv.h b/riscv/insns/vgmul_vv.h
new file mode 100644
index 0000000..820b396
--- /dev/null
+++ b/riscv/insns/vgmul_vv.h
@@ -0,0 +1,32 @@
+// vgmul.vv vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;  // Multiplier
+    EGU32x4_BREV8(Y);
+    EGU32x4_t H = vs2;  // Multiplicand
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(Y, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Lef shift by 1
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/insns/vrev8_v.h b/riscv/insns/vrev8_v.h
new file mode 100644
index 0000000..f26c5a0
--- /dev/null
+++ b/riscv/insns/vrev8_v.h
@@ -0,0 +1,16 @@
+// vrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  vd = vs2;
+  if (P.VU.vsew > 8)
+    vd = ((vd & 0x00FF00FF00FF00FFllu) <<  8) | ((vd & 0xFF00FF00FF00FF00llu) >>  8);
+  if (P.VU.vsew > 16)
+    vd = ((vd & 0x0000FFFF0000FFFFllu) << 16) | ((vd & 0xFFFF0000FFFF0000llu) >> 16);
+  if (P.VU.vsew > 32)
+    vd = ((vd & 0x00000000FFFFFFFFllu) << 32) | ((vd & 0xFFFFFFFF00000000llu) >> 32);
+})
diff --git a/riscv/insns/vrol_vv.h b/riscv/insns/vrol_vv.h
new file mode 100644
index 0000000..fb2e483
--- /dev/null
+++ b/riscv/insns/vrol_vv.h
@@ -0,0 +1,17 @@
+// vrol.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+  // For .vv, the shift amount comes from the vs1 element.
+  const reg_t lshift = vs1 & mask;
+  const reg_t rshift = (-lshift) & mask;
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vrol_vx.h b/riscv/insns/vrol_vx.h
new file mode 100644
index 0000000..b0c89a2
--- /dev/null
+++ b/riscv/insns/vrol_vx.h
@@ -0,0 +1,18 @@
+// vrol.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t lshift = ((reg_t)RS1) & mask;
+const reg_t rshift = (-lshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vi.h b/riscv/insns/vror_vi.h
new file mode 100644
index 0000000..1269c3d
--- /dev/null
+++ b/riscv/insns/vror_vi.h
@@ -0,0 +1,18 @@
+// vror.vi vd, vs2, zimm6, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vi, the shift amount comes from bits [26,19-15].
+const reg_t rshift = insn.v_zimm6() & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vv.h b/riscv/insns/vror_vv.h
new file mode 100644
index 0000000..c649c6d
--- /dev/null
+++ b/riscv/insns/vror_vv.h
@@ -0,0 +1,17 @@
+// vror.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+  // For .vv, the shift amount comes from the vs1 element.
+  const reg_t rshift = vs1 & mask;
+  const reg_t lshift = (-rshift) & mask;
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vx.h b/riscv/insns/vror_vx.h
new file mode 100644
index 0000000..50c8e5c
--- /dev/null
+++ b/riscv/insns/vror_vx.h
@@ -0,0 +1,18 @@
+// vror.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t rshift = ((reg_t)RS1) & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vsha2ch_vv.h b/riscv/insns/vsha2ch_vv.h
new file mode 100644
index 0000000..34c6e05
--- /dev/null
+++ b/riscv/insns/vsha2ch_vv.h
@@ -0,0 +1,61 @@
+// vsha2ch.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/insns/vsha2cl_vv.h b/riscv/insns/vsha2cl_vv.h
new file mode 100644
index 0000000..4a1df09
--- /dev/null
+++ b/riscv/insns/vsha2cl_vv.h
@@ -0,0 +1,62 @@
+// vsha2cl.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
+
diff --git a/riscv/insns/vsha2ms_vv.h b/riscv/insns/vsha2ms_vv.h
new file mode 100644
index 0000000..8f1ca08
--- /dev/null
+++ b/riscv/insns/vsha2ms_vv.h
@@ -0,0 +1,63 @@
+// vshams.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU32x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint32_t w16 = ZVK_SHA256_SCHEDULE(w14,  w9, w1, w0);
+        const uint32_t w17 = ZVK_SHA256_SCHEDULE(w15, w10, w2, w1);
+        const uint32_t w18 = ZVK_SHA256_SCHEDULE(w16, w11, w3, w2);
+        const uint32_t w19 = ZVK_SHA256_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU32x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU64x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint64_t w16 = ZVK_SHA512_SCHEDULE(w14,  w9, w1, w0);
+        const uint64_t w17 = ZVK_SHA512_SCHEDULE(w15, w10, w2, w1);
+        const uint64_t w18 = ZVK_SHA512_SCHEDULE(w16, w11, w3, w2);
+        const uint64_t w19 = ZVK_SHA512_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU64x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/insns/vsm3c_vi.h b/riscv/insns/vsm3c_vi.h
new file mode 100644
index 0000000..b3e8121
--- /dev/null
+++ b/riscv/insns/vsm3c_vi.h
@@ -0,0 +1,60 @@
+// vsm3c.vi vd, vs2, rnd
+
+#include "zvksh_ext_macros.h"
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(
+  {},
+  // No need to validate or normalize 'zimm5' here as this is a 5 bits value
+  // and all values in 0-31 are valid.
+  const reg_t round = zimm5;,
+  {
+    // {H, G, F, E, D, C, B, A} <- vd
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vd, H, G, F, E, D, C, B, A);
+    // {_, _, w5, w4, _, _, w1, w0} <- vs2
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2,
+                                   UNUSED _unused_w7, UNUSED _unused_w6, w5, w4,
+                                   UNUSED _unused_w3, UNUSED _unused_w2, w1, w0);
+    const uint32_t x0 = w0 ^ w4;  // W'[0] in spec documentation.
+    const uint32_t x1 = w1 ^ w5;  // W'[1]
+
+    // Two rounds of compression.
+    uint32_t ss1;
+    uint32_t ss2;
+    uint32_t tt1;
+    uint32_t tt2;
+    uint32_t j;
+
+    j = 2 * round;
+    ss1 = ZVK_ROL32(ZVK_ROL32(A, 12) + E + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+    ss2 = ss1 ^ ZVK_ROL32(A, 12);
+    tt1 = ZVKSH_FF(A, B, C, j) + D + ss2 + x0;
+    tt2 = ZVKSH_GG(E, F, G, j) + H + ss1 + w0;
+    D = C;
+    const uint32_t C1 = ZVK_ROL32(B, 9);
+    B = A;
+    const uint32_t A1 = tt1;
+    H = G;
+    const uint32_t G1 = ZVK_ROL32(F, 19);
+    F = E;
+    const uint32_t E1 = ZVKSH_P0(tt2);
+
+    j = 2 * round + 1;
+    ss1 = ZVK_ROL32(ZVK_ROL32(A1, 12) + E1 + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+    ss2 = ss1 ^ ZVK_ROL32(A1, 12);
+    tt1 = ZVKSH_FF(A1, B, C1, j) + D + ss2 + x1;
+    tt2 = ZVKSH_GG(E1, F, G1, j) + H + ss1 + w1;
+    D = C1;
+    const uint32_t C2 = ZVK_ROL32(B, 9);
+    B = A1;
+    const uint32_t A2 = tt1;
+    H = G1;
+    const uint32_t G2 = ZVK_ROL32(F, 19);
+    F = E1;
+    const uint32_t E2 = ZVKSH_P0(tt2);
+
+    // Update the destination register.
+    SET_EGU32x8_WORDS_BE_BSWAP(vd, G1, G2, E1, E2, C1, C2, A1, A2);
+  }
+);
diff --git a/riscv/insns/vsm3me_vv.h b/riscv/insns/vsm3me_vv.h
new file mode 100644
index 0000000..dd6cb52
--- /dev/null
+++ b/riscv/insns/vsm3me_vv.h
@@ -0,0 +1,39 @@
+// vsm3me.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+#include "zvksh_ext_macros.h"
+
+// Per the SM3 spec, the message expansion computes new words Wi as:
+//   W[i] = (    P_1( W[i-16] xor W[i-9] xor ( W[i-3] <<< 15 ) )
+//           xor ( W[i-13] <<< 7 )
+//           xor W[i-6]))
+// Using arguments M16 = W[i-16], M9 = W[i-9], etc.,
+// where Mk stands for "W[i Minus k]", we define the "W function":
+#define ZVKSH_W(M16, M9, M3, M13, M6) \
+  (ZVKSH_P1((M16) ^  (M9) ^ ZVK_ROL32((M3), 15)) ^ ZVK_ROL32((M13), 7) ^ (M6))
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(
+  {},
+  {
+    // {w7,  w6,  w5,  w4,  w3,  w2,  w1,  w0} <- vs1
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs1, w7, w6, w5, w4, w3, w2, w1, w0);
+    // {w15, w14, w13, w12, w11, w10, w9, w8} <- vs2
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2, w15, w14, w13, w12, w11, w10, w9, w8);
+
+    // Arguments are W[i-16], W[i-9], W[i-13], W[i-6].
+    // Note that some of the newly computed words are used in later invocations.
+    const uint32_t w16 = ZVKSH_W(w0,  w7, w13,  w3, w10);
+    const uint32_t w17 = ZVKSH_W(w1,  w8, w14,  w4, w11);
+    const uint32_t w18 = ZVKSH_W(w2,  w9, w15,  w5, w12);
+    const uint32_t w19 = ZVKSH_W(w3, w10, w16,  w6, w13);
+    const uint32_t w20 = ZVKSH_W(w4, w11, w17,  w7, w14);
+    const uint32_t w21 = ZVKSH_W(w5, w12, w18,  w8, w15);
+    const uint32_t w22 = ZVKSH_W(w6, w13, w19,  w9, w16);
+    const uint32_t w23 = ZVKSH_W(w7, w14, w20, w10, w17);
+
+    // Update the destination register.
+    SET_EGU32x8_WORDS_BE_BSWAP(vd, w23, w22, w21, w20, w19, w18, w17, w16);
+  }
+);
diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h
new file mode 100644
index 0000000..8f52e68
--- /dev/null
+++ b/riscv/insns/vsm4k_vi.h
@@ -0,0 +1,52 @@
+// vsm4k.vi vd, vs2, round#
+
+#include "zvksed_ext_macros.h"
+
+// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft.
+static constexpr uint32_t zvksed_ck[32] = {
+  0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
+  0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
+  0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
+  0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
+  0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
+  0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
+  0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
+  0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+};
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 3 bits of the immediate, ensuring that
+  // 'round' is in the valid range [0, 7].
+  const reg_t round = zimm5 & 0x7;,
+  // Per Element Group body.
+  {
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round];
+    uint32_t S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S);
+
+    B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S);
+
+    B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S);
+
+    B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h
new file mode 100644
index 0000000..44011eb
--- /dev/null
+++ b/riscv/insns/vsm4r_vs.h
@@ -0,0 +1,51 @@
+// vsm4r.vs vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+// No overlap of vd and vs2.
+require(insn.rd() != insn.rs2());
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0);
+  const uint32_t rk0 = scalar_key[0];
+  const uint32_t rk1 = scalar_key[1];
+  const uint32_t rk2 = scalar_key[2];
+  const uint32_t rk3 = scalar_key[3];,
+  {
+    EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true);
+
+    // {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(state, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h
new file mode 100644
index 0000000..9a18cec
--- /dev/null
+++ b/riscv/insns/vsm4r_vv.h
@@ -0,0 +1,37 @@
+// vsm4r.vv vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    // vd = {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3);
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/insns/vwsll_vi.h b/riscv/insns/vwsll_vi.h
new file mode 100644
index 0000000..13b5eb4
--- /dev/null
+++ b/riscv/insns/vwsll_vi.h
@@ -0,0 +1,10 @@
+// vwsll.vi vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VI_WIDENING_ULOOP({
+  const reg_t shift = zimm5 & ((2 * sew) - 1);
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vv.h b/riscv/insns/vwsll_vv.h
new file mode 100644
index 0000000..5a64c6c
--- /dev/null
+++ b/riscv/insns/vwsll_vv.h
@@ -0,0 +1,10 @@
+// vwsll.vv vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VV_WIDENING_ULOOP({
+  const reg_t shift = (vs1 & ((2 * sew) - 1));
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vx.h b/riscv/insns/vwsll_vx.h
new file mode 100644
index 0000000..5264e80
--- /dev/null
+++ b/riscv/insns/vwsll_vx.h
@@ -0,0 +1,10 @@
+// vwsll.vx vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VX_WIDENING_ULOOP({
+  const reg_t shift = (rs1 & ((2 * sew) - 1));
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 1c4300c..59472a4 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -236,10 +236,55 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       extension_table[EXT_ZICOND] = true;
     } else if (ext_str == "zihpm") {
       extension_table[EXT_ZIHPM] = true;
+    } else if (ext_str == "zvbb") {
+      extension_table[EXT_ZVBB] = true;
+    } else if (ext_str == "zvbc") {
+      extension_table[EXT_ZVBC] = true;
     } else if (ext_str == "zvfbfmin") {
       extension_table[EXT_ZVFBFMIN] = true;
     } else if (ext_str == "zvfbfwma") {
       extension_table[EXT_ZVFBFWMA] = true;
+    } else if (ext_str == "zvkg") {
+      extension_table[EXT_ZVKG] = true;
+    } else if (ext_str == "zvkn") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvknc") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVBC] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvkng") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKG] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvkned") {
+      extension_table[EXT_ZVKNED] = true;
+    } else if (ext_str == "zvknha") {
+      extension_table[EXT_ZVKNHA] = true;
+    } else if (ext_str == "zvknhb") {
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvks") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksc") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVBC] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksg") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKG] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksed") {
+      extension_table[EXT_ZVKSED] = true;
+    } else if (ext_str == "zvksh") {
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvkt") {
     } else if (ext_str == "sstc") {
         extension_table[EXT_SSTC] = true;
     } else if (ext_str[0] == 'x') {
@@ -295,7 +340,7 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
   }
 
   if ((extension_table[EXT_ZCMP] || extension_table[EXT_ZCMT]) && extension_table[EXT_ZCD]) {
-    bad_isa_string(str, "Zcmp' and 'Zcmt' exensions are incompatible with 'Zcd' extension");
+    bad_isa_string(str, "Zcmp' and 'Zcmt' extensions are incompatible with 'Zcd' extension");
   }
 
   if ((extension_table[EXT_ZCF] || extension_table[EXT_ZCD] || extension_table[EXT_ZCB] ||
@@ -307,6 +352,24 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     bad_isa_string(str, "'Zacas' extension requires 'A' extension");
   }
 
+  // Zpn conflicts with Zvknha/Zvknhb in both rv32 and rv64
+  if (extension_table[EXT_ZPN] && (extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB])) {
+    bad_isa_string(str, "'Zvkna' and 'Zvknhb' extensions are incompatible with 'Zpn' extension");
+  }
+  // In rv64 only, Zpn (rv64_zpn) conflicts with Zvkg/Zvkned/Zvksh
+  if (max_xlen == 64 && extension_table[EXT_ZPN] &&
+      (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
+    bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
+  }
+#ifdef WORDS_BIGENDIAN
+  // Access to the vector registers as element groups is unimplemented on big-endian setups.
+  if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] ||
+      extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) {
+      bad_isa_string(str,
+		     "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' "
+		     "extensions are incompatible with WORDS_BIGENDIAN setups.");
+  }
+#endif
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 3cbee7d..5b04347 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -58,8 +58,24 @@ typedef enum {
   EXT_ZICNTR,
   EXT_ZICOND,
   EXT_ZIHPM,
+  EXT_ZVBB,
+  EXT_ZVBC,
   EXT_ZVFBFMIN,
   EXT_ZVFBFWMA,
+  EXT_ZVKG,
+  EXT_ZVKNED,
+  EXT_ZVKNHA,
+  EXT_ZVKNHB,
+  EXT_ZVKSED,
+  EXT_ZVKSH,
+  EXT_XZBP,
+  EXT_XZBS,
+  EXT_XZBE,
+  EXT_XZBF,
+  EXT_XZBC,
+  EXT_XZBM,
+  EXT_XZBR,
+  EXT_XZBT,
   EXT_SSTC,
   EXT_ZACAS,
   EXT_INTERNAL_ZFH_MOVE,
diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index a30c770..2214be4 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -12,3 +12,12 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
 DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)
+DECLARE_OVERLAP_INSN(vaesdf_vv, EXT_ZVKNED)
+DECLARE_OVERLAP_INSN(vghsh_vv, EXT_ZVKG)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHA)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHB)
+DECLARE_OVERLAP_INSN(vsm3me_vv, EXT_ZVKSH)
+DECLARE_OVERLAP_INSN(rstsa16, EXT_ZPN)
+DECLARE_OVERLAP_INSN(rstsa32, EXT_ZPN)
+DECLARE_OVERLAP_INSN(srli32_u, EXT_ZPN)
+DECLARE_OVERLAP_INSN(umax32, EXT_ZPN)
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 6472982..a3e125f 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1340,32 +1340,98 @@ riscv_insn_ext_zacas = \
 	amocas_d \
 	$(if $(HAVE_INT128),amocas_q)
 
+riscv_insn_ext_zvbb = \
+	vandn_vv \
+	vandn_vx \
+	vbrev8_v \
+	vbrev_v \
+	vclz_v \
+	vcpop_v \
+	vctz_v \
+	vrev8_v \
+	vrol_vv \
+	vrol_vx \
+	vror_vi \
+	vror_vv \
+	vror_vx \
+	vwsll_vi \
+	vwsll_vv \
+	vwsll_vx \
+
+riscv_insn_ext_zvbc = \
+	vclmul_vv \
+	vclmul_vx \
+	vclmulh_vv \
+	vclmulh_vx \
+
+riscv_insn_ext_zvkg= \
+	vghsh_vv \
+	vgmul_vv \
+
+riscv_insn_ext_zvkned = \
+	vaesdf_vs \
+	vaesdf_vv \
+	vaesdm_vs \
+	vaesdm_vv \
+	vaesef_vs \
+	vaesef_vv \
+	vaesem_vs \
+	vaesem_vv \
+	vaeskf1_vi \
+	vaeskf2_vi \
+	vaesz_vs \
+
+# Covers both Zvknha and Zvkhnb.
+riscv_insn_ext_zvknh = \
+	vsha2cl_vv \
+	vsha2ch_vv \
+	vsha2ms_vv \
+
+riscv_insn_ext_zvksed = \
+	vsm4k_vi \
+	vsm4r_vs \
+	vsm4r_vv \
+
+riscv_insn_ext_zvksh = \
+	vsm3c_vi \
+	vsm3me_vv \
+
+riscv_insn_ext_zvk = \
+	$(riscv_insn_ext_zvbb) \
+	$(riscv_insn_ext_zvbc) \
+	$(riscv_insn_ext_zvkg) \
+	$(riscv_insn_ext_zvkned) \
+	$(riscv_insn_ext_zvknh) \
+	$(riscv_insn_ext_zvksed) \
+	$(riscv_insn_ext_zvksh) \
+
 riscv_insn_list = \
+	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
 	$(riscv_insn_ext_a) \
+	$(riscv_insn_ext_b) \
+	$(riscv_insn_ext_bf16) \
 	$(riscv_insn_ext_c) \
-	$(riscv_insn_ext_i) \
-	$(riscv_insn_ext_m) \
-	$(riscv_insn_ext_f) \
-	$(riscv_insn_ext_f_zfa) \
+	$(riscv_insn_ext_cmo) \
 	$(riscv_insn_ext_d) \
 	$(riscv_insn_ext_d_zfa) \
-	$(riscv_insn_ext_zfh) \
-	$(riscv_insn_ext_zfh_zfa) \
+	$(riscv_insn_ext_f) \
+	$(riscv_insn_ext_f_zfa) \
+	$(riscv_insn_ext_h) \
+	$(riscv_insn_ext_i) \
+	$(riscv_insn_ext_k) \
+	$(riscv_insn_ext_m) \
+	$(riscv_insn_ext_p) \
 	$(riscv_insn_ext_q) \
 	$(riscv_insn_ext_q_zfa) \
-	$(riscv_insn_ext_b) \
-	$(riscv_insn_ext_k) \
-	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
+	$(riscv_insn_ext_zacas) \
 	$(riscv_insn_ext_zce) \
-	$(riscv_insn_ext_h) \
-	$(riscv_insn_ext_p) \
+	$(riscv_insn_ext_zfh) \
+	$(riscv_insn_ext_zfh_zfa) \
+	$(riscv_insn_ext_zicond) \
+	$(riscv_insn_ext_zvk) \
 	$(riscv_insn_priv) \
-	$(riscv_insn_svinval) \
 	$(riscv_insn_smrnmi) \
-	$(riscv_insn_ext_cmo) \
-	$(riscv_insn_ext_zicond) \
-	$(riscv_insn_ext_bf16) \
-	$(riscv_insn_ext_zacas) \
+	$(riscv_insn_svinval) \
 
 riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list))
 
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 41256c7..908ff16 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
 
+#define V_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
 #define VX_U_PARAMS(x) \
   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
   type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
@@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   } \
   VI_LOOP_END 
 
+#define VI_V_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8) { \
+    V_U_PARAMS(e8); \
+    BODY; \
+  } else if (sew == e16) { \
+    V_U_PARAMS(e16); \
+    BODY; \
+  } else if (sew == e32) { \
+    V_U_PARAMS(e32); \
+    BODY; \
+  } else if (sew == e64) { \
+    V_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
 #define VI_VX_ULOOP(BODY) \
   VI_CHECK_SSS(false) \
   VI_LOOP_BASE \
diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc
index 9128df6..08adc61 100644
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write
   return regStart[n];
 }
 
+// The logic differences between 'elt()' and 'elt_group()' come from
+// the fact that, while 'elt()' requires that the element is fully
+// contained in a single vector register, the element group may span
+// multiple registers in a single register group (LMUL>1).
+//
+// Notes:
+// - We do NOT check that a single element - i.e., the T in the element
+//   group type std::array<T, N> - fits within a single register, or that
+//   T is smaller or equal to VSEW. Implementations of the instructions
+//   sometimes use a different T than what the specification suggests.
+//   Instructon implementations should 'require()' what the specification
+//   dictates.
+// - We do NOT check that 'vReg' is a valid register group, or that
+//   'n+1' element groups fit in the register group 'vReg'. It is
+//   the responsibility of the caller to validate those preconditions.
+template<typename EG> EG&
+vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) {
+#ifdef WORDS_BIGENDIAN
+  fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n",
+          stderr);
+  abort();
+#endif
+  using T = typename EG::value_type;
+  constexpr std::size_t N = std::tuple_size<EG>::value;
+  assert(N > 0);
+
+  assert(vsew != 0);
+  constexpr reg_t elt_group_size = N * sizeof(T);
+  const reg_t reg_group_size = (VLEN >> 3) * vflmul;
+  assert(((n + 1) * elt_group_size) <= reg_group_size);
+
+  const reg_t start_byte = n * elt_group_size;
+  const reg_t bytes_per_reg = VLEN >> 3;
+
+  // Inclusive first/last register indices.
+  const reg_t reg_first = vReg + start_byte / bytes_per_reg;
+  const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg;
+
+  // Element groups per register groups
+  for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) {
+      reg_referenced[vidx] = 1;
+
+      if (unlikely(p->get_log_commits_enabled() && is_write)) {
+          p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0};
+      }
+  }
+
+  return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte);
+}
+
 template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool);
 template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool);
 template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool);
@@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool);
 template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool);
 template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool);
 template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool);
+
+template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool);
+template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool);
+template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool);
+template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool);
diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h
index b9f706c..a057c62 100644
--- a/riscv/vector_unit.h
+++ b/riscv/vector_unit.h
@@ -2,6 +2,9 @@
 #ifndef _RISCV_VECTOR_UNIT_H
 #define _RISCV_VECTOR_UNIT_H
 
+#include <array>
+#include <cstdint>
+
 #include "decode.h"
 #include "csrs.h"
 
@@ -69,6 +72,17 @@ struct type_sew_t<64>
   using type=int64_t;
 };
 
+// Element Group of 4 32 bits elements (128b total).
+using EGU32x4_t = std::array<uint32_t, 4>;
+
+// Element Group of 8 32 bits elements (256b total).
+using EGU32x8_t = std::array<uint32_t, 8>;
+
+// Element Group of 4 64 bits elements (256b total).
+using EGU64x4_t = std::array<uint64_t, 4>;
+
+// Element Group of 16 8 bits elements (128b total).
+using EGU8x16_t = std::array<uint8_t, 16>;
 
 class vectorUnit_t
 {
@@ -88,8 +102,11 @@ public:
   bool vill;
   bool vstart_alu;
 
-  // vector element for varies SEW
+  // vector element for various SEW
   template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false);
+  // vector element group access, where EG is a std::array<T, N>.
+  template<typename EG> EG&
+  elt_group(reg_t vReg, reg_t n, bool is_write = false);
 
 public:
 
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
new file mode 100644
index 0000000..bf893f9
--- /dev/null
+++ b/riscv/zvk_ext_macros.h
@@ -0,0 +1,1035 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvk extension (vector cryptography).
+
+// Note that a good deal of code here would be cleaner/simpler
+// if exposed as C++ functions (including templated ones), however
+// this is not possible in the contexts where those headers are
+// included.
+
+#ifndef RISCV_ZVK_EXT_MACROS_H_
+#define RISCV_ZVK_EXT_MACROS_H_
+
+//
+// Predicate Macros
+//
+
+// Ensures that the ZVBB extension (vector crypto bitmanip) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvbb \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBB); \
+  } while (0)
+
+// Ensures that the ZVBC extension (vector carryless multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvbc \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBC); \
+  } while (0)
+
+// Ensures that the ZVKG extension (vector Gallois Field Multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvkg \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKG); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-256 is present.
+// For SHA-256, this support is present in either Zvknha or Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_256 \
+  do { \
+    require_vector(true); \
+    require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKNED extension (vector AES single round) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvkned \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNED); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-512 is present.
+// For SHA-512, this support is only present in Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_512 \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKSED extension (vector SM4 block cipher)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvksed \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSED); \
+  } while (0)
+
+// Ensures that the ZVKSH extension (vector SM3 hash) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvksh \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSH); \
+  } while (0)
+
+// Ensures that the vector instruction is not using a mask.
+#define require_no_vmask  require(insn.v_vm() == 1)
+
+// Ensures that an element group can fit in a register group. That is,
+//    (LMUL * VLEN) <= EGW
+#define require_egw_fits(EGW)  require((EGW) <= (P.VU.VLEN * P.VU.vflmul))
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=4 (four 32-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group),
+// for an effective element group width of EGW=256 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x8 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 8 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 8 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=64, EGS=4 (four 64-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_64x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+//
+// Loop Parameters Macros
+//
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs1 = P.VU.elt_group<EGU32x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs1 = P.VU.elt_group<EGU32x8_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 64b*4 element group as a EGU64x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU64x4_t &vd = P.VU.elt_group<EGU64x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU64x4_t vs1 = P.VU.elt_group<EGU64x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU64x4_t vs2 = P.VU.elt_group<EGU64x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1',
+// as part of a widening operation where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto vs1 = P.VU.elt<type_usew_t<SEW>::type>(rs1_num, i); \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the scalar register 'rs1', as part of a widening operation where
+// 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto rs1 = (type_usew_t<SEW>::type)RS1; \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the 5-bit immediate field 'zimm5', as part of a widening operation
+// where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto zimm5 = (type_usew_t<SEW>::type)insn.v_zimm5(); \
+
+//
+// Loop Macros
+//
+
+// NOTES:
+// - Each of the element-group loop macros DO contain an invocation
+//   of the corresponding 'require_element_groups_<bits>x<#elements>;',
+//   because the macro correctness requires proper VL/VSTART values.
+// - Each of the loop macros named "_NOVM_" DO contain an invocation
+//   of the 'require_no_vmask>;' macro. Those macros (all of them
+//   at this time) do not support masking (i.e., no skipping
+//   of elements/element groups is performed).
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x4_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of the macro 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x8_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x8;; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                               PRELOOP, \
+                                                               EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro is meant to be used for "op vd, vs2" instructions,
+//    whether vd is output only, or input and output.
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                           PRELOOP, \
+                                                           EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2.  This interprets the vectors as containing element groups
+// of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': 5 bits unsigned immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': unsigned 5 bits immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x8; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 64b*4 element groups available in the vector registers
+// vd, vs1, and vs2.  This interprets the vectors as containing element groups
+// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current
+// SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_64x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU64x4_t reference, content of the current element group
+//         in the 'vd' vector register / vector register group.
+//   'vs1': EGU64x4_t, content of the current element group
+//         in the 'vs1' vector register / vector register group.
+//   'vs2': EGU64x4_t, content of the current element group
+//         in the 'vs2' vector register / vector register group.
+#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_64x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, v1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, rs1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+//
+// Element Group Manipulation Macros
+//
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Little Endian" (LE) order, i.e., from the least significant (W0)
+// to the most significant (W3).
+#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Little Endian" (LE)
+// order, i.e., from the least significant (W0) to the most
+// significant (W3).
+#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Byte-swap the bytes of a uin32_t such that the order of bytes
+// is reversed.
+#define ZVK_BSWAP32(x) \
+  ((((uint32_t)((x) >> 24)) & 0xFF) <<  0 | \
+   (((uint32_t)((x) >> 16)) & 0xFF) <<  8 | \
+   (((uint32_t)((x) >>  8)) & 0xFF) << 16 | \
+   (((uint32_t)((x) >>  0)) & 0xFF) << 24)
+
+// Extracts 8 uint32_t words from the input EGU32x8_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W7)
+// to the least significant (W0). Each of the words is byte-swapped,
+// from a big-endian representation in the EGU32x8_t to a native/little-endian
+// ordering in the variables.
+#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  uint32_t W0 = ZVK_BSWAP32((X)[0]); \
+  uint32_t W1 = ZVK_BSWAP32((X)[1]); \
+  uint32_t W2 = ZVK_BSWAP32((X)[2]); \
+  uint32_t W3 = ZVK_BSWAP32((X)[3]); \
+  uint32_t W4 = ZVK_BSWAP32((X)[4]); \
+  uint32_t W5 = ZVK_BSWAP32((X)[5]); \
+  uint32_t W6 = ZVK_BSWAP32((X)[6]); \
+  uint32_t W7 = ZVK_BSWAP32((X)[7]); \
+  (void)(0)
+
+// Sets the elements words of given EGU32x8_t variable 'X' to
+// the given 8 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W7) to the least
+// significant (W0). Each of the words is byte-swapped,
+// from a native/little-endian ordering in the variables to
+// a big-endian representation in the EGU32x8_t.
+#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = ZVK_BSWAP32(W0); \
+    (X)[1] = ZVK_BSWAP32(W1); \
+    (X)[2] = ZVK_BSWAP32(W2); \
+    (X)[3] = ZVK_BSWAP32(W3); \
+    (X)[4] = ZVK_BSWAP32(W4); \
+    (X)[5] = ZVK_BSWAP32(W5); \
+    (X)[6] = ZVK_BSWAP32(W6); \
+    (X)[7] = ZVK_BSWAP32(W7); \
+  } while (0)
+
+// Extracts 4 uint64_t words from the input EGU64x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint64_t W0 = (X)[0]; \
+  uint64_t W1 = (X)[1]; \
+  uint64_t W2 = (X)[2]; \
+  uint64_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU64x4_t variable 'X' to
+// the given 4 uint64_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Copies a EGU8x16_t value from 'SRC' into 'DST'.
+#define EGU8x16_COPY(DST, SRC) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (SRC)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU8x16_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU32x4_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t idx = 0; idx < 4; ++idx) { \
+    (MUT_A)[idx] ^= (CONST_B)[idx]; \
+  }
+
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU8x16_XOR(DST, A, B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
+  }
+
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU32x4_XOR(DST, A, B) \
+  do { \
+    static_assert(std::is_same<EGU32x4_t, decltype(A)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(B)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(DST)>::value); \
+    for (std::size_t idx = 0; idx < 4; ++idx) { \
+      (DST)[idx] = (A)[idx] ^ (B)[idx]; \
+    } \
+  } while (0)
+
+//
+// Common bit manipulations logic.
+//
+
+// Form a 64 bit integer with bit X set
+#define ZVK_BIT(X) (1ULL << (X))
+
+// Reverse the order of bits within bytes of a word.
+// This is used to match the data interpretation in NIST SP 800-38D
+// a.k.a the GCM specification.
+#define ZVK_BREV8_32(X) \
+  do { \
+    (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \
+    (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \
+    (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \
+  } while (0)
+
+// Rotates right a uint32_t value by N bits.
+//   uint32_t ROR32(uint32_t X, std::size_t N);
+#define ZVK_ROR32(X, N) rotate_right<uint32_t>((X), (N))
+
+// Rotates right a uint64_t value by N bits.
+//   uint64_t ROR64(uint64_t X, std::size_t N);
+#define ZVK_ROR64(X, N) rotate_right<uint64_t>((X), (N))
+
+// Rotates left a uint32_t value by N bits.
+//   uint32_t ROL32(uint32_t X, std::size_t N);
+#define ZVK_ROL32(X, N) rotate_left<uint32_t>((X), (N))
+
+//
+// Element Group Bit Manipulation Macros
+//
+
+// Performs bit reversal in a EGU32x4_t group.
+#define EGU32x4_BREV8(X) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    ZVK_BREV8_32((X)[bidx]); \
+  }
+
+// Checks if a given bit is set within a EGU32x4_t group.
+// Assumes LE ordering.
+#define EGU32x4_ISSET(X, BIDX) \
+  (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0)
+
+// Shfts a EGU32x4_t group left by one bit.
+//
+// Since the entire 128 bit value is shifted we need to handle carry bits.
+// In order to limit the amount of carry check logic the elements are copied to
+// a 64 bit temporary variable.
+#define EGU32x4_LSHIFT(X) \
+  do { \
+    uint64_t dword; \
+    dword = ((uint64_t)(X)[3]) << 32; \
+    dword |= X[2]; \
+    dword <<= 1; \
+    if (X[1] & ZVK_BIT(31)) { \
+      dword |= ZVK_BIT(0); \
+    } \
+    X[2] = dword & UINT32_MAX; \
+    X[3] = dword >> 32; \
+    dword = ((uint64_t)(X)[1]) << 32; \
+    dword |= X[0]; \
+    dword <<= 1; \
+    X[0] = dword & UINT32_MAX; \
+    X[1] = dword >> 32; \
+  } while (0)
+
+#endif  // RISCV_ZVK_EXT_MACROS_H_
diff --git a/riscv/zvkned_ext_macros.h b/riscv/zvkned_ext_macros.h
new file mode 100644
index 0000000..db705c7
--- /dev/null
+++ b/riscv/zvkned_ext_macros.h
@@ -0,0 +1,270 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvkned extension (vector AES single round).
+
+#include "insns/aes_common.h"
+
+#ifndef RISCV_ZVKNED_EXT_MACROS_H_
+#define RISCV_ZVKNED_EXT_MACROS_H_
+
+// vaes*.vs instruction constraints:
+//  - Zvkned is enabled
+//  - EGW (128) <= LMUL * VLEN
+//  - vd and vs2 cannot overlap
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vs_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// vaes*.vv instruction constraints. Those are the same as the .vs ones,
+// except for the overlap constraint that is not present for .vv variants.
+//  - Zvkned is enabled
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vv_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// vaeskf*.vi instruction constraints. Those are the same as the .vv ones.
+#define require_vaeskf_vi_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+#define VAES_XTIME(A) (((A) << 1) ^ (((A) & 0x80) ? 0x1b : 0))
+
+#define VAES_GFMUL(A, B) \
+  ((((B) & 0x1) ?                                  (A)  : 0) ^ \
+   (((B) & 0x2) ?                         VAES_XTIME(A) : 0) ^ \
+   (((B) & 0x4) ?             VAES_XTIME(VAES_XTIME(A)) : 0) ^ \
+   (((B) & 0x8) ? VAES_XTIME(VAES_XTIME(VAES_XTIME(A))) : 0))
+
+// Apply the S-box transform to every byte in the VAESState 'state'
+#define VAES_SUB_BYTES(STATE) \
+  do { \
+    static constexpr uint8_t kVAESXEncSBox[256]= { \
+      0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, \
+      0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, \
+      0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, \
+      0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, \
+      0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, \
+      0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, \
+      0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, \
+      0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, \
+      0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, \
+      0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, \
+      0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, \
+      0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, \
+      0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, \
+      0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, \
+      0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, \
+      0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, \
+      0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, \
+      0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, \
+      0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, \
+      0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, \
+      0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, \
+      0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, \
+      0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, \
+      0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, \
+      0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, \
+      0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, \
+      0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, \
+      0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, \
+      0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, \
+      0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, \
+      0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, \
+      0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, \
+    }; \
+    for (uint8_t& byte : (STATE)) { \
+      byte = kVAESXEncSBox[byte]; \
+     } \
+  } while (0)
+
+// Applies the S-box inverse (decode) transform to every byte
+// in the VAESState 'state'.
+#define VAES_INV_SUB_BYTES(STATE) \
+  do { \
+    static constexpr uint8_t kVAESXDecSBox[256] = { \
+      0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, \
+      0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, \
+      0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, \
+      0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, \
+      0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, \
+      0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, \
+      0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, \
+      0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, \
+      0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, \
+      0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, \
+      0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, \
+      0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, \
+      0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, \
+      0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, \
+      0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, \
+      0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, \
+      0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, \
+      0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, \
+      0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, \
+      0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, \
+      0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, \
+      0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, \
+      0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, \
+      0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, \
+      0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, \
+      0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, \
+      0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, \
+      0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, \
+      0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, \
+      0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, \
+      0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, \
+      0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D, \
+    }; \
+    for (uint8_t &byte : (STATE)) { \
+      byte = kVAESXDecSBox[byte]; \
+    } \
+  } while (0)
+
+// Shift the state rows, as specified in ShiftRows.
+//  'STATE' is a VAESState value.
+#define VAES_SHIFT_ROWS(STATE) \
+  do { \
+    uint8_t temp; \
+    /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+    /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+    temp = (STATE)[1]; \
+    (STATE)[ 1] = (STATE)[ 5]; \
+    (STATE)[ 5] = (STATE)[ 9]; \
+    (STATE)[ 9] = (STATE)[13]; \
+    (STATE)[13] = temp; \
+    /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+    temp = (STATE)[2]; \
+    (STATE)[ 2] = (STATE)[10]; \
+    (STATE)[10] = temp; \
+    temp = (STATE)[6]; \
+    (STATE)[ 6] = (STATE)[14]; \
+    (STATE)[14] = temp; \
+    /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+    temp = (STATE)[3]; \
+    (STATE)[ 3] = (STATE)[15]; \
+    (STATE)[15] = (STATE)[11]; \
+    (STATE)[11] = (STATE)[ 7]; \
+    (STATE)[ 7] = temp; \
+  } while (0)
+
+// Shifts the state rows, as specified in InvShiftRows.
+// 'STATE' is a VAESState value.
+#define VAES_INV_SHIFT_ROWS(STATE) \
+  do { \
+    uint8_t temp; \
+    /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+    /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+    temp = (STATE)[1]; \
+    (STATE)[ 1] = (STATE)[13]; \
+    (STATE)[13] = (STATE)[ 9]; \
+    (STATE)[ 9] = (STATE)[ 5]; \
+    (STATE)[ 5] = temp; \
+    /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+    temp = (STATE)[2]; \
+    (STATE)[ 2] = (STATE)[10]; \
+    (STATE)[10] = temp; \
+    temp = (STATE)[6]; \
+    (STATE)[ 6] = (STATE)[14]; \
+    (STATE)[14] = temp; \
+    /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+    temp = (STATE)[3]; \
+    (STATE)[ 3] = (STATE)[ 7]; \
+    (STATE)[ 7] = (STATE)[11]; \
+    (STATE)[11] = (STATE)[15]; \
+    (STATE)[15] = temp; \
+  } while (0)
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation MixColumns() specified in FIPS-197 5.1.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+//   F(A, B, C, D) = (2 . A) xor (3 . B) xor C xor D
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_MIX_COLUMN_BYTE(A, B, C, D) \
+  (VAES_GFMUL((A), 0x2) ^ VAES_GFMUL((B), 0x3) ^ (C) ^ (D))
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation InvMixColumns() specified in FIPS-197 5.3.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+//   F(A, B, C, D) = (0xE . A) xor (0xB . B) xor (0xD . C) xor (0x9 . D)
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_INV_MIX_COLUMN_BYTE(A, B, C, D) \
+  (VAES_GFMUL((A), 0xE) ^ \
+   VAES_GFMUL((B), 0xB) ^ \
+   VAES_GFMUL((C), 0xD) ^ \
+   VAES_GFMUL((D), 0x9))
+
+// Given a column as a uin32_t (4 Bytes), produces the mixed column
+// as a uin32_t.
+#define VAES_MIX_COLUMN(STATE, COL_IDX) \
+  do { \
+    uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+    /* Extract the bytes, before we start overwriting them */ \
+    const uint8_t b0 = column[0]; \
+    const uint8_t b1 = column[1]; \
+    const uint8_t b2 = column[2]; \
+    const uint8_t b3 = column[3]; \
+    /* Every iteration rotates the byte indices by 1 */ \
+    column[0] = VAES_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+    column[1] = VAES_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+    column[2] = VAES_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+    column[3] = VAES_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+  } while (0)
+
+// Given a column as a uin32_t (4 Bytes), produces the inverse
+// mixed column as a uin32_t.
+#define VAES_INV_MIX_COLUMN(STATE, COL_IDX) \
+  do { \
+    uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+    /* Extract the bytes, before we start overwriting them */ \
+    const uint8_t b0 = column[0]; \
+    const uint8_t b1 = column[1]; \
+    const uint8_t b2 = column[2]; \
+    const uint8_t b3 = column[3]; \
+    /* Every iteration rotates the byte indices by 1 */ \
+    column[0] = VAES_INV_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+    column[1] = VAES_INV_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+    column[2] = VAES_INV_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+    column[3] = VAES_INV_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+  } while (0)
+
+// Implements MixColumns as defined in FIPS-197 5.1.3.
+#define VAES_MIX_COLUMNS(STATE) \
+  do { \
+    VAES_MIX_COLUMN((STATE), 0); \
+    VAES_MIX_COLUMN((STATE), 1); \
+    VAES_MIX_COLUMN((STATE), 2); \
+    VAES_MIX_COLUMN((STATE), 3); \
+  } while (0)
+
+// Implements InvMixColumns as defined in FIPS-197 5.3.3.
+#define VAES_INV_MIX_COLUMNS(STATE) \
+  do { \
+    VAES_INV_MIX_COLUMN((STATE), 0); \
+    VAES_INV_MIX_COLUMN((STATE), 1); \
+    VAES_INV_MIX_COLUMN((STATE), 2); \
+    VAES_INV_MIX_COLUMN((STATE), 3); \
+  } while (0)
+
+#endif  // RISCV_ZVKNED_EXT_MACROS_H_
diff --git a/riscv/zvknh_ext_macros.h b/riscv/zvknh_ext_macros.h
new file mode 100644
index 0000000..b50818b
--- /dev/null
+++ b/riscv/zvknh_ext_macros.h
@@ -0,0 +1,155 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvknh[ab] extensions (vector SHA-256/SHA-512 cryptography).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKNH_EXT_MACROS_H_
+#define RISCV_ZVKNH_EXT_MACROS_H_
+
+// Constraints common to all vsha* instructions, across all VSEW:
+//  - VSEW is 32 (SHA-256) or 64 (SHA-512)
+//  - No overlap of vd with vs1 or vs2.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_..._EGU32x4_..._LOOP and VI_..._EGU64x4_..._LOOP
+// macros.
+#define require_vsha2_common_constraints \
+  do { \
+    require(P.VU.vsew == 32 || P.VU.vsew == 64); \
+    require(insn.rd() != insn.rs1()); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsha2_vsew32_constraints \
+  do { \
+    require_zvknh_256; \
+    require_egw_fits(128); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU64x4_..._LOOP macros.
+#define require_vsha2_vsew64_constraints \
+  do { \
+    require_zvknh_512; \
+    require_egw_fits(256); \
+  } while (false)
+
+//
+// SHA-256 and SHA-512 common logic
+//
+
+// Ch(x, y, z) = (xy) ⊕ (~xz) = xy | ~xz
+#define ZVK_SHA_CH(X, Y, Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+
+// Maj(x,y,z)  = (xy) ⊕ (xz) ⊕(yz) = xy | xz | yz
+#define ZVK_SHA_MAJ(X, Y, Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+
+//
+// SHA-256
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA256_SUM0(X) \
+  (ZVK_ROR32(X, 2) ^ ZVK_ROR32(X, 13) ^ ZVK_ROR32(X, 22))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA256_SUM1(X) \
+  (ZVK_ROR32(X, 6) ^ ZVK_ROR32(X, 11) ^ ZVK_ROR32(X, 25))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA256_SIG0(X) \
+  (ZVK_ROR32(X, 7) ^ ZVK_ROR32(X, 18) ^ ((X) >> 3))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA256_SIG1(X)  \
+  (ZVK_ROR32(X, 17) ^ ZVK_ROR32(X, 19) ^ ((X) >> 10))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA256_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA256_SIG1(W14) + (W9) + ZVK_SHA256_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA256_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint32_t t1 = (H) + ZVK_SHA256_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint32_t t2 = ZVK_SHA256_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+//
+// SHA-512
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA512_SUM0(X) \
+  (ZVK_ROR64(X, 28) ^ ZVK_ROR64(X, 34) ^ ZVK_ROR64(X, 39))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA512_SUM1(X) \
+  (ZVK_ROR64(X, 14) ^ ZVK_ROR64(X, 18) ^ ZVK_ROR64(X, 41))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA512_SIG0(X) \
+  (ZVK_ROR64(X, 1) ^ ZVK_ROR64(X, 8) ^ ((X) >> 7))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA512_SIG1(X) \
+  (ZVK_ROR64(X, 19) ^ ZVK_ROR64(X, 61) ^ ((X) >> 6))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA512_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA512_SIG1(W14) + (W9) + ZVK_SHA512_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA512_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint64_t t1 = (H) + ZVK_SHA512_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint64_t t2 = ZVK_SHA512_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+#endif  // RISCV_ZVKNH_EXT_MACROS_H_
diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h
new file mode 100644
index 0000000..46e399b
--- /dev/null
+++ b/riscv/zvksed_ext_macros.h
@@ -0,0 +1,60 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksed extension (vectorized SM4).
+
+#include "insns/sm4_common.h"
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKSED_MACROS_H_
+#define RISCV_ZVKSED_MACROS_H_
+
+// Constraints common to all vsm4* instructions:
+//  - Zvksed is enabled
+//  - VSEW == 32
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsm4_constraints \
+  do { \
+    require_zvksed; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// Returns a uint32_t value constructed from the 4 bytes (uint8_t)
+// provided in "Little Endian" (LE) order, i.e., from least significant (B0)
+// to most significant (B3).
+#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \
+  (((uint32_t)(B0)) <<  0 | \
+   ((uint32_t)(B1)) <<  8 | \
+   ((uint32_t)(B2)) << 16 | \
+   ((uint32_t)(B3)) << 24)
+
+// Get byte BYTE of the SBox.
+#define ZVKSED_SBOX(BYTE)  (sm4_sbox[(BYTE)])
+
+// Given an unsigned integer value 'X' and a byte index,
+// returns a uint8_t value for the byte at the given index.
+#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8)))
+
+// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1.
+// of the IETF draft.
+#define ZVKSED_SUB_BYTES(B) \
+  ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3)))
+
+// Perform the linear transformation L to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND(X, S) \
+  ((X) ^ \
+   ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \
+    ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24)))
+
+// Perform the linear transformation L' to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND_KEY(X, S) \
+  ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23)))
+
+#endif // RISCV_ZVKSED_MACROS_H_
diff --git a/riscv/zvksh_ext_macros.h b/riscv/zvksh_ext_macros.h
new file mode 100644
index 0000000..71c5a09
--- /dev/null
+++ b/riscv/zvksh_ext_macros.h
@@ -0,0 +1,47 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksh extension (vectorized SM3).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_INSNS_ZVKSH_COMMON_H_
+#define RISCV_INSNS_ZVKSH_COMMON_H_
+
+// Constraints common to all vsm3* instructions:
+//  - Zvksh is enabled
+//  - VSEW == 32
+//  - EGW (256) <= LMUL * VLEN
+//  - No overlap of vd and vs2.
+//
+// The constraint that vstart and vl are both EGS (8) aligned
+// is checked in the VI_ZVK_..._EGU32x8_..._LOOP macros.
+#define require_vsm3_constraints \
+  do { \
+    require_zvksh; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(256); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+#define FF1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define FF2(X, Y, Z) (((X) & (Y)) | ((X) & (Z)) | ((Y) & (Z)))
+
+// Boolean function FF_j - section 4.3. of the IETF draft.
+#define ZVKSH_FF(X, Y, Z, J) (((J) <= 15) ? FF1(X, Y, Z) : FF2(X, Y, Z))
+
+#define GG1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define GG2(X, Y, Z) (((X) & (Y)) | ((~(X)) & (Z)))
+
+// Boolean function GG_j - section 4.3. of the IETF draft.
+#define ZVKSH_GG(X, Y, Z, J) (((J) <= 15) ? GG1(X, Y, Z) : GG2(X, Y, Z))
+
+#define T1 0x79CC4519
+#define T2 0x7A879D8A
+
+// T_j constant - section 4.2. of the IETF draft.
+#define ZVKSH_T(J) (((J) <= 15) ? (T1) : (T2))
+
+// Permutation functions P_0 and P_1 - section 4.4 of the IETF draft.
+#define ZVKSH_P0(X) ((X) ^ ZVK_ROL32((X),  9) ^ ZVK_ROL32((X), 17))
+#define ZVKSH_P1(X) ((X) ^ ZVK_ROL32((X), 15) ^ ZVK_ROL32((X), 23))
+
+#endif // RISCV_INSNS_ZVKSH_COMMON_H
author	Andrew Waterman <andrew@sifive.com>	2023-06-19 20:18:09 -0700
committer	GitHub <noreply@github.com>	2023-06-19 20:18:09 -0700
commit	5731a478ea2b7cf639a383498eb114a9dc7d64df (patch)
tree	43f28cdc046246deb9275b71ecaaacdce45ecaa9
parent	8b10de64dd2048e813438dbb5e4ed24d09feb8eb (diff)
parent	a55f96ae9380d5cc9bef05e8b9e82e54d5d6ec35 (diff)
download	spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.zip spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.gz spike-5731a478ea2b7cf639a383498eb114a9dc7d64df.tar.bz2