From d5c0339484323b5a9498576d70ec90eab2e13438 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Sun, 18 Jun 2023 17:10:53 -0700
Subject: Zvk: Infrastructure for Zvk extensions, element group handling

Introduce types and macros useful across multiple Zvk sub-extensions,
including Zvbb and Zvbc. Those will be used by upcoming
per-sub-extension commits.

In particular we introduce "Element Group" types and loop macros handling
those element groups. The concept of element group is described in
<https://github.com/riscv/riscv-crypto/blob/master/doc/vector/riscv-crypto-vector-element-groups.adoc>.

Note that the element group access method is not implemented
for WORDS_BIGENDIAN setup. As such, isa_parser.cc is modified to emit
an error when WORDS_BIGENDIAN is defined and extensions using element
groups are enabled.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/arith.h          |   21 +
 riscv/isa_parser.cc    |   10 +-
 riscv/v_ext_macros.h   |   22 ++
 riscv/vector_unit.cc   |   55 +++
 riscv/vector_unit.h    |   19 +-
 riscv/zvk_ext_macros.h | 1023 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1148 insertions(+), 2 deletions(-)
 create mode 100644 riscv/zvk_ext_macros.h
diff --git a/riscv/arith.h b/riscv/arith.h
index 3b807e9..20b1504 100644
--- a/riscv/arith.h
+++ b/riscv/arith.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <climits>
 #include <cstddef>
+#include <type_traits>
 
 inline uint64_t mulhu(uint64_t a, uint64_t b)
 {
@@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
   return r;
 }
 
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t rshift = shiftamt & mask;
+  const std::size_t lshift = (-rshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t lshift = shiftamt & mask;
+  const std::size_t rshift = (-lshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
 #endif
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 6fb29ae..59472a4 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -361,7 +361,15 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
     bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
   }
-
+#ifdef WORDS_BIGENDIAN
+  // Access to the vector registers as element groups is unimplemented on big-endian setups.
+  if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] ||
+      extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) {
+      bad_isa_string(str,
+		     "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' "
+		     "extensions are incompatible with WORDS_BIGENDIAN setups.");
+  }
+#endif
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 41256c7..908ff16 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
 
+#define V_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
 #define VX_U_PARAMS(x) \
   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
   type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
@@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   } \
   VI_LOOP_END 
 
+#define VI_V_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8) { \
+    V_U_PARAMS(e8); \
+    BODY; \
+  } else if (sew == e16) { \
+    V_U_PARAMS(e16); \
+    BODY; \
+  } else if (sew == e32) { \
+    V_U_PARAMS(e32); \
+    BODY; \
+  } else if (sew == e64) { \
+    V_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
 #define VI_VX_ULOOP(BODY) \
   VI_CHECK_SSS(false) \
   VI_LOOP_BASE \
diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc
index 9128df6..08adc61 100644
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write
   return regStart[n];
 }
 
+// The logic differences between 'elt()' and 'elt_group()' come from
+// the fact that, while 'elt()' requires that the element is fully
+// contained in a single vector register, the element group may span
+// multiple registers in a single register group (LMUL>1).
+//
+// Notes:
+// - We do NOT check that a single element - i.e., the T in the element
+//   group type std::array<T, N> - fits within a single register, or that
+//   T is smaller or equal to VSEW. Implementations of the instructions
+//   sometimes use a different T than what the specification suggests.
+//   Instructon implementations should 'require()' what the specification
+//   dictates.
+// - We do NOT check that 'vReg' is a valid register group, or that
+//   'n+1' element groups fit in the register group 'vReg'. It is
+//   the responsibility of the caller to validate those preconditions.
+template<typename EG> EG&
+vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) {
+#ifdef WORDS_BIGENDIAN
+  fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n",
+          stderr);
+  abort();
+#endif
+  using T = typename EG::value_type;
+  constexpr std::size_t N = std::tuple_size<EG>::value;
+  assert(N > 0);
+
+  assert(vsew != 0);
+  constexpr reg_t elt_group_size = N * sizeof(T);
+  const reg_t reg_group_size = (VLEN >> 3) * vflmul;
+  assert(((n + 1) * elt_group_size) <= reg_group_size);
+
+  const reg_t start_byte = n * elt_group_size;
+  const reg_t bytes_per_reg = VLEN >> 3;
+
+  // Inclusive first/last register indices.
+  const reg_t reg_first = vReg + start_byte / bytes_per_reg;
+  const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg;
+
+  // Element groups per register groups
+  for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) {
+      reg_referenced[vidx] = 1;
+
+      if (unlikely(p->get_log_commits_enabled() && is_write)) {
+          p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0};
+      }
+  }
+
+  return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte);
+}
+
 template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool);
 template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool);
 template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool);
@@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool);
 template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool);
 template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool);
 template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool);
+
+template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool);
+template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool);
+template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool);
+template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool);
diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h
index b9f706c..a057c62 100644
--- a/riscv/vector_unit.h
+++ b/riscv/vector_unit.h
@@ -2,6 +2,9 @@
 #ifndef _RISCV_VECTOR_UNIT_H
 #define _RISCV_VECTOR_UNIT_H
 
+#include <array>
+#include <cstdint>
+
 #include "decode.h"
 #include "csrs.h"
 
@@ -69,6 +72,17 @@ struct type_sew_t<64>
   using type=int64_t;
 };
 
+// Element Group of 4 32 bits elements (128b total).
+using EGU32x4_t = std::array<uint32_t, 4>;
+
+// Element Group of 8 32 bits elements (256b total).
+using EGU32x8_t = std::array<uint32_t, 8>;
+
+// Element Group of 4 64 bits elements (256b total).
+using EGU64x4_t = std::array<uint64_t, 4>;
+
+// Element Group of 16 8 bits elements (128b total).
+using EGU8x16_t = std::array<uint8_t, 16>;
 
 class vectorUnit_t
 {
@@ -88,8 +102,11 @@ public:
   bool vill;
   bool vstart_alu;
 
-  // vector element for varies SEW
+  // vector element for various SEW
   template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false);
+  // vector element group access, where EG is a std::array<T, N>.
+  template<typename EG> EG&
+  elt_group(reg_t vReg, reg_t n, bool is_write = false);
 
 public:
 
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
new file mode 100644
index 0000000..7efbac8
--- /dev/null
+++ b/riscv/zvk_ext_macros.h
@@ -0,0 +1,1023 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvk extension (vector cryptography).
+
+// Note that a good deal of code here would be cleaner/simpler
+// if exposed as C++ functions (including templated ones), however
+// this is not possible in the contexts where those headers are
+// included.
+
+#ifndef RISCV_ZVK_EXT_MACROS_H_
+#define RISCV_ZVK_EXT_MACROS_H_
+
+//
+// Predicate Macros
+//
+
+// Ensures that the ZVBB extension (vector crypto bitmanip) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvbb \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBB); \
+  } while (0)
+
+// Ensures that the ZVBC extension (vector carryless multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvbc \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBC); \
+  } while (0)
+
+// Ensures that the ZVKG extension (vector Gallois Field Multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvkg \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKG); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-256 is present.
+// For SHA-256, this support is present in either Zvknha or Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_256 \
+  do { \
+    require_vector(true); \
+    require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKNED extension (vector AES single round) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvkned \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNED); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-512 is present.
+// For SHA-512, this support is only present in Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_512 \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKSED extension (vector SM4 block cipher)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvksed \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSED); \
+  } while (0)
+
+// Ensures that the ZVKSH extension (vector SM3 hash) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvksh \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSH); \
+  } while (0)
+
+// Ensures that the vector instruction is not using a mask.
+#define require_no_vmask  require(insn.v_vm() == 1)
+
+// Ensures that an element group can fit in a register group. That is,
+//    (LMUL * VLEN) <= EGW
+#define require_egw_fits(EGW)  require((EGW) <= (P.VU.VLEN * P.VU.vflmul))
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=4 (four 32-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group),
+// for an effective element group width of EGW=256 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x8 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 8 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 8 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=64, EGS=4 (four 64-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_64x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+//
+// Loop Parameters Macros
+//
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs1 = P.VU.elt_group<EGU32x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs1 = P.VU.elt_group<EGU32x8_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 64b*4 element group as a EGU64x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU64x4_t &vd = P.VU.elt_group<EGU64x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU64x4_t vs1 = P.VU.elt_group<EGU64x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU64x4_t vs2 = P.VU.elt_group<EGU64x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1',
+// as part of a widening operation where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto vs1 = P.VU.elt<type_usew_t<SEW>::type>(rs1_num, i); \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the scalar register 'rs1', as part of a widening operation where
+// 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto rs1 = (type_usew_t<SEW>::type)RS1; \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the 5-bit immediate field 'zimm5', as part of a widening operation
+// where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto zimm5 = (type_usew_t<SEW>::type)insn.v_zimm5(); \
+
+//
+// Loop Macros
+//
+
+// NOTES:
+// - Each of the element-group loop macros DO contain an invocation
+//   of the corresponding 'require_element_groups_<bits>x<#elements>;',
+//   because the macro correctness requires proper VL/VSTART values.
+// - Each of the loop macros named "_NOVM_" DO contain an invocation
+//   of the 'require_no_vmask>;' macro. Those macros (all of them
+//   at this time) do not support masking (i.e., no skipping
+//   of elements/element groups is performed).
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x4_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of the macro 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x8_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x8;; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                               PRELOOP, \
+                                                               EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro is meant to be used for "op vd, vs2" instructions,
+//    whether vd is output only, or input and output.
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                           PRELOOP, \
+                                                           EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2.  This interprets the vectors as containing element groups
+// of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': 5 bits unsigned immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': unsigned 5 bits immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x8; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 64b*4 element groups available in the vector registers
+// vd, vs1, and vs2.  This interprets the vectors as containing element groups
+// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current
+// SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_64x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU64x4_t reference, content of the current element group
+//         in the 'vd' vector register / vector register group.
+//   'vs1': EGU64x4_t, content of the current element group
+//         in the 'vs1' vector register / vector register group.
+//   'vs2': EGU64x4_t, content of the current element group
+//         in the 'vs2' vector register / vector register group.
+#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_64x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, v1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, rs1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+//
+// Element Group Manipulation Macros
+//
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Little Endian" (LE) order, i.e., from the least significant (W0)
+// to the most significant (W3).
+#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Little Endian" (LE)
+// order, i.e., from the least significant (W0) to the most
+// significant (W3).
+#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Byte-swap the bytes of a uin32_t such that the order of bytes
+// is reversed.
+#define ZVK_BSWAP32(x) \
+  ((((uint32_t)((x) >> 24)) & 0xFF) <<  0 | \
+   (((uint32_t)((x) >> 16)) & 0xFF) <<  8 | \
+   (((uint32_t)((x) >>  8)) & 0xFF) << 16 | \
+   (((uint32_t)((x) >>  0)) & 0xFF) << 24)
+
+// Extracts 8 uint32_t words from the input EGU32x8_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W7)
+// to the least significant (W0). Each of the words is byte-swapped,
+// from a big-endian representation in the EGU32x8_t to a native/little-endian
+// ordering in the variables.
+#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  uint32_t W0 = ZVK_BSWAP32((X)[0]); \
+  uint32_t W1 = ZVK_BSWAP32((X)[1]); \
+  uint32_t W2 = ZVK_BSWAP32((X)[2]); \
+  uint32_t W3 = ZVK_BSWAP32((X)[3]); \
+  uint32_t W4 = ZVK_BSWAP32((X)[4]); \
+  uint32_t W5 = ZVK_BSWAP32((X)[5]); \
+  uint32_t W6 = ZVK_BSWAP32((X)[6]); \
+  uint32_t W7 = ZVK_BSWAP32((X)[7]); \
+  (void)(0)
+
+// Sets the elements words of given EGU32x8_t variable 'X' to
+// the given 8 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W7) to the least
+// significant (W0). Each of the words is byte-swapped,
+// from a native/little-endian ordering in the variables to
+// a big-endian representation in the EGU32x8_t.
+#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = ZVK_BSWAP32(W0); \
+    (X)[1] = ZVK_BSWAP32(W1); \
+    (X)[2] = ZVK_BSWAP32(W2); \
+    (X)[3] = ZVK_BSWAP32(W3); \
+    (X)[4] = ZVK_BSWAP32(W4); \
+    (X)[5] = ZVK_BSWAP32(W5); \
+    (X)[6] = ZVK_BSWAP32(W6); \
+    (X)[7] = ZVK_BSWAP32(W7); \
+  } while (0)
+
+// Extracts 4 uint64_t words from the input EGU64x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint64_t W0 = (X)[0]; \
+  uint64_t W1 = (X)[1]; \
+  uint64_t W2 = (X)[2]; \
+  uint64_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU64x4_t variable 'X' to
+// the given 4 uint64_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Copies a EGU8x16_t value from 'SRC' into 'DST'.
+#define EGU8x16_COPY(DST, SRC) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (SRC)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU8x16_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU32x4_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU8x16_XOR(DST, A, B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
+  }
+
+//
+// Common bit manipulations logic.
+//
+
+// Form a 64 bit integer with bit X set
+#define ZVK_BIT(X) (1ULL << (X))
+
+// Reverse the order of bits within bytes of a word.
+// This is used to match the data interpretation in NIST SP 800-38D
+// a.k.a the GCM specification.
+#define ZVK_BREV8_32(X) \
+  do { \
+    (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \
+    (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \
+    (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \
+  } while (0)
+
+// Rotates right a uint32_t value by N bits.
+//   uint32_t ROR32(uint32_t X, std::size_t N);
+#define ZVK_ROR32(X, N) rotate_right<uint32_t>((X), (N))
+
+// Rotates right a uint64_t value by N bits.
+//   uint64_t ROR64(uint64_t X, std::size_t N);
+#define ZVK_ROR64(X, N) rotate_right<uint64_t>((X), (N))
+
+// Rotates left a uint32_t value by N bits.
+//   uint32_t ROL32(uint32_t X, std::size_t N);
+#define ZVK_ROL32(X, N) rotate_left<uint32_t>((X), (N))
+
+//
+// Element Group Bit Manipulation Macros
+//
+
+// Performs bit reversal in a EGU32x4_t group.
+#define EGU32x4_BREV8(X) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    ZVK_BREV8_32((X)[bidx]); \
+  }
+
+// Checks if a given bit is set within a EGU32x4_t group.
+// Assumes LE ordering.
+#define EGU32x4_ISSET(X, BIDX) \
+  (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0)
+
+// Shfts a EGU32x4_t group left by one bit.
+//
+// Since the entire 128 bit value is shifted we need to handle carry bits.
+// In order to limit the amount of carry check logic the elements are copied to
+// a 64 bit temporary variable.
+#define EGU32x4_LSHIFT(X) \
+  do { \
+    uint64_t dword; \
+    dword = ((uint64_t)(X)[3]) << 32; \
+    dword |= X[2]; \
+    dword <<= 1; \
+    if (X[1] & ZVK_BIT(31)) { \
+      dword |= ZVK_BIT(0); \
+    } \
+    X[2] = dword & UINT32_MAX; \
+    X[3] = dword >> 32; \
+    dword = ((uint64_t)(X)[1]) << 32; \
+    dword |= X[0]; \
+    dword <<= 1; \
+    X[0] = dword & UINT32_MAX; \
+    X[1] = dword >> 32; \
+  } while (0)
+
+#endif  // RISCV_ZVK_EXT_MACROS_H_
-- 
cgit v1.1