From 377fb0a11b8ccc28f7d1687523b7d79403e26453 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Wed, 31 May 2023 13:57:31 -0700
Subject: List extensions alphabetically in riscv_insn_list

The previous order lacks any obvious logic. Alphabetical order,
while making it difficult to create interesting groupings,
makes it easy to find which extensions are compiled in.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/riscv.mk.in | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 6472982..3b493a0 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1341,31 +1341,31 @@ riscv_insn_ext_zacas = \
 	$(if $(HAVE_INT128),amocas_q)
 
 riscv_insn_list = \
+	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
 	$(riscv_insn_ext_a) \
+	$(riscv_insn_ext_b) \
+	$(riscv_insn_ext_bf16) \
 	$(riscv_insn_ext_c) \
-	$(riscv_insn_ext_i) \
-	$(riscv_insn_ext_m) \
-	$(riscv_insn_ext_f) \
-	$(riscv_insn_ext_f_zfa) \
+	$(riscv_insn_ext_cmo) \
 	$(riscv_insn_ext_d) \
 	$(riscv_insn_ext_d_zfa) \
-	$(riscv_insn_ext_zfh) \
-	$(riscv_insn_ext_zfh_zfa) \
+	$(riscv_insn_ext_f) \
+	$(riscv_insn_ext_f_zfa) \
+	$(riscv_insn_ext_h) \
+	$(riscv_insn_ext_i) \
+	$(riscv_insn_ext_k) \
+	$(riscv_insn_ext_m) \
+	$(riscv_insn_ext_p) \
 	$(riscv_insn_ext_q) \
 	$(riscv_insn_ext_q_zfa) \
-	$(riscv_insn_ext_b) \
-	$(riscv_insn_ext_k) \
-	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
+	$(riscv_insn_ext_zacas) \
 	$(riscv_insn_ext_zce) \
-	$(riscv_insn_ext_h) \
-	$(riscv_insn_ext_p) \
+	$(riscv_insn_ext_zfh) \
+	$(riscv_insn_ext_zfh_zfa) \
+	$(riscv_insn_ext_zicond) \
 	$(riscv_insn_priv) \
-	$(riscv_insn_svinval) \
 	$(riscv_insn_smrnmi) \
-	$(riscv_insn_ext_cmo) \
-	$(riscv_insn_ext_zicond) \
-	$(riscv_insn_ext_bf16) \
-	$(riscv_insn_ext_zacas) \
+	$(riscv_insn_svinval) \
 
 riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list))
 
-- 
cgit v1.1


From 1e5a71f99b3432ba9fb543995a466c2d96e96cec Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:04:31 -0700
Subject: Zvk: extensions parsing

Zvk is the short name for the Vector Cryptography Instruction
Set Extension Specification being defined at
<https://github.com/riscv/riscv-crypto/tree/master/doc/vector>.

This commit adds support for parsing/enabling the Zvk extensions
(Zvbb, Zvbc, Zvkg, Zvkned, Zvknha, Zvknhb, Zvksed, Zvksh, Zvkt)
and the combo extensions (Zvkn, Zvknc, Zvkng, Zvks, Zvksc, Zvksg).

This is an early commit in a series implementing Zvk. No instructions
are actually defined here, only infastructure that will
support the coming extensions.

The encodings for Zvk instructions have some conflicts with Zpn
encodings. This commit marks those Zpn instructions as overlapping,
and adds checks to error out if conflicting extensions are enabled.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/isa_parser.cc  | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 riscv/isa_parser.h   | 16 +++++++++++++++
 riscv/overlap_list.h |  9 +++++++++
 3 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 1c4300c..6fb29ae 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -236,10 +236,55 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       extension_table[EXT_ZICOND] = true;
     } else if (ext_str == "zihpm") {
       extension_table[EXT_ZIHPM] = true;
+    } else if (ext_str == "zvbb") {
+      extension_table[EXT_ZVBB] = true;
+    } else if (ext_str == "zvbc") {
+      extension_table[EXT_ZVBC] = true;
     } else if (ext_str == "zvfbfmin") {
       extension_table[EXT_ZVFBFMIN] = true;
     } else if (ext_str == "zvfbfwma") {
       extension_table[EXT_ZVFBFWMA] = true;
+    } else if (ext_str == "zvkg") {
+      extension_table[EXT_ZVKG] = true;
+    } else if (ext_str == "zvkn") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvknc") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVBC] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvkng") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKG] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvkned") {
+      extension_table[EXT_ZVKNED] = true;
+    } else if (ext_str == "zvknha") {
+      extension_table[EXT_ZVKNHA] = true;
+    } else if (ext_str == "zvknhb") {
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvks") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksc") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVBC] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksg") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKG] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksed") {
+      extension_table[EXT_ZVKSED] = true;
+    } else if (ext_str == "zvksh") {
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvkt") {
     } else if (ext_str == "sstc") {
         extension_table[EXT_SSTC] = true;
     } else if (ext_str[0] == 'x') {
@@ -295,7 +340,7 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
   }
 
   if ((extension_table[EXT_ZCMP] || extension_table[EXT_ZCMT]) && extension_table[EXT_ZCD]) {
-    bad_isa_string(str, "Zcmp' and 'Zcmt' exensions are incompatible with 'Zcd' extension");
+    bad_isa_string(str, "Zcmp' and 'Zcmt' extensions are incompatible with 'Zcd' extension");
   }
 
   if ((extension_table[EXT_ZCF] || extension_table[EXT_ZCD] || extension_table[EXT_ZCB] ||
@@ -307,6 +352,16 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     bad_isa_string(str, "'Zacas' extension requires 'A' extension");
   }
 
+  // Zpn conflicts with Zvknha/Zvknhb in both rv32 and rv64
+  if (extension_table[EXT_ZPN] && (extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB])) {
+    bad_isa_string(str, "'Zvkna' and 'Zvknhb' extensions are incompatible with 'Zpn' extension");
+  }
+  // In rv64 only, Zpn (rv64_zpn) conflicts with Zvkg/Zvkned/Zvksh
+  if (max_xlen == 64 && extension_table[EXT_ZPN] &&
+      (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
+    bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
+  }
+
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 3cbee7d..5b04347 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -58,8 +58,24 @@ typedef enum {
   EXT_ZICNTR,
   EXT_ZICOND,
   EXT_ZIHPM,
+  EXT_ZVBB,
+  EXT_ZVBC,
   EXT_ZVFBFMIN,
   EXT_ZVFBFWMA,
+  EXT_ZVKG,
+  EXT_ZVKNED,
+  EXT_ZVKNHA,
+  EXT_ZVKNHB,
+  EXT_ZVKSED,
+  EXT_ZVKSH,
+  EXT_XZBP,
+  EXT_XZBS,
+  EXT_XZBE,
+  EXT_XZBF,
+  EXT_XZBC,
+  EXT_XZBM,
+  EXT_XZBR,
+  EXT_XZBT,
   EXT_SSTC,
   EXT_ZACAS,
   EXT_INTERNAL_ZFH_MOVE,
diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index a30c770..2214be4 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -12,3 +12,12 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
 DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)
+DECLARE_OVERLAP_INSN(vaesdf_vv, EXT_ZVKNED)
+DECLARE_OVERLAP_INSN(vghsh_vv, EXT_ZVKG)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHA)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHB)
+DECLARE_OVERLAP_INSN(vsm3me_vv, EXT_ZVKSH)
+DECLARE_OVERLAP_INSN(rstsa16, EXT_ZPN)
+DECLARE_OVERLAP_INSN(rstsa32, EXT_ZPN)
+DECLARE_OVERLAP_INSN(srli32_u, EXT_ZPN)
+DECLARE_OVERLAP_INSN(umax32, EXT_ZPN)
-- 
cgit v1.1


From d5c0339484323b5a9498576d70ec90eab2e13438 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Sun, 18 Jun 2023 17:10:53 -0700
Subject: Zvk: Infrastructure for Zvk extensions, element group handling

Introduce types and macros useful across multiple Zvk sub-extensions,
including Zvbb and Zvbc. Those will be used by upcoming
per-sub-extension commits.

In particular we introduce "Element Group" types and loop macros handling
those element groups. The concept of element group is described in
<https://github.com/riscv/riscv-crypto/blob/master/doc/vector/riscv-crypto-vector-element-groups.adoc>.

Note that the element group access method is not implemented
for WORDS_BIGENDIAN setup. As such, isa_parser.cc is modified to emit
an error when WORDS_BIGENDIAN is defined and extensions using element
groups are enabled.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/arith.h          |   21 +
 riscv/isa_parser.cc    |   10 +-
 riscv/v_ext_macros.h   |   22 ++
 riscv/vector_unit.cc   |   55 +++
 riscv/vector_unit.h    |   19 +-
 riscv/zvk_ext_macros.h | 1023 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1148 insertions(+), 2 deletions(-)
 create mode 100644 riscv/zvk_ext_macros.h

diff --git a/riscv/arith.h b/riscv/arith.h
index 3b807e9..20b1504 100644
--- a/riscv/arith.h
+++ b/riscv/arith.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <climits>
 #include <cstddef>
+#include <type_traits>
 
 inline uint64_t mulhu(uint64_t a, uint64_t b)
 {
@@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
   return r;
 }
 
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t rshift = shiftamt & mask;
+  const std::size_t lshift = (-rshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t lshift = shiftamt & mask;
+  const std::size_t rshift = (-lshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
 #endif
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 6fb29ae..59472a4 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -361,7 +361,15 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
     bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
   }
-
+#ifdef WORDS_BIGENDIAN
+  // Access to the vector registers as element groups is unimplemented on big-endian setups.
+  if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] ||
+      extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) {
+      bad_isa_string(str,
+		     "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' "
+		     "extensions are incompatible with WORDS_BIGENDIAN setups.");
+  }
+#endif
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 41256c7..908ff16 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
 
+#define V_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
 #define VX_U_PARAMS(x) \
   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
   type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
@@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   } \
   VI_LOOP_END 
 
+#define VI_V_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8) { \
+    V_U_PARAMS(e8); \
+    BODY; \
+  } else if (sew == e16) { \
+    V_U_PARAMS(e16); \
+    BODY; \
+  } else if (sew == e32) { \
+    V_U_PARAMS(e32); \
+    BODY; \
+  } else if (sew == e64) { \
+    V_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
 #define VI_VX_ULOOP(BODY) \
   VI_CHECK_SSS(false) \
   VI_LOOP_BASE \
diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc
index 9128df6..08adc61 100644
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write
   return regStart[n];
 }
 
+// The logic differences between 'elt()' and 'elt_group()' come from
+// the fact that, while 'elt()' requires that the element is fully
+// contained in a single vector register, the element group may span
+// multiple registers in a single register group (LMUL>1).
+//
+// Notes:
+// - We do NOT check that a single element - i.e., the T in the element
+//   group type std::array<T, N> - fits within a single register, or that
+//   T is smaller or equal to VSEW. Implementations of the instructions
+//   sometimes use a different T than what the specification suggests.
+//   Instructon implementations should 'require()' what the specification
+//   dictates.
+// - We do NOT check that 'vReg' is a valid register group, or that
+//   'n+1' element groups fit in the register group 'vReg'. It is
+//   the responsibility of the caller to validate those preconditions.
+template<typename EG> EG&
+vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) {
+#ifdef WORDS_BIGENDIAN
+  fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n",
+          stderr);
+  abort();
+#endif
+  using T = typename EG::value_type;
+  constexpr std::size_t N = std::tuple_size<EG>::value;
+  assert(N > 0);
+
+  assert(vsew != 0);
+  constexpr reg_t elt_group_size = N * sizeof(T);
+  const reg_t reg_group_size = (VLEN >> 3) * vflmul;
+  assert(((n + 1) * elt_group_size) <= reg_group_size);
+
+  const reg_t start_byte = n * elt_group_size;
+  const reg_t bytes_per_reg = VLEN >> 3;
+
+  // Inclusive first/last register indices.
+  const reg_t reg_first = vReg + start_byte / bytes_per_reg;
+  const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg;
+
+  // Element groups per register groups
+  for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) {
+      reg_referenced[vidx] = 1;
+
+      if (unlikely(p->get_log_commits_enabled() && is_write)) {
+          p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0};
+      }
+  }
+
+  return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte);
+}
+
 template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool);
 template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool);
 template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool);
@@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool);
 template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool);
 template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool);
 template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool);
+
+template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool);
+template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool);
+template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool);
+template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool);
diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h
index b9f706c..a057c62 100644
--- a/riscv/vector_unit.h
+++ b/riscv/vector_unit.h
@@ -2,6 +2,9 @@
 #ifndef _RISCV_VECTOR_UNIT_H
 #define _RISCV_VECTOR_UNIT_H
 
+#include <array>
+#include <cstdint>
+
 #include "decode.h"
 #include "csrs.h"
 
@@ -69,6 +72,17 @@ struct type_sew_t<64>
   using type=int64_t;
 };
 
+// Element Group of 4 32 bits elements (128b total).
+using EGU32x4_t = std::array<uint32_t, 4>;
+
+// Element Group of 8 32 bits elements (256b total).
+using EGU32x8_t = std::array<uint32_t, 8>;
+
+// Element Group of 4 64 bits elements (256b total).
+using EGU64x4_t = std::array<uint64_t, 4>;
+
+// Element Group of 16 8 bits elements (128b total).
+using EGU8x16_t = std::array<uint8_t, 16>;
 
 class vectorUnit_t
 {
@@ -88,8 +102,11 @@ public:
   bool vill;
   bool vstart_alu;
 
-  // vector element for varies SEW
+  // vector element for various SEW
   template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false);
+  // vector element group access, where EG is a std::array<T, N>.
+  template<typename EG> EG&
+  elt_group(reg_t vReg, reg_t n, bool is_write = false);
 
 public:
 
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
new file mode 100644
index 0000000..7efbac8
--- /dev/null
+++ b/riscv/zvk_ext_macros.h
@@ -0,0 +1,1023 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvk extension (vector cryptography).
+
+// Note that a good deal of code here would be cleaner/simpler
+// if exposed as C++ functions (including templated ones), however
+// this is not possible in the contexts where those headers are
+// included.
+
+#ifndef RISCV_ZVK_EXT_MACROS_H_
+#define RISCV_ZVK_EXT_MACROS_H_
+
+//
+// Predicate Macros
+//
+
+// Ensures that the ZVBB extension (vector crypto bitmanip) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvbb \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBB); \
+  } while (0)
+
+// Ensures that the ZVBC extension (vector carryless multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvbc \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBC); \
+  } while (0)
+
+// Ensures that the ZVKG extension (vector Gallois Field Multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvkg \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKG); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-256 is present.
+// For SHA-256, this support is present in either Zvknha or Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_256 \
+  do { \
+    require_vector(true); \
+    require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKNED extension (vector AES single round) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvkned \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNED); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-512 is present.
+// For SHA-512, this support is only present in Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_512 \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKSED extension (vector SM4 block cipher)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvksed \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSED); \
+  } while (0)
+
+// Ensures that the ZVKSH extension (vector SM3 hash) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvksh \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSH); \
+  } while (0)
+
+// Ensures that the vector instruction is not using a mask.
+#define require_no_vmask  require(insn.v_vm() == 1)
+
+// Ensures that an element group can fit in a register group. That is,
+//    (LMUL * VLEN) <= EGW
+#define require_egw_fits(EGW)  require((EGW) <= (P.VU.VLEN * P.VU.vflmul))
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=4 (four 32-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group),
+// for an effective element group width of EGW=256 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x8 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 8 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 8 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=64, EGS=4 (four 64-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_64x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+//
+// Loop Parameters Macros
+//
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs1 = P.VU.elt_group<EGU32x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs1 = P.VU.elt_group<EGU32x8_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 64b*4 element group as a EGU64x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU64x4_t &vd = P.VU.elt_group<EGU64x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU64x4_t vs1 = P.VU.elt_group<EGU64x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU64x4_t vs2 = P.VU.elt_group<EGU64x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1',
+// as part of a widening operation where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto vs1 = P.VU.elt<type_usew_t<SEW>::type>(rs1_num, i); \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the scalar register 'rs1', as part of a widening operation where
+// 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto rs1 = (type_usew_t<SEW>::type)RS1; \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the 5-bit immediate field 'zimm5', as part of a widening operation
+// where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto zimm5 = (type_usew_t<SEW>::type)insn.v_zimm5(); \
+
+//
+// Loop Macros
+//
+
+// NOTES:
+// - Each of the element-group loop macros DO contain an invocation
+//   of the corresponding 'require_element_groups_<bits>x<#elements>;',
+//   because the macro correctness requires proper VL/VSTART values.
+// - Each of the loop macros named "_NOVM_" DO contain an invocation
+//   of the 'require_no_vmask>;' macro. Those macros (all of them
+//   at this time) do not support masking (i.e., no skipping
+//   of elements/element groups is performed).
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x4_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of the macro 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x8_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x8;; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                               PRELOOP, \
+                                                               EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro is meant to be used for "op vd, vs2" instructions,
+//    whether vd is output only, or input and output.
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                           PRELOOP, \
+                                                           EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2.  This interprets the vectors as containing element groups
+// of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': 5 bits unsigned immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': unsigned 5 bits immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x8; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 64b*4 element groups available in the vector registers
+// vd, vs1, and vs2.  This interprets the vectors as containing element groups
+// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current
+// SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_64x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU64x4_t reference, content of the current element group
+//         in the 'vd' vector register / vector register group.
+//   'vs1': EGU64x4_t, content of the current element group
+//         in the 'vs1' vector register / vector register group.
+//   'vs2': EGU64x4_t, content of the current element group
+//         in the 'vs2' vector register / vector register group.
+#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_64x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, v1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, rs1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+//
+// Element Group Manipulation Macros
+//
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Little Endian" (LE) order, i.e., from the least significant (W0)
+// to the most significant (W3).
+#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Little Endian" (LE)
+// order, i.e., from the least significant (W0) to the most
+// significant (W3).
+#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Byte-swap the bytes of a uin32_t such that the order of bytes
+// is reversed.
+#define ZVK_BSWAP32(x) \
+  ((((uint32_t)((x) >> 24)) & 0xFF) <<  0 | \
+   (((uint32_t)((x) >> 16)) & 0xFF) <<  8 | \
+   (((uint32_t)((x) >>  8)) & 0xFF) << 16 | \
+   (((uint32_t)((x) >>  0)) & 0xFF) << 24)
+
+// Extracts 8 uint32_t words from the input EGU32x8_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W7)
+// to the least significant (W0). Each of the words is byte-swapped,
+// from a big-endian representation in the EGU32x8_t to a native/little-endian
+// ordering in the variables.
+#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  uint32_t W0 = ZVK_BSWAP32((X)[0]); \
+  uint32_t W1 = ZVK_BSWAP32((X)[1]); \
+  uint32_t W2 = ZVK_BSWAP32((X)[2]); \
+  uint32_t W3 = ZVK_BSWAP32((X)[3]); \
+  uint32_t W4 = ZVK_BSWAP32((X)[4]); \
+  uint32_t W5 = ZVK_BSWAP32((X)[5]); \
+  uint32_t W6 = ZVK_BSWAP32((X)[6]); \
+  uint32_t W7 = ZVK_BSWAP32((X)[7]); \
+  (void)(0)
+
+// Sets the elements words of given EGU32x8_t variable 'X' to
+// the given 8 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W7) to the least
+// significant (W0). Each of the words is byte-swapped,
+// from a native/little-endian ordering in the variables to
+// a big-endian representation in the EGU32x8_t.
+#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = ZVK_BSWAP32(W0); \
+    (X)[1] = ZVK_BSWAP32(W1); \
+    (X)[2] = ZVK_BSWAP32(W2); \
+    (X)[3] = ZVK_BSWAP32(W3); \
+    (X)[4] = ZVK_BSWAP32(W4); \
+    (X)[5] = ZVK_BSWAP32(W5); \
+    (X)[6] = ZVK_BSWAP32(W6); \
+    (X)[7] = ZVK_BSWAP32(W7); \
+  } while (0)
+
+// Extracts 4 uint64_t words from the input EGU64x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint64_t W0 = (X)[0]; \
+  uint64_t W1 = (X)[1]; \
+  uint64_t W2 = (X)[2]; \
+  uint64_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU64x4_t variable 'X' to
+// the given 4 uint64_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Copies a EGU8x16_t value from 'SRC' into 'DST'.
+#define EGU8x16_COPY(DST, SRC) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (SRC)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU8x16_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU32x4_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU8x16_XOR(DST, A, B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
+  }
+
+//
+// Common bit manipulations logic.
+//
+
+// Form a 64 bit integer with bit X set
+#define ZVK_BIT(X) (1ULL << (X))
+
+// Reverse the order of bits within bytes of a word.
+// This is used to match the data interpretation in NIST SP 800-38D
+// a.k.a the GCM specification.
+#define ZVK_BREV8_32(X) \
+  do { \
+    (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \
+    (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \
+    (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \
+  } while (0)
+
+// Rotates right a uint32_t value by N bits.
+//   uint32_t ROR32(uint32_t X, std::size_t N);
+#define ZVK_ROR32(X, N) rotate_right<uint32_t>((X), (N))
+
+// Rotates right a uint64_t value by N bits.
+//   uint64_t ROR64(uint64_t X, std::size_t N);
+#define ZVK_ROR64(X, N) rotate_right<uint64_t>((X), (N))
+
+// Rotates left a uint32_t value by N bits.
+//   uint32_t ROL32(uint32_t X, std::size_t N);
+#define ZVK_ROL32(X, N) rotate_left<uint32_t>((X), (N))
+
+//
+// Element Group Bit Manipulation Macros
+//
+
+// Performs bit reversal in a EGU32x4_t group.
+#define EGU32x4_BREV8(X) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    ZVK_BREV8_32((X)[bidx]); \
+  }
+
+// Checks if a given bit is set within a EGU32x4_t group.
+// Assumes LE ordering.
+#define EGU32x4_ISSET(X, BIDX) \
+  (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0)
+
+// Shfts a EGU32x4_t group left by one bit.
+//
+// Since the entire 128 bit value is shifted we need to handle carry bits.
+// In order to limit the amount of carry check logic the elements are copied to
+// a 64 bit temporary variable.
+#define EGU32x4_LSHIFT(X) \
+  do { \
+    uint64_t dword; \
+    dword = ((uint64_t)(X)[3]) << 32; \
+    dword |= X[2]; \
+    dword <<= 1; \
+    if (X[1] & ZVK_BIT(31)) { \
+      dword |= ZVK_BIT(0); \
+    } \
+    X[2] = dword & UINT32_MAX; \
+    X[3] = dword >> 32; \
+    dword = ((uint64_t)(X)[1]) << 32; \
+    dword |= X[0]; \
+    dword <<= 1; \
+    X[0] = dword & UINT32_MAX; \
+    X[1] = dword >> 32; \
+  } while (0)
+
+#endif  // RISCV_ZVK_EXT_MACROS_H_
-- 
cgit v1.1


From e87038ee5e6545a5149cdf4334d220f951534f30 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:06:55 -0700
Subject: Zvk: Implement Zvbb, Vector Bit-manipulation for Cryptography

Implement the proposed instructions in Zvbb:
 - vandn.{vv,vx}, vector bitwise and-not
 - vbrev.v, vector bit reverse in element
 - vbrev8.v, vector bit reverse in bytes
 - vrev8.v, vector byte reverse
 - vctz.v, vector count trailing zeros
 - vclz.v, vector count leading zeros
 - vcpop.v, vector population count
 - vrol.{vv,vx}, vector rotate left
 - vror.{vi,vv,vx}, vector rotate right
 - vwsll.{vi,vv,vx} vector widening shift left logical

A new instruction field, 'zimm6', is introduced, encoded
in bits [15, 19] and [26].. It is used by "vror.vi" to encode
a shift immediate in [0, 63].

Co-authored-by: Raghav Gupta <rgupta@rivosinc.com>
Co-authored-by: Stanislaw Kardach <kda@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/decode.h         |  1 +
 riscv/insns/vandn_vv.h | 10 ++++++++++
 riscv/insns/vandn_vx.h | 10 ++++++++++
 riscv/insns/vbrev8_v.h | 13 +++++++++++++
 riscv/insns/vbrev_v.h  | 24 ++++++++++++++++++++++++
 riscv/insns/vclz_v.h   | 16 ++++++++++++++++
 riscv/insns/vcpop_v.h  | 16 ++++++++++++++++
 riscv/insns/vctz_v.h   | 16 ++++++++++++++++
 riscv/insns/vrev8_v.h  | 16 ++++++++++++++++
 riscv/insns/vrol_vv.h  | 17 +++++++++++++++++
 riscv/insns/vrol_vx.h  | 18 ++++++++++++++++++
 riscv/insns/vror_vi.h  | 18 ++++++++++++++++++
 riscv/insns/vror_vv.h  | 17 +++++++++++++++++
 riscv/insns/vror_vx.h  | 18 ++++++++++++++++++
 riscv/insns/vwsll_vi.h | 10 ++++++++++
 riscv/insns/vwsll_vv.h | 10 ++++++++++
 riscv/insns/vwsll_vx.h | 10 ++++++++++
 riscv/riscv.mk.in      | 22 ++++++++++++++++++++++
 18 files changed, 262 insertions(+)
 create mode 100644 riscv/insns/vandn_vv.h
 create mode 100644 riscv/insns/vandn_vx.h
 create mode 100644 riscv/insns/vbrev8_v.h
 create mode 100644 riscv/insns/vbrev_v.h
 create mode 100644 riscv/insns/vclz_v.h
 create mode 100644 riscv/insns/vcpop_v.h
 create mode 100644 riscv/insns/vctz_v.h
 create mode 100644 riscv/insns/vrev8_v.h
 create mode 100644 riscv/insns/vrol_vv.h
 create mode 100644 riscv/insns/vrol_vx.h
 create mode 100644 riscv/insns/vror_vi.h
 create mode 100644 riscv/insns/vror_vv.h
 create mode 100644 riscv/insns/vror_vx.h
 create mode 100644 riscv/insns/vwsll_vi.h
 create mode 100644 riscv/insns/vwsll_vv.h
 create mode 100644 riscv/insns/vwsll_vx.h

diff --git a/riscv/decode.h b/riscv/decode.h
index dad32a1..cd1c0a1 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -140,6 +140,7 @@ public:
   uint64_t v_vta() { return x(26, 1); }
   uint64_t v_vma() { return x(27, 1); }
   uint64_t v_mew() { return x(28, 1); }
+  uint64_t v_zimm6() { return x(15, 5) + (x(26, 1) << 5); }
 
   uint64_t p_imm2() { return x(20, 2); }
   uint64_t p_imm3() { return x(20, 3); }
diff --git a/riscv/insns/vandn_vv.h b/riscv/insns/vandn_vv.h
new file mode 100644
index 0000000..d85e47d
--- /dev/null
+++ b/riscv/insns/vandn_vv.h
@@ -0,0 +1,10 @@
+// vandn.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VV_LOOP
+({
+  vd = vs2 & (~vs1);
+})
diff --git a/riscv/insns/vandn_vx.h b/riscv/insns/vandn_vx.h
new file mode 100644
index 0000000..1c66a40
--- /dev/null
+++ b/riscv/insns/vandn_vx.h
@@ -0,0 +1,10 @@
+// vandn.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VX_LOOP
+({
+  vd = vs2 & (~rs1);
+})
diff --git a/riscv/insns/vbrev8_v.h b/riscv/insns/vbrev8_v.h
new file mode 100644
index 0000000..a6d3cda
--- /dev/null
+++ b/riscv/insns/vbrev8_v.h
@@ -0,0 +1,13 @@
+// vbrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  vd = vs2;
+  vd = ((vd & 0x5555555555555555llu) <<  1) | ((vd & 0xAAAAAAAAAAAAAAAAllu) >>  1);
+  vd = ((vd & 0x3333333333333333llu) <<  2) | ((vd & 0xCCCCCCCCCCCCCCCCllu) >>  2);
+  vd = ((vd & 0x0F0F0F0F0F0F0F0Fllu) <<  4) | ((vd & 0xF0F0F0F0F0F0F0F0llu) >>  4);
+})
diff --git a/riscv/insns/vbrev_v.h b/riscv/insns/vbrev_v.h
new file mode 100644
index 0000000..7f784c2
--- /dev/null
+++ b/riscv/insns/vbrev_v.h
@@ -0,0 +1,24 @@
+// vbrev.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  reg_t x = vs2;
+
+  // Reverse bits in bytes (vbrev8)
+  x = ((x & 0x5555555555555555llu) <<  1) | ((x & 0xAAAAAAAAAAAAAAAAllu) >>  1);
+  x = ((x & 0x3333333333333333llu) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCllu) >>  2);
+  x = ((x & 0x0F0F0F0F0F0F0F0Fllu) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0llu) >>  4);
+  // Re-order bytes (vrev8)
+  if (P.VU.vsew > 8)
+    x = ((x & 0x00FF00FF00FF00FFllu) <<  8) | ((x & 0xFF00FF00FF00FF00llu) >>  8);
+  if (P.VU.vsew > 16)
+    x = ((x & 0x0000FFFF0000FFFFllu) << 16) | ((x & 0xFFFF0000FFFF0000llu) >> 16);
+  if (P.VU.vsew > 32)
+    x = ((x & 0x00000000FFFFFFFFllu) << 32) | ((x & 0xFFFFFFFF00000000llu) >> 32);
+
+  vd = x;
+})
diff --git a/riscv/insns/vclz_v.h b/riscv/insns/vclz_v.h
new file mode 100644
index 0000000..5f7f03c
--- /dev/null
+++ b/riscv/insns/vclz_v.h
@@ -0,0 +1,16 @@
+// vclz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  unsigned int i = 0;
+  for (; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> (P.VU.vsew - 1 - i))) {
+      break;
+    }
+  }
+  vd = i;
+})
diff --git a/riscv/insns/vcpop_v.h b/riscv/insns/vcpop_v.h
new file mode 100644
index 0000000..52b29c6
--- /dev/null
+++ b/riscv/insns/vcpop_v.h
@@ -0,0 +1,16 @@
+// vpopc.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  reg_t count = 0;
+  for (std::size_t i = 0; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> i)) {
+      count++;
+    }
+  }
+  vd = count;
+})
diff --git a/riscv/insns/vctz_v.h b/riscv/insns/vctz_v.h
new file mode 100644
index 0000000..b63dd01
--- /dev/null
+++ b/riscv/insns/vctz_v.h
@@ -0,0 +1,16 @@
+// vctz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  unsigned int i = 0;
+  for (; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> i)) {
+      break;
+    }
+  }
+  vd = i;
+})
diff --git a/riscv/insns/vrev8_v.h b/riscv/insns/vrev8_v.h
new file mode 100644
index 0000000..f26c5a0
--- /dev/null
+++ b/riscv/insns/vrev8_v.h
@@ -0,0 +1,16 @@
+// vrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  vd = vs2;
+  if (P.VU.vsew > 8)
+    vd = ((vd & 0x00FF00FF00FF00FFllu) <<  8) | ((vd & 0xFF00FF00FF00FF00llu) >>  8);
+  if (P.VU.vsew > 16)
+    vd = ((vd & 0x0000FFFF0000FFFFllu) << 16) | ((vd & 0xFFFF0000FFFF0000llu) >> 16);
+  if (P.VU.vsew > 32)
+    vd = ((vd & 0x00000000FFFFFFFFllu) << 32) | ((vd & 0xFFFFFFFF00000000llu) >> 32);
+})
diff --git a/riscv/insns/vrol_vv.h b/riscv/insns/vrol_vv.h
new file mode 100644
index 0000000..fb2e483
--- /dev/null
+++ b/riscv/insns/vrol_vv.h
@@ -0,0 +1,17 @@
+// vrol.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+  // For .vv, the shift amount comes from the vs1 element.
+  const reg_t lshift = vs1 & mask;
+  const reg_t rshift = (-lshift) & mask;
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vrol_vx.h b/riscv/insns/vrol_vx.h
new file mode 100644
index 0000000..b0c89a2
--- /dev/null
+++ b/riscv/insns/vrol_vx.h
@@ -0,0 +1,18 @@
+// vrol.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t lshift = ((reg_t)RS1) & mask;
+const reg_t rshift = (-lshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vi.h b/riscv/insns/vror_vi.h
new file mode 100644
index 0000000..1269c3d
--- /dev/null
+++ b/riscv/insns/vror_vi.h
@@ -0,0 +1,18 @@
+// vror.vi vd, vs2, zimm6, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vi, the shift amount comes from bits [26,19-15].
+const reg_t rshift = insn.v_zimm6() & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vv.h b/riscv/insns/vror_vv.h
new file mode 100644
index 0000000..c649c6d
--- /dev/null
+++ b/riscv/insns/vror_vv.h
@@ -0,0 +1,17 @@
+// vror.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+  // For .vv, the shift amount comes from the vs1 element.
+  const reg_t rshift = vs1 & mask;
+  const reg_t lshift = (-rshift) & mask;
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vx.h b/riscv/insns/vror_vx.h
new file mode 100644
index 0000000..50c8e5c
--- /dev/null
+++ b/riscv/insns/vror_vx.h
@@ -0,0 +1,18 @@
+// vror.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t rshift = ((reg_t)RS1) & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vwsll_vi.h b/riscv/insns/vwsll_vi.h
new file mode 100644
index 0000000..13b5eb4
--- /dev/null
+++ b/riscv/insns/vwsll_vi.h
@@ -0,0 +1,10 @@
+// vwsll.vi vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VI_WIDENING_ULOOP({
+  const reg_t shift = zimm5 & ((2 * sew) - 1);
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vv.h b/riscv/insns/vwsll_vv.h
new file mode 100644
index 0000000..5a64c6c
--- /dev/null
+++ b/riscv/insns/vwsll_vv.h
@@ -0,0 +1,10 @@
+// vwsll.vv vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VV_WIDENING_ULOOP({
+  const reg_t shift = (vs1 & ((2 * sew) - 1));
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vx.h b/riscv/insns/vwsll_vx.h
new file mode 100644
index 0000000..5264e80
--- /dev/null
+++ b/riscv/insns/vwsll_vx.h
@@ -0,0 +1,10 @@
+// vwsll.vx vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VX_WIDENING_ULOOP({
+  const reg_t shift = (rs1 & ((2 * sew) - 1));
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 3b493a0..4aa23e3 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1340,6 +1340,27 @@ riscv_insn_ext_zacas = \
 	amocas_d \
 	$(if $(HAVE_INT128),amocas_q)
 
+riscv_insn_ext_zvbb = \
+	vandn_vv \
+	vandn_vx \
+	vbrev8_v \
+	vbrev_v \
+	vclz_v \
+	vcpop_v \
+	vctz_v \
+	vrev8_v \
+	vrol_vv \
+	vrol_vx \
+	vror_vi \
+	vror_vv \
+	vror_vx \
+	vwsll_vi \
+	vwsll_vv \
+	vwsll_vx \
+
+riscv_insn_ext_zvk = \
+	$(riscv_insn_ext_zvbb) \
+
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
 	$(riscv_insn_ext_a) \
@@ -1363,6 +1384,7 @@ riscv_insn_list = \
 	$(riscv_insn_ext_zfh) \
 	$(riscv_insn_ext_zfh_zfa) \
 	$(riscv_insn_ext_zicond) \
+	$(riscv_insn_ext_zvk) \
 	$(riscv_insn_priv) \
 	$(riscv_insn_smrnmi) \
 	$(riscv_insn_svinval) \
-- 
cgit v1.1


From d633af2b180391b6f73f84f56d8b305a3af7c152 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:04 -0700
Subject: Zvk: Implement Zvbc extension, vectory carryless multiplaction

Implement the Zvbc instructions
- vclmul.{vv,vx}, vector carryless multiply low
- vclmulh.{vv,vx}, vector carryless multiply high

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vclmul_vv.h  | 20 ++++++++++++++++++++
 riscv/insns/vclmul_vx.h  | 20 ++++++++++++++++++++
 riscv/insns/vclmulh_vv.h | 20 ++++++++++++++++++++
 riscv/insns/vclmulh_vx.h | 20 ++++++++++++++++++++
 riscv/riscv.mk.in        |  7 +++++++
 5 files changed, 87 insertions(+)
 create mode 100644 riscv/insns/vclmul_vv.h
 create mode 100644 riscv/insns/vclmul_vx.h
 create mode 100644 riscv/insns/vclmulh_vv.h
 create mode 100644 riscv/insns/vclmulh_vx.h

diff --git a/riscv/insns/vclmul_vv.h b/riscv/insns/vclmul_vv.h
new file mode 100644
index 0000000..8957738
--- /dev/null
+++ b/riscv/insns/vclmul_vv.h
@@ -0,0 +1,20 @@
+// vclmul.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the low 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((vs1 & mask) != 0) {
+      vd ^= vs2 << bit_idx;
+    }
+  }
+})
diff --git a/riscv/insns/vclmul_vx.h b/riscv/insns/vclmul_vx.h
new file mode 100644
index 0000000..1df7a3a
--- /dev/null
+++ b/riscv/insns/vclmul_vx.h
@@ -0,0 +1,20 @@
+// vclmul.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the low 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((rs1 & mask) != 0) {
+        vd ^= vs2 << bit_idx;
+    }
+  }
+})
diff --git a/riscv/insns/vclmulh_vv.h b/riscv/insns/vclmulh_vv.h
new file mode 100644
index 0000000..6a54bcf
--- /dev/null
+++ b/riscv/insns/vclmulh_vv.h
@@ -0,0 +1,20 @@
+// vclmulh.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the high 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((vs1 & mask) != 0) {
+      vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+    }
+  }
+})
diff --git a/riscv/insns/vclmulh_vx.h b/riscv/insns/vclmulh_vx.h
new file mode 100644
index 0000000..e874d1d
--- /dev/null
+++ b/riscv/insns/vclmulh_vx.h
@@ -0,0 +1,20 @@
+// vclmulh.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the high 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((rs1 & mask) != 0) {
+      vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+    }
+  }
+})
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 4aa23e3..dcf2640 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1358,8 +1358,15 @@ riscv_insn_ext_zvbb = \
 	vwsll_vv \
 	vwsll_vx \
 
+riscv_insn_ext_zvbc = \
+	vclmul_vv \
+	vclmul_vx \
+	vclmulh_vv \
+	vclmulh_vx \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
+	$(riscv_insn_ext_zvbc) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
-- 
cgit v1.1


From fbd4ca2eef884b6835e848d761b3e375a66fc47a Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:22 -0700
Subject: Zvk: Implement Zvkg, Vector GCM/GMAC instruction

Implement the proposed instruction in Zvkg, vghmac.vv,
Vector Carryless Multiply Accumulate over GHASH Galois-Field.

The instruction performs one step of GHASH routine as described
in "NIST Special Publication 800-38D" a.k.a the AES-GCM specification.
The logic was written to closely track the pseudo-code
in the Zvk specification.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
Co-authored-by: Kornel Duleba <mindal@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vghsh_vv.h | 38 ++++++++++++++++++++++++++++++++++++++
 riscv/insns/vgmul_vv.h | 32 ++++++++++++++++++++++++++++++++
 riscv/riscv.mk.in      |  5 +++++
 riscv/zvk_ext_macros.h | 16 ++++++++++++++--
 4 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 riscv/insns/vghsh_vv.h
 create mode 100644 riscv/insns/vgmul_vv.h

diff --git a/riscv/insns/vghsh_vv.h b/riscv/insns/vghsh_vv.h
new file mode 100644
index 0000000..bcbfe74
--- /dev/null
+++ b/riscv/insns/vghsh_vv.h
@@ -0,0 +1,38 @@
+// vghsh.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;   // Current partial hash
+    EGU32x4_t X = vs1;  // Block cipher output
+    EGU32x4_t H = vs2;  // Hash subkey
+
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    // S = brev8(Y ^ X)
+    EGU32x4_t S;
+    EGU32x4_XOR(S, Y, X);
+    EGU32x4_BREV8(S);
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(S, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      const bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Left shift by 1.
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/insns/vgmul_vv.h b/riscv/insns/vgmul_vv.h
new file mode 100644
index 0000000..820b396
--- /dev/null
+++ b/riscv/insns/vgmul_vv.h
@@ -0,0 +1,32 @@
+// vgmul.vv vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;  // Multiplier
+    EGU32x4_BREV8(Y);
+    EGU32x4_t H = vs2;  // Multiplicand
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(Y, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Lef shift by 1
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index dcf2640..5562c09 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1364,9 +1364,14 @@ riscv_insn_ext_zvbc = \
 	vclmulh_vv \
 	vclmulh_vx \
 
+riscv_insn_ext_zvkg= \
+	vghsh_vv \
+	vgmul_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
+	$(riscv_insn_ext_zvkg) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
index 7efbac8..bf893f9 100644
--- a/riscv/zvk_ext_macros.h
+++ b/riscv/zvk_ext_macros.h
@@ -942,8 +942,8 @@
 // Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
 // in A (mutated) with the bytes in B (unchanged).
 #define EGU32x4_XOREQ(MUT_A, CONST_B) \
-  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
-    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  for (std::size_t idx = 0; idx < 4; ++idx) { \
+    (MUT_A)[idx] ^= (CONST_B)[idx]; \
   }
 
 // Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
@@ -953,6 +953,18 @@
     (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
   }
 
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU32x4_XOR(DST, A, B) \
+  do { \
+    static_assert(std::is_same<EGU32x4_t, decltype(A)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(B)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(DST)>::value); \
+    for (std::size_t idx = 0; idx < 4; ++idx) { \
+      (DST)[idx] = (A)[idx] ^ (B)[idx]; \
+    } \
+  } while (0)
+
 //
 // Common bit manipulations logic.
 //
-- 
cgit v1.1


From 00873aa61acae4a17c1d269cddf1885e83b50102 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:32 -0700
Subject: Zvk: Implement Zvknh[ab], NIST Suite: Vector SHA-2

Implement the instructions part of the Zvknha and Zvknhb
sub-extensions:
 - vsha2ms.vv, message schedule
 - vsha2ch.vv / vsha2cl.vv, compression rounds

A header files for common macros is added.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vsha2ch_vv.h |  61 +++++++++++++++++++
 riscv/insns/vsha2cl_vv.h |  62 +++++++++++++++++++
 riscv/insns/vsha2ms_vv.h |  63 +++++++++++++++++++
 riscv/riscv.mk.in        |   7 +++
 riscv/zvknh_ext_macros.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 348 insertions(+)
 create mode 100644 riscv/insns/vsha2ch_vv.h
 create mode 100644 riscv/insns/vsha2cl_vv.h
 create mode 100644 riscv/insns/vsha2ms_vv.h
 create mode 100644 riscv/zvknh_ext_macros.h

diff --git a/riscv/insns/vsha2ch_vv.h b/riscv/insns/vsha2ch_vv.h
new file mode 100644
index 0000000..34c6e05
--- /dev/null
+++ b/riscv/insns/vsha2ch_vv.h
@@ -0,0 +1,61 @@
+// vsha2ch.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/insns/vsha2cl_vv.h b/riscv/insns/vsha2cl_vv.h
new file mode 100644
index 0000000..4a1df09
--- /dev/null
+++ b/riscv/insns/vsha2cl_vv.h
@@ -0,0 +1,62 @@
+// vsha2cl.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
+
diff --git a/riscv/insns/vsha2ms_vv.h b/riscv/insns/vsha2ms_vv.h
new file mode 100644
index 0000000..8f1ca08
--- /dev/null
+++ b/riscv/insns/vsha2ms_vv.h
@@ -0,0 +1,63 @@
+// vshams.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU32x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint32_t w16 = ZVK_SHA256_SCHEDULE(w14,  w9, w1, w0);
+        const uint32_t w17 = ZVK_SHA256_SCHEDULE(w15, w10, w2, w1);
+        const uint32_t w18 = ZVK_SHA256_SCHEDULE(w16, w11, w3, w2);
+        const uint32_t w19 = ZVK_SHA256_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU32x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU64x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint64_t w16 = ZVK_SHA512_SCHEDULE(w14,  w9, w1, w0);
+        const uint64_t w17 = ZVK_SHA512_SCHEDULE(w15, w10, w2, w1);
+        const uint64_t w18 = ZVK_SHA512_SCHEDULE(w16, w11, w3, w2);
+        const uint64_t w19 = ZVK_SHA512_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU64x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 5562c09..4ce088f 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1368,10 +1368,17 @@ riscv_insn_ext_zvkg= \
 	vghsh_vv \
 	vgmul_vv \
 
+# Covers both Zvknha and Zvkhnb.
+riscv_insn_ext_zvknh = \
+	vsha2cl_vv \
+	vsha2ch_vv \
+	vsha2ms_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
+	$(riscv_insn_ext_zvknh) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvknh_ext_macros.h b/riscv/zvknh_ext_macros.h
new file mode 100644
index 0000000..b50818b
--- /dev/null
+++ b/riscv/zvknh_ext_macros.h
@@ -0,0 +1,155 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvknh[ab] extensions (vector SHA-256/SHA-512 cryptography).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKNH_EXT_MACROS_H_
+#define RISCV_ZVKNH_EXT_MACROS_H_
+
+// Constraints common to all vsha* instructions, across all VSEW:
+//  - VSEW is 32 (SHA-256) or 64 (SHA-512)
+//  - No overlap of vd with vs1 or vs2.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_..._EGU32x4_..._LOOP and VI_..._EGU64x4_..._LOOP
+// macros.
+#define require_vsha2_common_constraints \
+  do { \
+    require(P.VU.vsew == 32 || P.VU.vsew == 64); \
+    require(insn.rd() != insn.rs1()); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsha2_vsew32_constraints \
+  do { \
+    require_zvknh_256; \
+    require_egw_fits(128); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU64x4_..._LOOP macros.
+#define require_vsha2_vsew64_constraints \
+  do { \
+    require_zvknh_512; \
+    require_egw_fits(256); \
+  } while (false)
+
+//
+// SHA-256 and SHA-512 common logic
+//
+
+// Ch(x, y, z) = (xy) ⊕ (~xz) = xy | ~xz
+#define ZVK_SHA_CH(X, Y, Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+
+// Maj(x,y,z)  = (xy) ⊕ (xz) ⊕(yz) = xy | xz | yz
+#define ZVK_SHA_MAJ(X, Y, Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+
+//
+// SHA-256
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA256_SUM0(X) \
+  (ZVK_ROR32(X, 2) ^ ZVK_ROR32(X, 13) ^ ZVK_ROR32(X, 22))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA256_SUM1(X) \
+  (ZVK_ROR32(X, 6) ^ ZVK_ROR32(X, 11) ^ ZVK_ROR32(X, 25))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA256_SIG0(X) \
+  (ZVK_ROR32(X, 7) ^ ZVK_ROR32(X, 18) ^ ((X) >> 3))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA256_SIG1(X)  \
+  (ZVK_ROR32(X, 17) ^ ZVK_ROR32(X, 19) ^ ((X) >> 10))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA256_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA256_SIG1(W14) + (W9) + ZVK_SHA256_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA256_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint32_t t1 = (H) + ZVK_SHA256_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint32_t t2 = ZVK_SHA256_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+//
+// SHA-512
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA512_SUM0(X) \
+  (ZVK_ROR64(X, 28) ^ ZVK_ROR64(X, 34) ^ ZVK_ROR64(X, 39))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA512_SUM1(X) \
+  (ZVK_ROR64(X, 14) ^ ZVK_ROR64(X, 18) ^ ZVK_ROR64(X, 41))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA512_SIG0(X) \
+  (ZVK_ROR64(X, 1) ^ ZVK_ROR64(X, 8) ^ ((X) >> 7))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA512_SIG1(X) \
+  (ZVK_ROR64(X, 19) ^ ZVK_ROR64(X, 61) ^ ((X) >> 6))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA512_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA512_SIG1(W14) + (W9) + ZVK_SHA512_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA512_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint64_t t1 = (H) + ZVK_SHA512_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint64_t t2 = ZVK_SHA512_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+#endif  // RISCV_ZVKNH_EXT_MACROS_H_
-- 
cgit v1.1


From eadb0e1129c23e709b0565740f0fc1a3359de7b7 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:38 -0700
Subject: Zvk: Implement Zvkned, vector AES single round

Implement the Zvkned extension, "NIST Suite: Vector AES Encryption
& Decryption (Single Round)".
 - vaeskf1.vi: AES forward key scheduling, AES-128.
 - vaeskf2.vi: AES forward key scheduling, AES-256.
 - vaesz.vs: AES encryption/decryption, 0-th round.
 - vaesdm.{vs,vv}: AES decryption, middle rounds.
 - vaesdf.{vs,vv}: AES decryption, final round.
 - vaesem.{vs,vv}: AES encryption, middle rounds.
 - vaesef.{vs,vv}: AES encryption, final round.

An extension specific header containing common logic is added.

Co-authored-by: Stanislaw Kardach <kda@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vaesdf_vs.h   |  43 ++++++++
 riscv/insns/vaesdf_vv.h   |  37 +++++++
 riscv/insns/vaesdm_vs.h   |  44 ++++++++
 riscv/insns/vaesdm_vv.h   |  38 +++++++
 riscv/insns/vaesef_vs.h   |  43 ++++++++
 riscv/insns/vaesef_vv.h   |  37 +++++++
 riscv/insns/vaesem_vs.h   |  44 ++++++++
 riscv/insns/vaesem_vv.h   |  38 +++++++
 riscv/insns/vaeskf1_vi.h  |  65 +++++++++++
 riscv/insns/vaeskf2_vi.h  |  89 +++++++++++++++
 riscv/insns/vaesz_vs.h    |  24 +++++
 riscv/riscv.mk.in         |  14 +++
 riscv/zvkned_ext_macros.h | 270 ++++++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 786 insertions(+)
 create mode 100644 riscv/insns/vaesdf_vs.h
 create mode 100644 riscv/insns/vaesdf_vv.h
 create mode 100644 riscv/insns/vaesdm_vs.h
 create mode 100644 riscv/insns/vaesdm_vv.h
 create mode 100644 riscv/insns/vaesef_vs.h
 create mode 100644 riscv/insns/vaesef_vv.h
 create mode 100644 riscv/insns/vaesem_vs.h
 create mode 100644 riscv/insns/vaesem_vv.h
 create mode 100644 riscv/insns/vaeskf1_vi.h
 create mode 100644 riscv/insns/vaeskf2_vi.h
 create mode 100644 riscv/insns/vaesz_vs.h
 create mode 100644 riscv/zvkned_ext_macros.h

diff --git a/riscv/insns/vaesdf_vs.h b/riscv/insns/vaesdf_vs.h
new file mode 100644
index 0000000..a124278
--- /dev/null
+++ b/riscv/insns/vaesdf_vs.h
@@ -0,0 +1,43 @@
+// vaesdf.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdf_vv.h b/riscv/insns/vaesdf_vv.h
new file mode 100644
index 0000000..9fca572
--- /dev/null
+++ b/riscv/insns/vaesdf_vv.h
@@ -0,0 +1,37 @@
+// vaesdf.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vs.h b/riscv/insns/vaesdm_vs.h
new file mode 100644
index 0000000..3c23e69
--- /dev/null
+++ b/riscv/insns/vaesdm_vs.h
@@ -0,0 +1,44 @@
+// vaesdm.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vv.h b/riscv/insns/vaesdm_vv.h
new file mode 100644
index 0000000..9c29cd9
--- /dev/null
+++ b/riscv/insns/vaesdm_vv.h
@@ -0,0 +1,38 @@
+// vaesdm.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vs.h b/riscv/insns/vaesef_vs.h
new file mode 100644
index 0000000..2d32653
--- /dev/null
+++ b/riscv/insns/vaesef_vs.h
@@ -0,0 +1,43 @@
+// vaesef.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vv.h b/riscv/insns/vaesef_vv.h
new file mode 100644
index 0000000..9b43a6d
--- /dev/null
+++ b/riscv/insns/vaesef_vv.h
@@ -0,0 +1,37 @@
+// vaesef.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vs.h b/riscv/insns/vaesem_vs.h
new file mode 100644
index 0000000..348cd9f
--- /dev/null
+++ b/riscv/insns/vaesem_vs.h
@@ -0,0 +1,44 @@
+// vaesem.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vv.h b/riscv/insns/vaesem_vv.h
new file mode 100644
index 0000000..34f0056
--- /dev/null
+++ b/riscv/insns/vaesem_vv.h
@@ -0,0 +1,38 @@
+// vaesem.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaeskf1_vi.h b/riscv/insns/vaeskf1_vi.h
new file mode 100644
index 0000000..28d03d0
--- /dev/null
+++ b/riscv/insns/vaeskf1_vi.h
@@ -0,0 +1,65 @@
+// vaeskf1.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// There is one round constant for each round number
+// between 1 and 10. We index using 'round# -1'.
+static constexpr uint8_t kRoundConstants[10] = {
+  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 4 bits of the immediate.
+  const reg_t zimm4 = zimm5 & 0xF;
+  // Normalize the round value to be in [2, 14] by toggling bit 3
+  // if outside the range (i.e., +8 or -8).
+  const reg_t round = ((1 <= zimm4) && (zimm4 <= 10)) ? zimm4 : (zimm4 ^ 0x8);
+  const uint32_t rcon = kRoundConstants[round - 1];,
+  // Per Element Group body.
+  {
+    // vaeskf1_vi produces key[i+1] in vd, it receives key[i] in vs2,
+    // i.e., 4x32b values (4 words).
+    //
+    // The logic is fairly similar between vaeskf1/vaeskf2, with the following
+    // differences:
+    // - in AES-128 (vaeskf1), we get both the 'temp' word and
+    //   the "previous words" w0..w3 from key[i]/vs2.
+    // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+    //   the "previous words" w0..w3 from key[i-1]/vd.
+
+    // 'temp' is extracted from the last (most significant) word of key[i].
+    uint32_t temp = vs2[3];
+    temp = (temp >> 8) | (temp << 24);  // Rotate right by 8
+    temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+            ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  8) & 0xFF] <<  8) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  0) & 0xFF] <<  0));
+    temp = temp ^ rcon;
+
+    // "old" words are the w[i-Nk] of FIPS-197. They are extracted
+    // from vs2, which contains key[i] in AES-128 where Nk=4.
+    const uint32_t w0 = vs2[0] ^ temp;
+    const uint32_t w1 = vs2[1] ^ w0;
+    const uint32_t w2 = vs2[2] ^ w1;
+    const uint32_t w3 = vs2[3] ^ w2;
+
+    // Overwrite vd with k[i+1] from the new words.
+    SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+  }
+);
diff --git a/riscv/insns/vaeskf2_vi.h b/riscv/insns/vaeskf2_vi.h
new file mode 100644
index 0000000..49c2a2d
--- /dev/null
+++ b/riscv/insns/vaeskf2_vi.h
@@ -0,0 +1,89 @@
+// vaeskf2.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// Round Constants
+//
+// Only the odd rounds need to be encoded, the even ones can use 0
+// or skip the rcon handling. We can use '(round# / 2) - 1'
+// (or "(round# >> 1) - 1") to index into the array.
+//
+// Round#  Constant
+//  [ 2]  -> kRoundConstants[0]
+//  [ 3]  -> 0 / Nothing
+//  [ 4]  -> kRoundConstants[1]
+//  [ 5]  -> 0 / Nothing
+//  [ 6]  -> kRoundConstants[2]
+//  [ 7]  -> 0 / Nothing
+// ...
+//  [13]  -> 0 / Nothing
+//  [14]  -> kRoundConstants[6]
+static constexpr uint8_t kRoundConstants[7] = {
+  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40,
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 4 bits of the immediate.
+  const reg_t zimm4 = zimm5 & 0xF;
+  // Normalize the round value to be in [2, 14] by toggling bit 3
+  // if outside the range (i.e., +8 or -8).
+  const reg_t round = ((2 <= zimm4) && (zimm4 <= 14)) ? zimm4 : (zimm4 ^ 0x8);,
+  // Per Element Group body.
+  {
+    // vaeskf2_vi produces key[i+1] in vd, it receives key[i] in vs2,
+    // i.e., 4x32b values (4 words).
+    //
+    // The logic is fairly similar between vaeskf2/vaeskf2, with the following
+    // differences:
+    // - in AES-128 (vaeskf1), we get both the 'temp' word and
+    //   the "previous words" w0..w3 from key[i]/vs2.
+    // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+    //   the "previous words" w0..w3 from key[i-1]/vd.
+
+    // 'temp' is extracted from the last (most significant) word of key[i].
+    uint32_t temp = vs2[3];
+    // With AES-256, when we have an even round number, we hit the
+    //       Nk > 6 and i mod Nk = 4
+    // condition in the FIPS-197 key expansion pseudo-code (Figure 11).
+    // In those cases we skip RotWord and the round constant is 0.
+    const bool is_even_round = (round & 0x1) == 0;
+    if (is_even_round) {
+      temp = (temp >> 8) | (temp << 24);  // Rotate right by 8
+    }
+    temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+            ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  8) & 0xFF] <<  8) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  0) & 0xFF] <<  0));
+
+    if (is_even_round) {
+      const uint32_t rcon = kRoundConstants[(round >> 1) - 1];
+      temp = temp ^ rcon;
+    }
+
+    // "old" words are the w[i-Nk] of FIPS-197. For AES-256, where Nk=8,
+    // they are extracted from vd which contains key[i-1].
+    const uint32_t w0 = vd[0] ^ temp;
+    const uint32_t w1 = vd[1] ^ w0;
+    const uint32_t w2 = vd[2] ^ w1;
+    const uint32_t w3 = vd[3] ^ w2;
+
+    // Overwrite vd with k[i+1] from the new words.
+    SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+  }
+);
diff --git a/riscv/insns/vaesz_vs.h b/riscv/insns/vaesz_vs.h
new file mode 100644
index 0000000..c3dc931
--- /dev/null
+++ b/riscv/insns/vaesz_vs.h
@@ -0,0 +1,24 @@
+// vaesz.vs vd, vs2
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  // Per Element Group body.
+  {
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    // Produce vd = vd ^ "common key from vs2".
+    EGU8x16_XOR(vd, vd, scalar_key);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 4ce088f..2d75662 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1368,6 +1368,19 @@ riscv_insn_ext_zvkg= \
 	vghsh_vv \
 	vgmul_vv \
 
+riscv_insn_ext_zvkned = \
+	vaesdf_vs \
+	vaesdf_vv \
+	vaesdm_vs \
+	vaesdm_vv \
+	vaesef_vs \
+	vaesef_vv \
+	vaesem_vs \
+	vaesem_vv \
+	vaeskf1_vi \
+	vaeskf2_vi \
+	vaesz_vs \
+
 # Covers both Zvknha and Zvkhnb.
 riscv_insn_ext_zvknh = \
 	vsha2cl_vv \
@@ -1378,6 +1391,7 @@ riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
+	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
 
 riscv_insn_list = \
diff --git a/riscv/zvkned_ext_macros.h b/riscv/zvkned_ext_macros.h
new file mode 100644
index 0000000..db705c7
--- /dev/null
+++ b/riscv/zvkned_ext_macros.h
@@ -0,0 +1,270 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvkned extension (vector AES single round).
+
+#include "insns/aes_common.h"
+
+#ifndef RISCV_ZVKNED_EXT_MACROS_H_
+#define RISCV_ZVKNED_EXT_MACROS_H_
+
+// vaes*.vs instruction constraints:
+//  - Zvkned is enabled
+//  - EGW (128) <= LMUL * VLEN
+//  - vd and vs2 cannot overlap
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vs_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// vaes*.vv instruction constraints. Those are the same as the .vs ones,
+// except for the overlap constraint that is not present for .vv variants.
+//  - Zvkned is enabled
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vv_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// vaeskf*.vi instruction constraints. Those are the same as the .vv ones.
+#define require_vaeskf_vi_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+#define VAES_XTIME(A) (((A) << 1) ^ (((A) & 0x80) ? 0x1b : 0))
+
+#define VAES_GFMUL(A, B) \
+  ((((B) & 0x1) ?                                  (A)  : 0) ^ \
+   (((B) & 0x2) ?                         VAES_XTIME(A) : 0) ^ \
+   (((B) & 0x4) ?             VAES_XTIME(VAES_XTIME(A)) : 0) ^ \
+   (((B) & 0x8) ? VAES_XTIME(VAES_XTIME(VAES_XTIME(A))) : 0))
+
+// Apply the S-box transform to every byte in the VAESState 'state'
+#define VAES_SUB_BYTES(STATE) \
+  do { \
+    static constexpr uint8_t kVAESXEncSBox[256]= { \
+      0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, \
+      0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, \
+      0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, \
+      0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, \
+      0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, \
+      0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, \
+      0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, \
+      0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, \
+      0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, \
+      0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, \
+      0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, \
+      0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, \
+      0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, \
+      0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, \
+      0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, \
+      0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, \
+      0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, \
+      0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, \
+      0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, \
+      0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, \
+      0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, \
+      0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, \
+      0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, \
+      0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, \
+      0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, \
+      0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, \
+      0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, \
+      0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, \
+      0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, \
+      0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, \
+      0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, \
+      0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, \
+    }; \
+    for (uint8_t& byte : (STATE)) { \
+      byte = kVAESXEncSBox[byte]; \
+     } \
+  } while (0)
+
+// Applies the S-box inverse (decode) transform to every byte
+// in the VAESState 'state'.
+#define VAES_INV_SUB_BYTES(STATE) \
+  do { \
+    static constexpr uint8_t kVAESXDecSBox[256] = { \
+      0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, \
+      0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, \
+      0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, \
+      0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, \
+      0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, \
+      0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, \
+      0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, \
+      0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, \
+      0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, \
+      0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, \
+      0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, \
+      0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, \
+      0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, \
+      0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, \
+      0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, \
+      0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, \
+      0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, \
+      0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, \
+      0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, \
+      0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, \
+      0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, \
+      0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, \
+      0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, \
+      0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, \
+      0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, \
+      0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, \
+      0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, \
+      0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, \
+      0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, \
+      0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, \
+      0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, \
+      0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D, \
+    }; \
+    for (uint8_t &byte : (STATE)) { \
+      byte = kVAESXDecSBox[byte]; \
+    } \
+  } while (0)
+
+// Shift the state rows, as specified in ShiftRows.
+//  'STATE' is a VAESState value.
+#define VAES_SHIFT_ROWS(STATE) \
+  do { \
+    uint8_t temp; \
+    /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+    /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+    temp = (STATE)[1]; \
+    (STATE)[ 1] = (STATE)[ 5]; \
+    (STATE)[ 5] = (STATE)[ 9]; \
+    (STATE)[ 9] = (STATE)[13]; \
+    (STATE)[13] = temp; \
+    /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+    temp = (STATE)[2]; \
+    (STATE)[ 2] = (STATE)[10]; \
+    (STATE)[10] = temp; \
+    temp = (STATE)[6]; \
+    (STATE)[ 6] = (STATE)[14]; \
+    (STATE)[14] = temp; \
+    /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+    temp = (STATE)[3]; \
+    (STATE)[ 3] = (STATE)[15]; \
+    (STATE)[15] = (STATE)[11]; \
+    (STATE)[11] = (STATE)[ 7]; \
+    (STATE)[ 7] = temp; \
+  } while (0)
+
+// Shifts the state rows, as specified in InvShiftRows.
+// 'STATE' is a VAESState value.
+#define VAES_INV_SHIFT_ROWS(STATE) \
+  do { \
+    uint8_t temp; \
+    /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+    /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+    temp = (STATE)[1]; \
+    (STATE)[ 1] = (STATE)[13]; \
+    (STATE)[13] = (STATE)[ 9]; \
+    (STATE)[ 9] = (STATE)[ 5]; \
+    (STATE)[ 5] = temp; \
+    /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+    temp = (STATE)[2]; \
+    (STATE)[ 2] = (STATE)[10]; \
+    (STATE)[10] = temp; \
+    temp = (STATE)[6]; \
+    (STATE)[ 6] = (STATE)[14]; \
+    (STATE)[14] = temp; \
+    /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+    temp = (STATE)[3]; \
+    (STATE)[ 3] = (STATE)[ 7]; \
+    (STATE)[ 7] = (STATE)[11]; \
+    (STATE)[11] = (STATE)[15]; \
+    (STATE)[15] = temp; \
+  } while (0)
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation MixColumns() specified in FIPS-197 5.1.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+//   F(A, B, C, D) = (2 . A) xor (3 . B) xor C xor D
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_MIX_COLUMN_BYTE(A, B, C, D) \
+  (VAES_GFMUL((A), 0x2) ^ VAES_GFMUL((B), 0x3) ^ (C) ^ (D))
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation InvMixColumns() specified in FIPS-197 5.3.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+//   F(A, B, C, D) = (0xE . A) xor (0xB . B) xor (0xD . C) xor (0x9 . D)
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_INV_MIX_COLUMN_BYTE(A, B, C, D) \
+  (VAES_GFMUL((A), 0xE) ^ \
+   VAES_GFMUL((B), 0xB) ^ \
+   VAES_GFMUL((C), 0xD) ^ \
+   VAES_GFMUL((D), 0x9))
+
+// Given a column as a uin32_t (4 Bytes), produces the mixed column
+// as a uin32_t.
+#define VAES_MIX_COLUMN(STATE, COL_IDX) \
+  do { \
+    uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+    /* Extract the bytes, before we start overwriting them */ \
+    const uint8_t b0 = column[0]; \
+    const uint8_t b1 = column[1]; \
+    const uint8_t b2 = column[2]; \
+    const uint8_t b3 = column[3]; \
+    /* Every iteration rotates the byte indices by 1 */ \
+    column[0] = VAES_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+    column[1] = VAES_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+    column[2] = VAES_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+    column[3] = VAES_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+  } while (0)
+
+// Given a column as a uin32_t (4 Bytes), produces the inverse
+// mixed column as a uin32_t.
+#define VAES_INV_MIX_COLUMN(STATE, COL_IDX) \
+  do { \
+    uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+    /* Extract the bytes, before we start overwriting them */ \
+    const uint8_t b0 = column[0]; \
+    const uint8_t b1 = column[1]; \
+    const uint8_t b2 = column[2]; \
+    const uint8_t b3 = column[3]; \
+    /* Every iteration rotates the byte indices by 1 */ \
+    column[0] = VAES_INV_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+    column[1] = VAES_INV_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+    column[2] = VAES_INV_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+    column[3] = VAES_INV_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+  } while (0)
+
+// Implements MixColumns as defined in FIPS-197 5.1.3.
+#define VAES_MIX_COLUMNS(STATE) \
+  do { \
+    VAES_MIX_COLUMN((STATE), 0); \
+    VAES_MIX_COLUMN((STATE), 1); \
+    VAES_MIX_COLUMN((STATE), 2); \
+    VAES_MIX_COLUMN((STATE), 3); \
+  } while (0)
+
+// Implements InvMixColumns as defined in FIPS-197 5.3.3.
+#define VAES_INV_MIX_COLUMNS(STATE) \
+  do { \
+    VAES_INV_MIX_COLUMN((STATE), 0); \
+    VAES_INV_MIX_COLUMN((STATE), 1); \
+    VAES_INV_MIX_COLUMN((STATE), 2); \
+    VAES_INV_MIX_COLUMN((STATE), 3); \
+  } while (0)
+
+#endif  // RISCV_ZVKNED_EXT_MACROS_H_
-- 
cgit v1.1


From cbb2b1a224d8922c6d3146da56f5087a3858ced5 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:53 -0700
Subject: Zvk: Implement Zvksed, vector SM4 Block Cipher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the Zvksed sub-extension, "ShangMi Suite: SM4 Block Cipher":
 - vsm4k.vi, vector SM4 key expansion,
 - vsm4r.{vs,vv}, vector SM4 rounds.

This also introduces a header for common vector SM4 logic.

Co-authored-by: Raghav Gupta <rgupta@rivosinc.com>
Co-authored-by: Albert Jakieła <aja@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/sm4_common.h  |  1 -
 riscv/insns/vsm4k_vi.h    | 52 ++++++++++++++++++++++++++++++++++++++++
 riscv/insns/vsm4r_vs.h    | 51 ++++++++++++++++++++++++++++++++++++++++
 riscv/insns/vsm4r_vv.h    | 37 +++++++++++++++++++++++++++++
 riscv/riscv.mk.in         |  6 +++++
 riscv/zvksed_ext_macros.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 riscv/insns/vsm4k_vi.h
 create mode 100644 riscv/insns/vsm4r_vs.h
 create mode 100644 riscv/insns/vsm4r_vv.h
 create mode 100644 riscv/zvksed_ext_macros.h

diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h
index 17f129f..24d6ce1 100644
--- a/riscv/insns/sm4_common.h
+++ b/riscv/insns/sm4_common.h
@@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
 	0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
 	0xD7, 0xCB, 0x39, 0x48
 };
-
diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h
new file mode 100644
index 0000000..8f52e68
--- /dev/null
+++ b/riscv/insns/vsm4k_vi.h
@@ -0,0 +1,52 @@
+// vsm4k.vi vd, vs2, round#
+
+#include "zvksed_ext_macros.h"
+
+// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft.
+static constexpr uint32_t zvksed_ck[32] = {
+  0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
+  0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
+  0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
+  0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
+  0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
+  0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
+  0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
+  0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+};
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 3 bits of the immediate, ensuring that
+  // 'round' is in the valid range [0, 7].
+  const reg_t round = zimm5 & 0x7;,
+  // Per Element Group body.
+  {
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round];
+    uint32_t S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S);
+
+    B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S);
+
+    B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S);
+
+    B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h
new file mode 100644
index 0000000..44011eb
--- /dev/null
+++ b/riscv/insns/vsm4r_vs.h
@@ -0,0 +1,51 @@
+// vsm4r.vs vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+// No overlap of vd and vs2.
+require(insn.rd() != insn.rs2());
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0);
+  const uint32_t rk0 = scalar_key[0];
+  const uint32_t rk1 = scalar_key[1];
+  const uint32_t rk2 = scalar_key[2];
+  const uint32_t rk3 = scalar_key[3];,
+  {
+    EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true);
+
+    // {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(state, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h
new file mode 100644
index 0000000..9a18cec
--- /dev/null
+++ b/riscv/insns/vsm4r_vv.h
@@ -0,0 +1,37 @@
+// vsm4r.vv vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    // vd = {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3);
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 2d75662..c774e1b 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1387,12 +1387,18 @@ riscv_insn_ext_zvknh = \
 	vsha2ch_vv \
 	vsha2ms_vv \
 
+riscv_insn_ext_zvksed = \
+	vsm4k_vi \
+	vsm4r_vs \
+	vsm4r_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
 	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
+	$(riscv_insn_ext_zvksed) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h
new file mode 100644
index 0000000..46e399b
--- /dev/null
+++ b/riscv/zvksed_ext_macros.h
@@ -0,0 +1,60 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksed extension (vectorized SM4).
+
+#include "insns/sm4_common.h"
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKSED_MACROS_H_
+#define RISCV_ZVKSED_MACROS_H_
+
+// Constraints common to all vsm4* instructions:
+//  - Zvksed is enabled
+//  - VSEW == 32
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsm4_constraints \
+  do { \
+    require_zvksed; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// Returns a uint32_t value constructed from the 4 bytes (uint8_t)
+// provided in "Little Endian" (LE) order, i.e., from least significant (B0)
+// to most significant (B3).
+#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \
+  (((uint32_t)(B0)) <<  0 | \
+   ((uint32_t)(B1)) <<  8 | \
+   ((uint32_t)(B2)) << 16 | \
+   ((uint32_t)(B3)) << 24)
+
+// Get byte BYTE of the SBox.
+#define ZVKSED_SBOX(BYTE)  (sm4_sbox[(BYTE)])
+
+// Given an unsigned integer value 'X' and a byte index,
+// returns a uint8_t value for the byte at the given index.
+#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8)))
+
+// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1.
+// of the IETF draft.
+#define ZVKSED_SUB_BYTES(B) \
+  ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3)))
+
+// Perform the linear transformation L to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND(X, S) \
+  ((X) ^ \
+   ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \
+    ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24)))
+
+// Perform the linear transformation L' to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND_KEY(X, S) \
+  ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23)))
+
+#endif // RISCV_ZVKSED_MACROS_H_
-- 
cgit v1.1


From a55f96ae9380d5cc9bef05e8b9e82e54d5d6ec35 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:09:07 -0700
Subject: Zvk: Implement Zvksh, vector SM3 Hash Function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the Zvksh sub-extension, "ShangMi Suite: SM3 Hash
Function Instructions":
 - vsm3me.vv, message expansion,
 - vsm3c.vi, compression rounds.

This also introduces a SM3 specific header for common logic.

Co-authored-by: Raghav Gupta <rgupta@rivosinc.com>
Co-authored-by: Albert Jakieła <aja@semihalf.com>
Co-authored-by: Kornel Dulęba <mindal@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vsm3c_vi.h   | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
 riscv/insns/vsm3me_vv.h  | 39 +++++++++++++++++++++++++++++++
 riscv/riscv.mk.in        |  5 ++++
 riscv/zvksh_ext_macros.h | 47 +++++++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 riscv/insns/vsm3c_vi.h
 create mode 100644 riscv/insns/vsm3me_vv.h
 create mode 100644 riscv/zvksh_ext_macros.h

diff --git a/riscv/insns/vsm3c_vi.h b/riscv/insns/vsm3c_vi.h
new file mode 100644
index 0000000..b3e8121
--- /dev/null
+++ b/riscv/insns/vsm3c_vi.h
@@ -0,0 +1,60 @@
+// vsm3c.vi vd, vs2, rnd
+
+#include "zvksh_ext_macros.h"
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(
+  {},
+  // No need to validate or normalize 'zimm5' here as this is a 5 bits value
+  // and all values in 0-31 are valid.
+  const reg_t round = zimm5;,
+  {
+    // {H, G, F, E, D, C, B, A} <- vd
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vd, H, G, F, E, D, C, B, A);
+    // {_, _, w5, w4, _, _, w1, w0} <- vs2
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2,
+                                   UNUSED _unused_w7, UNUSED _unused_w6, w5, w4,
+                                   UNUSED _unused_w3, UNUSED _unused_w2, w1, w0);
+    const uint32_t x0 = w0 ^ w4;  // W'[0] in spec documentation.
+    const uint32_t x1 = w1 ^ w5;  // W'[1]
+
+    // Two rounds of compression.
+    uint32_t ss1;
+    uint32_t ss2;
+    uint32_t tt1;
+    uint32_t tt2;
+    uint32_t j;
+
+    j = 2 * round;
+    ss1 = ZVK_ROL32(ZVK_ROL32(A, 12) + E + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+    ss2 = ss1 ^ ZVK_ROL32(A, 12);
+    tt1 = ZVKSH_FF(A, B, C, j) + D + ss2 + x0;
+    tt2 = ZVKSH_GG(E, F, G, j) + H + ss1 + w0;
+    D = C;
+    const uint32_t C1 = ZVK_ROL32(B, 9);
+    B = A;
+    const uint32_t A1 = tt1;
+    H = G;
+    const uint32_t G1 = ZVK_ROL32(F, 19);
+    F = E;
+    const uint32_t E1 = ZVKSH_P0(tt2);
+
+    j = 2 * round + 1;
+    ss1 = ZVK_ROL32(ZVK_ROL32(A1, 12) + E1 + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+    ss2 = ss1 ^ ZVK_ROL32(A1, 12);
+    tt1 = ZVKSH_FF(A1, B, C1, j) + D + ss2 + x1;
+    tt2 = ZVKSH_GG(E1, F, G1, j) + H + ss1 + w1;
+    D = C1;
+    const uint32_t C2 = ZVK_ROL32(B, 9);
+    B = A1;
+    const uint32_t A2 = tt1;
+    H = G1;
+    const uint32_t G2 = ZVK_ROL32(F, 19);
+    F = E1;
+    const uint32_t E2 = ZVKSH_P0(tt2);
+
+    // Update the destination register.
+    SET_EGU32x8_WORDS_BE_BSWAP(vd, G1, G2, E1, E2, C1, C2, A1, A2);
+  }
+);
diff --git a/riscv/insns/vsm3me_vv.h b/riscv/insns/vsm3me_vv.h
new file mode 100644
index 0000000..dd6cb52
--- /dev/null
+++ b/riscv/insns/vsm3me_vv.h
@@ -0,0 +1,39 @@
+// vsm3me.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+#include "zvksh_ext_macros.h"
+
+// Per the SM3 spec, the message expansion computes new words Wi as:
+//   W[i] = (    P_1( W[i-16] xor W[i-9] xor ( W[i-3] <<< 15 ) )
+//           xor ( W[i-13] <<< 7 )
+//           xor W[i-6]))
+// Using arguments M16 = W[i-16], M9 = W[i-9], etc.,
+// where Mk stands for "W[i Minus k]", we define the "W function":
+#define ZVKSH_W(M16, M9, M3, M13, M6) \
+  (ZVKSH_P1((M16) ^  (M9) ^ ZVK_ROL32((M3), 15)) ^ ZVK_ROL32((M13), 7) ^ (M6))
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(
+  {},
+  {
+    // {w7,  w6,  w5,  w4,  w3,  w2,  w1,  w0} <- vs1
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs1, w7, w6, w5, w4, w3, w2, w1, w0);
+    // {w15, w14, w13, w12, w11, w10, w9, w8} <- vs2
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2, w15, w14, w13, w12, w11, w10, w9, w8);
+
+    // Arguments are W[i-16], W[i-9], W[i-13], W[i-6].
+    // Note that some of the newly computed words are used in later invocations.
+    const uint32_t w16 = ZVKSH_W(w0,  w7, w13,  w3, w10);
+    const uint32_t w17 = ZVKSH_W(w1,  w8, w14,  w4, w11);
+    const uint32_t w18 = ZVKSH_W(w2,  w9, w15,  w5, w12);
+    const uint32_t w19 = ZVKSH_W(w3, w10, w16,  w6, w13);
+    const uint32_t w20 = ZVKSH_W(w4, w11, w17,  w7, w14);
+    const uint32_t w21 = ZVKSH_W(w5, w12, w18,  w8, w15);
+    const uint32_t w22 = ZVKSH_W(w6, w13, w19,  w9, w16);
+    const uint32_t w23 = ZVKSH_W(w7, w14, w20, w10, w17);
+
+    // Update the destination register.
+    SET_EGU32x8_WORDS_BE_BSWAP(vd, w23, w22, w21, w20, w19, w18, w17, w16);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index c774e1b..a3e125f 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1392,6 +1392,10 @@ riscv_insn_ext_zvksed = \
 	vsm4r_vs \
 	vsm4r_vv \
 
+riscv_insn_ext_zvksh = \
+	vsm3c_vi \
+	vsm3me_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
@@ -1399,6 +1403,7 @@ riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
 	$(riscv_insn_ext_zvksed) \
+	$(riscv_insn_ext_zvksh) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvksh_ext_macros.h b/riscv/zvksh_ext_macros.h
new file mode 100644
index 0000000..71c5a09
--- /dev/null
+++ b/riscv/zvksh_ext_macros.h
@@ -0,0 +1,47 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksh extension (vectorized SM3).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_INSNS_ZVKSH_COMMON_H_
+#define RISCV_INSNS_ZVKSH_COMMON_H_
+
+// Constraints common to all vsm3* instructions:
+//  - Zvksh is enabled
+//  - VSEW == 32
+//  - EGW (256) <= LMUL * VLEN
+//  - No overlap of vd and vs2.
+//
+// The constraint that vstart and vl are both EGS (8) aligned
+// is checked in the VI_ZVK_..._EGU32x8_..._LOOP macros.
+#define require_vsm3_constraints \
+  do { \
+    require_zvksh; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(256); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+#define FF1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define FF2(X, Y, Z) (((X) & (Y)) | ((X) & (Z)) | ((Y) & (Z)))
+
+// Boolean function FF_j - section 4.3. of the IETF draft.
+#define ZVKSH_FF(X, Y, Z, J) (((J) <= 15) ? FF1(X, Y, Z) : FF2(X, Y, Z))
+
+#define GG1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define GG2(X, Y, Z) (((X) & (Y)) | ((~(X)) & (Z)))
+
+// Boolean function GG_j - section 4.3. of the IETF draft.
+#define ZVKSH_GG(X, Y, Z, J) (((J) <= 15) ? GG1(X, Y, Z) : GG2(X, Y, Z))
+
+#define T1 0x79CC4519
+#define T2 0x7A879D8A
+
+// T_j constant - section 4.2. of the IETF draft.
+#define ZVKSH_T(J) (((J) <= 15) ? (T1) : (T2))
+
+// Permutation functions P_0 and P_1 - section 4.4 of the IETF draft.
+#define ZVKSH_P0(X) ((X) ^ ZVK_ROL32((X),  9) ^ ZVK_ROL32((X), 17))
+#define ZVKSH_P1(X) ((X) ^ ZVK_ROL32((X), 15) ^ ZVK_ROL32((X), 23))
+
+#endif // RISCV_INSNS_ZVKSH_COMMON_H
-- 
cgit v1.1