Zvk: Implement Zvkg, Vector GCM/GMAC instruction

Implement the proposed instruction in Zvkg, vghmac.vv, Vector Carryless Multiply Accumulate over GHASH Galois-Field. The instruction performs one step of GHASH routine as described in "NIST Special Publication 800-38D" a.k.a the AES-GCM specification. The logic was written to closely track the pseudo-code in the Zvk specification. Signed-off-by: Eric Gouriou <ego@rivosinc.com> Co-authored-by: Kornel Duleba <mindal@semihalf.com> Signed-off-by: Eric Gouriou <ego@rivosinc.com>
author: Eric Gouriou <ego@rivosinc.com> 2023-06-01 18:07:22 -0700
committer: Eric Gouriou <ego@rivosinc.com> 2023-06-19 14:30:34 -0700
commit: fbd4ca2eef884b6835e848d761b3e375a66fc47a (patch)
tree: 6fff44dce7b7a5adc93d10886cd52929f1a2c32e /riscv
parent: d633af2b180391b6f73f84f56d8b305a3af7c152 (diff)
download: spike-fbd4ca2eef884b6835e848d761b3e375a66fc47a.zip
spike-fbd4ca2eef884b6835e848d761b3e375a66fc47a.tar.gz
spike-fbd4ca2eef884b6835e848d761b3e375a66fc47a.tar.bz2
4 files changed, 89 insertions, 2 deletions
diff --git a/riscv/insns/vghsh_vv.h b/riscv/insns/vghsh_vv.h
new file mode 100644
index 0000000..bcbfe74
--- /dev/null
+++ b/riscv/insns/vghsh_vv.h
@@ -0,0 +1,38 @@
+// vghsh.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;   // Current partial hash
+    EGU32x4_t X = vs1;  // Block cipher output
+    EGU32x4_t H = vs2;  // Hash subkey
+
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    // S = brev8(Y ^ X)
+    EGU32x4_t S;
+    EGU32x4_XOR(S, Y, X);
+    EGU32x4_BREV8(S);
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(S, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      const bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Left shift by 1.
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/insns/vgmul_vv.h b/riscv/insns/vgmul_vv.h
new file mode 100644
index 0000000..820b396
--- /dev/null
+++ b/riscv/insns/vgmul_vv.h
@@ -0,0 +1,32 @@
+// vgmul.vv vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;  // Multiplier
+    EGU32x4_BREV8(Y);
+    EGU32x4_t H = vs2;  // Multiplicand
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(Y, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Lef shift by 1
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index dcf2640..5562c09 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1364,9 +1364,14 @@ riscv_insn_ext_zvbc = \
 	vclmulh_vv \
 	vclmulh_vx \
 
+riscv_insn_ext_zvkg= \
+	vghsh_vv \
+	vgmul_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
+	$(riscv_insn_ext_zvkg) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
index 7efbac8..bf893f9 100644
--- a/riscv/zvk_ext_macros.h
+++ b/riscv/zvk_ext_macros.h
@@ -942,8 +942,8 @@
 // Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
 // in A (mutated) with the bytes in B (unchanged).
 #define EGU32x4_XOREQ(MUT_A, CONST_B) \
-  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
-    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  for (std::size_t idx = 0; idx < 4; ++idx) { \
+    (MUT_A)[idx] ^= (CONST_B)[idx]; \
   }
 
 // Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
@@ -953,6 +953,18 @@
     (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
   }
 
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU32x4_XOR(DST, A, B) \
+  do { \
+    static_assert(std::is_same<EGU32x4_t, decltype(A)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(B)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(DST)>::value); \
+    for (std::size_t idx = 0; idx < 4; ++idx) { \
+      (DST)[idx] = (A)[idx] ^ (B)[idx]; \
+    } \
+  } while (0)
+
 //
 // Common bit manipulations logic.
 //
author	Eric Gouriou <ego@rivosinc.com>	2023-06-01 18:07:22 -0700
committer	Eric Gouriou <ego@rivosinc.com>	2023-06-19 14:30:34 -0700
commit	fbd4ca2eef884b6835e848d761b3e375a66fc47a (patch)
tree	6fff44dce7b7a5adc93d10886cd52929f1a2c32e /riscv
parent	d633af2b180391b6f73f84f56d8b305a3af7c152 (diff)
download	spike-fbd4ca2eef884b6835e848d761b3e375a66fc47a.zip spike-fbd4ca2eef884b6835e848d761b3e375a66fc47a.tar.gz spike-fbd4ca2eef884b6835e848d761b3e375a66fc47a.tar.bz2