From 00873aa61acae4a17c1d269cddf1885e83b50102 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:32 -0700
Subject: Zvk: Implement Zvknh[ab], NIST Suite: Vector SHA-2

Implement the instructions part of the Zvknha and Zvknhb
sub-extensions:
 - vsha2ms.vv, message schedule
 - vsha2ch.vv / vsha2cl.vv, compression rounds

A header files for common macros is added.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vsha2ch_vv.h |  61 +++++++++++++++++++
 riscv/insns/vsha2cl_vv.h |  62 +++++++++++++++++++
 riscv/insns/vsha2ms_vv.h |  63 +++++++++++++++++++
 riscv/riscv.mk.in        |   7 +++
 riscv/zvknh_ext_macros.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 348 insertions(+)
 create mode 100644 riscv/insns/vsha2ch_vv.h
 create mode 100644 riscv/insns/vsha2cl_vv.h
 create mode 100644 riscv/insns/vsha2ms_vv.h
 create mode 100644 riscv/zvknh_ext_macros.h

diff --git a/riscv/insns/vsha2ch_vv.h b/riscv/insns/vsha2ch_vv.h
new file mode 100644
index 0000000..34c6e05
--- /dev/null
+++ b/riscv/insns/vsha2ch_vv.h
@@ -0,0 +1,61 @@
+// vsha2ch.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/insns/vsha2cl_vv.h b/riscv/insns/vsha2cl_vv.h
new file mode 100644
index 0000000..4a1df09
--- /dev/null
+++ b/riscv/insns/vsha2cl_vv.h
@@ -0,0 +1,62 @@
+// vsha2cl.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
+
diff --git a/riscv/insns/vsha2ms_vv.h b/riscv/insns/vsha2ms_vv.h
new file mode 100644
index 0000000..8f1ca08
--- /dev/null
+++ b/riscv/insns/vsha2ms_vv.h
@@ -0,0 +1,63 @@
+// vshams.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU32x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint32_t w16 = ZVK_SHA256_SCHEDULE(w14,  w9, w1, w0);
+        const uint32_t w17 = ZVK_SHA256_SCHEDULE(w15, w10, w2, w1);
+        const uint32_t w18 = ZVK_SHA256_SCHEDULE(w16, w11, w3, w2);
+        const uint32_t w19 = ZVK_SHA256_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU32x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU64x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint64_t w16 = ZVK_SHA512_SCHEDULE(w14,  w9, w1, w0);
+        const uint64_t w17 = ZVK_SHA512_SCHEDULE(w15, w10, w2, w1);
+        const uint64_t w18 = ZVK_SHA512_SCHEDULE(w16, w11, w3, w2);
+        const uint64_t w19 = ZVK_SHA512_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU64x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 5562c09..4ce088f 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1368,10 +1368,17 @@ riscv_insn_ext_zvkg= \
 	vghsh_vv \
 	vgmul_vv \
 
+# Covers both Zvknha and Zvkhnb.
+riscv_insn_ext_zvknh = \
+	vsha2cl_vv \
+	vsha2ch_vv \
+	vsha2ms_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
+	$(riscv_insn_ext_zvknh) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvknh_ext_macros.h b/riscv/zvknh_ext_macros.h
new file mode 100644
index 0000000..b50818b
--- /dev/null
+++ b/riscv/zvknh_ext_macros.h
@@ -0,0 +1,155 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvknh[ab] extensions (vector SHA-256/SHA-512 cryptography).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKNH_EXT_MACROS_H_
+#define RISCV_ZVKNH_EXT_MACROS_H_
+
+// Constraints common to all vsha* instructions, across all VSEW:
+//  - VSEW is 32 (SHA-256) or 64 (SHA-512)
+//  - No overlap of vd with vs1 or vs2.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_..._EGU32x4_..._LOOP and VI_..._EGU64x4_..._LOOP
+// macros.
+#define require_vsha2_common_constraints \
+  do { \
+    require(P.VU.vsew == 32 || P.VU.vsew == 64); \
+    require(insn.rd() != insn.rs1()); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsha2_vsew32_constraints \
+  do { \
+    require_zvknh_256; \
+    require_egw_fits(128); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU64x4_..._LOOP macros.
+#define require_vsha2_vsew64_constraints \
+  do { \
+    require_zvknh_512; \
+    require_egw_fits(256); \
+  } while (false)
+
+//
+// SHA-256 and SHA-512 common logic
+//
+
+// Ch(x, y, z) = (xy) ⊕ (~xz) = xy | ~xz
+#define ZVK_SHA_CH(X, Y, Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+
+// Maj(x,y,z)  = (xy) ⊕ (xz) ⊕(yz) = xy | xz | yz
+#define ZVK_SHA_MAJ(X, Y, Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+
+//
+// SHA-256
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA256_SUM0(X) \
+  (ZVK_ROR32(X, 2) ^ ZVK_ROR32(X, 13) ^ ZVK_ROR32(X, 22))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA256_SUM1(X) \
+  (ZVK_ROR32(X, 6) ^ ZVK_ROR32(X, 11) ^ ZVK_ROR32(X, 25))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA256_SIG0(X) \
+  (ZVK_ROR32(X, 7) ^ ZVK_ROR32(X, 18) ^ ((X) >> 3))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA256_SIG1(X)  \
+  (ZVK_ROR32(X, 17) ^ ZVK_ROR32(X, 19) ^ ((X) >> 10))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA256_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA256_SIG1(W14) + (W9) + ZVK_SHA256_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA256_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint32_t t1 = (H) + ZVK_SHA256_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint32_t t2 = ZVK_SHA256_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+//
+// SHA-512
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA512_SUM0(X) \
+  (ZVK_ROR64(X, 28) ^ ZVK_ROR64(X, 34) ^ ZVK_ROR64(X, 39))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA512_SUM1(X) \
+  (ZVK_ROR64(X, 14) ^ ZVK_ROR64(X, 18) ^ ZVK_ROR64(X, 41))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA512_SIG0(X) \
+  (ZVK_ROR64(X, 1) ^ ZVK_ROR64(X, 8) ^ ((X) >> 7))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA512_SIG1(X) \
+  (ZVK_ROR64(X, 19) ^ ZVK_ROR64(X, 61) ^ ((X) >> 6))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA512_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA512_SIG1(W14) + (W9) + ZVK_SHA512_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA512_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint64_t t1 = (H) + ZVK_SHA512_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint64_t t2 = ZVK_SHA512_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+#endif  // RISCV_ZVKNH_EXT_MACROS_H_
-- 
cgit v1.1