diff options
Diffstat (limited to 'riscv')
-rw-r--r-- | riscv/insns/sm4_common.h | 1 | ||||
-rw-r--r-- | riscv/insns/vsm4k_vi.h | 52 | ||||
-rw-r--r-- | riscv/insns/vsm4r_vs.h | 51 | ||||
-rw-r--r-- | riscv/insns/vsm4r_vv.h | 37 | ||||
-rw-r--r-- | riscv/riscv.mk.in | 6 | ||||
-rw-r--r-- | riscv/zvksed_ext_macros.h | 60 |
6 files changed, 206 insertions, 1 deletions
diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h index 17f129f..24d6ce1 100644 --- a/riscv/insns/sm4_common.h +++ b/riscv/insns/sm4_common.h @@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = { 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 }; - diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h new file mode 100644 index 0000000..8f52e68 --- /dev/null +++ b/riscv/insns/vsm4k_vi.h @@ -0,0 +1,52 @@ +// vsm4k.vi vd, vs2, round# + +#include "zvksed_ext_macros.h" + +// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft. +static constexpr uint32_t zvksed_ck[32] = { + 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269, + 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9, + 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249, + 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9, + 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229, + 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299, + 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209, + 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +}; + +require_vsm4_constraints; + +VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP( + {}, + // The following statements will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the 'round' variable + // declared and defined here here to be visible in the loop block. + // Only consider the bottom 3 bits of the immediate, ensuring that + // 'round' is in the valid range [0, 7]. + const reg_t round = zimm5 & 0x7;, + // Per Element Group body. + { + // {rk0, rk1, rk2, rk3} <- vs2 + EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3); + + uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round]; + uint32_t S = ZVKSED_SUB_BYTES(B); + uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S); + + B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1]; + S = ZVKSED_SUB_BYTES(B); + uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S); + + B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2]; + S = ZVKSED_SUB_BYTES(B); + uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S); + + B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3]; + S = ZVKSED_SUB_BYTES(B); + uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S); + + // Update the destination register. + SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7); + } +); diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h new file mode 100644 index 0000000..44011eb --- /dev/null +++ b/riscv/insns/vsm4r_vs.h @@ -0,0 +1,51 @@ +// vsm4r.vs vd, vs2 + +#include "zvksed_ext_macros.h" + +require_vsm4_constraints; +// No overlap of vd and vs2. +require(insn.rd() != insn.rs2()); + +VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP( + {}, + // This statement will be executed before the first execution + // of the loop, and only if the loop is going to be entered. + // We cannot use a block ( { ... } ) since we want the variables declared + // here to be visible in the loop block. + // We capture the "scalar", vs2's first element, by copy, even though + // the "no overlap" constraint means that vs2 should remain constant + // during the loop. + const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0); + const uint32_t rk0 = scalar_key[0]; + const uint32_t rk1 = scalar_key[1]; + const uint32_t rk2 = scalar_key[2]; + const uint32_t rk3 = scalar_key[3];, + { + EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true); + + // {x0, x1,x2, x3} <- vd + EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3); + + uint32_t B; + uint32_t S; + + B = x1 ^ x2 ^ x3 ^ rk0; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x4 = ZVKSED_ROUND(x0, S); + + B = x2 ^ x3 ^ x4 ^ rk1; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x5 = ZVKSED_ROUND(x1, S); + + B = x3 ^ x4 ^ x5 ^ rk2; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x6 = ZVKSED_ROUND(x2, S); + + B = x4 ^ x5 ^ x6 ^ rk3; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x7 = ZVKSED_ROUND(x3, S); + + // Update the destination register. + SET_EGU32x4_LE(state, x4, x5, x6, x7); + } +); diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h new file mode 100644 index 0000000..9a18cec --- /dev/null +++ b/riscv/insns/vsm4r_vv.h @@ -0,0 +1,37 @@ +// vsm4r.vv vd, vs2 + +#include "zvksed_ext_macros.h" + +require_vsm4_constraints; + +VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP( + {}, + { + // vd = {x0, x1,x2, x3} <- vd + EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3); + // {rk0, rk1, rk2, rk3} <- vs2 + EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3); + + uint32_t B; + uint32_t S; + + B = x1 ^ x2 ^ x3 ^ rk0; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x4 = ZVKSED_ROUND(x0, S); + + B = x2 ^ x3 ^ x4 ^ rk1; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x5 = ZVKSED_ROUND(x1, S); + + B = x3 ^ x4 ^ x5 ^ rk2; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x6 = ZVKSED_ROUND(x2, S); + + B = x4 ^ x5 ^ x6 ^ rk3; + S = ZVKSED_SUB_BYTES(B); + const uint32_t x7 = ZVKSED_ROUND(x3, S); + + // Update the destination register. + SET_EGU32x4_LE(vd, x4, x5, x6, x7); + } +); diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in index 2d75662..c774e1b 100644 --- a/riscv/riscv.mk.in +++ b/riscv/riscv.mk.in @@ -1387,12 +1387,18 @@ riscv_insn_ext_zvknh = \ vsha2ch_vv \ vsha2ms_vv \ +riscv_insn_ext_zvksed = \ + vsm4k_vi \ + vsm4r_vs \ + vsm4r_vv \ + riscv_insn_ext_zvk = \ $(riscv_insn_ext_zvbb) \ $(riscv_insn_ext_zvbc) \ $(riscv_insn_ext_zvkg) \ $(riscv_insn_ext_zvkned) \ $(riscv_insn_ext_zvknh) \ + $(riscv_insn_ext_zvksed) \ riscv_insn_list = \ $(if $(HAVE_INT128),$(riscv_insn_ext_v),) \ diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h new file mode 100644 index 0000000..46e399b --- /dev/null +++ b/riscv/zvksed_ext_macros.h @@ -0,0 +1,60 @@ +// Helper macros and functions to help implement instructions defined as part of +// the RISC-V Zvksed extension (vectorized SM4). + +#include "insns/sm4_common.h" +#include "zvk_ext_macros.h" + +#ifndef RISCV_ZVKSED_MACROS_H_ +#define RISCV_ZVKSED_MACROS_H_ + +// Constraints common to all vsm4* instructions: +// - Zvksed is enabled +// - VSEW == 32 +// - EGW (128) <= LMUL * VLEN +// +// The constraint that vstart and vl are both EGS (4) aligned +// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros. +#define require_vsm4_constraints \ + do { \ + require_zvksed; \ + require(P.VU.vsew == 32); \ + require_egw_fits(128); \ + } while (false) + +// Returns a uint32_t value constructed from the 4 bytes (uint8_t) +// provided in "Little Endian" (LE) order, i.e., from least significant (B0) +// to most significant (B3). +#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \ + (((uint32_t)(B0)) << 0 | \ + ((uint32_t)(B1)) << 8 | \ + ((uint32_t)(B2)) << 16 | \ + ((uint32_t)(B3)) << 24) + +// Get byte BYTE of the SBox. +#define ZVKSED_SBOX(BYTE) (sm4_sbox[(BYTE)]) + +// Given an unsigned integer value 'X' and a byte index, +// returns a uint8_t value for the byte at the given index. +#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8))) + +// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1. +// of the IETF draft. +#define ZVKSED_SUB_BYTES(B) \ + ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \ + ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \ + ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \ + ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3))) + +// Perform the linear transformation L to a 32 bit word S and xor it with a 32 +// bit word X - section 6.2.2. of the IETF draft. +#define ZVKSED_ROUND(X, S) \ + ((X) ^ \ + ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \ + ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24))) + +// Perform the linear transformation L' to a 32 bit word S and xor it with a 32 +// bit word X - section 6.2.2. of the IETF draft. +#define ZVKSED_ROUND_KEY(X, S) \ + ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23))) + +#endif // RISCV_ZVKSED_MACROS_H_ |