riscv/insns/vsm3c_vi.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

// vsm3c.vi vd, vs2, rnd

#include "zvksh_ext_macros.h"

require_vsm3_constraints;

VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(
  {},
  // No need to validate or normalize 'zimm5' here as this is a 5 bits value
  // and all values in 0-31 are valid.
  const reg_t round = zimm5;,
  {
    // {H, G, F, E, D, C, B, A} <- vd
    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vd, H, G, F, E, D, C, B, A);
    // {_, _, w5, w4, _, _, w1, w0} <- vs2
    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2,
                                   UNUSED _unused_w7, UNUSED _unused_w6, w5, w4,
                                   UNUSED _unused_w3, UNUSED _unused_w2, w1, w0);
    const uint32_t x0 = w0 ^ w4;  // W'[0] in spec documentation.
    const uint32_t x1 = w1 ^ w5;  // W'[1]

    // Two rounds of compression.
    uint32_t ss1;
    uint32_t ss2;
    uint32_t tt1;
    uint32_t tt2;
    uint32_t j;

    j = 2 * round;
    ss1 = ZVK_ROL32(ZVK_ROL32(A, 12) + E + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
    ss2 = ss1 ^ ZVK_ROL32(A, 12);
    tt1 = ZVKSH_FF(A, B, C, j) + D + ss2 + x0;
    tt2 = ZVKSH_GG(E, F, G, j) + H + ss1 + w0;
    D = C;
    const uint32_t C1 = ZVK_ROL32(B, 9);
    B = A;
    const uint32_t A1 = tt1;
    H = G;
    const uint32_t G1 = ZVK_ROL32(F, 19);
    F = E;
    const uint32_t E1 = ZVKSH_P0(tt2);

    j = 2 * round + 1;
    ss1 = ZVK_ROL32(ZVK_ROL32(A1, 12) + E1 + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
    ss2 = ss1 ^ ZVK_ROL32(A1, 12);
    tt1 = ZVKSH_FF(A1, B, C1, j) + D + ss2 + x1;
    tt2 = ZVKSH_GG(E1, F, G1, j) + H + ss1 + w1;
    D = C1;
    const uint32_t C2 = ZVK_ROL32(B, 9);
    B = A1;
    const uint32_t A2 = tt1;
    H = G1;
    const uint32_t G2 = ZVK_ROL32(F, 19);
    F = E1;
    const uint32_t E2 = ZVKSH_P0(tt2);

    // Update the destination register.
    SET_EGU32x8_WORDS_BE_BSWAP(vd, G1, G2, E1, E2, C1, C2, A1, A2);
  }
);