src/vector-examples.adoc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

[appendix]
== Vector Assembly Code Examples

The following are provided as non-normative text to help explain the vector ISA.

=== Vector-vector add example

----
include::example/vvaddint32.s[lines=4..-1]
----

=== Example with mixed-width mask and compute.

----
# Code using one width for predicate and different width for masked
# compute.
#   int8_t a[]; int32_t b[], c[];
#   for (i=0;  i<n; i++) { b[i] =  (a[i] < 5) ? c[i] : 1; }
#
# Mixed-width code that keeps SEW/LMUL=8
  loop:
    vsetvli a4, a0, e8, m1, ta, ma   # Byte vector for predicate calc
    vle8.v v1, (a1)               # Load a[i]
      add a1, a1, a4              # Bump pointer.
    vmslt.vi v0, v1, 5            # a[i] < 5?

    vsetvli x0, a0, e32, m4, ta, mu  # Vector of 32-bit values.
      sub a0, a0, a4              # Decrement count
    vmv.v.i v4, 1                 # Splat immediate to destination
    vle32.v v4, (a3), v0.t        # Load requested elements of C, others undisturbed
      sll t1, a4, 2
      add a3, a3, t1              # Bump pointer.
    vse32.v v4, (a2)              # Store b[i].
      add a2, a2, t1              # Bump pointer.
      bnez a0, loop               # Any more?
----

=== Memcpy example

----
include::example/memcpy.s[lines=4..-1]
----

=== Conditional example

----
# (int16) z[i] = ((int8) x[i] < 5) ? (int16) a[i] : (int16) b[i];
#

loop:
    vsetvli t0, a0, e8, m1, ta, ma # Use 8b elements.
    vle8.v v0, (a1)         # Get x[i]
      sub a0, a0, t0        # Decrement element count
      add a1, a1, t0        # x[i] Bump pointer
    vmslt.vi v0, v0, 5      # Set mask in v0
    vsetvli x0, x0, e16, m2, ta, mu  # Use 16b elements.
      slli t0, t0, 1        # Multiply by 2 bytes
    vle16.v v2, (a2), v0.t  # z[i] = a[i] case
    vmnot.m v0, v0          # Invert v0
      add a2, a2, t0        # a[i] bump pointer
    vle16.v v2, (a3), v0.t  # z[i] = b[i] case
      add a3, a3, t0        # b[i] bump pointer
    vse16.v v2, (a4)        # Store z
      add a4, a4, t0        # z[i] bump pointer
      bnez a0, loop
----
=== SAXPY example

----
include::example/saxpy.s[lines=4..-1]
----

=== SGEMM example

----
include::example/sgemm.S[lines=4..-1]
----

=== Division approximation example

----
# v1 = v1 / v2 to almost 23 bits of precision.

vfrec7.v v3, v2             # Estimate 1/v2
  li t0, 0x40000000
vmv.v.x v4, t0              # Splat 2.0
vfnmsac.vv v4, v2, v3       # 2.0 - v2 * est(1/v2)
vfmul.vv v3, v3, v4         # Better estimate of 1/v2
vmv.v.x v4, t0              # Splat 2.0
vfnmsac.vv v4, v2, v3       # 2.0 - v2 * est(1/v2)
vfmul.vv v3, v3, v4         # Better estimate of 1/v2
vfmul.vv v1, v1, v3         # Estimate of v1/v2
----

=== Square root approximation example

----
# v1 = sqrt(v1) to almost 23 bits of precision.

  fmv.w.x ft0, x0           # Mask off zero inputs
vmfne.vf v0, v1, ft0        #   to avoid div by zero
vfrsqrt7.v v2, v1, v0.t     # Estimate 1/sqrt(x)
vmfne.vf v0, v2, ft0, v0.t  # Additionally mask off +inf inputs
  li t0, 0x40400000
vmv.v.x v4, t0              # Splat 3.0
vfmul.vv v3, v1, v2, v0.t   # x * est
vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
vfmul.vv v3, v3, v2, v0.t   # est * (-x * est * est + 3)
  li t0, 0x3f000000
  fmv.w.x ft0, t0           # 0.5
vfmul.vf v2, v3, ft0, v0.t  # Estimate to 14 bits
vfmul.vv v3, v1, v2, v0.t   # x * est
vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
vfmul.vv v3, v3, v2, v0.t   # est * (-x * est * est + 3)
vfmul.vf v2, v3, ft0, v0.t  # Estimate to 23 bits
vfmul.vv v1, v2, v1, v0.t   # x * 1/sqrt(x)
----

=== C standard library strcmp example

----
include::example/strcmp.s[lines=4..-1]
----

include::fraclmul.adoc[]