aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Traynor <wmat@riscv.org>2024-03-19 23:14:38 -0400
committerGitHub <noreply@github.com>2024-03-19 23:14:38 -0400
commitaa5dce0b1ffda7eaa74491156c4b507d2e4d6460 (patch)
tree70f9de3d76050528c75b82211998a251a9374b19
parenta4382e9c8e285360a88d8056c1253e1525552393 (diff)
parent7013a901500bafd72be3a7413eca342bf69b1860 (diff)
downloadriscv-isa-manual-riscv-isa-release-aa5dce0-2024-03-20.zip
riscv-isa-manual-riscv-isa-release-aa5dce0-2024-03-20.tar.gz
riscv-isa-manual-riscv-isa-release-aa5dce0-2024-03-20.tar.bz2
Merge pull request #1088 from riscv/vectorriscv-isa-release-aa5dce0-2024-03-20
-rw-r--r--src/c-st-ext.adoc3
-rw-r--r--src/calling-convention.adoc29
-rw-r--r--src/example/memcpy.s17
-rw-r--r--src/example/saxpy.s29
-rw-r--r--src/example/sgemm.S221
-rw-r--r--src/example/strcmp.s34
-rw-r--r--src/example/strcpy.s20
-rw-r--r--src/example/strlen.s22
-rw-r--r--src/example/strncpy.s36
-rw-r--r--src/example/vvaddint32.s22
-rw-r--r--src/fraclmul.adoc174
-rw-r--r--src/images/wavedrom/v-inst-table.adoc210
-rw-r--r--src/images/wavedrom/valu-format.adoc104
-rw-r--r--src/images/wavedrom/vcfg-format.adoc47
-rw-r--r--src/images/wavedrom/vfrec7.adoc136
-rw-r--r--src/images/wavedrom/vfrsqrt7.adoc137
-rw-r--r--src/images/wavedrom/vmem-format.adoc108
-rw-r--r--src/images/wavedrom/vtype-format.adoc28
-rw-r--r--src/resources/themes/riscv-spec.yml1
-rw-r--r--src/riscv-privileged.adoc5
-rw-r--r--src/riscv-unprivileged.adoc10
-rw-r--r--src/v-st-ext.adoc5185
-rw-r--r--src/vector-examples.adoc125
23 files changed, 6698 insertions, 5 deletions
diff --git a/src/c-st-ext.adoc b/src/c-st-ext.adoc
index ca248f6..4cc36cd 100644
--- a/src/c-st-ext.adoc
+++ b/src/c-st-ext.adoc
@@ -306,8 +306,7 @@ These instructions use the CI format.
C.LWSP loads a 32-bit value from memory into register _rd_. It computes
an effective address by adding the _zero_-extended offset, scaled by 4,
to the stack pointer, `x2`. It expands to `lw rd, offset(x2)`. C.LWSP is
-only valid when _rd_&#x2260;x0 the code
-points with _rd_=x0 are reserved.
+only valid when _rd_&#x2260;x0 the code points with _rd_=x0 are reserved.
C.LDSP is an RV64C/RV128C-only instruction that loads a 64-bit value
from memory into register _rd_. It computes its effective address by
diff --git a/src/calling-convention.adoc b/src/calling-convention.adoc
new file mode 100644
index 0000000..f5cb079
--- /dev/null
+++ b/src/calling-convention.adoc
@@ -0,0 +1,29 @@
+[appendix]
+== Calling Convention for Vector State (Not authoritative - Placeholder Only)
+
+NOTE: This Appendix is only a placeholder to help explain the
+conventions used in the code examples, and is not considered frozen or
+part of the ratification process. The official RISC-V psABI document
+is being expanded to specify the vector calling conventions.
+
+In the RISC-V psABI, the vector registers `v0`-`v31` are all caller-saved.
+The `vl` and `vtype` CSRs are also caller-saved.
+
+Procedures may assume that `vstart` is zero upon entry. Procedures may
+assume that `vstart` is zero upon return from a procedure call.
+
+NOTE: Application software should normally not write `vstart` explicitly.
+Any procedure that does explicitly write `vstart` to a nonzero value must
+zero `vstart` before either returning or calling another procedure.
+
+The `vxrm` and `vxsat` fields of `vcsr` have thread storage duration.
+
+Executing a system call causes all caller-saved vector registers
+(`v0`-`v31`, `vl`, `vtype`) and `vstart` to become unspecified.
+
+NOTE: This scheme allows system calls that cause context switches to avoid
+saving and later restoring the vector registers.
+
+NOTE: Most OSes will choose to either leave these registers intact or reset
+them to their initial state to avoid leaking information across process
+boundaries.
diff --git a/src/example/memcpy.s b/src/example/memcpy.s
new file mode 100644
index 0000000..5f6318a
--- /dev/null
+++ b/src/example/memcpy.s
@@ -0,0 +1,17 @@
+ .text
+ .balign 4
+ .global memcpy
+ # void *memcpy(void* dest, const void* src, size_t n)
+ # a0=dest, a1=src, a2=n
+ #
+ memcpy:
+ mv a3, a0 # Copy destination
+ loop:
+ vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
+ vle8.v v0, (a1) # Load bytes
+ add a1, a1, t0 # Bump pointer
+ sub a2, a2, t0 # Decrement count
+ vse8.v v0, (a3) # Store bytes
+ add a3, a3, t0 # Bump pointer
+ bnez a2, loop # Any more?
+ ret # Return
diff --git a/src/example/saxpy.s b/src/example/saxpy.s
new file mode 100644
index 0000000..de7f224
--- /dev/null
+++ b/src/example/saxpy.s
@@ -0,0 +1,29 @@
+ .text
+ .balign 4
+ .global saxpy
+# void
+# saxpy(size_t n, const float a, const float *x, float *y)
+# {
+# size_t i;
+# for (i=0; i<n; i++)
+# y[i] = a * x[i] + y[i];
+# }
+#
+# register arguments:
+# a0 n
+# fa0 a
+# a1 x
+# a2 y
+
+saxpy:
+ vsetvli a4, a0, e32, m8, ta, ma
+ vle32.v v0, (a1)
+ sub a0, a0, a4
+ slli a4, a4, 2
+ add a1, a1, a4
+ vle32.v v8, (a2)
+ vfmacc.vf v8, fa0, v0
+ vse32.v v8, (a2)
+ add a2, a2, a4
+ bnez a0, saxpy
+ ret
diff --git a/src/example/sgemm.S b/src/example/sgemm.S
new file mode 100644
index 0000000..e29cc8d
--- /dev/null
+++ b/src/example/sgemm.S
@@ -0,0 +1,221 @@
+ .text
+ .balign 4
+ .global sgemm_nn
+# RV64IDV system
+#
+# void
+# sgemm_nn(size_t n,
+# size_t m,
+# size_t k,
+# const float*a, // m * k matrix
+# size_t lda,
+# const float*b, // k * n matrix
+# size_t ldb,
+# float*c, // m * n matrix
+# size_t ldc)
+#
+# c += a*b (alpha=1, no transpose on input matrices)
+# matrices stored in C row-major order
+
+#define n a0
+#define m a1
+#define k a2
+#define ap a3
+#define astride a4
+#define bp a5
+#define bstride a6
+#define cp a7
+#define cstride t0
+#define kt t1
+#define nt t2
+#define bnp t3
+#define cnp t4
+#define akp t5
+#define bkp s0
+#define nvl s1
+#define ccp s2
+#define amp s3
+
+# Use args as additional temporaries
+#define ft12 fa0
+#define ft13 fa1
+#define ft14 fa2
+#define ft15 fa3
+
+# This version holds a 16*VLMAX block of C matrix in vector registers
+# in inner loop, but otherwise does not cache or TLB tiling.
+
+sgemm_nn:
+ addi sp, sp, -FRAMESIZE
+ sd s0, OFFSET(sp)
+ sd s1, OFFSET(sp)
+ sd s2, OFFSET(sp)
+
+ # Check for zero size matrices
+ beqz n, exit
+ beqz m, exit
+ beqz k, exit
+
+ # Convert elements strides to byte strides.
+ ld cstride, OFFSET(sp) # Get arg from stack frame
+ slli astride, astride, 2
+ slli bstride, bstride, 2
+ slli cstride, cstride, 2
+
+ slti t6, m, 16
+ bnez t6, end_rows
+
+c_row_loop: # Loop across rows of C blocks
+
+ mv nt, n # Initialize n counter for next row of C blocks
+
+ mv bnp, bp # Initialize B n-loop pointer to start
+ mv cnp, cp # Initialize C n-loop pointer
+
+c_col_loop: # Loop across one row of C blocks
+ vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1
+
+ mv akp, ap # reset pointer into A to beginning
+ mv bkp, bnp # step to next column in B matrix
+
+ # Initalize current C submatrix block from memory.
+ vle32.v v0, (cnp); add ccp, cnp, cstride;
+ vle32.v v1, (ccp); add ccp, ccp, cstride;
+ vle32.v v2, (ccp); add ccp, ccp, cstride;
+ vle32.v v3, (ccp); add ccp, ccp, cstride;
+ vle32.v v4, (ccp); add ccp, ccp, cstride;
+ vle32.v v5, (ccp); add ccp, ccp, cstride;
+ vle32.v v6, (ccp); add ccp, ccp, cstride;
+ vle32.v v7, (ccp); add ccp, ccp, cstride;
+ vle32.v v8, (ccp); add ccp, ccp, cstride;
+ vle32.v v9, (ccp); add ccp, ccp, cstride;
+ vle32.v v10, (ccp); add ccp, ccp, cstride;
+ vle32.v v11, (ccp); add ccp, ccp, cstride;
+ vle32.v v12, (ccp); add ccp, ccp, cstride;
+ vle32.v v13, (ccp); add ccp, ccp, cstride;
+ vle32.v v14, (ccp); add ccp, ccp, cstride;
+ vle32.v v15, (ccp)
+
+
+ mv kt, k # Initialize inner loop counter
+
+ # Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
+ # Software pipeline loads
+ flw ft0, (akp); add amp, akp, astride;
+ flw ft1, (amp); add amp, amp, astride;
+ flw ft2, (amp); add amp, amp, astride;
+ flw ft3, (amp); add amp, amp, astride;
+ # Get vector from B matrix
+ vle32.v v16, (bkp)
+
+ # Loop on inner dimension for current C block
+ k_loop:
+ vfmacc.vf v0, ft0, v16
+ add bkp, bkp, bstride
+ flw ft4, (amp)
+ add amp, amp, astride
+ vfmacc.vf v1, ft1, v16
+ addi kt, kt, -1 # Decrement k counter
+ flw ft5, (amp)
+ add amp, amp, astride
+ vfmacc.vf v2, ft2, v16
+ flw ft6, (amp)
+ add amp, amp, astride
+ flw ft7, (amp)
+ vfmacc.vf v3, ft3, v16
+ add amp, amp, astride
+ flw ft8, (amp)
+ add amp, amp, astride
+ vfmacc.vf v4, ft4, v16
+ flw ft9, (amp)
+ add amp, amp, astride
+ vfmacc.vf v5, ft5, v16
+ flw ft10, (amp)
+ add amp, amp, astride
+ vfmacc.vf v6, ft6, v16
+ flw ft11, (amp)
+ add amp, amp, astride
+ vfmacc.vf v7, ft7, v16
+ flw ft12, (amp)
+ add amp, amp, astride
+ vfmacc.vf v8, ft8, v16
+ flw ft13, (amp)
+ add amp, amp, astride
+ vfmacc.vf v9, ft9, v16
+ flw ft14, (amp)
+ add amp, amp, astride
+ vfmacc.vf v10, ft10, v16
+ flw ft15, (amp)
+ add amp, amp, astride
+ addi akp, akp, 4 # Move to next column of a
+ vfmacc.vf v11, ft11, v16
+ beqz kt, 1f # Don't load past end of matrix
+ flw ft0, (akp)
+ add amp, akp, astride
+1: vfmacc.vf v12, ft12, v16
+ beqz kt, 1f
+ flw ft1, (amp)
+ add amp, amp, astride
+1: vfmacc.vf v13, ft13, v16
+ beqz kt, 1f
+ flw ft2, (amp)
+ add amp, amp, astride
+1: vfmacc.vf v14, ft14, v16
+ beqz kt, 1f # Exit out of loop
+ flw ft3, (amp)
+ add amp, amp, astride
+ vfmacc.vf v15, ft15, v16
+ vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
+ j k_loop
+
+1: vfmacc.vf v15, ft15, v16
+
+ # Save C matrix block back to memory
+ vse32.v v0, (cnp); add ccp, cnp, cstride;
+ vse32.v v1, (ccp); add ccp, ccp, cstride;
+ vse32.v v2, (ccp); add ccp, ccp, cstride;
+ vse32.v v3, (ccp); add ccp, ccp, cstride;
+ vse32.v v4, (ccp); add ccp, ccp, cstride;
+ vse32.v v5, (ccp); add ccp, ccp, cstride;
+ vse32.v v6, (ccp); add ccp, ccp, cstride;
+ vse32.v v7, (ccp); add ccp, ccp, cstride;
+ vse32.v v8, (ccp); add ccp, ccp, cstride;
+ vse32.v v9, (ccp); add ccp, ccp, cstride;
+ vse32.v v10, (ccp); add ccp, ccp, cstride;
+ vse32.v v11, (ccp); add ccp, ccp, cstride;
+ vse32.v v12, (ccp); add ccp, ccp, cstride;
+ vse32.v v13, (ccp); add ccp, ccp, cstride;
+ vse32.v v14, (ccp); add ccp, ccp, cstride;
+ vse32.v v15, (ccp)
+
+ # Following tail instructions should be scheduled earlier in free slots during C block save.
+ # Leaving here for clarity.
+
+ # Bump pointers for loop across blocks in one row
+ slli t6, nvl, 2
+ add cnp, cnp, t6 # Move C block pointer over
+ add bnp, bnp, t6 # Move B block pointer over
+ sub nt, nt, nvl # Decrement element count in n dimension
+ bnez nt, c_col_loop # Any more to do?
+
+ # Move to next set of rows
+ addi m, m, -16 # Did 16 rows above
+ slli t6, astride, 4 # Multiply astride by 16
+ add ap, ap, t6 # Move A matrix pointer down 16 rows
+ slli t6, cstride, 4 # Multiply cstride by 16
+ add cp, cp, t6 # Move C matrix pointer down 16 rows
+
+ slti t6, m, 16
+ beqz t6, c_row_loop
+
+ # Handle end of matrix with fewer than 16 rows.
+ # Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
+end_rows:
+ # Not done.
+
+exit:
+ ld s0, OFFSET(sp)
+ ld s1, OFFSET(sp)
+ ld s2, OFFSET(sp)
+ addi sp, sp, FRAMESIZE
+ ret
diff --git a/src/example/strcmp.s b/src/example/strcmp.s
new file mode 100644
index 0000000..c657703
--- /dev/null
+++ b/src/example/strcmp.s
@@ -0,0 +1,34 @@
+ .text
+ .balign 4
+ .global strcmp
+ # int strcmp(const char *src1, const char* src2)
+strcmp:
+ ## Using LMUL=2, but same register names work for larger LMULs
+ li t1, 0 # Initial pointer bump
+loop:
+ vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes
+ add a0, a0, t1 # Bump src1 pointer
+ vle8ff.v v8, (a0) # Get src1 bytes
+ add a1, a1, t1 # Bump src2 pointer
+ vle8ff.v v16, (a1) # Get src2 bytes
+
+ vmseq.vi v0, v8, 0 # Flag zero bytes in src1
+ vmsne.vv v1, v8, v16 # Flag if src1 != src2
+ vmor.mm v0, v0, v1 # Combine exit conditions
+
+ vfirst.m a2, v0 # ==0 or != ?
+ csrr t1, vl # Get number of bytes fetched
+
+ bltz a2, loop # Loop if all same and no zero byte
+
+ add a0, a0, a2 # Get src1 element address
+ lbu a3, (a0) # Get src1 byte from memory
+
+ add a1, a1, a2 # Get src2 element address
+ lbu a4, (a1) # Get src2 byte from memory
+
+ sub a0, a3, a4 # Return value.
+
+ ret
+
+
diff --git a/src/example/strcpy.s b/src/example/strcpy.s
new file mode 100644
index 0000000..109112d
--- /dev/null
+++ b/src/example/strcpy.s
@@ -0,0 +1,20 @@
+ .text
+ .balign 4
+ .global strcpy
+ # char* strcpy(char *dst, const char* src)
+strcpy:
+ mv a2, a0 # Copy dst
+ li t0, -1 # Infinite AVL
+loop:
+ vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes
+ vle8ff.v v8, (a1) # Get src bytes
+ csrr t1, vl # Get number of bytes fetched
+ vmseq.vi v1, v8, 0 # Flag zero bytes
+ vfirst.m a3, v1 # Zero found?
+ add a1, a1, t1 # Bump pointer
+ vmsif.m v0, v1 # Set mask up to and including zero byte.
+ vse8.v v8, (a2), v0.t # Write out bytes
+ add a2, a2, t1 # Bump pointer
+ bltz a3, loop # Zero byte not found, so loop
+
+ ret
diff --git a/src/example/strlen.s b/src/example/strlen.s
new file mode 100644
index 0000000..1c3af4b
--- /dev/null
+++ b/src/example/strlen.s
@@ -0,0 +1,22 @@
+ .text
+ .balign 4
+ .global strlen
+# size_t strlen(const char *str)
+# a0 holds *str
+
+strlen:
+ mv a3, a0 # Save start
+loop:
+ vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length
+ vle8ff.v v8, (a3) # Load bytes
+ csrr a1, vl # Get bytes read
+ vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0
+ vfirst.m a2, v0 # Find first set bit
+ add a3, a3, a1 # Bump pointer
+ bltz a2, loop # Not found?
+
+ add a0, a0, a1 # Sum start + bump
+ add a3, a3, a2 # Add index
+ sub a0, a3, a0 # Subtract start address+bump
+
+ ret
diff --git a/src/example/strncpy.s b/src/example/strncpy.s
new file mode 100644
index 0000000..87e5410
--- /dev/null
+++ b/src/example/strncpy.s
@@ -0,0 +1,36 @@
+ .text
+ .balign 4
+ .global strncpy
+ # char* strncpy(char *dst, const char* src, size_t n)
+strncpy:
+ mv a3, a0 # Copy dst
+loop:
+ vsetvli x0, a2, e8, m8, ta, ma # Vectors of bytes.
+ vle8ff.v v8, (a1) # Get src bytes
+ vmseq.vi v1, v8, 0 # Flag zero bytes
+ csrr t1, vl # Get number of bytes fetched
+ vfirst.m a4, v1 # Zero found?
+ vmsbf.m v0, v1 # Set mask up to before zero byte.
+ vse8.v v8, (a3), v0.t # Write out non-zero bytes
+ bgez a4, zero_tail # Zero remaining bytes.
+ sub a2, a2, t1 # Decrement count.
+ add a3, a3, t1 # Bump dest pointer
+ add a1, a1, t1 # Bump src pointer
+ bnez a2, loop # Anymore?
+
+ ret
+
+zero_tail:
+ sub a2, a2, a4 # Subtract count on non-zero bytes.
+ add a3, a3, a4 # Advance past non-zero bytes.
+ vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes.
+ vmv.v.i v0, 0 # Splat zero.
+
+zero_loop:
+ vse8.v v0, (a3) # Store zero.
+ sub a2, a2, t1 # Decrement count.
+ add a3, a3, t1 # Bump pointer
+ vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes.
+ bnez a2, zero_loop # Anymore?
+
+ ret
diff --git a/src/example/vvaddint32.s b/src/example/vvaddint32.s
new file mode 100644
index 0000000..22305d9
--- /dev/null
+++ b/src/example/vvaddint32.s
@@ -0,0 +1,22 @@
+ .text
+ .balign 4
+ .global vvaddint32
+ # vector-vector add routine of 32-bit integers
+ # void vvaddint32(size_t n, const int*x, const int*y, int*z)
+ # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+ #
+ # a0 = n, a1 = x, a2 = y, a3 = z
+ # Non-vector instructions are indented
+vvaddint32:
+ vsetvli t0, a0, e32, ta, ma # Set vector length based on 32-bit vectors
+ vle32.v v0, (a1) # Get first vector
+ sub a0, a0, t0 # Decrement number done
+ slli t0, t0, 2 # Multiply number done by 4 bytes
+ add a1, a1, t0 # Bump pointer
+ vle32.v v1, (a2) # Get second vector
+ add a2, a2, t0 # Bump pointer
+ vadd.vv v2, v0, v1 # Sum vectors
+ vse32.v v2, (a3) # Store result
+ add a3, a3, t0 # Bump pointer
+ bnez a0, vvaddint32 # Loop back
+ ret # Finished
diff --git a/src/fraclmul.adoc b/src/fraclmul.adoc
new file mode 100644
index 0000000..6f12f58
--- /dev/null
+++ b/src/fraclmul.adoc
@@ -0,0 +1,174 @@
+=== Fractional Lmul example
+
+This appendix presents a non-normative example to help explain where
+compilers can make good use of the fractional LMUL feature.
+
+Consider the following (admittedly contrived) loop written in C:
+
+----
+void add_ref(long N,
+ signed char *restrict c_c, signed char *restrict c_a, signed char *restrict c_b,
+ long *restrict l_c, long *restrict l_a, long *restrict l_b,
+ long *restrict l_d, long *restrict l_e, long *restrict l_f,
+ long *restrict l_g, long *restrict l_h, long *restrict l_i,
+ long *restrict l_j, long *restrict l_k, long *restrict l_l,
+ long *restrict l_m) {
+ long i;
+ for (i = 0; i < N; i++) {
+ c_c[i] = c_a[i] + c_b[i]; // Note this 'char' addition that creates a mixed type situation
+ l_c[i] = l_a[i] + l_b[i];
+ l_f[i] = l_d[i] + l_e[i];
+ l_i[i] = l_g[i] + l_h[i];
+ l_l[i] = l_k[i] + l_j[i];
+ l_m[i] += l_m[i] + l_c[i] + l_f[i] + l_i[i] + l_l[i];
+ }
+}
+----
+
+The example loop has a high register pressure due to the many input variables
+and temporaries required. The compiler realizes there are two datatypes within
+the loop: an 8-bit 'char' and a 64-bit 'long *'. Without fractional LMUL, the
+compiler would be forced to use LMUL=1 for the 8-bit computation and LMUL=8 for
+the 64-bit computation(s), to have equal number of elements on all computations
+within the same loop iteration. Under LMUL=8, only 4 registers are available
+to the register allocator. Given the large number of 64-bit variables and
+temporaries required in this loop, the compiler ends up generating a lot of
+spill code. The code below demonstrates this effect:
+
+----
+.LBB0_4: # %vector.body
+ # =>This Inner Loop Header: Depth=1
+ add s9, a2, s6
+ vsetvli s1, zero, e8,m1,ta,mu
+ vle8.v v25, (s9)
+ add s1, a3, s6
+ vle8.v v26, (s1)
+ vadd.vv v25, v26, v25
+ add s1, a1, s6
+ vse8.v v25, (s1)
+ add s9, a5, s10
+ vsetvli s1, zero, e64,m8,ta,mu
+ vle64.v v8, (s9)
+ add s1, a6, s10
+ vle64.v v16, (s1)
+ add s1, a7, s10
+ vle64.v v24, (s1)
+ add s1, s3, s10
+ vle64.v v0, (s1)
+ sd a0, -112(s0)
+ ld a0, -128(s0)
+ vs8r.v v0, (a0) # Spill LMUL=8
+ add s9, t6, s10
+ add s11, t5, s10
+ add ra, t2, s10
+ add s1, t3, s10
+ vle64.v v0, (s9)
+ ld s9, -136(s0)
+ vs8r.v v0, (s9) # Spill LMUL=8
+ vle64.v v0, (s11)
+ ld s9, -144(s0)
+ vs8r.v v0, (s9) # Spill LMUL=8
+ vle64.v v0, (ra)
+ ld s9, -160(s0)
+ vs8r.v v0, (s9) # Spill LMUL=8
+ vle64.v v0, (s1)
+ ld s1, -152(s0)
+ vs8r.v v0, (s1) # Spill LMUL=8
+ vadd.vv v16, v16, v8
+ ld s1, -128(s0)
+ vl8r.v v8, (s1) # Reload LMUL=8
+ vadd.vv v8, v8, v24
+ ld s1, -136(s0)
+ vl8r.v v24, (s1) # Reload LMUL=8
+ ld s1, -144(s0)
+ vl8r.v v0, (s1) # Reload LMUL=8
+ vadd.vv v24, v0, v24
+ ld s1, -128(s0)
+ vs8r.v v24, (s1) # Spill LMUL=8
+ ld s1, -152(s0)
+ vl8r.v v0, (s1) # Reload LMUL=8
+ ld s1, -160(s0)
+ vl8r.v v24, (s1) # Reload LMUL=8
+ vadd.vv v0, v0, v24
+ add s1, a4, s10
+ vse64.v v16, (s1)
+ add s1, s2, s10
+ vse64.v v8, (s1)
+ vadd.vv v8, v8, v16
+ add s1, t4, s10
+ ld s9, -128(s0)
+ vl8r.v v16, (s9) # Reload LMUL=8
+ vse64.v v16, (s1)
+ add s9, t0, s10
+ vadd.vv v8, v8, v16
+ vle64.v v16, (s9)
+ add s1, t1, s10
+ vse64.v v0, (s1)
+ vadd.vv v8, v8, v0
+ vsll.vi v16, v16, 1
+ vadd.vv v8, v8, v16
+ vse64.v v8, (s9)
+ add s6, s6, s7
+ add s10, s10, s8
+ bne s6, s4, .LBB0_4
+----
+
+If instead of using LMUL=1 for the 8-bit computation, the compiler is allowed
+to use a fractional LMUL=1/2, then the 64-bit computations can be performed
+using LMUL=4 (note that the same ratio of 64-bit elements and 8-bit elements is
+preserved as in the previous example). Now the compiler has 8 available
+registers to perform register allocation, resulting in no spill code, as
+shown in the loop below:
+
+----
+.LBB0_4: # %vector.body
+ # =>This Inner Loop Header: Depth=1
+ add s9, a2, s6
+ vsetvli s1, zero, e8,mf2,ta,mu // LMUL=1/2 !
+ vle8.v v25, (s9)
+ add s1, a3, s6
+ vle8.v v26, (s1)
+ vadd.vv v25, v26, v25
+ add s1, a1, s6
+ vse8.v v25, (s1)
+ add s9, a5, s10
+ vsetvli s1, zero, e64,m4,ta,mu // LMUL=4
+ vle64.v v28, (s9)
+ add s1, a6, s10
+ vle64.v v8, (s1)
+ vadd.vv v28, v8, v28
+ add s1, a7, s10
+ vle64.v v8, (s1)
+ add s1, s3, s10
+ vle64.v v12, (s1)
+ add s1, t6, s10
+ vle64.v v16, (s1)
+ add s1, t5, s10
+ vle64.v v20, (s1)
+ add s1, a4, s10
+ vse64.v v28, (s1)
+ vadd.vv v8, v12, v8
+ vadd.vv v12, v20, v16
+ add s1, t2, s10
+ vle64.v v16, (s1)
+ add s1, t3, s10
+ vle64.v v20, (s1)
+ add s1, s2, s10
+ vse64.v v8, (s1)
+ add s9, t4, s10
+ vadd.vv v16, v20, v16
+ add s11, t0, s10
+ vle64.v v20, (s11)
+ vse64.v v12, (s9)
+ add s1, t1, s10
+ vse64.v v16, (s1)
+ vsll.vi v20, v20, 1
+ vadd.vv v28, v8, v28
+ vadd.vv v28, v28, v12
+ vadd.vv v28, v28, v16
+ vadd.vv v28, v28, v20
+ vse64.v v28, (s11)
+ add s6, s6, s7
+ add s10, s10, s8
+ bne s6, s4, .LBB0_4
+----
diff --git a/src/images/wavedrom/v-inst-table.adoc b/src/images/wavedrom/v-inst-table.adoc
new file mode 100644
index 0000000..0c02220
--- /dev/null
+++ b/src/images/wavedrom/v-inst-table.adoc
@@ -0,0 +1,210 @@
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+[cols="<,<,<,<,<,<,<,<,<,<,<,<,<",options="headers"]
+|===
+5+| Integer 4+| Integer 4+| FP
+
+| funct3 | | | | | funct3 | | | | funct3 | | |
+| OPIVV |V| | | | OPMVV{nbsp} |V| | | OPFVV |V| |
+| OPIVX | |X| | | OPMVX{nbsp} | |X| | OPFVF | |F|
+| OPIVI | | |I| | | | | | | | |
+|===
+
+[cols="<,<,<,<,<,<,<,<,<,<,<,<,<",options="headers"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 000000 |V|X|I| vadd | 000000 |V| | vredsum | 000000 |V|F| vfadd
+| 000001 | | | | | 000001 |V| | vredand | 000001 |V| | vfredusum
+| 000010 |V|X| | vsub | 000010 |V| | vredor | 000010 |V|F| vfsub
+| 000011 | |X|I| vrsub | 000011 |V| | vredxor | 000011 |V| | vfredosum
+| 000100 |V|X| | vminu | 000100 |V| | vredminu | 000100 |V|F| vfmin
+| 000101 |V|X| | vmin | 000101 |V| | vredmin | 000101 |V| | vfredmin
+| 000110 |V|X| | vmaxu | 000110 |V| | vredmaxu | 000110 |V|F| vfmax
+| 000111 |V|X| | vmax | 000111 |V| | vredmax | 000111 |V| | vfredmax
+| 001000 | | | | | 001000 |V|X| vaaddu | 001000 |V|F| vfsgnj
+| 001001 |V|X|I| vand | 001001 |V|X| vaadd | 001001 |V|F| vfsgnjn
+| 001010 |V|X|I| vor | 001010 |V|X| vasubu | 001010 |V|F| vfsgnjx
+| 001011 |V|X|I| vxor | 001011 |V|X| vasub | 001011 | | |
+| 001100 |V|X|I| vrgather | 001100 | | | | 001100 | | |
+| 001101 | | | | | 001101 | | | | 001101 | | |
+| 001110 | |X|I| vslideup | 001110 | |X| vslide1up | 001110 | |F| vfslide1up
+| 001110 |V| | |vrgatherei16| | | | | | | |
+| 001111 | |X|I| vslidedown | 001111 | |X| vslide1down | 001111 | |F| vfslide1down
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 010000 |V|X|I| vadc | 010000 |V| | VWXUNARY0 | 010000 |V| | VWFUNARY0
+| | | | | | 010000 | |X| VRXUNARY0 | 010000 | |F| VRFUNARY0
+| 010001 |V|X|I| vmadc | 010001 | | | | 010001 | | |
+| 010010 |V|X| | vsbc | 010010 |V| | VXUNARY0 | 010010 |V| | VFUNARY0
+| 010011 |V|X| | vmsbc | 010011 | | | | 010011 |V| | VFUNARY1
+| 010100 | | | | | 010100 |V| | VMUNARY0 | 010100 | | |
+| 010101 | | | | | 010101 | | | | 010101 | | |
+| 010110 | | | | | 010110 | | | | 010110 | | |
+| 010111 |V|X|I| vmerge/vmv | 010111 |V| | vcompress | 010111 | |F| vfmerge/vfmv
+| 011000 |V|X|I| vmseq | 011000 |V| | vmandn | 011000 |V|F| vmfeq
+| 011001 |V|X|I| vmsne | 011001 |V| | vmand | 011001 |V|F| vmfle
+| 011010 |V|X| | vmsltu | 011010 |V| | vmor | 011010 | | |
+| 011011 |V|X| | vmslt | 011011 |V| | vmxor | 011011 |V|F| vmflt
+| 011100 |V|X|I| vmsleu | 011100 |V| | vmorn | 011100 |V|F| vmfne
+| 011101 |V|X|I| vmsle | 011101 |V| | vmnand | 011101 | |F| vmfgt
+| 011110 | |X|I| vmsgtu | 011110 |V| | vmnor | 011110 | | |
+| 011111 | |X|I| vmsgt | 011111 |V| | vmxnor | 011111 | |F| vmfge
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 100000 |V|X|I| vsaddu | 100000 |V|X| vdivu | 100000 |V|F| vfdiv
+| 100001 |V|X|I| vsadd | 100001 |V|X| vdiv | 100001 | |F| vfrdiv
+| 100010 |V|X| | vssubu | 100010 |V|X| vremu | 100010 | | |
+| 100011 |V|X| | vssub | 100011 |V|X| vrem | 100011 | | |
+| 100100 | | | | | 100100 |V|X| vmulhu | 100100 |V|F| vfmul
+| 100101 |V|X|I| vsll | 100101 |V|X| vmul | 100101 | | |
+| 100110 | | | | | 100110 |V|X| vmulhsu | 100110 | | |
+| 100111 |V|X| | vsmul | 100111 |V|X| vmulh | 100111 | |F| vfrsub
+| 100111 | | |I| vmv<nr>r | | | | | | | |
+| 101000 |V|X|I| vsrl | 101000 | | | | 101000 |V|F| vfmadd
+| 101001 |V|X|I| vsra | 101001 |V|X| vmadd | 101001 |V|F| vfnmadd
+| 101010 |V|X|I| vssrl | 101010 | | | | 101010 |V|F| vfmsub
+| 101011 |V|X|I| vssra | 101011 |V|X| vnmsub | 101011 |V|F| vfnmsub
+| 101100 |V|X|I| vnsrl | 101100 | | | | 101100 |V|F| vfmacc
+| 101101 |V|X|I| vnsra | 101101 |V|X| vmacc | 101101 |V|F| vfnmacc
+| 101110 |V|X|I| vnclipu | 101110 | | | | 101110 |V|F| vfmsac
+| 101111 |V|X|I| vnclip | 101111 |V|X| vnmsac | 101111 |V|F| vfnmsac
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 110000 |V| | | vwredsumu | 110000 |V|X| vwaddu | 110000 |V|F| vfwadd
+| 110001 |V| | | vwredsum | 110001 |V|X| vwadd | 110001 |V| | vfwredusum
+| 110010 | | | | | 110010 |V|X| vwsubu | 110010 |V|F| vfwsub
+| 110011 | | | | | 110011 |V|X| vwsub | 110011 |V| | vfwredosum
+| 110100 | | | | | 110100 |V|X| vwaddu.w | 110100 |V|F| vfwadd.w
+| 110101 | | | | | 110101 |V|X| vwadd.w | 110101 | | |
+| 110110 | | | | | 110110 |V|X| vwsubu.w | 110110 |V|F| vfwsub.w
+| 110111 | | | | | 110111 |V|X| vwsub.w | 110111 | | |
+| 111000 | | | | | 111000 |V|X| vwmulu | 111000 |V|F| vfwmul
+| 111001 | | | | | 111001 | | | | 111001 | | |
+| 111010 | | | | | 111010 |V|X| vwmulsu | 111010 | | |
+| 111011 | | | | | 111011 |V|X| vwmul | 111011 | | |
+| 111100 | | | | | 111100 |V|X| vwmaccu | 111100 |V|F| vfwmacc
+| 111101 | | | | | 111101 |V|X| vwmacc | 111101 |V|F| vfwnmacc
+| 111110 | | | | | 111110 | |X| vwmaccus | 111110 |V|F| vfwmsac
+| 111111 | | | | | 111111 |V|X| vwmaccsu | 111111 |V|F| vfwnmsac
+|===
+
+<<<
+
+.VRXUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs2 |
+
+| 00000 | vmv.s.x
+|===
+
+.VWXUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00000 | vmv.x.s
+| 10000 | vcpop
+| 10001 | vfirst
+|===
+
+.VXUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00010 | vzext.vf8
+| 00011 | vsext.vf8
+| 00100 | vzext.vf4
+| 00101 | vsext.vf4
+| 00110 | vzext.vf2
+| 00111 | vsext.vf2
+|===
+
+.VRFUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs2 |
+
+| 00000 | vfmv.s.f
+|===
+
+.VWFUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00000 | vfmv.f.s
+|===
+
+.VFUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 | name
+
+2+| single-width converts
+| 00000 | vfcvt.xu.f.v
+| 00001 | vfcvt.x.f.v
+| 00010 | vfcvt.f.xu.v
+| 00011 | vfcvt.f.x.v
+| 00110 | vfcvt.rtz.xu.f.v
+| 00111 | vfcvt.rtz.x.f.v
+| |
+2+| widening converts
+| 01000 | vfwcvt.xu.f.v
+| 01001 | vfwcvt.x.f.v
+| 01010 | vfwcvt.f.xu.v
+| 01011 | vfwcvt.f.x.v
+| 01100 | vfwcvt.f.f.v
+| 01110 | vfwcvt.rtz.xu.f.v
+| 01111 | vfwcvt.rtz.x.f.v
+| |
+2+| narrowing converts
+| 10000 | vfncvt.xu.f.w
+| 10001 | vfncvt.x.f.w
+| 10010 | vfncvt.f.xu.w
+| 10011 | vfncvt.f.x.w
+| 10100 | vfncvt.f.f.w
+| 10101 | vfncvt.rod.f.f.w
+| 10110 | vfncvt.rtz.xu.f.w
+| 10111 | vfncvt.rtz.x.f.w
+|===
+
+.VFUNARY1 encoding space
+[cols="2,14"]
+|===
+| vs1 | name
+
+| 00000 | vfsqrt.v
+| 00100 | vfrsqrt7.v
+| 00101 | vfrec7.v
+| 10000 | vfclass.v
+|===
+
+
+.VMUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00001 | vmsbf
+| 00010 | vmsof
+| 00011 | vmsif
+| 10000 | viota
+| 10001 | vid
+|===
+
+
diff --git a/src/images/wavedrom/valu-format.adoc b/src/images/wavedrom/valu-format.adoc
new file mode 100644
index 0000000..cdd3447
--- /dev/null
+++ b/src/images/wavedrom/valu-format.adoc
@@ -0,0 +1,104 @@
+Formats for Vector Arithmetic Instructions under OP-V major opcode
+
+////
+31 26 25 24 20 19 15 14 12 11 7 6 0
+ funct6 | vm | vs2 | vs1 | 0 0 0 | vd |1010111| OP-V (OPIVV)
+ funct6 | vm | vs2 | vs1 | 0 0 1 | vd/rd |1010111| OP-V (OPFVV)
+ funct6 | vm | vs2 | vs1 | 0 1 0 | vd/rd |1010111| OP-V (OPMVV)
+ funct6 | vm | vs2 | imm[4:0] | 0 1 1 | vd |1010111| OP-V (OPIVI)
+ funct6 | vm | vs2 | rs1 | 1 0 0 | vd |1010111| OP-V (OPIVX)
+ funct6 | vm | vs2 | rs1 | 1 0 1 | vd |1010111| OP-V (OPFVF)
+ funct6 | vm | vs2 | rs1 | 1 1 0 | vd/rd |1010111| OP-V (OPMVX)
+ 6 1 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPIVV'},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 0},
+ {bits: 5, name: 'vs1', type: 2},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPFVV'},
+ {bits: 5, name: 'vd / rd', type: 7},
+ {bits: 3, name: 1},
+ {bits: 5, name: 'vs1', type: 2},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPMVV'},
+ {bits: 5, name: 'vd / rd', type: 7},
+ {bits: 3, name: 2},
+ {bits: 5, name: 'vs1', type: 2},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: ['OPIVI']},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 3},
+ {bits: 5, name: 'imm[4:0]', type: 5},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPIVX'},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 4},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPFVF'},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 5},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPMVX'},
+ {bits: 5, name: 'vd / rd', type: 7},
+ {bits: 3, name: 6},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
diff --git a/src/images/wavedrom/vcfg-format.adoc b/src/images/wavedrom/vcfg-format.adoc
new file mode 100644
index 0000000..ac0353c
--- /dev/null
+++ b/src/images/wavedrom/vcfg-format.adoc
@@ -0,0 +1,47 @@
+Formats for Vector Configuration Instructions under OP-V major opcode
+
+////
+ 31 30 25 24 20 19 15 14 12 11 7 6 0
+ 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli
+ 1 | 1| zimm[ 9:0] | uimm[4:0]| 1 1 1 | rd |1010111| vsetivli
+ 1 | 000000 | rs2 | rs1 | 1 1 1 | rd |1010111| vsetvl
+ 1 6 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'vsetvli'},
+ {bits: 5, name: 'rd', type: 4},
+ {bits: 3, name: 7},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 11, name: 'vtypei[10:0]', type: 5},
+ {bits: 1, name: '0'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'vsetivli'},
+ {bits: 5, name: 'rd', type: 4},
+ {bits: 3, name: 7},
+ {bits: 5, name: 'uimm[4:0]', type: 5},
+ {bits: 10, name: 'vtypei[9:0]', type: 5},
+ {bits: 1, name: '1'},
+ {bits: 1, name: '1'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'vsetvl'},
+ {bits: 5, name: 'rd', type: 4},
+ {bits: 3, name: 7},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'rs2', type: 4},
+ {bits: 6, name: 0x00},
+ {bits: 1, name: 1},
+]}
+....
diff --git a/src/images/wavedrom/vfrec7.adoc b/src/images/wavedrom/vfrec7.adoc
new file mode 100644
index 0000000..d33f44e
--- /dev/null
+++ b/src/images/wavedrom/vfrec7.adoc
@@ -0,0 +1,136 @@
+.vfrec7.v common-case lookup table contents
+[%autowidth,float="center",align="center",options="header"]
+|===
+
+| sig[MSB -: 7] | sig_out[MSB -: 7]
+
+| 0 | 127
+| 1 | 125
+| 2 | 123
+| 3 | 121
+| 4 | 119
+| 5 | 117
+| 6 | 116
+| 7 | 114
+| 8 | 112
+| 9 | 110
+| 10 | 109
+| 11 | 107
+| 12 | 105
+| 13 | 104
+| 14 | 102
+| 15 | 100
+| 16 | 99
+| 17 | 97
+| 18 | 96
+| 19 | 94
+| 20 | 93
+| 21 | 91
+| 22 | 90
+| 23 | 88
+| 24 | 87
+| 25 | 85
+| 26 | 84
+| 27 | 83
+| 28 | 81
+| 29 | 80
+| 30 | 79
+| 31 | 77
+| 32 | 76
+| 33 | 75
+| 34 | 74
+| 35 | 72
+| 36 | 71
+| 37 | 70
+| 38 | 69
+| 39 | 68
+| 40 | 66
+| 41 | 65
+| 42 | 64
+| 43 | 63
+| 44 | 62
+| 45 | 61
+| 46 | 60
+| 47 | 59
+| 48 | 58
+| 49 | 57
+| 50 | 56
+| 51 | 55
+| 52 | 54
+| 53 | 53
+| 54 | 52
+| 55 | 51
+| 56 | 50
+| 57 | 49
+| 58 | 48
+| 59 | 47
+| 60 | 46
+| 61 | 45
+| 62 | 44
+| 63 | 43
+| 64 | 42
+| 65 | 41
+| 66 | 40
+| 67 | 40
+| 68 | 39
+| 69 | 38
+| 70 | 37
+| 71 | 36
+| 72 | 35
+| 73 | 35
+| 74 | 34
+| 75 | 33
+| 76 | 32
+| 77 | 31
+| 78 | 31
+| 79 | 30
+| 80 | 29
+| 81 | 28
+| 82 | 28
+| 83 | 27
+| 84 | 26
+| 85 | 25
+| 86 | 25
+| 87 | 24
+| 88 | 23
+| 89 | 23
+| 90 | 22
+| 91 | 21
+| 92 | 21
+| 93 | 20
+| 94 | 19
+| 95 | 19
+| 96 | 18
+| 97 | 17
+| 98 | 17
+| 99 | 16
+| 100 | 15
+| 101 | 15
+| 102 | 14
+| 103 | 14
+| 104 | 13
+| 105 | 12
+| 106 | 12
+| 107 | 11
+| 108 | 11
+| 109 | 10
+| 110 | 9
+| 111 | 9
+| 112 | 8
+| 113 | 8
+| 114 | 7
+| 115 | 7
+| 116 | 6
+| 117 | 5
+| 118 | 5
+| 119 | 4
+| 120 | 4
+| 121 | 3
+| 122 | 3
+| 123 | 2
+| 124 | 2
+| 125 | 1
+| 126 | 1
+| 127 | 0
+
+|===
diff --git a/src/images/wavedrom/vfrsqrt7.adoc b/src/images/wavedrom/vfrsqrt7.adoc
new file mode 100644
index 0000000..8ebc621
--- /dev/null
+++ b/src/images/wavedrom/vfrsqrt7.adoc
@@ -0,0 +1,137 @@
+.vfrsqrt7.v common-case lookup table contents
+[%autowidth,float=center,align=center,options="header"]
+|===
+
+|exp[0] | sig[MSB -: 6] | sig_out[MSB -: 7]
+
+| 0| 0 | 52
+| 0| 1 | 51
+| 0| 2 | 50
+| 0| 3 | 48
+| 0| 4 | 47
+| 0| 5 | 46
+| 0| 6 | 44
+| 0| 7 | 43
+| 0| 8 | 42
+| 0| 9 | 41
+| 0| 10 | 40
+| 0| 11 | 39
+| 0| 12 | 38
+| 0| 13 | 36
+| 0| 14 | 35
+| 0| 15 | 34
+| 0| 16 | 33
+| 0| 17 | 32
+| 0| 18 | 31
+| 0| 19 | 30
+| 0| 20 | 30
+| 0| 21 | 29
+| 0| 22 | 28
+| 0| 23 | 27
+| 0| 24 | 26
+| 0| 25 | 25
+| 0| 26 | 24
+| 0| 27 | 23
+| 0| 28 | 23
+| 0| 29 | 22
+| 0| 30 | 21
+| 0| 31 | 20
+| 0| 32 | 19
+| 0| 33 | 19
+| 0| 34 | 18
+| 0| 35 | 17
+| 0| 36 | 16
+| 0| 37 | 16
+| 0| 38 | 15
+| 0| 39 | 14
+| 0| 40 | 14
+| 0| 41 | 13
+| 0| 42 | 12
+| 0| 43 | 12
+| 0| 44 | 11
+| 0| 45 | 10
+| 0| 46 | 10
+| 0| 47 | 9
+| 0| 48 | 9
+| 0| 49 | 8
+| 0| 50 | 7
+| 0| 51 | 7
+| 0| 52 | 6
+| 0| 53 | 6
+| 0| 54 | 5
+| 0| 55 | 4
+| 0| 56 | 4
+| 0| 57 | 3
+| 0| 58 | 3
+| 0| 59 | 2
+| 0| 60 | 2
+| 0| 61 | 1
+| 0| 62 | 1
+| 0| 63 | 0
+
+| 1| 0 | 127
+| 1| 1 | 125
+| 1| 2 | 123
+| 1| 3 | 121
+| 1| 4 | 119
+| 1| 5 | 118
+| 1| 6 | 116
+| 1| 7 | 114
+| 1| 8 | 113
+| 1| 9 | 111
+| 1| 10 | 109
+| 1| 11 | 108
+| 1| 12 | 106
+| 1| 13 | 105
+| 1| 14 | 103
+| 1| 15 | 102
+| 1| 16 | 100
+| 1| 17 | 99
+| 1| 18 | 97
+| 1| 19 | 96
+| 1| 20 | 95
+| 1| 21 | 93
+| 1| 22 | 92
+| 1| 23 | 91
+| 1| 24 | 90
+| 1| 25 | 88
+| 1| 26 | 87
+| 1| 27 | 86
+| 1| 28 | 85
+| 1| 29 | 84
+| 1| 30 | 83
+| 1| 31 | 82
+| 1| 32 | 80
+| 1| 33 | 79
+| 1| 34 | 78
+| 1| 35 | 77
+| 1| 36 | 76
+| 1| 37 | 75
+| 1| 38 | 74
+| 1| 39 | 73
+| 1| 40 | 72
+| 1| 41 | 71
+| 1| 42 | 70
+| 1| 43 | 70
+| 1| 44 | 69
+| 1| 45 | 68
+| 1| 46 | 67
+| 1| 47 | 66
+| 1| 48 | 65
+| 1| 49 | 64
+| 1| 50 | 63
+| 1| 51 | 63
+| 1| 52 | 62
+| 1| 53 | 61
+| 1| 54 | 60
+| 1| 55 | 59
+| 1| 56 | 59
+| 1| 57 | 58
+| 1| 58 | 57
+| 1| 59 | 56
+| 1| 60 | 56
+| 1| 61 | 55
+| 1| 62 | 54
+| 1| 63 | 53
+
+|=== \ No newline at end of file
diff --git a/src/images/wavedrom/vmem-format.adoc b/src/images/wavedrom/vmem-format.adoc
new file mode 100644
index 0000000..f9b25ee
--- /dev/null
+++ b/src/images/wavedrom/vmem-format.adoc
@@ -0,0 +1,108 @@
+Format for Vector Load Instructions under LOAD-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | mew| mop | vm | lumop | rs1 | width | vd |0000111| VL* unit-stride
+ nf | mew| mop | vm | rs2 | rs1 | width | vd |0000111| VLS* strided
+ nf | mew| mop | vm | vs2 | rs1 | width | vd |0000111| VLX* indexed
+ 3 1 2 1 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x7, attr: 'VL* unit-stride'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'lumop'},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x7, attr: 'VLS* strided'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'rs2', attr: 'stride', type: 4},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x7, attr: 'VLX* indexed'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'vs2', attr: 'address offsets', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+Format for Vector Store Instructions under STORE-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | mew| mop | vm | sumop | rs1 | width | vs3 |0100111| VS* unit-stride
+ nf | mew| mop | vm | rs2 | rs1 | width | vs3 |0100111| VSS* strided
+ nf | mew| mop | vm | vs2 | rs1 | width | vs3 |0100111| VSX* indexed
+ 3 1 2 1 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VS* unit-stride'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'sumop'},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VSS* strided'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'rs2', attr: 'stride', type: 4},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VSX* indexed'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'vs2', attr: 'address offsets', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
diff --git a/src/images/wavedrom/vtype-format.adoc b/src/images/wavedrom/vtype-format.adoc
new file mode 100644
index 0000000..9e6ab34
--- /dev/null
+++ b/src/images/wavedrom/vtype-format.adoc
@@ -0,0 +1,28 @@
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 3, name: 'vlmul[2:0]'},
+ {bits: 3, name: 'vsew[2:0]'},
+ {bits: 1, name: 'vta'},
+ {bits: 1, name: 'vma'},
+ {bits: 23, name: 'reserved'},
+ {bits: 1, name: 'vill'},
+]}
+....
+
+NOTE: This diagram shows the layout for RV32 systems, whereas in
+general `vill` should be at bit XLEN-1.
+
+.`vtype` register layout
+[cols=">2,4,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Bits | Name | Description
+
+| XLEN-1 | vill | Illegal value if set
+| XLEN-2:8 | 0 | Reserved if non-zero
+| 7 | vma | Vector mask agnostic
+| 6 | vta | Vector tail agnostic
+| 5:3 | vsew[2:0] | Selected element width (SEW) setting
+| 2:0 | vlmul[2:0] | Vector register group multiplier (LMUL) setting
+|===
diff --git a/src/resources/themes/riscv-spec.yml b/src/resources/themes/riscv-spec.yml
index 5cb07c9..e8332fc 100644
--- a/src/resources/themes/riscv-spec.yml
+++ b/src/resources/themes/riscv-spec.yml
@@ -250,6 +250,7 @@ figure:
align: center
table:
background_color: $page_background_color
+ font-size: 9
#head_background_color: #2596be
#head_font_color: $base_font_color
head_font_style: bold
diff --git a/src/riscv-privileged.adoc b/src/riscv-privileged.adoc
index bddef4f..7ca9ad1 100644
--- a/src/riscv-privileged.adoc
+++ b/src/riscv-privileged.adoc
@@ -51,6 +51,11 @@ endif::[]
:hide-uri-scheme:
:stem: latexmath
:footnote:
+:le: &#8804;
+:ge: &#8805;
+:ne: &#8800;
+:approx: &#8776;
+:inf: &#8734;
_Contributors to all versions of the spec in alphabetical order (please contact
editors to suggest corrections): Krste Asanović, Peter Ashenden, Rimas
diff --git a/src/riscv-unprivileged.adoc b/src/riscv-unprivileged.adoc
index f0537a5..7a3ab3a 100644
--- a/src/riscv-unprivileged.adoc
+++ b/src/riscv-unprivileged.adoc
@@ -47,6 +47,11 @@ endif::[]
:hide-uri-scheme:
:stem: latexmath
:footnote:
+:le: &#8804;
+:ge: &#8805;
+:ne: &#8800;
+:approx: &#8776;
+:inf: &#8734;
:csrname: envcfg
_Contributors to all versions of the spec in alphabetical order (please contact editors to suggest
@@ -139,6 +144,11 @@ include::mm-eplan.adoc[]
//memory.tex
include::mm-formal.adoc[]
//end of memory.tex, memory-model-alloy.tex, memory-model-herd.tex
+//Appendices for Vector
+include::vector-examples.adoc[]
+include::calling-convention.adoc[]
+//include::fraclmul.adoc[]
+//End of Vector appendices
include::index.adoc[]
// this is generated generated from index markers.
include::bibliography.adoc[]
diff --git a/src/v-st-ext.adoc b/src/v-st-ext.adoc
index 88dcf8d..194e448 100644
--- a/src/v-st-ext.adoc
+++ b/src/v-st-ext.adoc
@@ -1,9 +1,6 @@
[[vector]]
== "V" Standard Extension for Vector Operations, Version 1.0
-The specification is currently hosted at
-https://github.com/riscv/riscv-v-spec.
-
[NOTE]
====
_The base vector extension is intended to provide general support for
@@ -12,3 +9,5185 @@ with later vector extensions supporting richer functionality for certain
domains._
====
+=== Introduction
+
+This document is version 1.1-draft of the RISC-V vector extension.
+
+NOTE: This version holds updates gathered after the start of the
+public review. The spec will have a final update to version 2.0 at
+time of ratification.
+
+This spec includes the complete set of currently frozen vector
+instructions. Other instructions that have been considered during
+development but are not present in this document are not included in
+the review and ratification process, and may be completely revised or
+abandoned. Section <<sec-vector-extensions>> lists the standard
+vector extensions and which instructions and element widths are
+supported by each extension.
+
+=== Implementation-defined Constant Parameters
+
+Each hart supporting a vector extension defines two parameters:
+
+. The maximum size in bits of a vector element that any operation can produce or consume, _ELEN_ {ge} 8, which
+must be a power of 2.
+. The number of bits in a single vector register, _VLEN_ {ge} ELEN, which must be a power of 2, and must be no greater than 2^16^.
+
+Standard vector extensions (Section <<sec-vector-extensions>>) and
+architecture profiles may set further constraints on _ELEN_ and _VLEN_.
+
+NOTE: Future extensions may allow ELEN {gt} VLEN by holding one
+element using bits from multiple vector registers, but this current
+proposal does not include this option.
+
+NOTE: The upper limit on VLEN allows software to know that indices
+will fit into 16 bits (largest VLMAX of 65,536 occurs for LMUL=8 and
+SEW=8 with VLEN=65,536). Any future extension beyond 64Kib per vector
+register will require new configuration instructions such that
+software using the old configuration instructions does not see greater
+vector lengths.
+
+The vector extension supports writing binary code that under certain
+constraints will execute portably on harts with different values for
+the VLEN parameter, provided the harts support the required element
+types and instructions.
+
+NOTE: Code can be written that will expose differences in
+implementation parameters.
+
+NOTE: In general, thread contexts with active vector state cannot be
+migrated during execution between harts that have any difference in
+VLEN or ELEN parameters.
+
+=== Vector Extension Programmer's Model
+
+The vector extension adds 32 vector registers, and seven unprivileged
+CSRs (`vstart`, `vxsat`, `vxrm`, `vcsr`, `vtype`, `vl`, `vlenb`) to a
+base scalar RISC-V ISA.
+
+.New vector CSRs
+[cols="2,2,2,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Address | Privilege | Name | Description
+
+| 0x008 | URW | vstart | Vector start position
+| 0x009 | URW | vxsat | Fixed-Point Saturate Flag
+| 0x00A | URW | vxrm | Fixed-Point Rounding Mode
+| 0x00F | URW | vcsr | Vector control and status register
+| 0xC20 | URO | vl | Vector length
+| 0xC21 | URO | vtype | Vector data type register
+| 0xC22 | URO | vlenb | VLEN/8 (vector register length in bytes)
+|===
+
+NOTE: The four CSR numbers `0x00B`-`0x00E` are tentatively reserved
+for future vector CSRs, some of which may be mirrored into `vcsr`.
+
+==== Vector Registers
+
+The vector extension adds 32 architectural vector registers,
+`v0`-`v31` to the base scalar RISC-V ISA.
+
+Each vector register has a fixed VLEN bits of state.
+
+==== Vector Context Status in `mstatus`
+
+A vector context status field, `VS`, is added to `mstatus[10:9]` and shadowed
+in `sstatus[10:9]`. It is defined analogously to the floating-point context
+status field, `FS`.
+
+Attempts to execute any vector instruction, or to access the vector
+CSRs, raise an illegal-instruction exception when `mstatus.VS` is
+set to Off.
+
+When `mstatus.VS` is set to Initial or Clean, executing any
+instruction that changes vector state, including the vector CSRs, will
+change `mstatus.VS` to Dirty.
+Implementations may also change `mstatus.VS` from Initial or Clean to Dirty
+at any time, even when there is no change in vector state.
+
+NOTE: Accurate setting of `mstatus.VS` is an optimization. Software
+will typically use VS to reduce context-swap overhead.
+
+If `mstatus.VS` is Dirty, `mstatus.SD` is 1;
+otherwise, `mstatus.SD` is set in accordance with existing specifications.
+
+Implementations may have a writable `misa.V` field. Analogous to the
+way in which the floating-point unit is handled, the `mstatus.VS`
+field may exist even if `misa.V` is clear.
+
+NOTE: Allowing `mstatus.VS` to exist when `misa.V` is clear, enables
+vector emulation and simplifies handling of `mstatus.VS` in systems
+with writable `misa.V`.
+
+==== Vector Context Status in `vsstatus`
+
+When the hypervisor extension is present, a vector context status field, `VS`,
+is added to `vsstatus[10:9]`.
+It is defined analogously to the floating-point context status field, `FS`.
+
+When V=1, both `vsstatus.VS` and `mstatus.VS` are in effect: attempts to
+execute any vector instruction, or to access the vector CSRs, raise an
+illegal-instruction exception when either field is set to Off.
+
+When V=1 and neither `vsstatus.VS` nor `mstatus.VS` is set to Off, executing
+any instruction that changes vector state, including the vector CSRs, will
+change both `mstatus.VS` and `vsstatus.VS` to Dirty.
+Implementations may also change `mstatus.VS` or `vsstatus.VS` from Initial or
+Clean to Dirty at any time, even when there is no change in vector state.
+
+If `vsstatus.VS` is Dirty, `vsstatus.SD` is 1;
+otherwise, `vsstatus.SD` is set in accordance with existing specifications.
+
+If `mstatus.VS` is Dirty, `mstatus.SD` is 1;
+otherwise, `mstatus.SD` is set in accordance with existing specifications.
+
+For implementations with a writable `misa.V` field,
+the `vsstatus.VS` field may exist even if `misa.V` is clear.
+
+==== Vector type register, `vtype`
+
+The read-only XLEN-wide _vector_ _type_ CSR, `vtype` provides the
+default type used to interpret the contents of the vector register
+file, and can only be updated by `vset{i}vl{i}` instructions. The
+vector type determines the organization of elements in each
+vector register, and how multiple vector registers are grouped. The
+`vtype` register also indicates how masked-off elements and elements
+past the current vector length in a vector result are handled.
+
+NOTE: Allowing updates only via the `vset{i}vl{i}` instructions
+simplifies maintenance of the `vtype` register state.
+
+The `vtype` register has five fields, `vill`, `vma`, `vta`,
+`vsew[2:0]`, and `vlmul[2:0]`. Bits `vtype[XLEN-2:8]` should be
+written with zero, and non-zero values in this field are reserved.
+
+include::images/wavedrom/vtype-format.adoc[]
+
+NOTE: A small implementation supporting ELEN=32 requires only seven
+bits of state in `vtype`: two bits for `ma` and `ta`, two bits for
+`vsew[1:0]` and three bits for `vlmul[2:0]`. The illegal value
+represented by `vill` can be internally encoded using the illegal 64-bit
+combination in `vsew[1:0]` without requiring an additional storage
+bit to hold `vill`.
+
+NOTE: Further standard and custom vector extensions may extend these
+fields to support a greater variety of data types.
+
+NOTE: The primary motivation for the `vtype` CSR is to allow the
+vector instruction set to fit into a 32-bit instruction encoding
+space. A separate `vset{i}vl{i}` instruction can be used to set `vl`
+and/or `vtype` fields before execution of a vector instruction, and
+implementations may choose to fuse these two instructions into a single
+internal vector microop. In many cases, the `vl` and `vtype` values
+can be reused across multiple instructions, reducing the static and
+dynamic instruction overhead from the `vset{i}vl{i}` instructions. It
+is anticipated that a future extended 64-bit instruction encoding
+would allow these fields to be specified statically in the instruction
+encoding.
+
+===== Vector selected element width `vsew[2:0]`
+
+The value in `vsew` sets the dynamic _selected_ _element_ _width_
+(SEW). By default, a vector register is viewed as being divided into
+VLEN/SEW elements.
+
+.vsew[2:0] (selected element width) encoding
+[cols="1,1,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+3+| vsew[2:0] | SEW
+
+| 0 | 0 | 0 | 8
+| 0 | 0 | 1 | 16
+| 0 | 1 | 0 | 32
+| 0 | 1 | 1 | 64
+| 1 | X | X | Reserved
+|===
+
+NOTE: While it is anticipated the larger `vsew[2:0]` encodings
+(`100`-`111`) will be used to encode larger SEW, the encodings are
+formally _reserved_ at this point.
+
+.Example VLEN = 128 bits
+[cols=">,>"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| SEW | Elements per vector register
+
+| 64 | 2
+| 32 | 4
+| 16 | 8
+| 8 | 16
+|===
+
+The supported element width may vary with LMUL.
+
+NOTE: The current set of standard vector extensions do not vary
+supported element width with LMUL. Some future extensions may support
+larger SEWs only when bits from multiple vector registers are combined
+using LMUL. In this case, software that relies on large SEW should
+attempt to use the largest LMUL, and hence the fewest vector register
+groups, to increase the number of implementations on which the code
+will run. The `vill` bit in `vtype` should be checked after setting
+`vtype` to see if the configuration is supported, and an alternate
+code path should be provided if it is not. Alternatively, a profile
+can mandate the minimum SEW at each LMUL setting.
+
+===== Vector Register Grouping (`vlmul[2:0]`)
+
+Multiple vector registers can be grouped together, so that a single
+vector instruction can operate on multiple vector registers. The term
+_vector_ _register_ _group_ is used herein to refer to one or more
+vector registers used as a single operand to a vector instruction.
+Vector register groups can be used to provide greater execution
+efficiency for longer application vectors, but the main reason for
+their inclusion is to allow double-width or larger elements to be
+operated on with the same vector length as single-width elements. The
+vector length multiplier, _LMUL_, when greater than 1, represents the
+default number of vector registers that are combined to form a vector
+register group. Implementations must support LMUL integer values of
+1, 2, 4, and 8.
+
+
+NOTE: The vector architecture includes instructions that take multiple
+source and destination vector operands with different element widths,
+but the same number of elements. The effective LMUL (EMUL) of each
+vector operand is determined by the number of registers required to
+hold the elements. For example, for a widening add operation, such as
+add 32-bit values to produce 64-bit results, a double-width result
+requires twice the LMUL of the single-width inputs.
+
+LMUL can also be a fractional value, reducing the number of bits used
+in a single vector register. Fractional LMUL is used to increase the
+number of effective usable vector register groups when operating on
+mixed-width values.
+
+NOTE: With only integer LMUL values, a loop operating on a range of
+sizes would have to allocate at least one whole vector register
+(LMUL=1) for the narrowest data type and then would consume multiple
+vector registers (LMUL>1) to form a vector register group for each
+wider vector operand. This can limit the number of vector register groups
+available. With fractional LMUL, the widest values need occupy only a
+single vector register while narrower values can occupy a fraction of
+a single vector register, allowing all 32 architectural vector
+register names to be used for different values in a vector loop even
+when handling mixed-width values. Fractional LMUL implies portions of
+vector registers are unused, but in some cases, having more shorter
+register-resident vectors improves efficiency relative to fewer longer
+register-resident vectors.
+
+Implementations must provide fractional LMUL settings that allow the
+narrowest supported type to occupy a fraction of a vector register
+corresponding to the ratio of the narrowest supported type's width to
+that of the largest supported type's width. In general, the
+requirement is to support LMUL {ge} SEW~MIN~/ELEN, where SEW~MIN~ is
+the narrowest supported SEW value and ELEN is the widest supported SEW
+value. In the standard extensions, SEW~MIN~=8. For
+standard vector extensions with ELEN=32, fractional LMULs of 1/2 and
+1/4 must be supported. For standard vector extensions with ELEN=64,
+fractional LMULs of 1/2, 1/4, and 1/8 must be supported.
+
+NOTE: When LMUL < SEW~MIN~/ELEN, there is no guarantee
+an implementation would have enough bits in the fractional vector
+register to store at least one element, as VLEN=ELEN is a
+valid implementation choice. For example, with VLEN=ELEN=32,
+and SEW~MIN~=8, an LMUL of 1/8 would only provide four bits of
+storage in a vector register.
+
+For a given supported fractional LMUL setting, implementations must support
+SEW settings between SEW~MIN~ and LMUL * ELEN, inclusive.
+
+The use of `vtype` encodings with LMUL < SEW~MIN~/ELEN is
+__reserved__, but implementations can set `vill` if they do not
+support these configurations.
+
+NOTE: Requiring all implementations to set `vill` in this case would
+prohibit future use of this case in an extension, so to allow for a
+future definition of LMUL<SEW~MIN~/ELEN behavior, we
+consider the use of this case to be __reserved__.
+
+NOTE: It is recommended that assemblers provide a warning (not an
+error) if a `vsetvli` instruction attempts to write an LMUL < SEW~MIN~/ELEN.
+
+LMUL is set by the signed `vlmul` field in `vtype` (i.e., LMUL =
+2^`vlmul[2:0]`^).
+
+The derived value VLMAX = LMUL*VLEN/SEW represents the maximum number
+of elements that can be operated on with a single vector instruction
+given the current SEW and LMUL settings as shown in the table below.
+
+[cols="1,1,1,2,2,5,5"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+ 3+| vlmul[2:0] | LMUL | #groups | VLMAX | Registers grouped with register __n__
+
+| 1 | 0 | 0 | - | - | - | reserved
+| 1 | 0 | 1 | 1/8| 32 | VLEN/SEW/8 | `v` __n__ (single register in group)
+| 1 | 1 | 0 | 1/4| 32 | VLEN/SEW/4 | `v` __n__ (single register in group)
+| 1 | 1 | 1 | 1/2| 32 | VLEN/SEW/2 | `v` __n__ (single register in group)
+| 0 | 0 | 0 | 1 | 32 | VLEN/SEW | `v` __n__ (single register in group)
+| 0 | 0 | 1 | 2 | 16 | 2*VLEN/SEW | `v` __n__, `v` __n__+1
+| 0 | 1 | 0 | 4 | 8 | 4*VLEN/SEW | `v` __n__, ..., `v` __n__+3
+| 0 | 1 | 1 | 8 | 4 | 8*VLEN/SEW | `v` __n__, ..., `v` __n__+7
+|===
+
+When LMUL=2, the vector register group contains vector register `v`
+__n__ and vector register `v` __n__+1, providing twice the vector
+length in bits. Instructions specifying an LMUL=2 vector register group
+with an odd-numbered vector register are reserved.
+
+When LMUL=4, the vector register group contains four vector registers,
+and instructions specifying an LMUL=4 vector register group using vector
+register numbers that are not multiples of four are reserved.
+
+When LMUL=8, the vector register group contains eight vector
+registers, and instructions specifying an LMUL=8 vector register group
+using register numbers that are not multiples of eight are reserved.
+
+Mask registers are always contained in a single vector register,
+regardless of LMUL.
+
+[[sec-agnostic]]
+===== Vector Tail Agnostic and Vector Mask Agnostic `vta` and `vma`
+
+These two bits modify the behavior of destination tail elements and
+destination inactive masked-off elements respectively during the
+execution of vector instructions. The tail and inactive sets contain
+element positions that are not receiving new results during a vector
+operation, as defined in Section <<sec-inactive-defs>>.
+
+All systems must support all four options:
+
+[cols="1,1,3,3"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| `vta` | `vma` | Tail Elements | Inactive Elements
+
+| 0 | 0 | undisturbed | undisturbed
+| 0 | 1 | undisturbed | agnostic
+| 1 | 0 | agnostic | undisturbed
+| 1 | 1 | agnostic | agnostic
+|===
+
+Mask destination tail elements are always treated as tail-agnostic,
+regardless of the setting of `vta`.
+
+When a set is marked undisturbed, the corresponding set of destination
+elements in a vector register group retain the value they previously
+held.
+
+When a set is marked agnostic, the corresponding set of destination
+elements in any vector destination operand can either retain the value
+they previously held, or are overwritten with 1s. Within a single vector
+instruction, each destination element can be either left undisturbed
+or overwritten with 1s, in any combination, and the pattern of
+undisturbed or overwritten with 1s is not required to be deterministic
+when the instruction is executed with the same inputs.
+
+NOTE: The agnostic policy was added to accommodate machines with
+vector register renaming. With an undisturbed policy, all elements
+would have to be read from the old physical destination vector
+register to be copied into the new physical destination vector
+register. This causes an inefficiency when these inactive or tail
+values are not required for subsequent calculations.
+
+NOTE: The value of all 1s instead of all 0s was chosen for the
+overwrite value to discourage software developers from depending on
+the value written.
+
+NOTE: A simple in-order implementation can ignore the settings and
+simply execute all vector instructions using the undisturbed
+policy. The `vta` and `vma` state bits must still be provided in
+`vtype` for compatibility and to support thread migration.
+
+NOTE: An out-of-order implementation can choose to implement
+tail-agnostic + mask-agnostic using tail-agnostic + mask-undisturbed
+to reduce implementation complexity.
+
+NOTE: The definition of agnostic result policy is left loose to
+accommodate migrating application threads between harts on a small
+in-order core (which probably leaves agnostic regions undisturbed) and
+harts on a larger out-of-order core with register renaming (which
+probably overwrites agnostic elements with 1s). As it might be
+necessary to restart in the middle, we allow arbitrary mixing of
+agnostic policies within a single vector instruction. This allowed
+mixing of policies also enables implementations that might change
+policies for different granules of a vector register, for example,
+using undisturbed within a granule that is actively operated on but
+renaming to all 1s for granules in the tail.
+
+In addition, except for mask load instructions, any element in the
+tail of a mask result can also be written with the value the
+mask-producing operation would have calculated with `vl`=VLMAX.
+Furthermore, for mask-logical instructions and `vmsbf.m`, `vmsif.m`,
+`vmsof.m` mask-manipulation instructions, any element in the tail of
+the result can be written with the value the mask-producing operation
+would have calculated with `vl`=VLEN, SEW=8, and LMUL=8 (i.e., all
+bits of the mask register can be overwritten).
+
+NOTE: Mask tails are always treated as agnostic to reduce complexity
+of managing mask data, which can be written at bit granularity. There
+appears to be little software need to support tail-undisturbed for
+mask register values. Allowing mask-generating instructions to write
+back the result of the instruction avoids the need for logic to mask
+out the tail, except mask loads cannot write memory values to
+destination mask tails as this would imply accessing memory past
+software intent.
+
+The assembly syntax adds two mandatory flags to the `vsetvli` instruction:
+
+----
+ ta # Tail agnostic
+ tu # Tail undisturbed
+ ma # Mask agnostic
+ mu # Mask undisturbed
+
+ vsetvli t0, a0, e32, m4, ta, ma # Tail agnostic, mask agnostic
+ vsetvli t0, a0, e32, m4, tu, ma # Tail undisturbed, mask agnostic
+ vsetvli t0, a0, e32, m4, ta, mu # Tail agnostic, mask undisturbed
+ vsetvli t0, a0, e32, m4, tu, mu # Tail undisturbed, mask undisturbed
+----
+
+NOTE: Prior to v0.9, when these flags were not specified on a
+`vsetvli`, they defaulted to mask-undisturbed/tail-undisturbed. The
+use of `vsetvli` without these flags is deprecated, however, and
+specifying a flag setting is now mandatory. The default should
+perhaps be tail-agnostic/mask-agnostic, so software has to specify
+when it cares about the non-participating elements, but given the
+historical meaning of the instruction prior to introduction of these
+flags, it was decided to always require them in future assembly code.
+
+===== Vector Type Illegal `vill`
+
+The `vill` bit is used to encode that a previous `vset{i}vl{i}`
+instruction attempted to write an unsupported value to `vtype`.
+
+NOTE: The `vill` bit is held in bit XLEN-1 of the CSR to support
+checking for illegal values with a branch on the sign bit.
+
+If the `vill` bit is set, then any attempt to execute a vector instruction
+that depends upon `vtype` will raise an illegal-instruction exception.
+
+NOTE: `vset{i}vl{i}` and whole register loads and stores do not depend
+upon `vtype`.
+
+When the `vill` bit is set, the other XLEN-1 bits in `vtype` shall be
+zero.
+
+==== Vector Length Register `vl`
+
+The _XLEN_-bit-wide read-only `vl` CSR can only be updated by the
+`vset{i}vl{i}` instructions, and the _fault-only-first_ vector load
+instruction variants.
+
+The `vl` register holds an unsigned integer specifying the number of
+elements to be updated with results from a vector instruction, as
+further detailed in Section <<sec-inactive-defs>>.
+
+NOTE: The number of bits implemented in `vl` depends on the
+implementation's maximum vector length of the smallest supported
+type. The smallest vector implementation with VLEN=32 and supporting
+SEW=8 would need at least six bits in `vl` to hold the values 0-32
+(VLEN=32, with LMUL=8 and SEW=8, yields VLMAX=32).
+
+==== Vector Byte Length `vlenb`
+
+The _XLEN_-bit-wide read-only CSR `vlenb` holds the value VLEN/8,
+i.e., the vector register length in bytes.
+
+NOTE: The value in `vlenb` is a design-time constant in any
+implementation.
+
+NOTE: Without this CSR, several instructions are needed to calculate
+VLEN in bytes, and the code has to disturb current `vl` and `vtype`
+settings which require them to be saved and restored.
+
+==== Vector Start Index CSR `vstart`
+
+The _XLEN_-bit-wide read-write `vstart` CSR specifies the index of the
+first element to be executed by a vector instruction, as described in
+Section <<sec-inactive-defs>>.
+
+Normally, `vstart` is only written by hardware on a trap on a vector
+instruction, with the `vstart` value representing the element on which
+the trap was taken (either a synchronous exception or an asynchronous
+interrupt), and at which execution should resume after a resumable
+trap is handled.
+
+All vector instructions are defined to begin execution with the
+element number given in the `vstart` CSR, leaving earlier elements in
+the destination vector undisturbed, and to reset the `vstart` CSR to
+zero at the end of execution.
+
+NOTE: All vector instructions, including `vset{i}vl{i}`, reset the `vstart`
+CSR to zero.
+
+`vstart` is not modified by vector instructions that raise illegal-instruction
+exceptions.
+
+The `vstart` CSR is defined to have only enough writable bits to hold
+the largest element index (one less than the maximum VLMAX).
+
+NOTE: The maximum vector length is obtained with the largest LMUL
+setting (8) and the smallest SEW setting (8), so VLMAX_max = 8*VLEN/8 = VLEN. For example, for VLEN=256, `vstart` would have 8 bits to
+represent indices from 0 through 255.
+
+The use of `vstart` values greater than the largest element index for
+the current `vtype` setting is reserved.
+
+NOTE: It is recommended that implementations trap if `vstart` is out
+of bounds. It is not required to trap, as a possible future use of
+upper `vstart` bits is to store imprecise trap information.
+
+The `vstart` CSR is writable by unprivileged code, but non-zero
+`vstart` values may cause vector instructions to run substantially
+slower on some implementations, so `vstart` should not be used by
+application programmers. A few vector instructions cannot be
+executed with a non-zero `vstart` value and will raise an illegal
+instruction exception as defined below.
+
+NOTE: Making `vstart` visible to unprivileged code supports user-level
+threading libraries.
+
+Implementations are permitted to raise illegal instruction exceptions when
+attempting to execute a vector instruction with a value of `vstart` that the
+implementation can never produce when executing that same instruction with
+the same `vtype` setting.
+
+NOTE: For example, some implementations will never take interrupts during
+execution of a vector arithmetic instruction, instead waiting until the
+instruction completes to take the interrupt. Such implementations are
+permitted to raise an illegal instruction exception when attempting to execute
+a vector arithmetic instruction when `vstart` is nonzero.
+
+NOTE: When migrating a software thread between two harts with
+different microarchitectures, the `vstart` value might not be
+supported by the new hart microarchitecture. The runtime on the
+receiving hart might then have to emulate instruction execution up to the
+next supported `vstart` element position. Alternatively, migration events
+can be constrained to only occur at mutually supported `vstart`
+locations.
+
+==== Vector Fixed-Point Rounding Mode Register `vxrm`
+
+The vector fixed-point rounding-mode register holds a two-bit
+read-write rounding-mode field in the least-significant bits
+(`vxrm[1:0]`). The upper bits, `vxrm[XLEN-1:2]`, should be written as
+zeros.
+
+The vector fixed-point rounding-mode is given a separate CSR address
+to allow independent access, but is also reflected as a field in
+`vcsr`.
+
+NOTE: A new rounding mode can be set while saving the original
+rounding mode using a single `csrwi` instruction.
+
+The fixed-point rounding algorithm is specified as follows.
+Suppose the pre-rounding result is `v`, and `d` bits of that result are to be
+rounded off.
+Then the rounded result is `(v >> d) + r`, where `r` depends on the rounding
+mode as specified in the following table.
+
+.vxrm encoding
+//[cols="1,1,4,10,5"]
+[%autowidth,float="center",align="center",cols="<,<,<,<,<",options="header"]
+|===
+2+| `vxrm[1:0]` | Abbreviation | Rounding Mode | Rounding increment, `r`
+
+| 0 | 0 | rnu | round-to-nearest-up (add +0.5 LSB) | `v[d-1]`
+| 0 | 1 | rne | round-to-nearest-even | `v[d-1] & (v[d-2:0]{ne}0 \| v[d])`
+| 1 | 0 | rdn | round-down (truncate) | `0`
+| 1 | 1 | rod | round-to-odd (OR bits into LSB, aka "jam") | `!v[d] & v[d-1:0]{ne}0`
+|===
+
+The rounding functions:
+----
+roundoff_unsigned(v, d) = (unsigned(v) >> d) + r
+roundoff_signed(v, d) = (signed(v) >> d) + r
+----
+are used to represent this operation in the instruction descriptions below.
+
+==== Vector Fixed-Point Saturation Flag `vxsat`
+
+The `vxsat` CSR has a single read-write least-significant bit
+(`vxsat[0]`) that indicates if a fixed-point instruction has had to
+saturate an output value to fit into a destination format.
+Bits `vxsat[XLEN-1:1]` should be written as zeros.
+
+The `vxsat` bit is mirrored in `vcsr`.
+
+==== Vector Control and Status Register `vcsr`
+
+The `vxrm` and `vxsat` separate CSRs can also be accessed via fields
+in the _XLEN_-bit-wide vector control and status CSR, `vcsr`.
+
+.vcsr layout
+[cols=">2,4,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Bits | Name | Description
+
+| XLEN-1:3 | | Reserved
+| 2:1 | vxrm[1:0] | Fixed-point rounding mode
+| 0 | vxsat | Fixed-point accrued saturation flag
+|===
+
+==== State of Vector Extension at Reset
+
+The vector extension must have a consistent state at reset. In
+particular, `vtype` and `vl` must have values that can be read and
+then restored with a single `vsetvl` instruction.
+
+NOTE: It is recommended that at reset, `vtype.vill` is set, the
+remaining bits in `vtype` are zero, and `vl` is set to zero.
+
+The `vstart`, `vxrm`, `vxsat` CSRs can have arbitrary values at reset.
+
+NOTE: Most uses of the vector unit will require an initial `vset{i}vl{i}`,
+which will reset `vstart`. The `vxrm` and `vxsat` fields should be
+reset explicitly in software before use.
+
+The vector registers can have arbitrary values at reset.
+
+=== Mapping of Vector Elements to Vector Register State
+
+The following diagrams illustrate how different width elements are
+packed into the bytes of a vector register depending on the current
+SEW and LMUL settings, as well as implementation VLEN. Elements are
+packed into each vector register with the least-significant byte in
+the lowest-numbered bits.
+
+The mapping was chosen to provide the simplest and most portable model
+for software, but might appear to incur large wiring cost for wider
+vector datapaths on certain operations. The vector instruction set
+was expressly designed to support implementations that internally
+rearrange vector data for different SEW to reduce datapath wiring
+costs, while externally preserving the simple software model.
+
+NOTE: For example, microarchitectures can track the EEW with which a
+vector register was written, and then insert additional scrambling
+operations to rearrange data if the register is accessed with a
+different EEW.
+
+==== Mapping for LMUL = 1
+
+When LMUL=1, elements are simply packed in order from the
+least-significant to most-significant bits of the vector register.
+
+NOTE: To increase readability, vector register layouts are drawn with
+bytes ordered from right to left with increasing byte address. Bits
+within an element are numbered in a little-endian format with
+increasing bit index from right to left corresponding to increasing
+magnitude.
+
+----
+LMUL=1 examples.
+
+The element index is given in hexadecimal and is shown placed at the
+least-significant byte of the stored element.
+
+
+ VLEN=32b
+
+ Byte 3 2 1 0
+
+ SEW=8b 3 2 1 0
+ SEW=16b 1 0
+ SEW=32b 0
+
+ VLEN=64b
+
+ Byte 7 6 5 4 3 2 1 0
+
+ SEW=8b 7 6 5 4 3 2 1 0
+ SEW=16b 3 2 1 0
+ SEW=32b 1 0
+ SEW=64b 0
+
+ VLEN=128b
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+
+ SEW=8b F E D C B A 9 8 7 6 5 4 3 2 1 0
+ SEW=16b 7 6 5 4 3 2 1 0
+ SEW=32b 3 2 1 0
+ SEW=64b 1 0
+
+ VLEN=256b
+
+ Byte 1F1E1D1C1B1A19181716151413121110 F E D C B A 9 8 7 6 5 4 3 2 1 0
+
+ SEW=8b 1F1E1D1C1B1A19181716151413121110 F E D C B A 9 8 7 6 5 4 3 2 1 0
+ SEW=16b F E D C B A 9 8 7 6 5 4 3 2 1 0
+ SEW=32b 7 6 5 4 3 2 1 0
+ SEW=64b 3 2 1 0
+----
+
+==== Mapping for LMUL < 1
+
+When LMUL < 1, only the first LMUL*VLEN/SEW elements in the vector
+register are used. The remaining space in the vector register is
+treated as part of the tail, and hence must obey the vta setting.
+
+----
+ Example, VLEN=128b, LMUL=1/4
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+
+ SEW=8b - - - - - - - - - - - - 3 2 1 0
+ SEW=16b - - - - - - 1 0
+ SEW=32b - - - 0
+----
+
+==== Mapping for LMUL > 1
+
+When vector registers are grouped, the elements of the vector register
+group are packed contiguously in element order beginning with the
+lowest-numbered vector register and moving to the
+next-highest-numbered vector register in the group once each vector
+register is filled.
+
+----
+ LMUL > 1 examples
+
+ VLEN=32b, SEW=8b, LMUL=2
+
+ Byte 3 2 1 0
+ v2*n 3 2 1 0
+ v2*n+1 7 6 5 4
+
+ VLEN=32b, SEW=16b, LMUL=2
+
+ Byte 3 2 1 0
+ v2*n 1 0
+ v2*n+1 3 2
+
+ VLEN=32b, SEW=16b, LMUL=4
+
+ Byte 3 2 1 0
+ v4*n 1 0
+ v4*n+1 3 2
+ v4*n+2 5 4
+ v4*n+3 7 6
+
+ VLEN=32b, SEW=32b, LMUL=4
+
+ Byte 3 2 1 0
+ v4*n 0
+ v4*n+1 1
+ v4*n+2 2
+ v4*n+3 3
+
+ VLEN=64b, SEW=32b, LMUL=2
+
+ Byte 7 6 5 4 3 2 1 0
+ v2*n 1 0
+ v2*n+1 3 2
+
+ VLEN=64b, SEW=32b, LMUL=4
+
+ Byte 7 6 5 4 3 2 1 0
+ v4*n 1 0
+ v4*n+1 3 2
+ v4*n+2 5 4
+ v4*n+3 7 6
+
+ VLEN=128b, SEW=32b, LMUL=2
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+ v2*n 3 2 1 0
+ v2*n+1 7 6 5 4
+
+ VLEN=128b, SEW=32b, LMUL=4
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+ v4*n 3 2 1 0
+ v4*n+1 7 6 5 4
+ v4*n+2 B A 9 8
+ v4*n+3 F E D C
+----
+
+[[sec-mapping-mixed]]
+==== Mapping across Mixed-Width Operations
+
+The vector ISA is designed to support mixed-width operations without
+requiring additional explicit rearrangement instructions. The
+recommended software strategy when operating on multiple vectors with
+different precision values is to modify `vtype` dynamically to keep
+SEW/LMUL constant (and hence VLMAX constant).
+
+The following example shows four different packed element widths (8b,
+16b, 32b, 64b) in a VLEN=128b implementation. The vector register
+grouping factor (LMUL) is increased by the relative element size such
+that each group can hold the same number of vector elements (VLMAX=8
+in this example) to simplify stripmining code.
+
+----
+Example VLEN=128b, with SEW/LMUL=16
+
+Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+vn - - - - - - - - 7 6 5 4 3 2 1 0 SEW=8b, LMUL=1/2
+
+vn 7 6 5 4 3 2 1 0 SEW=16b, LMUL=1
+
+v2*n 3 2 1 0 SEW=32b, LMUL=2
+v2*n+1 7 6 5 4
+
+v4*n 1 0 SEW=64b, LMUL=4
+v4*n+1 3 2
+v4*n+2 5 4
+v4*n+3 7 6
+----
+
+The following table shows each possible constant SEW/LMUL operating
+point for loops with mixed-width operations. Each column represents a
+constant SEW/LMUL operating point. Entries in table are the LMUL
+values that yield that column's SEW/LMUL value for the datawidth on
+that row. In each column, an LMUL setting for a datawidth indicates
+that it can be aligned with the other datawidths in the same column
+that also have an LMUL setting, such that all have the same VLMAX.
+
+|===
+| 7+^| SEW/LMUL
+| | 1 | 2 | 4 | 8 | 16 | 32 | 64
+
+| SEW= 8 | 8 | 4 | 2 | 1 | 1/2 | 1/4 | 1/8
+| SEW= 16 | | 8 | 4 | 2 | 1 | 1/2 | 1/4
+| SEW= 32 | | | 8 | 4 | 2 | 1 | 1/2
+| SEW= 64 | | | | 8 | 4 | 2 | 1
+|===
+
+Larger LMUL settings can also used to simply increase vector length to
+reduce instruction fetch and dispatch overheads in cases where fewer
+vector register groups are needed.
+
+[[sec-mask-register-layout]]
+==== Mask Register Layout
+
+A vector mask occupies only one vector register regardless of SEW and
+LMUL.
+
+Each element is allocated a single mask bit in a mask vector register.
+The mask bit for element _i_ is located in bit _i_ of the mask
+register, independent of SEW or LMUL.
+
+=== Vector Instruction Formats
+
+The instructions in the vector extension fit under two existing major
+opcodes (LOAD-FP and STORE-FP) and one new major opcode (OP-V).
+
+Vector loads and stores are encoded within the scalar floating-point
+load and store major opcodes (LOAD-FP/STORE-FP). The vector load and
+store encodings repurpose a portion of the standard scalar
+floating-point load/store 12-bit immediate field to provide further
+vector instruction encoding, with bit 25 holding the standard vector
+mask bit (see <<sec-vector-mask-encoding>>).
+
+include::images/wavedrom/vmem-format.adoc[]
+
+include::images/wavedrom/valu-format.adoc[]
+
+include::images/wavedrom/vcfg-format.adoc[]
+
+Vector instructions can have scalar or vector source operands and
+produce scalar or vector results, and most vector instructions can be
+performed either unconditionally or conditionally under a mask.
+
+Vector loads and stores move bit patterns between vector register
+elements and memory. Vector arithmetic instructions operate on values
+held in vector register elements.
+
+==== Scalar Operands
+
+Scalar operands can be immediates, or taken from the `x` registers,
+the `f` registers, or element 0 of a vector register. Scalar results
+are written to an `x` or `f` register or to element 0 of a vector
+register. Any vector register can be used to hold a scalar regardless
+of the current LMUL setting.
+
+NOTE: Zfinx ("F in X") is a new ISA extension where
+floating-point instructions take their arguments from the integer
+register file. The vector extension is also compatible with Zfinx,
+where the Zfinx vector extension has vector-scalar floating-point
+instructions taking their scalar argument from the `x` registers.
+
+NOTE: We considered but did not pursue overlaying the `f` registers on
+`v` registers. The adopted approach reduces vector register pressure,
+avoids interactions with the standard calling convention, simplifies
+high-performance scalar floating-point design, and provides
+compatibility with the Zfinx ISA option. Overlaying `f` with `v`
+would provide the advantage of lowering the number of state bits in
+some implementations, but complicates high-performance designs and
+would prevent compatibility with the Zfinx ISA option.
+
+[[sec-vec-operands]]
+==== Vector Operands
+
+Each vector operand has an _effective_ _element_ _width_ (EEW) and an
+_effective_ LMUL (EMUL) that is used to determine the size and
+location of all the elements within a vector register group. By
+default, for most operands of most instructions, EEW=SEW and
+EMUL=LMUL.
+
+Some vector instructions have source and destination vector operands
+with the same number of elements but different widths, so that EEW and
+EMUL differ from SEW and LMUL respectively but EEW/EMUL = SEW/LMUL.
+For example, most widening arithmetic instructions have a source group
+with EEW=SEW and EMUL=LMUL but have a destination group with EEW=2*SEW and
+EMUL=2*LMUL. Narrowing instructions have a source operand that has
+EEW=2*SEW and EMUL=2*LMUL but with a destination where EEW=SEW and EMUL=LMUL.
+
+Vector operands or results may occupy one or more vector registers
+depending on EMUL, but are always specified using the lowest-numbered
+vector register in the group. Using other than the lowest-numbered
+vector register to specify a vector register group is a reserved
+encoding.
+
+A vector register cannot be used to provide source operands with more
+than one EEW for a single instruction. A mask register source is
+considered to have EEW=1 for this constraint. An encoding that would
+result in the same vector register being read with two or more
+different EEWs, including when the vector register appears at
+different positions within two or more vector register groups, is
+reserved.
+
+NOTE: In practice, there is no software benefit to reading the same
+register with different EEW in the same instruction, and this
+constraint reduces complexity for implementations that internally
+rearrange data dependent on EEW.
+
+A destination vector register group can overlap a source vector register
+group only if one of the following holds:
+
+- The destination EEW equals the source EEW.
+- The destination EEW is smaller than the source EEW and the overlap is in
+ the lowest-numbered part of the source register group (e.g., when LMUL=1,
+ `vnsrl.wi v0, v0, 3` is legal, but a destination of `v1` is not).
+- The destination EEW is greater than the source EEW, the source EMUL is
+ at least 1, and the overlap is in the highest-numbered part of the
+ destination register group (e.g., when LMUL=8, `vzext.vf4 v0, v6` is legal,
+ but a source of `v0`, `v2`, or `v4` is not).
+
+For the purpose of determining register group overlap constraints,
+mask elements have EEW=1.
+
+NOTE: The overlap constraints are designed to support resumable
+exceptions in machines without register renaming.
+
+Any instruction encoding that violates the overlap constraints is reserved.
+
+When source and destination registers overlap and have different EEW, the
+instruction is mask- and tail-agnostic, regardless of the setting of the
+`vta` and `vma` bits in `vtype`.
+
+The largest vector register group used by an instruction can not be
+greater than 8 vector registers (i.e., EMUL{le}8), and if a vector
+instruction would require greater than 8 vector registers in a group,
+the instruction encoding is reserved. For example, a widening
+operation that produces a widened vector register group result when
+LMUL=8 is reserved as this would imply a result EMUL=16.
+
+Widened scalar values, e.g., input and output to a widening reduction
+operation, are held in the first element of a vector register and
+have EMUL=1.
+
+==== Vector Masking
+
+Masking is supported on many vector instructions. Element operations
+that are masked off (inactive) never generate exceptions. The
+destination vector register elements corresponding to masked-off
+elements are handled with either a mask-undisturbed or mask-agnostic
+policy depending on the setting of the `vma` bit in `vtype` (Section
+<<sec-agnostic>>).
+
+The mask value used to control execution of a masked vector
+instruction is always supplied by vector register `v0`.
+
+NOTE: Masks are held in vector registers, rather than in a separate mask
+register file, to reduce total architectural state and to simplify the ISA.
+
+NOTE: Future vector extensions may provide longer instruction
+encodings with space for a full mask register specifier.
+
+The destination vector register group for a masked vector instruction
+cannot overlap the source mask register (`v0`), unless the destination
+vector register is being written with a mask value (e.g., compares)
+or the scalar result of a reduction. These instruction encodings are
+reserved.
+
+NOTE: This constraint supports restart with a non-zero `vstart` value.
+
+Other vector registers can be used to hold working mask values, and
+mask vector logical operations are provided to perform predicate
+calculations. [[sec-mask-vector-logical]]
+
+As specified in Section <<sec-agnostic>>, mask destination values are
+always treated as tail-agnostic, regardless of the setting of `vta`.
+
+[[sec-vector-mask-encoding]]
+===== Mask Encoding
+
+Where available, masking is encoded in a single-bit `vm` field in the
+ instruction (`inst[25]`).
+
+[cols="1,15"]
+|===
+| vm | Description
+
+| 0 | vector result, only where v0.mask[i] = 1
+| 1 | unmasked
+|===
+
+Vector masking is represented in assembler code as another vector
+operand, with `.t` indicating that the operation occurs when
+`v0.mask[i]` is `1` (`t` for "true"). If no masking operand is
+specified, unmasked vector execution (`vm=1`) is assumed.
+
+----
+ vop.v* v1, v2, v3, v0.t # enabled where v0.mask[i]=1, vm=0
+ vop.v* v1, v2, v3 # unmasked vector operation, vm=1
+----
+
+NOTE: Even though the current vector extensions only support one vector
+mask register `v0` and only the true form of predication, the assembly
+syntax writes it out in full to be compatible with future extensions
+that might add a mask register specifier and support both true and
+complement mask values. The `.t` suffix on the masking operand also helps
+to visually encode the use of a mask.
+
+NOTE: The `.mask` suffix is not part of the assembly syntax.
+We only append it in contexts where a mask vector is subscripted,
+e.g., `v0.mask[i]`.
+
+[[sec-inactive-defs]]
+==== Prestart, Active, Inactive, Body, and Tail Element Definitions
+
+The destination element indices operated on during a vector
+instruction's execution can be divided into three disjoint subsets.
+
+* The _prestart_ elements are those whose element index is less than the
+initial value in the `vstart` register. The prestart elements do not
+raise exceptions and do not update the destination vector register.
+
+* The _body_ elements are those whose element index is greater than or equal
+to the initial value in the `vstart` register, and less than the current
+vector length setting in `vl`. The body can be split into two disjoint subsets:
+
+** The _active_ elements during a vector instruction's execution are the
+elements within the body and where the current mask is enabled at that element
+position. The active elements can raise exceptions and update the destination
+vector register group.
+
+** The _inactive_ elements are the elements within the body
+but where the current mask is disabled at that element
+position. The inactive elements do not raise exceptions and do not
+update any destination vector register group unless masked agnostic is
+specified (`vtype.vma`=1), in which case inactive elements may be
+overwritten with 1s.
+
+* The _tail_ elements during a vector instruction's execution are the
+elements past the current vector length setting specified in `vl`.
+The tail elements do not raise exceptions, and do not update any
+destination vector register group unless tail agnostic is specified
+(`vtype.vta`=1), in which case tail elements may be overwritten with
+1s, or with the result of the instruction in the case of
+mask-producing instructions except for mask loads. When LMUL < 1, the
+tail includes the elements past VLMAX that are held in the same vector
+register.
+
+----
+ for element index x
+ prestart(x) = (0 <= x < vstart)
+ body(x) = (vstart <= x < vl)
+ tail(x) = (vl <= x < max(VLMAX,VLEN/SEW))
+ mask(x) = unmasked || v0.mask[x] == 1
+ active(x) = body(x) && mask(x)
+ inactive(x) = body(x) && !mask(x)
+----
+
+When `vstart` {ge} `vl`, there are no body elements, and no elements
+are updated in any destination vector register group, including that
+no tail elements are updated with agnostic values.
+
+NOTE: As a consequence, when `vl`=0, no elements, including agnostic
+elements, are updated in the destination vector register group
+regardless of `vstart`.
+
+Instructions that write an `x` register or `f` register
+do so even when `vstart` {ge} `vl`, including when `vl`=0.
+
+NOTE: Some instructions such as `vslidedown` and `vrgather` may read
+indices past `vl` or even VLMAX in source vector register groups. The
+general policy is to return the value 0 when the index is greater than
+VLMAX in the source vector register group.
+
+[[sec-vector-config]]
+=== Configuration-Setting Instructions (`vsetvli`/`vsetivli`/`vsetvl`)
+
+One of the common approaches to handling a large number of elements is
+"stripmining" where each iteration of a loop handles some number of elements,
+and the iterations continue until all elements have been processed. The RISC-V
+vector specification provides direct, portable support for this approach.
+The application specifies the total number of elements to be processed (the application vector length or AVL) as a
+candidate value for `vl`, and the hardware responds via a general-purpose
+register with the (frequently smaller) number of elements that the hardware
+will handle per iteration (stored in `vl`), based on the microarchitectural
+implementation and the `vtype` setting. A straightforward loop structure,
+shown in <<example-stripmine-sew>>, depicts the ease with which the code keeps
+track of the remaining number of elements and the amount per iteration handled
+by hardware.
+
+A set of instructions is provided to allow rapid configuration of the
+values in `vl` and `vtype` to match application needs. The
+`vset{i}vl{i}` instructions set the `vtype` and `vl` CSRs based on
+their arguments, and write the new value of `vl` into `rd`.
+
+----
+ vsetvli rd, rs1, vtypei # rd = new vl, rs1 = AVL, vtypei = new vtype setting
+ vsetivli rd, uimm, vtypei # rd = new vl, uimm = AVL, vtypei = new vtype setting
+ vsetvl rd, rs1, rs2 # rd = new vl, rs1 = AVL, rs2 = new vtype value
+----
+
+include::images/wavedrom/vcfg-format.adoc[]
+
+==== `vtype` encoding
+
+include::images/wavedrom/vtype-format.adoc[]
+
+The new `vtype` value is encoded in the immediate fields of `vsetvli`
+and `vsetivli`, and in the `rs2` register for `vsetvl`.
+
+----
+ Suggested assembler names used for vset{i}vli vtypei immediate
+
+ e8 # SEW=8b
+ e16 # SEW=16b
+ e32 # SEW=32b
+ e64 # SEW=64b
+
+ mf8 # LMUL=1/8
+ mf4 # LMUL=1/4
+ mf2 # LMUL=1/2
+ m1 # LMUL=1, assumed if m setting absent
+ m2 # LMUL=2
+ m4 # LMUL=4
+ m8 # LMUL=8
+
+Examples:
+ vsetvli t0, a0, e8, ta, ma # SEW= 8, LMUL=1
+ vsetvli t0, a0, e8, m2, ta, ma # SEW= 8, LMUL=2
+ vsetvli t0, a0, e32, mf2, ta, ma # SEW=32, LMUL=1/2
+----
+
+The `vsetvl` variant operates similarly to `vsetvli` except that it
+takes a `vtype` value from `rs2` and can be used for context restore.
+
+===== Unsupported `vtype` Values
+
+If the `vtype` value is not supported by the implementation, then
+the `vill` bit is set in `vtype`, the remaining bits in `vtype` are
+set to zero, and the `vl` register is also set to zero.
+
+NOTE: Earlier drafts required a trap when setting `vtype` to an
+illegal value. However, this would have added the first
+data-dependent trap on a CSR write to the ISA. Implementations could
+choose to trap when illegal values are written to `vtype` instead of
+setting `vill`, to allow emulation to support new configurations for
+forward-compatibility. The current scheme supports light-weight
+runtime interrogation of the supported vector unit configurations by
+checking if `vill` is clear for a given setting.
+
+A `vtype` value with `vill` set is treated as an unsupported
+configuration.
+
+Implementations must consider all bits of the `vtype` value to
+determine if the configuration is supported. An unsupported value in
+any location within the `vtype` value must result in `vill` being set.
+
+NOTE: In particular, all XLEN bits of the register `vtype` argument to
+the `vsetvl` instruction must be checked. Implementations cannot
+ignore fields they do not implement. All bits must be checked to
+ensure that new code assuming unsupported vector features in `vtype`
+traps instead of executing incorrectly on an older implementation.
+
+==== AVL encoding
+
+The new vector
+length setting is based on AVL, which for `vsetvli` and `vsetvl` is encoded in the `rs1` and `rd`
+fields as follows:
+
+.AVL used in `vsetvli` and `vsetvl` instructions
+[cols="2,2,10,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| `rd` | `rs1` | AVL value | Effect on `vl`
+| - | !x0 | Value in `x[rs1]` | Normal stripmining
+| !x0 | x0 | ~0 | Set `vl` to VLMAX
+| x0 | x0 | Value in `vl` register | Keep existing `vl` (of course, `vtype` may change)
+|===
+
+When `rs1` is not `x0`, the AVL is an unsigned integer held in the `x`
+register specified by `rs1`, and the new `vl` value is also written to
+the `x` register specified by `rd`.
+
+When `rs1=x0` but `rd!=x0`, the maximum unsigned integer value (`~0`)
+is used as the AVL, and the resulting VLMAX is written to `vl` and
+also to the `x` register specified by `rd`.
+
+When `rs1=x0` and `rd=x0`, the instruction operates as if the current
+vector length in `vl` is used as the AVL, and the resulting value is
+written to `vl`, but not to a destination register. This form can
+only be used when VLMAX and hence `vl` is not actually changed by the
+new SEW/LMUL ratio. Use of the instruction with a new SEW/LMUL ratio
+that would result in a change of VLMAX is reserved.
+Use of the instruction is also reserved if `vill` was 1 beforehand.
+Implementations may set `vill` in either case.
+
+NOTE: This last form of the instructions allows the `vtype` register to
+be changed while maintaining the current `vl`, provided VLMAX is not
+reduced. This design was chosen to ensure `vl` would always hold a
+legal value for current `vtype` setting. The current `vl` value can
+be read from the `vl` CSR. The `vl` value could be reduced by this
+instruction if the new SEW/LMUL ratio causes VLMAX to shrink, and so
+this case has been reserved as it is not clear this is a generally
+useful operation, and implementations can otherwise assume `vl` is not
+changed by this instruction to optimize their microarchitecture.
+
+For the `vsetivli` instruction, the AVL is encoded as a 5-bit
+zero-extended immediate (0--31) in the `rs1` field.
+
+NOTE: The encoding of AVL for `vsetivli` is the same as for regular
+CSR immediate values.
+
+NOTE: The `vsetivli` instruction provides more compact code when the
+dimensions of vectors are small and known to fit inside the vector
+registers, in which case there is no stripmining overhead.
+
+==== Constraints on Setting `vl`
+
+The `vset{i}vl{i}` instructions first set VLMAX according to their `vtype`
+argument, then set `vl` obeying the following constraints:
+
+. `vl = AVL` if `AVL {le} VLMAX`
+. `ceil(AVL / 2) {le} vl {le} VLMAX` if `AVL < (2 * VLMAX)`
+. `vl = VLMAX` if `AVL {ge} (2 * VLMAX)`
+. Deterministic on any given implementation for same input AVL and VLMAX values
+. These specific properties follow from the prior rules:
+.. `vl = 0` if `AVL = 0`
+.. `vl > 0` if `AVL > 0`
+.. `vl {le} VLMAX`
+.. `vl {le} AVL`
+.. a value read from `vl` when used as the AVL argument to `vset{i}vl{i}` results in the same
+value in `vl`, provided the resultant VLMAX equals the value of VLMAX at the time that `vl` was read
+
+[NOTE]
+--
+The `vl` setting rules are designed to be sufficiently strict to
+preserve `vl` behavior across register spills and context swaps for
+`AVL {le} VLMAX`, yet flexible enough to enable implementations to improve
+vector lane utilization for `AVL > VLMAX`.
+
+For example, this permits an implementation to set `vl = ceil(AVL / 2)`
+for `VLMAX < AVL < 2*VLMAX` in order to evenly distribute work over the
+last two iterations of a stripmine loop.
+Requirement 2 ensures that the first stripmine iteration of reduction
+loops uses the largest vector length of all iterations, even in the case
+of `AVL < 2*VLMAX`.
+This allows software to avoid needing to explicitly calculate a running
+maximum of vector lengths observed during a stripmined loop.
+Requirement 2 also allows an implementation to set vl to VLMAX for `VLMAX < AVL < 2*VLMAX`
+--
+
+[[example-stripmine-sew]]
+==== Example of stripmining and changes to SEW
+
+The SEW and LMUL settings can be changed dynamically to provide high
+throughput on mixed-width operations in a single loop.
+----
+# Example: Load 16-bit values, widen multiply to 32b, shift 32b result
+# right by 3, store 32b values.
+# On entry:
+# a0 holds the total number of elements to process
+# a1 holds the address of the source array
+# a2 holds the address of the destination array
+
+loop:
+ vsetvli a3, a0, e16, m4, ta, ma # vtype = 16-bit integer vectors;
+ # also update a3 with vl (# of elements this iteration)
+ vle16.v v4, (a1) # Get 16b vector
+ slli t1, a3, 1 # Multiply # elements this iteration by 2 bytes/source element
+ add a1, a1, t1 # Bump pointer
+ vwmul.vx v8, v4, x10 # Widening multiply into 32b in <v8--v15>
+
+ vsetvli x0, x0, e32, m8, ta, ma # Operate on 32b values
+ vsrl.vi v8, v8, 3
+ vse32.v v8, (a2) # Store vector of 32b elements
+ slli t1, a3, 2 # Multiply # elements this iteration by 4 bytes/destination element
+ add a2, a2, t1 # Bump pointer
+ sub a0, a0, a3 # Decrement count by vl
+ bnez a0, loop # Any more?
+----
+
+[[sec-vector-memory]]
+=== Vector Loads and Stores
+
+Vector loads and stores move values between vector registers and
+memory.
+Vector loads and stores can be masked, and they only access memory or raise
+exceptions for active elements.
+Masked vector loads do not update inactive elements in the destination vector
+register group, unless masked agnostic is specified (`vtype.vma`=1).
+All vector loads and stores may
+generate and accept a non-zero `vstart` value.
+
+==== Vector Load/Store Instruction Encoding
+
+Vector loads and stores are encoded within the scalar floating-point
+load and store major opcodes (LOAD-FP/STORE-FP). The vector load and
+store encodings repurpose a portion of the standard scalar
+floating-point load/store 12-bit immediate field to provide further
+vector instruction encoding, with bit 25 holding the standard vector
+mask bit (see <<sec-vector-mask-encoding>>).
+
+include::images/wavedrom/vmem-format.adoc[]
+
+[cols="4,12"]
+|===
+| Field | Description
+
+| rs1[4:0] | specifies x register holding base address
+| rs2[4:0] | specifies x register holding stride
+| vs2[4:0] | specifies v register holding address offsets
+| vs3[4:0] | specifies v register holding store data
+| vd[4:0] | specifies v register destination of load
+| vm | specifies whether vector masking is enabled (0 = mask enabled, 1 = mask disabled)
+| width[2:0] | specifies size of memory elements, and distinguishes from FP scalar
+| mew | extended memory element width. See <<sec-vector-loadstore-width-encoding>>
+| mop[1:0] | specifies memory addressing mode
+| nf[2:0] | specifies the number of fields in each segment, for segment load/stores
+| lumop[4:0]/sumop[4:0] | are additional fields encoding variants of unit-stride instructions
+|===
+
+Vector memory unit-stride and constant-stride operations directly
+encode EEW of the data to be transferred statically in the instruction
+to reduce the number of `vtype` changes when accessing memory in a
+mixed-width routine. Indexed operations use the explicit EEW encoding
+in the instruction to set the size of the indices used, and use
+SEW/LMUL to specify the data width.
+
+==== Vector Load/Store Addressing Modes
+
+The vector extension supports unit-stride, strided, and
+indexed (scatter/gather) addressing modes. Vector load/store base
+registers and strides are taken from the GPR `x` registers.
+
+The base effective address for all vector accesses is given by the
+contents of the `x` register named in `rs1`.
+
+Vector unit-stride operations access elements stored contiguously in
+memory starting from the base effective address.
+
+Vector constant-strided operations access the first memory element at the base
+effective address, and then access subsequent elements at address
+increments given by the byte offset contained in the `x` register
+specified by `rs2`.
+
+Vector indexed operations add the contents of each element of the
+vector offset operand specified by `vs2` to the base effective address
+to give the effective address of each element. The data vector
+register group has EEW=SEW, EMUL=LMUL, while the offset vector
+register group has EEW encoded in the instruction and
+EMUL=(EEW/SEW)*LMUL.
+
+The vector offset operand is treated as a vector of byte-address
+offsets.
+
+NOTE: The indexed operations can also be used to access fields within
+a vector of objects, where the `vs2` vector holds pointers to the base
+of the objects and the scalar `x` register holds the offset of the
+member field in each object. Supporting this case is why the indexed
+operations were not defined to scale the element indices by the data
+EEW.
+
+If the vector offset elements are narrower than XLEN, they are
+zero-extended to XLEN before adding to the base effective address. If
+the vector offset elements are wider than XLEN, the least-significant
+XLEN bits are used in the address calculation. An implementation must
+raise an illegal instruction exception if the EEW is not supported for
+offset elements.
+
+NOTE: A profile may place an upper limit on the maximum supported index
+EEW (e.g., only up to XLEN) smaller than ELEN.
+
+The vector addressing modes are encoded using the 2-bit `mop[1:0]`
+field.
+
+.encoding for loads
+[cols="1,1,7,6"]
+|===
+2+| mop [1:0] | Description | Opcodes
+
+| 0 | 0 | unit-stride | VLE<EEW>
+| 0 | 1 | indexed-unordered | VLUXEI<EEW>
+| 1 | 0 | strided | VLSE<EEW>
+| 1 | 1 | indexed-ordered | VLOXEI<EEW>
+|===
+
+.encoding for stores
+[cols="1,1,7,6"]
+|===
+2+| mop [1:0] | Description | Opcodes
+
+| 0 | 0 | unit-stride | VSE<EEW>
+| 0 | 1 | indexed-unordered | VSUXEI<EEW>
+| 1 | 0 | strided | VSSE<EEW>
+| 1 | 1 | indexed-ordered | VSOXEI<EEW>
+|===
+
+Vector unit-stride and constant-stride memory accesses do not
+guarantee ordering between individual element accesses. The vector
+indexed load and store memory operations have two forms, ordered and
+unordered. The indexed-ordered variants preserve element ordering on
+memory accesses.
+
+For unordered instructions (`mop[1:0]`!=11) there is no guarantee on
+element access order. If the accesses are to a strongly ordered IO
+region, the element accesses can be initiated in any order.
+
+NOTE: To provide ordered vector accesses to a strongly ordered IO
+region, the ordered indexed instructions should be used.
+
+For implementations with precise vector traps, exceptions on
+indexed-unordered stores must also be precise.
+
+Additional unit-stride vector addressing modes are encoded using the
+5-bit `lumop` and `sumop` fields in the unit-stride load and store
+instruction encodings respectively.
+
+.lumop
+[cols="1,1,1,1,1,11"]
+|===
+5+| lumop[4:0] | Description
+
+| 0 | 0 | 0 | 0 | 0 | unit-stride load
+| 0 | 1 | 0 | 0 | 0 | unit-stride, whole register load
+| 0 | 1 | 0 | 1 | 1 | unit-stride, mask load, EEW=8
+| 1 | 0 | 0 | 0 | 0 | unit-stride fault-only-first
+| x | x | x | x | x | other encodings reserved
+|===
+
+.sumop
+[cols="1,1,1,1,1,11"]
+|===
+5+| sumop[4:0] | Description
+
+| 0 | 0 | 0 | 0 | 0 | unit-stride store
+| 0 | 1 | 0 | 0 | 0 | unit-stride, whole register store
+| 0 | 1 | 0 | 1 | 1 | unit-stride, mask store, EEW=8
+| x | x | x | x | x | other encodings reserved
+|===
+
+The `nf[2:0]` field encodes the number of fields in each segment. For
+regular vector loads and stores, `nf`=0, indicating that a single
+value is moved between a vector register group and memory at each
+element position. Larger values in the `nf` field are used to access
+multiple contiguous fields within a segment as described below in
+Section <<sec-aos>>.
+
+The `nf[2:0]` field also encodes the number of whole vector registers
+to transfer for the whole vector register load/store instructions.
+
+[[sec-vector-loadstore-width-encoding]]
+==== Vector Load/Store Width Encoding
+
+Vector loads and stores have an EEW encoded directly in the
+instruction. The corresponding EMUL is calculated as EMUL =
+(EEW/SEW)*LMUL. If the EMUL would be out of range (EMUL>8 or
+EMUL<1/8), the instruction encoding is reserved. The vector register
+groups must have legal register specifiers for the selected EMUL,
+otherwise the instruction encoding is reserved.
+
+Vector unit-stride and constant-stride use the EEW/EMUL encoded in the
+instruction for the data values, while vector indexed loads and stores
+use the EEW/EMUL encoded in the instruction for the index values and
+the SEW/LMUL encoded in `vtype` for the data values.
+
+Vector loads and stores are encoded using width values that are not
+claimed by the standard scalar floating-point loads and stores.
+
+Implementations must provide vector loads and stores with EEWs
+corresponding to all supported SEW settings. Vector load/store
+encodings for unsupported EEW widths must raise an illegal
+instruction exception.
+
+.Width encoding for vector loads and stores.
+[cols="5,1,1,1,1,>3,>3,>3,3"]
+|===
+| | mew 3+| width [2:0] | Mem bits | Data Reg bits | Index bits | Opcodes
+
+| Standard scalar FP | x | 0 | 0 | 1 | 16| FLEN | - | FLH/FSH
+| Standard scalar FP | x | 0 | 1 | 0 | 32| FLEN | - | FLW/FSW
+| Standard scalar FP | x | 0 | 1 | 1 | 64| FLEN | - | FLD/FSD
+| Standard scalar FP | x | 1 | 0 | 0 | 128| FLEN | - | FLQ/FSQ
+| Vector 8b element | 0 | 0 | 0 | 0 | 8| 8 | - | VLxE8/VSxE8
+| Vector 16b element | 0 | 1 | 0 | 1 | 16| 16 | - | VLxE16/VSxE16
+| Vector 32b element | 0 | 1 | 1 | 0 | 32| 32 | - | VLxE32/VSxE32
+| Vector 64b element | 0 | 1 | 1 | 1 | 64| 64 | - | VLxE64/VSxE64
+| Vector 8b index | 0 | 0 | 0 | 0 | SEW | SEW | 8 | VLxEI8/VSxEI8
+| Vector 16b index | 0 | 1 | 0 | 1 | SEW | SEW | 16 | VLxEI16/VSxEI16
+| Vector 32b index | 0 | 1 | 1 | 0 | SEW | SEW | 32 | VLxEI32/VSxEI32
+| Vector 64b index | 0 | 1 | 1 | 1 | SEW | SEW | 64 | VLxEI64/VSxEI64
+| Reserved | 1 | X | X | X | - | - | - |
+|===
+
+Mem bits is the size of each element accessed in memory.
+
+Data reg bits is the size of each data element accessed in register.
+
+Index bits is the size of each index accessed in register.
+
+The `mew` bit (`inst[28]`) when set is expected to be used to encode
+expanded memory sizes of 128 bits and above, but these encodings are
+currently reserved.
+
+==== Vector Unit-Stride Instructions
+
+----
+ # Vector unit-stride loads and stores
+
+ # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
+ vle8.v vd, (rs1), vm # 8-bit unit-stride load
+ vle16.v vd, (rs1), vm # 16-bit unit-stride load
+ vle32.v vd, (rs1), vm # 32-bit unit-stride load
+ vle64.v vd, (rs1), vm # 64-bit unit-stride load
+
+ # vs3 store data, rs1 base address, vm is mask encoding (v0.t or <missing>)
+ vse8.v vs3, (rs1), vm # 8-bit unit-stride store
+ vse16.v vs3, (rs1), vm # 16-bit unit-stride store
+ vse32.v vs3, (rs1), vm # 32-bit unit-stride store
+ vse64.v vs3, (rs1), vm # 64-bit unit-stride store
+----
+
+Additional unit-stride mask load and store instructions are
+provided to transfer mask values to/from memory. These
+operate similarly to unmasked byte loads or stores (EEW=8), except that
+the effective vector length is ``evl``=ceil(``vl``/8) (i.e. EMUL=1),
+and the destination register is always written with a tail-agnostic
+policy.
+
+----
+ # Vector unit-stride mask load
+ vlm.v vd, (rs1) # Load byte vector of length ceil(vl/8)
+
+ # Vector unit-stride mask store
+ vsm.v vs3, (rs1) # Store byte vector of length ceil(vl/8)
+----
+
+`vlm.v` and `vsm.v` are encoded with the same `width[2:0]`=0 encoding as
+`vle8.v` and `vse8.v`, but are distinguished by different
+`lumop` and `sumop` encodings. Since `vlm.v` and `vsm.v` operate as byte loads and stores,
+`vstart` is in units of bytes for these instructions.
+
+NOTE: `vlm.v` and `vsm.v` respect the `vill` field in `vtype`, as
+they depend on `vtype` indirectly through its constraints on `vl`.
+
+NOTE: The previous assembler mnemonics `vle1.v` and `vse1.v` were
+confusing as length was handled differently for these instructions
+versus other element load/store instructions. To avoid software
+churn, these older assembly mnemonics are being retained as aliases.
+
+NOTE: The primary motivation to provide mask load and store is to
+support machines that internally rearrange data to reduce
+cross-datapath wiring. However, these instructions also provide a convenient
+mechanism to use packed bit vectors in memory as mask values,
+and also reduce the cost of mask spill/fill by reducing need to change
+`vl`.
+
+==== Vector Strided Instructions
+
+----
+ # Vector strided loads and stores
+
+ # vd destination, rs1 base address, rs2 byte stride
+ vlse8.v vd, (rs1), rs2, vm # 8-bit strided load
+ vlse16.v vd, (rs1), rs2, vm # 16-bit strided load
+ vlse32.v vd, (rs1), rs2, vm # 32-bit strided load
+ vlse64.v vd, (rs1), rs2, vm # 64-bit strided load
+
+ # vs3 store data, rs1 base address, rs2 byte stride
+ vsse8.v vs3, (rs1), rs2, vm # 8-bit strided store
+ vsse16.v vs3, (rs1), rs2, vm # 16-bit strided store
+ vsse32.v vs3, (rs1), rs2, vm # 32-bit strided store
+ vsse64.v vs3, (rs1), rs2, vm # 64-bit strided store
+----
+
+Negative and zero strides are supported.
+
+Element accesses within a strided instruction are unordered with
+respect to each other.
+
+When `rs2`=`x0`, then an implementation is allowed, but not required,
+to perform fewer memory operations than the number of active elements,
+and may perform different numbers of memory operations across
+different dynamic executions of the same static instruction.
+
+NOTE: Compilers must be aware to not use the `x0` form for rs2 when
+the immediate stride is `0` if the intent is to require all memory
+accesses are performed.
+
+When `rs2!=x0` and the value of `x[rs2]=0`, the implementation must
+perform one memory access for each active element (but these accesses
+will not be ordered).
+
+NOTE: As with other architectural mandates, implementations must
+_appear_ to perform each memory access. Microarchitectures are
+free to optimize away accesses that would not be observed by another
+agent, for example, in idempotent memory regions obeying RVWMO. For
+non-idempotent memory regions, where by definition each access can be
+observed by a device, the optimization would not be possible.
+
+NOTE: When repeating ordered vector accesses to the same memory
+address are required, then an ordered indexed operation can be used.
+
+==== Vector Indexed Instructions
+
+----
+ # Vector indexed loads and stores
+
+ # Vector indexed-unordered load instructions
+ # vd destination, rs1 base address, vs2 byte offsets
+ vluxei8.v vd, (rs1), vs2, vm # unordered 8-bit indexed load of SEW data
+ vluxei16.v vd, (rs1), vs2, vm # unordered 16-bit indexed load of SEW data
+ vluxei32.v vd, (rs1), vs2, vm # unordered 32-bit indexed load of SEW data
+ vluxei64.v vd, (rs1), vs2, vm # unordered 64-bit indexed load of SEW data
+
+ # Vector indexed-ordered load instructions
+ # vd destination, rs1 base address, vs2 byte offsets
+ vloxei8.v vd, (rs1), vs2, vm # ordered 8-bit indexed load of SEW data
+ vloxei16.v vd, (rs1), vs2, vm # ordered 16-bit indexed load of SEW data
+ vloxei32.v vd, (rs1), vs2, vm # ordered 32-bit indexed load of SEW data
+ vloxei64.v vd, (rs1), vs2, vm # ordered 64-bit indexed load of SEW data
+
+ # Vector indexed-unordered store instructions
+ # vs3 store data, rs1 base address, vs2 byte offsets
+ vsuxei8.v vs3, (rs1), vs2, vm # unordered 8-bit indexed store of SEW data
+ vsuxei16.v vs3, (rs1), vs2, vm # unordered 16-bit indexed store of SEW data
+ vsuxei32.v vs3, (rs1), vs2, vm # unordered 32-bit indexed store of SEW data
+ vsuxei64.v vs3, (rs1), vs2, vm # unordered 64-bit indexed store of SEW data
+
+ # Vector indexed-ordered store instructions
+ # vs3 store data, rs1 base address, vs2 byte offsets
+ vsoxei8.v vs3, (rs1), vs2, vm # ordered 8-bit indexed store of SEW data
+ vsoxei16.v vs3, (rs1), vs2, vm # ordered 16-bit indexed store of SEW data
+ vsoxei32.v vs3, (rs1), vs2, vm # ordered 32-bit indexed store of SEW data
+ vsoxei64.v vs3, (rs1), vs2, vm # ordered 64-bit indexed store of SEW data
+
+----
+
+NOTE: The assembler syntax for indexed loads and stores uses
+``ei``__x__ instead of ``e``__x__ to indicate the statically encoded EEW
+is of the index not the data.
+
+NOTE: The indexed operations mnemonics have a "U" or "O" to
+distinguish between unordered and ordered, while the other vector
+addressing modes have no character. While this is perhaps a little
+less consistent, this approach minimizes disruption to existing
+software, as VSXEI previously meant "ordered" - and the opcode can be
+retained as an alias during transition to help reduce software churn.
+
+==== Unit-stride Fault-Only-First Loads
+
+The unit-stride fault-only-first load instructions are used to
+vectorize loops with data-dependent exit conditions ("while" loops).
+These instructions execute as a regular load except that they will
+only take a trap caused by a synchronous exception on element 0. If
+element 0 raises an exception, `vl` is not modified, and the trap is
+taken. If an element > 0 raises an exception, the corresponding trap
+is not taken, and the vector length `vl` is reduced to the index of
+the element that would have raised an exception.
+
+Load instructions may overwrite active destination vector register
+group elements past the element index at which the trap is reported.
+Similarly, fault-only-first load instructions may update active destination
+elements past the element that causes trimming of the vector length
+(but not past the original vector length). The values of these
+spurious updates do not have to correspond to the values in memory at
+the addressed memory locations. Non-idempotent memory locations can
+only be accessed when it is known the corresponding element load
+operation will not be restarted due to a trap or vector-length
+trimming.
+
+----
+ # Vector unit-stride fault-only-first loads
+
+ # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
+ vle8ff.v vd, (rs1), vm # 8-bit unit-stride fault-only-first load
+ vle16ff.v vd, (rs1), vm # 16-bit unit-stride fault-only-first load
+ vle32ff.v vd, (rs1), vm # 32-bit unit-stride fault-only-first load
+ vle64ff.v vd, (rs1), vm # 64-bit unit-stride fault-only-first load
+----
+
+----
+strlen example using unit-stride fault-only-first instruction
+
+include::example/strlen.s[lines=4..-1]
+----
+
+NOTE: There is a security concern with fault-on-first loads, as they
+can be used to probe for valid effective addresses. The unit-stride
+versions only allow probing a region immediately contiguous to a known
+region, and so reduce the security impact when used in unprivileged
+code. However, code running in S-mode can establish arbitrary page
+translations that allow probing of random guest physical addresses
+provided by a hypervisor. Strided and scatter/gather fault-only-first
+instructions are not provided due to lack of encoding space, but they
+can also represent a larger security hole, allowing even unprivileged
+software to easily check multiple random pages for accessibility
+without experiencing a trap. This standard does not address possible
+security mitigations for fault-only-first instructions.
+
+Even when an exception is not raised, implementations are permitted to process
+fewer than `vl` elements and reduce `vl` accordingly, but if `vstart`=0 and
+`vl`>0, then at least one element must be processed.
+
+When the fault-only-first instruction takes a trap due to an
+interrupt, implementations should not reduce `vl` and should instead
+set a `vstart` value.
+
+NOTE: When the fault-only-first instruction would trigger a debug
+data-watchpoint trap on an element after the first, implementations
+should not reduce `vl` but instead should trigger the debug trap as
+otherwise the event might be lost.
+
+[[sec-aos]]
+==== Vector Load/Store Segment Instructions
+
+The vector load/store segment instructions move multiple contiguous
+fields in memory to and from consecutively numbered vector registers.
+
+NOTE: The name "segment" reflects that the items moved are subarrays
+with homogeneous elements. These operations can be used to transpose
+arrays between memory and registers, and can support operations on
+"array-of-structures" datatypes by unpacking each field in a structure
+into a separate vector register.
+
+The three-bit `nf` field in the vector instruction encoding is an
+unsigned integer that contains one less than the number of fields per
+segment, _NFIELDS_.
+
+[[fig-nf]]
+.NFIELDS Encoding
+[cols="1,1,1,13"]
+|===
+3+| nf[2:0] | NFIELDS
+
+| 0 | 0 | 0 | 1
+| 0 | 0 | 1 | 2
+| 0 | 1 | 0 | 3
+| 0 | 1 | 1 | 4
+| 1 | 0 | 0 | 5
+| 1 | 0 | 1 | 6
+| 1 | 1 | 0 | 7
+| 1 | 1 | 1 | 8
+|===
+
+The EMUL setting must be such that EMUL * NFIELDS {le} 8, otherwise
+the instruction encoding is reserved.
+
+NOTE: The product ceil(EMUL) * NFIELDS represents the number of underlying
+vector registers that will be touched by a segmented load or store
+instruction. This constraint makes this total no larger than 1/4 of
+the architectural register file, and the same as for regular
+operations with EMUL=8.
+
+Each field will be held in successively numbered vector register
+groups. When EMUL>1, each field will occupy a vector register group
+held in multiple successively numbered vector registers, and the
+vector register group for each field must follow the usual vector
+register alignment constraints (e.g., when EMUL=2 and NFIELDS=4, each
+field's vector register group must start at an even vector register,
+but does not have to start at a multiple of 8 vector register number).
+
+If the vector register numbers accessed by the segment load or store
+would increment past 31, then the instruction encoding is reserved.
+
+NOTE: This constraint is to help allow for forward-compatibility with
+a possible future longer instruction encoding that has more
+addressable vector registers.
+
+The `vl` register gives the number of segments to move, which is
+equal to the number of elements transferred to each vector register
+group. Masking is also applied at the level of whole segments.
+
+For segment loads and stores, the individual memory accesses used to
+access fields within each segment are unordered with respect to each
+other even for ordered indexed segment loads and stores.
+
+The `vstart` value is in units of whole segments. If a trap occurs during
+access to a segment, it is implementation-defined whether a subset
+of the faulting segment's accesses are performed before the trap is taken.
+
+===== Vector Unit-Stride Segment Loads and Stores
+
+The vector unit-stride load and store segment instructions move packed
+contiguous segments into multiple destination vector register groups.
+
+NOTE: Where the segments hold structures with heterogeneous-sized
+fields, software can later unpack individual structure fields using
+additional instructions after the segment load brings data into the
+vector registers.
+
+The assembler prefixes `vlseg`/`vsseg` are used for unit-stride
+segment loads and stores respectively.
+
+----
+ # Format
+ vlseg<nf>e<eew>.v vd, (rs1), vm # Unit-stride segment load template
+ vsseg<nf>e<eew>.v vs3, (rs1), vm # Unit-stride segment store template
+
+ # Examples
+ vlseg8e8.v vd, (rs1), vm # Load eight vector registers with eight byte fields.
+
+ vsseg3e32.v vs3, (rs1), vm # Store packed vector of 3*4-byte segments from vs3,vs3+1,vs3+2 to memory
+----
+
+For loads, the `vd` register will hold the first field loaded from the
+segment. For stores, the `vs3` register is read to provide the first
+field to be stored to each segment.
+
+----
+ # Example 1
+ # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp)
+ vsetvli a1, t0, e8, ta, ma
+ vlseg3e8.v v8, (a0), vm
+ # v8 holds the red pixels
+ # v9 holds the green pixels
+ # v10 holds the blue pixels
+
+ # Example 2
+ # Memory structure holds complex values, 32b for real and 32b for imaginary
+ vsetvli a1, t0, e32, ta, ma
+ vlseg2e32.v v8, (a0), vm
+ # v8 holds real
+ # v9 holds imaginary
+----
+
+There are also fault-only-first versions of the unit-stride instructions.
+
+----
+ # Template for vector fault-only-first unit-stride segment loads.
+ vlseg<nf>e<eew>ff.v vd, (rs1), vm # Unit-stride fault-only-first segment loads
+----
+
+For fault-only-first segment loads, if an exception is detected partway
+through accessing a segment, regardless of whether the element index is zero,
+it is implementation-defined whether a subset of the segment is loaded.
+
+These instructions may overwrite destination vector register group
+elements past the point at which a trap is reported or past the point
+at which vector length is trimmed.
+
+===== Vector Strided Segment Loads and Stores
+
+Vector strided segment loads and stores move contiguous segments where
+each segment is separated by the byte-stride offset given in the `rs2`
+GPR argument.
+
+NOTE: Negative and zero strides are supported.
+
+----
+ # Format
+ vlsseg<nf>e<eew>.v vd, (rs1), rs2, vm # Strided segment loads
+ vssseg<nf>e<eew>.v vs3, (rs1), rs2, vm # Strided segment stores
+
+ # Examples
+ vsetvli a1, t0, e8, ta, ma
+ vlsseg3e8.v v4, (x5), x6 # Load bytes at addresses x5+i*x6 into v4[i],
+ # and bytes at addresses x5+i*x6+1 into v5[i],
+ # and bytes at addresses x5+i*x6+2 into v6[i].
+
+ # Examples
+ vsetvli a1, t0, e32, ta, ma
+ vssseg2e32.v v2, (x5), x6 # Store words from v2[i] to address x5+i*x6
+ # and words from v3[i] to address x5+i*x6+4
+----
+
+Accesses to the fields within each segment can occur in any order,
+including the case where the byte stride is such that segments overlap
+in memory.
+
+===== Vector Indexed Segment Loads and Stores
+
+Vector indexed segment loads and stores move contiguous segments where
+each segment is located at an address given by adding the scalar base
+address in the `rs1` field to byte offsets in vector register `vs2`.
+Both ordered and unordered forms are provided, where the ordered forms
+access segments in element order. However, even for the ordered form,
+accesses to the fields within an individual segment are not ordered
+with respect to each other.
+
+The data vector register group has EEW=SEW, EMUL=LMUL, while the index
+vector register group has EEW encoded in the instruction with
+EMUL=(EEW/SEW)*LMUL.
+The EMUL * NFIELDS {le} 8 constraint applies to the data vector register group.
+
+----
+ # Format
+ vluxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-unordered segment loads
+ vloxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-ordered segment loads
+ vsuxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-unordered segment stores
+ vsoxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-ordered segment stores
+
+ # Examples
+ vsetvli a1, t0, e8, ta, ma
+ vluxseg3ei8.v v4, (x5), v3 # Load bytes at addresses x5+v3[i] into v4[i],
+ # and bytes at addresses x5+v3[i]+1 into v5[i],
+ # and bytes at addresses x5+v3[i]+2 into v6[i].
+
+ # Examples
+ vsetvli a1, t0, e32, ta, ma
+ vsuxseg2ei32.v v2, (x5), v5 # Store words from v2[i] to address x5+v5[i]
+ # and words from v3[i] to address x5+v5[i]+4
+----
+
+For vector indexed segment loads, the destination vector register
+groups cannot overlap the source vector register group (specified by
+`vs2`), else the instruction encoding is reserved.
+
+NOTE: This constraint supports restart of indexed segment loads
+that raise exceptions partway through loading a structure.
+
+==== Vector Load/Store Whole Register Instructions
+
+Format for Vector Load Whole Register Instructions under LOAD-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | mew| 00 | 1| 01000 | rs1 | width | vd |0000111| VL<nf>R
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x07, attr: 'VL*R*'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 8, attr: 'lumop'},
+ {bits: 1, name: 1, attr: 'vm'},
+ {bits: 2, name: 0x10000, attr: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+Format for Vector Store Whole Register Instructions under STORE-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | 0 | 00 | 1| 01000 | rs1 | 000 | vs3 |0100111| VS<nf>R
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VS*R*'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 0x1000},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 8, attr: 'sumop'},
+ {bits: 1, name: 1, attr: 'vm'},
+ {bits: 2, name: 0x100, attr: 'mop'},
+ {bits: 1, name: 0x100, attr: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+These instructions load and store whole vector register groups.
+
+NOTE: These instructions are intended to be used to save and restore
+vector registers when the type or length of the current contents of
+the vector register is not known, or where modifying `vl` and `vtype`
+would be costly. Examples include compiler register spills, vector
+function calls where values are passed in vector registers, interrupt
+handlers, and OS context switches. Software can determine the number
+of bytes transferred by reading the `vlenb` register.
+
+The load instructions have an EEW encoded in the `mew` and `width`
+fields following the pattern of regular unit-stride loads.
+
+NOTE: Because in-register byte layouts are identical to in-memory byte
+layouts, the same data is written to the destination register group
+regardless of EEW.
+Hence, it would have sufficed to provide only EEW=8 variants.
+The full set of EEW variants is provided so that the encoded EEW can be used
+as a hint to indicate the destination register group will next be accessed
+with this EEW, which aids implementations that rearrange data internally.
+
+The vector whole register store instructions are encoded similar to
+unmasked unit-stride store of elements with EEW=8.
+
+The `nf` field encodes how many vector registers to load and store using the NFIELDS encoding (Figure <<fig-nf>>).
+The encoded number of registers must be a power of 2 and the vector
+register numbers must be aligned as with a vector register group,
+otherwise the instruction encoding is reserved. NFIELDS
+indicates the number of vector registers to transfer, numbered
+successively after the base. Only NFIELDS values of 1, 2, 4, 8 are
+supported, with other values reserved. When multiple registers are
+transferred, the lowest-numbered vector register is held in the
+lowest-numbered memory addresses and successive vector register
+numbers are placed contiguously in memory.
+
+The instructions operate with an effective vector length,
+`evl`=NFIELDS*VLEN/EEW, regardless of current settings in `vtype` and
+`vl`. The usual property that no elements are written if `vstart`
+{ge} `vl` does not apply to these instructions. Instead, no elements
+are written if `vstart` {ge} `evl`.
+
+The instructions operate similarly to unmasked unit-stride load and
+store instructions, with the base address passed in the scalar `x`
+register specified by `rs1`.
+
+Implementations are allowed to raise a misaligned address exception on
+whole register loads and stores if the base address is not naturally
+aligned to the larger of the size of the encoded EEW in bytes (EEW/8)
+or the implementation's smallest supported SEW size in bytes
+(SEW~MIN~/8).
+
+NOTE: Allowing misaligned exceptions to be raised based on
+non-alignment to the encoded EEW simplifies the implementation of these
+instructions. Some subset implementations might not support smaller
+SEW widths, so are allowed to report misaligned exceptions for the
+smallest supported SEW even if larger than encoded EEW. An extreme
+non-standard implementation might have SEW~MIN~>XLEN for example. Software
+environments can mandate the minimum alignment requirements to support
+an ABI.
+
+----
+ # Format of whole register load and store instructions.
+ vl1r.v v3, (a0) # Pseudoinstruction equal to vl1re8.v
+
+ vl1re8.v v3, (a0) # Load v3 with VLEN/8 bytes held at address in a0
+ vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords held at address in a0
+ vl1re32.v v3, (a0) # Load v3 with VLEN/32 words held at address in a0
+ vl1re64.v v3, (a0) # Load v3 with VLEN/64 doublewords held at address in a0
+
+ vl2r.v v2, (a0) # Pseudoinstruction equal to vl2re8.v
+
+ vl2re8.v v2, (a0) # Load v2-v3 with 2*VLEN/8 bytes from address in a0
+ vl2re16.v v2, (a0) # Load v2-v3 with 2*VLEN/16 halfwords held at address in a0
+ vl2re32.v v2, (a0) # Load v2-v3 with 2*VLEN/32 words held at address in a0
+ vl2re64.v v2, (a0) # Load v2-v3 with 2*VLEN/64 doublewords held at address in a0
+
+ vl4r.v v4, (a0) # Pseudoinstruction equal to vl4re8.v
+
+ vl4re8.v v4, (a0) # Load v4-v7 with 4*VLEN/8 bytes from address in a0
+ vl4re16.v v4, (a0)
+ vl4re32.v v4, (a0)
+ vl4re64.v v4, (a0)
+
+ vl8r.v v8, (a0) # Pseudoinstruction equal to vl8re8.v
+
+ vl8re8.v v8, (a0) # Load v8-v15 with 8*VLEN/8 bytes from address in a0
+ vl8re16.v v8, (a0)
+ vl8re32.v v8, (a0)
+ vl8re64.v v8, (a0)
+
+ vs1r.v v3, (a1) # Store v3 to address in a1
+ vs2r.v v2, (a1) # Store v2-v3 to address in a1
+ vs4r.v v4, (a1) # Store v4-v7 to address in a1
+ vs8r.v v8, (a1) # Store v8-v15 to address in a1
+----
+
+NOTE: Implementations should raise illegal instruction exceptions on
+`vl<nf>r` instructions for EEW values that are not supported.
+
+NOTE: We have considered adding a whole register mask load instruction
+(`vl1rm.v`) but have decided to omit from initial extension. The
+primary purpose would be to inform the microarchitecture that the data
+will be used as a mask. The same effect can be achieved with the
+following code sequence, whose cost is at most four instructions. Of
+these, the first could likely be removed as `vl` is often already
+in a scalar register, and the last might already be present if the
+following vector instruction needs a new SEW/LMUL. So, in best case
+only two instructions (of which only one performs vector operations) are needed to synthesize the effect of the
+dedicated instruction:
+----
+ csrr t0, vl # Save current vl (potentially not needed)
+ vsetvli t1, x0, e8, m8, ta, ma # Maximum VLMAX
+ vlm.v v0, (a0) # Load mask register
+ vsetvli x0, t0, <new type> # Restore vl (potentially already present)
+----
+
+=== Vector Memory Alignment Constraints
+
+If an element accessed by a vector memory instruction is not naturally
+aligned to the size of the element, either the element is transferred
+successfully or an address misaligned exception is raised on that
+element.
+
+Support for misaligned vector memory accesses is independent of an
+implementation's support for misaligned scalar memory accesses.
+
+NOTE: An implementation may have neither, one, or both scalar and
+vector memory accesses support some or all misaligned accesses in
+hardware. A separate PMA should be defined to determine if vector
+misaligned accesses are supported in the associated address range.
+
+Vector misaligned memory accesses follow the same rules for atomicity
+as scalar misaligned memory accesses.
+
+=== Vector Memory Consistency Model
+
+Vector memory instructions appear to execute in program order on the
+local hart.
+
+Vector memory instructions follow RVWMO at the instruction level.
+If the Ztso extension is implemented, vector memory instructions additionally
+follow RVTSO at the instruction level.
+
+Except for vector indexed-ordered loads and stores, element operations
+are unordered within the instruction.
+
+Vector indexed-ordered loads and stores read and write elements
+from/to memory in element order respectively,
+obeying RVWMO at the element level.
+
+NOTE: Ztso only imposes RVTSO at the instruction level; intra-instruction
+ordering follows RVWMO regardless of whether Ztso is implemented.
+
+NOTE: More formal definitions required.
+
+Instructions affected by the vector length register `vl` have a control
+dependency on `vl`, rather than a data dependency.
+Similarly, masked vector instructions have a control dependency on the source
+mask register, rather than a data dependency.
+
+NOTE: Treating the vector length and mask as control rather than data
+typically matches the semantics of the corresponding scalar code, where branch
+instructions ordinarily would have been used.
+Treating the mask as control allows masked vector load instructions to access
+memory before the mask value is known, without the need for
+a misspeculation-recovery mechanism.
+
+=== Vector Arithmetic Instruction Formats
+
+The vector arithmetic instructions use a new major opcode (OP-V =
+1010111~2~) which neighbors OP-FP. The three-bit `funct3` field is
+used to define sub-categories of vector instructions.
+
+include::images/wavedrom/valu-format.adoc[]
+
+[[sec-arithmetic-encoding]]
+==== Vector Arithmetic Instruction encoding
+
+The `funct3` field encodes the operand type and source locations.
+
+.funct3
+[cols="1,1,1,3,5,5"]
+|===
+3+| funct3[2:0] | Category | Operands | Type of scalar operand
+
+| 0 | 0 | 0 | OPIVV | vector-vector | N/A
+| 0 | 0 | 1 | OPFVV | vector-vector | N/A
+| 0 | 1 | 0 | OPMVV | vector-vector | N/A
+| 0 | 1 | 1 | OPIVI | vector-immediate | `imm[4:0]`
+| 1 | 0 | 0 | OPIVX | vector-scalar | GPR `x` register `rs1`
+| 1 | 0 | 1 | OPFVF | vector-scalar | FP `f` register `rs1`
+| 1 | 1 | 0 | OPMVX | vector-scalar | GPR `x` register `rs1`
+| 1 | 1 | 1 | OPCFG | scalars-imms | GPR `x` register `rs1` & `rs2`/`imm`
+|===
+
+Integer operations are performed using unsigned or two's-complement
+signed integer arithmetic depending on the opcode.
+
+NOTE: In this discussion, fixed-point operations are
+considered to be integer operations.
+
+All standard vector floating-point arithmetic operations follow the
+IEEE-754/2008 standard. All vector floating-point operations use the
+dynamic rounding mode in the `frm` register. Use of the `frm` field
+when it contains an invalid rounding mode by any vector floating-point
+instruction--even those that do not depend on the rounding mode, or
+when `vl`=0, or when `vstart` {ge} `vl`--is reserved.
+
+NOTE: All vector floating-point code will rely on a valid value in
+`frm`. Implementations can make all vector FP instructions report
+exceptions when the rounding mode is invalid to simplify control
+logic.
+
+Vector-vector operations take two vectors of operands from vector
+register groups specified by `vs2` and `vs1` respectively.
+
+Vector-scalar operations can have three possible forms. In all three forms,
+the vector register group operand is specified by `vs2`. The second
+scalar source operand comes from one of three alternative sources:
+
+. For integer operations, the scalar can be a 5-bit immediate, `imm[4:0]`, encoded
+in the `rs1` field. The value is sign-extended to SEW bits, unless
+otherwise specified.
+
+. For integer operations, the scalar can be taken from the scalar `x`
+register specified by `rs1`. If XLEN>SEW, the least-significant SEW
+bits of the `x` register are used, unless otherwise specified. If
+XLEN<SEW, the value from the `x` register is sign-extended to SEW
+bits.
+
+. For floating-point operations, the scalar can be taken from a scalar
+`f` register. If FLEN > SEW, the value in the `f` registers is
+checked for a valid NaN-boxed value, in which case the
+least-significant SEW bits of the `f` register are used, else the
+canonical NaN value is used. Vector instructions where any
+floating-point vector operand's EEW is not a supported floating-point
+type width (which includes when FLEN < SEW) are reserved.
+
+NOTE: Some instructions _zero_-extend the 5-bit immediate, and denote this
+by naming the immediate `uimm` in the assembly syntax.
+
+NOTE: When adding a vector extension to the Zfinx/Zdinx/Zhinx
+extensions, floating-point scalar arguments are taken from the `x`
+registers. NaN-boxing is not supported in these extensions, and so
+the vector floating-point scalar value is produced using the same
+rules as for an integer scalar operand (i.e., when XLEN > SEW use the
+lowest SEW bits, when XLEN < SEW use the sign-extended value).
+
+Vector arithmetic instructions are masked under control of the `vm`
+field.
+
+----
+# Assembly syntax pattern for vector binary arithmetic instructions
+
+# Operations returning vector results, masked by vm (v0.t, <nothing>)
+vop.vv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vop.vx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+vop.vi vd, vs2, imm, vm # integer vector-immediate vd[i] = vs2[i] op imm
+
+vfop.vv vd, vs2, vs1, vm # FP vector-vector operation vd[i] = vs2[i] fop vs1[i]
+vfop.vf vd, vs2, rs1, vm # FP vector-scalar operation vd[i] = vs2[i] fop f[rs1]
+----
+
+NOTE: In the encoding, `vs2` is the first operand, while `rs1/imm`
+is the second operand. This is the opposite to the standard scalar
+ordering. This arrangement retains the existing encoding conventions
+that instructions that read only one scalar register, read it from
+`rs1`, and that 5-bit immediates are sourced from the `rs1` field.
+
+----
+# Assembly syntax pattern for vector ternary arithmetic instructions (multiply-add)
+
+# Integer operations overwriting sum input
+vop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vs2[i] + vd[i]
+vop.vx vd, rs1, vs2, vm # vd[i] = x[rs1] * vs2[i] + vd[i]
+
+# Integer operations overwriting product input
+vop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vd[i] + vs2[i]
+vop.vx vd, rs1, vs2, vm # vd[i] = x[rs1] * vd[i] + vs2[i]
+
+# Floating-point operations overwriting sum input
+vfop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vs2[i] + vd[i]
+vfop.vf vd, rs1, vs2, vm # vd[i] = f[rs1] * vs2[i] + vd[i]
+
+# Floating-point operations overwriting product input
+vfop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vd[i] + vs2[i]
+vfop.vf vd, rs1, vs2, vm # vd[i] = f[rs1] * vd[i] + vs2[i]
+----
+
+NOTE: For ternary multiply-add operations, the assembler syntax always
+places the destination vector register first, followed by either `rs1`
+or `vs1`, then `vs2`. This ordering provides a more natural reading
+of the assembler for these ternary operations, as the multiply
+operands are always next to each other.
+
+[[sec-widening]]
+==== Widening Vector Arithmetic Instructions
+
+A few vector arithmetic instructions are defined to be __widening__
+operations where the destination vector register group has EEW=2*SEW
+and EMUL=2*LMUL. These are generally given a `vw*` prefix on the
+opcode, or `vfw*` for vector floating-point instructions.
+
+The first vector register group operand can be either single or
+double-width.
+
+----
+Assembly syntax pattern for vector widening arithmetic instructions
+
+# Double-width result, two single-width sources: 2*SEW = SEW op SEW
+vwop.vv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vwop.vx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+
+# Double-width result, first source double-width, second source single-width: 2*SEW = 2*SEW op SEW
+vwop.wv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vwop.wx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+----
+
+NOTE: Originally, a `w` suffix was used on opcode, but this could be
+confused with the use of a `w` suffix to mean word-sized operations in
+doubleword integers, so the `w` was moved to prefix.
+
+NOTE: The floating-point widening operations were changed to `vfw*`
+from `vwf*` to be more consistent with any scalar widening
+floating-point operations that will be written as `fw*`.
+
+Widening instruction encodings must follow the constraints in Section
+<<sec-vec-operands>>.
+
+[[sec-narrowing]]
+==== Narrowing Vector Arithmetic Instructions
+
+A few instructions are provided to convert double-width source vectors
+into single-width destination vectors. These instructions convert a
+vector register group specified by `vs2` with EEW/EMUL=2*SEW/2*LMUL to a vector register
+group with the current SEW/LMUL setting. Where there is a second
+source vector register group (specified by `vs1`), this has the same
+(narrower) width as the result (i.e., EEW=SEW).
+
+NOTE: An alternative design decision would have been to treat SEW/LMUL
+as defining the size of the source vector register group. The choice
+here is motivated by the belief the chosen approach will require fewer
+`vtype` changes.
+
+NOTE: Compare operations that set a mask register are also
+implicitly a narrowing operation.
+
+A `vn*` prefix on the opcode is used to distinguish these instructions
+in the assembler, or a `vfn*` prefix for narrowing floating-point
+opcodes. The double-width source vector register group is signified
+by a `w` in the source operand suffix (e.g., `vnsra.wv`)
+
+----
+Assembly syntax pattern for vector narrowing arithmetic instructions
+
+# Single-width result vd, double-width source vs2, single-width source vs1/rs1
+# SEW = 2*SEW op SEW
+vnop.wv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vnop.wx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+----
+
+Narrowing instruction encodings must follow the constraints in Section
+<<sec-vec-operands>>.
+
+[[sec-vector-integer]]
+=== Vector Integer Arithmetic Instructions
+
+A set of vector integer arithmetic instructions is provided. Unless
+otherwise stated, integer operations wrap around on overflow.
+
+==== Vector Single-Width Integer Add and Subtract
+
+Vector integer add and subtract are provided. Reverse-subtract
+instructions are also provided for the vector-scalar forms.
+
+----
+# Integer adds.
+vadd.vv vd, vs2, vs1, vm # Vector-vector
+vadd.vx vd, vs2, rs1, vm # vector-scalar
+vadd.vi vd, vs2, imm, vm # vector-immediate
+
+# Integer subtract
+vsub.vv vd, vs2, vs1, vm # Vector-vector
+vsub.vx vd, vs2, rs1, vm # vector-scalar
+
+# Integer reverse subtract
+vrsub.vx vd, vs2, rs1, vm # vd[i] = x[rs1] - vs2[i]
+vrsub.vi vd, vs2, imm, vm # vd[i] = imm - vs2[i]
+----
+
+NOTE: A vector of integer values can be negated using a
+reverse-subtract instruction with a scalar operand of `x0`. An
+assembly pseudoinstruction `vneg.v vd,vs` = `vrsub.vx vd,vs,x0` is provided.
+
+==== Vector Widening Integer Add/Subtract
+
+The widening add/subtract instructions are provided in both signed and
+unsigned variants, depending on whether the narrower source operands
+are first sign- or zero-extended before forming the double-width sum.
+
+----
+# Widening unsigned integer add/subtract, 2*SEW = SEW +/- SEW
+vwaddu.vv vd, vs2, vs1, vm # vector-vector
+vwaddu.vx vd, vs2, rs1, vm # vector-scalar
+vwsubu.vv vd, vs2, vs1, vm # vector-vector
+vwsubu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening signed integer add/subtract, 2*SEW = SEW +/- SEW
+vwadd.vv vd, vs2, vs1, vm # vector-vector
+vwadd.vx vd, vs2, rs1, vm # vector-scalar
+vwsub.vv vd, vs2, vs1, vm # vector-vector
+vwsub.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening unsigned integer add/subtract, 2*SEW = 2*SEW +/- SEW
+vwaddu.wv vd, vs2, vs1, vm # vector-vector
+vwaddu.wx vd, vs2, rs1, vm # vector-scalar
+vwsubu.wv vd, vs2, vs1, vm # vector-vector
+vwsubu.wx vd, vs2, rs1, vm # vector-scalar
+
+# Widening signed integer add/subtract, 2*SEW = 2*SEW +/- SEW
+vwadd.wv vd, vs2, vs1, vm # vector-vector
+vwadd.wx vd, vs2, rs1, vm # vector-scalar
+vwsub.wv vd, vs2, vs1, vm # vector-vector
+vwsub.wx vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: An integer value can be doubled in width using the widening add
+instructions with a scalar operand of `x0`. Assembly
+pseudoinstructions `vwcvt.x.x.v vd,vs,vm` = `vwadd.vx vd,vs,x0,vm` and
+`vwcvtu.x.x.v vd,vs,vm` = `vwaddu.vx vd,vs,x0,vm` are provided.
+
+==== Vector Integer Extension
+
+The vector integer extension instructions zero- or sign-extend a
+source vector integer operand with EEW less than SEW to fill SEW-sized
+elements in the destination. The EEW of the source is 1/2, 1/4, or
+1/8 of SEW, while EMUL of the source is (EEW/SEW)*LMUL. The
+destination has EEW equal to SEW and EMUL equal to LMUL.
+
+----
+vzext.vf2 vd, vs2, vm # Zero-extend SEW/2 source to SEW destination
+vsext.vf2 vd, vs2, vm # Sign-extend SEW/2 source to SEW destination
+vzext.vf4 vd, vs2, vm # Zero-extend SEW/4 source to SEW destination
+vsext.vf4 vd, vs2, vm # Sign-extend SEW/4 source to SEW destination
+vzext.vf8 vd, vs2, vm # Zero-extend SEW/8 source to SEW destination
+vsext.vf8 vd, vs2, vm # Sign-extend SEW/8 source to SEW destination
+----
+
+If the source EEW is not a supported width, or source EMUL would be
+below the minimum legal LMUL, the instruction encoding is reserved.
+
+NOTE: Standard vector load instructions access memory values that are
+the same size as the destination register elements. Some application
+code needs to operate on a range of operand widths in a wider element,
+for example, loading a byte from memory and adding to an eight-byte
+element. To avoid having to provide the cross-product of the number
+of vector load instructions by the number of data types (byte, word,
+halfword, and also signed/unsigned variants), we instead add explicit
+extension instructions that can be used if an appropriate widening
+arithmetic instruction is not available.
+
+==== Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+
+To support multi-word integer arithmetic, instructions that operate on
+a carry bit are provided. For each operation (add or subtract), two
+instructions are provided: one to provide the result (SEW width), and
+the second to generate the carry output (single bit encoded as a mask
+boolean).
+
+The carry inputs and outputs are represented using the mask register
+layout as described in Section <<sec-mask-register-layout>>. Due to
+encoding constraints, the carry input must come from the implicit `v0`
+register, but carry outputs can be written to any vector register that
+respects the source/destination overlap restrictions.
+
+`vadc` and `vsbc` add or subtract the source operands and the carry-in or
+borrow-in, and write the result to vector register `vd`.
+These instructions are encoded as masked instructions (`vm=0`), but they operate
+on and write back all body elements.
+Encodings corresponding to the unmasked versions (`vm=1`) are reserved.
+
+`vmadc` and `vmsbc` add or subtract the source operands, optionally
+add the carry-in or subtract the borrow-in if masked (`vm=0`), and
+write the result back to mask register `vd`. If unmasked (`vm=1`),
+there is no carry-in or borrow-in. These instructions operate on and
+write back all body elements, even if masked. Because these
+instructions produce a mask value, they always operate with a
+tail-agnostic policy.
+
+----
+ # Produce sum with carry.
+
+ # vd[i] = vs2[i] + vs1[i] + v0.mask[i]
+ vadc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd[i] = vs2[i] + x[rs1] + v0.mask[i]
+ vadc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # vd[i] = vs2[i] + imm + v0.mask[i]
+ vadc.vim vd, vs2, imm, v0 # Vector-immediate
+
+ # Produce carry out in mask register format
+
+ # vd.mask[i] = carry_out(vs2[i] + vs1[i] + v0.mask[i])
+ vmadc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd.mask[i] = carry_out(vs2[i] + x[rs1] + v0.mask[i])
+ vmadc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # vd.mask[i] = carry_out(vs2[i] + imm + v0.mask[i])
+ vmadc.vim vd, vs2, imm, v0 # Vector-immediate
+
+ # vd.mask[i] = carry_out(vs2[i] + vs1[i])
+ vmadc.vv vd, vs2, vs1 # Vector-vector, no carry-in
+
+ # vd.mask[i] = carry_out(vs2[i] + x[rs1])
+ vmadc.vx vd, vs2, rs1 # Vector-scalar, no carry-in
+
+ # vd.mask[i] = carry_out(vs2[i] + imm)
+ vmadc.vi vd, vs2, imm # Vector-immediate, no carry-in
+----
+
+Because implementing a carry propagation requires executing two
+instructions with unchanged inputs, destructive accumulations will
+require an additional move to obtain correct results.
+
+----
+ # Example multi-word arithmetic sequence, accumulating into v4
+ vmadc.vvm v1, v4, v8, v0 # Get carry into temp register v1
+ vadc.vvm v4, v4, v8, v0 # Calc new sum
+ vmmv.m v0, v1 # Move temp carry into v0 for next word
+----
+
+The subtract with borrow instruction `vsbc` performs the equivalent
+function to support long word arithmetic for subtraction. There are
+no subtract with immediate instructions.
+
+----
+ # Produce difference with borrow.
+
+ # vd[i] = vs2[i] - vs1[i] - v0.mask[i]
+ vsbc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd[i] = vs2[i] - x[rs1] - v0.mask[i]
+ vsbc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # Produce borrow out in mask register format
+
+ # vd.mask[i] = borrow_out(vs2[i] - vs1[i] - v0.mask[i])
+ vmsbc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd.mask[i] = borrow_out(vs2[i] - x[rs1] - v0.mask[i])
+ vmsbc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # vd.mask[i] = borrow_out(vs2[i] - vs1[i])
+ vmsbc.vv vd, vs2, vs1 # Vector-vector, no borrow-in
+
+ # vd.mask[i] = borrow_out(vs2[i] - x[rs1])
+ vmsbc.vx vd, vs2, rs1 # Vector-scalar, no borrow-in
+----
+
+For `vmsbc`, the borrow is defined to be 1 iff the difference, prior to
+truncation, is negative.
+
+For `vadc` and `vsbc`, the instruction encoding is reserved if the
+destination vector register is `v0`.
+
+NOTE: This constraint corresponds to the constraint on masked vector
+operations that overwrite the mask register.
+
+==== Vector Bitwise Logical Instructions
+
+----
+# Bitwise logical operations.
+vand.vv vd, vs2, vs1, vm # Vector-vector
+vand.vx vd, vs2, rs1, vm # vector-scalar
+vand.vi vd, vs2, imm, vm # vector-immediate
+
+vor.vv vd, vs2, vs1, vm # Vector-vector
+vor.vx vd, vs2, rs1, vm # vector-scalar
+vor.vi vd, vs2, imm, vm # vector-immediate
+
+vxor.vv vd, vs2, vs1, vm # Vector-vector
+vxor.vx vd, vs2, rs1, vm # vector-scalar
+vxor.vi vd, vs2, imm, vm # vector-immediate
+----
+
+NOTE: With an immediate of -1, scalar-immediate forms of the `vxor`
+instruction provide a bitwise NOT operation. This is provided as
+an assembler pseudoinstruction `vnot.v vd,vs,vm` = `vxor.vi vd,vs,-1,vm`.
+
+==== Vector Single-Width Shift Instructions
+
+A full set of vector shift instructions are provided, including
+logical shift left (`sll`), and logical (zero-extending `srl`) and
+arithmetic (sign-extending `sra`) shift right. The data to be shifted
+is in the vector register group specified by `vs2` and the shift
+amount value can come from a vector register group `vs1`, a scalar
+integer register `rs1`, or a zero-extended 5-bit immediate. Only the low
+lg2(SEW) bits of the shift-amount value are used to control the shift
+amount.
+
+----
+# Bit shift operations
+vsll.vv vd, vs2, vs1, vm # Vector-vector
+vsll.vx vd, vs2, rs1, vm # vector-scalar
+vsll.vi vd, vs2, uimm, vm # vector-immediate
+
+vsrl.vv vd, vs2, vs1, vm # Vector-vector
+vsrl.vx vd, vs2, rs1, vm # vector-scalar
+vsrl.vi vd, vs2, uimm, vm # vector-immediate
+
+vsra.vv vd, vs2, vs1, vm # Vector-vector
+vsra.vx vd, vs2, rs1, vm # vector-scalar
+vsra.vi vd, vs2, uimm, vm # vector-immediate
+----
+
+==== Vector Narrowing Integer Right Shift Instructions
+
+The narrowing right shifts extract a smaller field from a wider
+operand and have both zero-extending (`srl`) and sign-extending
+(`sra`) forms. The shift amount can come from a vector register
+group, or a scalar `x` register, or a zero-extended 5-bit immediate.
+The low lg2(2*SEW) bits of the shift-amount value are
+used (e.g., the low 6 bits for a SEW=64-bit to SEW=32-bit narrowing
+operation).
+
+----
+ # Narrowing shift right logical, SEW = (2*SEW) >> SEW
+ vnsrl.wv vd, vs2, vs1, vm # vector-vector
+ vnsrl.wx vd, vs2, rs1, vm # vector-scalar
+ vnsrl.wi vd, vs2, uimm, vm # vector-immediate
+
+ # Narrowing shift right arithmetic, SEW = (2*SEW) >> SEW
+ vnsra.wv vd, vs2, vs1, vm # vector-vector
+ vnsra.wx vd, vs2, rs1, vm # vector-scalar
+ vnsra.wi vd, vs2, uimm, vm # vector-immediate
+----
+
+NOTE: Future extensions might add support for versions that narrow to
+a destination that is 1/4 the width of the source.
+
+NOTE: An integer value can be halved in width using the narrowing integer
+shift instructions with a scalar operand of `x0`. An assembly
+pseudoinstruction is provided `vncvt.x.x.w vd,vs,vm` = `vnsrl.wx vd,vs,x0,vm`.
+
+==== Vector Integer Compare Instructions
+
+The following integer compare instructions write 1 to the destination
+mask register element if the comparison evaluates to true, and 0
+otherwise. The destination mask vector is always held in a single
+vector register, with a layout of elements as described in Section
+<<sec-mask-register-layout>>. The destination mask vector register
+may be the same as the source vector mask register (`v0`).
+
+----
+# Set if equal
+vmseq.vv vd, vs2, vs1, vm # Vector-vector
+vmseq.vx vd, vs2, rs1, vm # vector-scalar
+vmseq.vi vd, vs2, imm, vm # vector-immediate
+
+# Set if not equal
+vmsne.vv vd, vs2, vs1, vm # Vector-vector
+vmsne.vx vd, vs2, rs1, vm # vector-scalar
+vmsne.vi vd, vs2, imm, vm # vector-immediate
+
+# Set if less than, unsigned
+vmsltu.vv vd, vs2, vs1, vm # Vector-vector
+vmsltu.vx vd, vs2, rs1, vm # Vector-scalar
+
+# Set if less than, signed
+vmslt.vv vd, vs2, vs1, vm # Vector-vector
+vmslt.vx vd, vs2, rs1, vm # vector-scalar
+
+# Set if less than or equal, unsigned
+vmsleu.vv vd, vs2, vs1, vm # Vector-vector
+vmsleu.vx vd, vs2, rs1, vm # vector-scalar
+vmsleu.vi vd, vs2, imm, vm # Vector-immediate
+
+# Set if less than or equal, signed
+vmsle.vv vd, vs2, vs1, vm # Vector-vector
+vmsle.vx vd, vs2, rs1, vm # vector-scalar
+vmsle.vi vd, vs2, imm, vm # vector-immediate
+
+# Set if greater than, unsigned
+vmsgtu.vx vd, vs2, rs1, vm # Vector-scalar
+vmsgtu.vi vd, vs2, imm, vm # Vector-immediate
+
+# Set if greater than, signed
+vmsgt.vx vd, vs2, rs1, vm # Vector-scalar
+vmsgt.vi vd, vs2, imm, vm # Vector-immediate
+
+# Following two instructions are not provided directly
+# Set if greater than or equal, unsigned
+# vmsgeu.vx vd, vs2, rs1, vm # Vector-scalar
+# Set if greater than or equal, signed
+# vmsge.vx vd, vs2, rs1, vm # Vector-scalar
+----
+
+The following table indicates how all comparisons are implemented in
+native machine code.
+
+----
+Comparison Assembler Mapping Assembler Pseudoinstruction
+
+va < vb vmslt{u}.vv vd, va, vb, vm
+va <= vb vmsle{u}.vv vd, va, vb, vm
+va > vb vmslt{u}.vv vd, vb, va, vm vmsgt{u}.vv vd, va, vb, vm
+va >= vb vmsle{u}.vv vd, vb, va, vm vmsge{u}.vv vd, va, vb, vm
+
+va < x vmslt{u}.vx vd, va, x, vm
+va <= x vmsle{u}.vx vd, va, x, vm
+va > x vmsgt{u}.vx vd, va, x, vm
+va >= x see below
+
+va < i vmsle{u}.vi vd, va, i-1, vm vmslt{u}.vi vd, va, i, vm
+va <= i vmsle{u}.vi vd, va, i, vm
+va > i vmsgt{u}.vi vd, va, i, vm
+va >= i vmsgt{u}.vi vd, va, i-1, vm vmsge{u}.vi vd, va, i, vm
+
+va, vb vector register groups
+x scalar integer register
+i immediate
+----
+
+NOTE: The immediate forms of `vmslt{u}.vi` are not provided as the
+immediate value can be decreased by 1 and the `vmsle{u}.vi` variants
+used instead. The `vmsle.vi` range is -16 to 15, resulting in an
+effective `vmslt.vi` range of -15 to 16. The `vmsleu.vi` range is 0
+to 15 giving an effective `vmsltu.vi` range of 1 to 16 (Note,
+`vmsltu.vi` with immediate 0 is not useful as it is always
+false).
+
+NOTE: Because the 5-bit vector immediates are always sign-extended,
+when the high bit of the `simm5` immediate is set, `vmsleu.vi` also
+supports unsigned immediate values in the range `2^SEW^-16` to
+`2^SEW^-1`, allowing corresponding `vmsltu.vi` compares against
+unsigned immediates in the range `2^SEW^-15` to `2^SEW^`. Note that
+`vmsltu.vi` with immediate `2^SEW^` is not useful as it is always
+true.
+
+Similarly, `vmsge{u}.vi` is not provided and the compare is
+implemented using `vmsgt{u}.vi` with the immediate decremented by one.
+The resulting effective `vmsge.vi` range is -15 to 16, and the
+resulting effective `vmsgeu.vi` range is 1 to 16 (Note, `vmsgeu.vi` with
+immediate 0 is not useful as it is always true).
+
+NOTE: The `vmsgt` forms for register scalar and immediates are provided
+to allow a single compare instruction to provide the correct
+polarity of mask value without using additional mask logical
+instructions.
+
+To reduce encoding space, the `vmsge{u}.vx` form is not directly
+provided, and so the `va {ge} x` case requires special treatment.
+
+NOTE: The `vmsge{u}.vx` could potentially be encoded in a
+non-orthogonal way under the unused OPIVI variant of `vmslt{u}`. These
+would be the only instructions in OPIVI that use a scalar `x`register
+however. Alternatively, a further two funct6 encodings could be used,
+but these would have a different operand format (writes to mask
+register) than others in the same group of 8 funct6 encodings. The
+current PoR is to omit these instructions and to synthesize where
+needed as described below.
+
+The `vmsge{u}.vx` operation can be synthesized by reducing the
+value of `x` by 1 and using the `vmsgt{u}.vx` instruction, when it is
+known that this will not underflow the representation in `x`.
+
+----
+Sequences to synthesize `vmsge{u}.vx` instruction
+
+va >= x, x > minimum
+
+ addi t0, x, -1; vmsgt{u}.vx vd, va, t0, vm
+----
+
+The above sequence will usually be the most efficient implementation,
+but assembler pseudoinstructions can be provided for cases where the
+range of `x` is unknown.
+
+----
+unmasked va >= x
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x
+ expansion: vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd
+
+masked va >= x, vd != v0
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t
+ expansion: vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+
+masked va >= x, vd == v0
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+ expansion: vmslt{u}.vx vt, va, x; vmandn.mm vd, vd, vt
+
+masked va >= x, any vd
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+ expansion: vmslt{u}.vx vt, va, x; vmandn.mm vt, v0, vt; vmandn.mm vd, vd, v0; vmor.mm vd, vt, vd
+
+ The vt argument to the pseudoinstruction must name a temporary vector register that is
+ not same as vd and which will be clobbered by the pseudoinstruction
+----
+
+Compares effectively AND in the mask under a mask-undisturbed policy if the destination register is `v0`, e.g.,
+
+----
+ # (a < b) && (b < c) in two instructions when mask-undisturbed
+ vmslt.vv v0, va, vb # All body elements written
+ vmslt.vv v0, vb, vc, v0.t # Only update at set mask
+----
+
+Compares write mask registers, and so always operate under a
+tail-agnostic policy.
+
+==== Vector Integer Min/Max Instructions
+
+Signed and unsigned integer minimum and maximum instructions are
+supported.
+
+----
+# Unsigned minimum
+vminu.vv vd, vs2, vs1, vm # Vector-vector
+vminu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed minimum
+vmin.vv vd, vs2, vs1, vm # Vector-vector
+vmin.vx vd, vs2, rs1, vm # vector-scalar
+
+# Unsigned maximum
+vmaxu.vv vd, vs2, vs1, vm # Vector-vector
+vmaxu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed maximum
+vmax.vv vd, vs2, vs1, vm # Vector-vector
+vmax.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Integer Multiply Instructions
+
+The single-width multiply instructions perform a SEW-bit*SEW-bit
+multiply to generate a 2*SEW-bit product, then return one half of the
+product in the SEW-bit-wide destination. The `*mul*` versions write
+the low word of the product to the destination register, while the
+`*mulh*` versions write the high word of the product to the
+destination register.
+
+----
+# Signed multiply, returning low bits of product
+vmul.vv vd, vs2, vs1, vm # Vector-vector
+vmul.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed multiply, returning high bits of product
+vmulh.vv vd, vs2, vs1, vm # Vector-vector
+vmulh.vx vd, vs2, rs1, vm # vector-scalar
+
+# Unsigned multiply, returning high bits of product
+vmulhu.vv vd, vs2, vs1, vm # Vector-vector
+vmulhu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed(vs2)-Unsigned multiply, returning high bits of product
+vmulhsu.vv vd, vs2, vs1, vm # Vector-vector
+vmulhsu.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: There is no `vmulhus.vx` opcode to return high half of
+unsigned-vector * signed-scalar product. The scalar can be splatted
+to a vector, then a `vmulhsu.vv` used.
+
+NOTE: The current `vmulh*` opcodes perform simple fractional
+multiplies, but with no option to scale, round, and/or saturate the
+result. A possible future extension can consider variants of `vmulh`,
+`vmulhu`, `vmulhsu` that use the `vxrm` rounding mode when discarding
+low half of product. There is no possibility of overflow in these
+cases.
+
+==== Vector Integer Divide Instructions
+
+The divide and remainder instructions are equivalent to the RISC-V
+standard scalar integer multiply/divides, with the same results for
+extreme inputs.
+
+----
+ # Unsigned divide.
+ vdivu.vv vd, vs2, vs1, vm # Vector-vector
+ vdivu.vx vd, vs2, rs1, vm # vector-scalar
+
+ # Signed divide
+ vdiv.vv vd, vs2, vs1, vm # Vector-vector
+ vdiv.vx vd, vs2, rs1, vm # vector-scalar
+
+ # Unsigned remainder
+ vremu.vv vd, vs2, vs1, vm # Vector-vector
+ vremu.vx vd, vs2, rs1, vm # vector-scalar
+
+ # Signed remainder
+ vrem.vv vd, vs2, vs1, vm # Vector-vector
+ vrem.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: The decision to include integer divide and remainder was
+contentious. The argument in favor is that without a standard
+instruction, software would have to pick some algorithm to perform the
+operation, which would likely perform poorly on some
+microarchitectures versus others.
+
+NOTE: There is no instruction to perform a "scalar divide by vector"
+operation.
+
+==== Vector Widening Integer Multiply Instructions
+
+The widening integer multiply instructions return the full 2*SEW-bit
+product from an SEW-bit*SEW-bit multiply.
+
+----
+# Widening signed-integer multiply
+vwmul.vv vd, vs2, vs1, vm # vector-vector
+vwmul.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening unsigned-integer multiply
+vwmulu.vv vd, vs2, vs1, vm # vector-vector
+vwmulu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening signed(vs2)-unsigned integer multiply
+vwmulsu.vv vd, vs2, vs1, vm # vector-vector
+vwmulsu.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Integer Multiply-Add Instructions
+
+The integer multiply-add instructions are destructive and are provided
+in two forms, one that overwrites the addend or minuend
+(`vmacc`, `vnmsac`) and one that overwrites the first multiplicand
+(`vmadd`, `vnmsub`).
+
+The low half of the product is added or subtracted from the third operand.
+
+NOTE: `sac` is intended to be read as "subtract from accumulator". The
+opcode is `vnmsac` to match the (unfortunately counterintuitive)
+floating-point `fnmsub` instruction definition. Similarly for the
+`vnmsub` opcode.
+
+----
+# Integer multiply-add, overwrite addend
+vmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vmacc.vx vd, rs1, vs2, vm # vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+
+# Integer multiply-sub, overwrite minuend
+vnmsac.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+vnmsac.vx vd, rs1, vs2, vm # vd[i] = -(x[rs1] * vs2[i]) + vd[i]
+
+# Integer multiply-add, overwrite multiplicand
+vmadd.vv vd, vs1, vs2, vm # vd[i] = (vs1[i] * vd[i]) + vs2[i]
+vmadd.vx vd, rs1, vs2, vm # vd[i] = (x[rs1] * vd[i]) + vs2[i]
+
+# Integer multiply-sub, overwrite multiplicand
+vnmsub.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vd[i]) + vs2[i]
+vnmsub.vx vd, rs1, vs2, vm # vd[i] = -(x[rs1] * vd[i]) + vs2[i]
+----
+
+==== Vector Widening Integer Multiply-Add Instructions
+
+The widening integer multiply-add instructions add the full 2*SEW-bit
+product from a SEW-bit*SEW-bit multiply to a 2*SEW-bit value and
+produce a 2*SEW-bit result. All combinations of signed and unsigned
+multiply operands are supported.
+
+----
+# Widening unsigned-integer multiply-add, overwrite addend
+vwmaccu.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vwmaccu.vx vd, rs1, vs2, vm # vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+
+# Widening signed-integer multiply-add, overwrite addend
+vwmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vwmacc.vx vd, rs1, vs2, vm # vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+
+# Widening signed-unsigned-integer multiply-add, overwrite addend
+vwmaccsu.vv vd, vs1, vs2, vm # vd[i] = +(signed(vs1[i]) * unsigned(vs2[i])) + vd[i]
+vwmaccsu.vx vd, rs1, vs2, vm # vd[i] = +(signed(x[rs1]) * unsigned(vs2[i])) + vd[i]
+
+# Widening unsigned-signed-integer multiply-add, overwrite addend
+vwmaccus.vx vd, rs1, vs2, vm # vd[i] = +(unsigned(x[rs1]) * signed(vs2[i])) + vd[i]
+----
+
+==== Vector Integer Merge Instructions
+
+The vector integer merge instructions combine two source operands
+based on a mask. Unlike regular arithmetic instructions, the
+merge operates on all body elements (i.e., the set of elements from
+`vstart` up to the current vector length in `vl`).
+
+The `vmerge` instructions are encoded as masked instructions (`vm=0`).
+The instructions combine two
+sources as follows. At elements where the mask value is zero, the
+first operand is copied to the destination element, otherwise the
+second operand is copied to the destination element. The first
+operand is always a vector register group specified by `vs2`. The
+second operand is a vector register group specified by `vs1` or a
+scalar `x` register specified by `rs1` or a 5-bit sign-extended
+immediate.
+
+----
+vmerge.vvm vd, vs2, vs1, v0 # vd[i] = v0.mask[i] ? vs1[i] : vs2[i]
+vmerge.vxm vd, vs2, rs1, v0 # vd[i] = v0.mask[i] ? x[rs1] : vs2[i]
+vmerge.vim vd, vs2, imm, v0 # vd[i] = v0.mask[i] ? imm : vs2[i]
+----
+
+==== Vector Integer Move Instructions
+
+The vector integer move instructions copy a source operand to a vector
+register group.
+The `vmv.v.v` variant copies a vector register group, whereas the `vmv.v.x`
+and `vmv.v.i` variants __splat__ a scalar register or immediate to all active
+elements of the destination vector register group.
+These instructions are encoded as unmasked instructions (`vm=1`).
+The first operand specifier (`vs2`) must contain `v0`, and any other vector
+register number in `vs2` is _reserved_.
+
+----
+vmv.v.v vd, vs1 # vd[i] = vs1[i]
+vmv.v.x vd, rs1 # vd[i] = x[rs1]
+vmv.v.i vd, imm # vd[i] = imm
+----
+
+NOTE: Mask values can be widened into SEW-width elements using a
+sequence `vmv.v.i vd, 0; vmerge.vim vd, vd, 1, v0`.
+
+NOTE: The vector integer move instructions share the encoding with the vector
+merge instructions, but with `vm=1` and `vs2=v0`.
+
+The form `vmv.v.v vd, vd`, which leaves body elements unchanged,
+can be used to indicate that the register will next be used
+with an EEW equal to SEW.
+
+NOTE: Implementations that internally reorganize data according to EEW
+can shuffle the internal representation according to SEW.
+Implementations that do not internally reorganize data can dynamically
+elide this instruction, and treat as a NOP.
+
+NOTE: The `vmv.v.v vd. vd` instruction is not a RISC-V HINT as a
+tail-agnostic setting may cause an architectural state change on some
+implementations.
+
+[[sec-vector-fixed-point]]
+=== Vector Fixed-Point Arithmetic Instructions
+
+The preceding set of integer arithmetic instructions is extended to support
+fixed-point arithmetic.
+
+A fixed-point number is a two's-complement signed or unsigned integer
+interpreted as the numerator in a fraction with an implicit denominator.
+The fixed-point instructions are intended to be applied to the numerators;
+it is the responsibility of software to manage the denominators.
+An N-bit element can hold two's-complement signed integers in the
+range -2^N-1^...+2^N-1^-1, and unsigned integers in the range 0
+... +2^N^-1. The fixed-point instructions help preserve precision in
+narrow operands by supporting scaling and rounding, and can handle
+overflow by saturating results into the destination format range.
+
+NOTE: The widening integer operations described above can also be used
+to avoid overflow.
+
+==== Vector Single-Width Saturating Add and Subtract
+
+Saturating forms of integer add and subtract are provided, for both
+signed and unsigned integers. If the result would overflow the
+destination, the result is replaced with the closest representable
+value, and the `vxsat` bit is set.
+
+----
+# Saturating adds of unsigned integers.
+vsaddu.vv vd, vs2, vs1, vm # Vector-vector
+vsaddu.vx vd, vs2, rs1, vm # vector-scalar
+vsaddu.vi vd, vs2, imm, vm # vector-immediate
+
+# Saturating adds of signed integers.
+vsadd.vv vd, vs2, vs1, vm # Vector-vector
+vsadd.vx vd, vs2, rs1, vm # vector-scalar
+vsadd.vi vd, vs2, imm, vm # vector-immediate
+
+# Saturating subtract of unsigned integers.
+vssubu.vv vd, vs2, vs1, vm # Vector-vector
+vssubu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Saturating subtract of signed integers.
+vssub.vv vd, vs2, vs1, vm # Vector-vector
+vssub.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Averaging Add and Subtract
+
+The averaging add and subtract instructions right shift the result by
+one bit and round off the result according to the setting in `vxrm`.
+Both unsigned and signed versions are provided.
+For `vaaddu` and `vaadd` there can be no overflow in the result.
+For `vasub` and `vasubu`, overflow is ignored and the result wraps around.
+
+NOTE: For `vasub`, overflow occurs only when subtracting the smallest number
+from the largest number under `rnu` or `rne` rounding.
+
+----
+# Averaging add
+
+# Averaging adds of unsigned integers.
+vaaddu.vv vd, vs2, vs1, vm # roundoff_unsigned(vs2[i] + vs1[i], 1)
+vaaddu.vx vd, vs2, rs1, vm # roundoff_unsigned(vs2[i] + x[rs1], 1)
+
+# Averaging adds of signed integers.
+vaadd.vv vd, vs2, vs1, vm # roundoff_signed(vs2[i] + vs1[i], 1)
+vaadd.vx vd, vs2, rs1, vm # roundoff_signed(vs2[i] + x[rs1], 1)
+
+# Averaging subtract
+
+# Averaging subtract of unsigned integers.
+vasubu.vv vd, vs2, vs1, vm # roundoff_unsigned(vs2[i] - vs1[i], 1)
+vasubu.vx vd, vs2, rs1, vm # roundoff_unsigned(vs2[i] - x[rs1], 1)
+
+# Averaging subtract of signed integers.
+vasub.vv vd, vs2, vs1, vm # roundoff_signed(vs2[i] - vs1[i], 1)
+vasub.vx vd, vs2, rs1, vm # roundoff_signed(vs2[i] - x[rs1], 1)
+----
+
+==== Vector Single-Width Fractional Multiply with Rounding and Saturation
+
+The signed fractional multiply instruction produces a 2*SEW product of
+the two SEW inputs, then shifts the result right by SEW-1 bits,
+rounding these bits according to `vxrm`, then saturates the result to
+fit into SEW bits. If the result causes saturation, the `vxsat` bit
+is set.
+
+----
+# Signed saturating and rounding fractional multiply
+# See vxrm description for rounding calculation
+vsmul.vv vd, vs2, vs1, vm # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1))
+vsmul.vx vd, vs2, rs1, vm # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1))
+----
+
+NOTE: When multiplying two N-bit signed numbers, the largest magnitude
+is obtained for -2^N-1^ * -2^N-1^ producing a result +2^2N-2^, which
+has a single (zero) sign bit when held in 2N bits. All other products
+have two sign bits in 2N bits. To retain greater precision in N
+result bits, the product is shifted right by one bit less than N,
+saturating the largest magnitude result but increasing result
+precision by one bit for all other products.
+
+NOTE: We do not provide an equivalent fractional multiply where one
+input is unsigned, as these would retain all upper SEW bits and would
+not need to saturate. This operation is partly covered by the
+`vmulhu` and `vmulhsu` instructions, for the case where rounding is
+simply truncation (`rdn`).
+
+==== Vector Single-Width Scaling Shift Instructions
+
+These instructions shift the input value right, and round off the
+shifted out bits according to `vxrm`. The scaling right shifts have
+both zero-extending (`vssrl`) and sign-extending (`vssra`) forms. The
+data to be shifted is in the vector register group specified by `vs2`
+and the shift amount value can come from a vector register group
+`vs1`, a scalar integer register `rs1`, or a zero-extended 5-bit
+immediate. Only the low lg2(SEW) bits of the shift-amount value are
+used to control the shift amount.
+
+----
+ # Scaling shift right logical
+ vssrl.vv vd, vs2, vs1, vm # vd[i] = roundoff_unsigned(vs2[i], vs1[i])
+ vssrl.vx vd, vs2, rs1, vm # vd[i] = roundoff_unsigned(vs2[i], x[rs1])
+ vssrl.vi vd, vs2, uimm, vm # vd[i] = roundoff_unsigned(vs2[i], uimm)
+
+ # Scaling shift right arithmetic
+ vssra.vv vd, vs2, vs1, vm # vd[i] = roundoff_signed(vs2[i],vs1[i])
+ vssra.vx vd, vs2, rs1, vm # vd[i] = roundoff_signed(vs2[i], x[rs1])
+ vssra.vi vd, vs2, uimm, vm # vd[i] = roundoff_signed(vs2[i], uimm)
+----
+
+==== Vector Narrowing Fixed-Point Clip Instructions
+
+The `vnclip` instructions are used to pack a fixed-point value into a
+narrower destination. The instructions support rounding, scaling, and
+saturation into the final destination format. The source data is in
+the vector register group specified by `vs2`. The scaling shift amount
+value can come from a vector register group `vs1`, a scalar integer
+register `rs1`, or a zero-extended 5-bit immediate. The low
+lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the
+low 6 bits for a SEW=64-bit to SEW=32-bit narrowing operation) are
+used to control the right shift amount, which provides the scaling.
+----
+# Narrowing unsigned clip
+# SEW 2*SEW SEW
+ vnclipu.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], vs1[i]))
+ vnclipu.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], x[rs1]))
+ vnclipu.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_unsigned(vs2[i], uimm))
+
+# Narrowing signed clip
+ vnclip.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_signed(vs2[i], vs1[i]))
+ vnclip.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_signed(vs2[i], x[rs1]))
+ vnclip.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_signed(vs2[i], uimm))
+----
+
+For `vnclipu`/`vnclip`, the rounding mode is specified in the `vxrm`
+CSR. Rounding occurs around the least-significant bit of the
+destination and before saturation.
+
+For `vnclipu`, the shifted rounded source value is treated as an
+unsigned integer and saturates if the result would overflow the
+destination viewed as an unsigned integer.
+
+NOTE: There is no single instruction that can saturate a signed value
+into an unsigned destination. A sequence of two vector instructions
+that first removes negative numbers by performing a max against 0
+using `vmax` then clips the resulting unsigned value into the
+destination using `vnclipu` can be used if setting `vxsat` value for
+negative numbers is not required. A `vsetvli` is required inbetween
+these two instructions to change SEW.
+
+For `vnclip`, the shifted rounded source value is treated as a signed
+integer and saturates if the result would overflow the destination viewed
+as a signed integer.
+
+If any destination element is saturated, the `vxsat` bit is set in the
+`vxsat` register.
+
+[[sec-vector-float]]
+=== Vector Floating-Point Instructions
+
+The standard vector floating-point instructions treat elements as
+IEEE-754/2008-compatible values. If the EEW of a vector
+floating-point operand does not correspond to a supported IEEE
+floating-point type, the instruction encoding is reserved.
+
+NOTE: Whether floating-point is supported, and for which element
+widths, is determined by the specific vector extension. The current
+set of extensions include support for 32-bit and 64-bit floating-point
+values. When 16-bit and 128-bit element widths are added, they will be
+also be treated as IEEE-754/2008-compatible values. Other
+floating-point formats may be supported in future extensions.
+
+Vector floating-point instructions require the presence of base scalar
+floating-point extensions corresponding to the supported vector
+floating-point element widths.
+
+NOTE: In particular, future vector extensions supporting 16-bit
+half-precision floating-point values will also require some scalar
+half-precision floating-point support.
+
+If the floating-point unit status field `mstatus.FS` is `Off` then any
+attempt to execute a vector floating-point instruction will raise an
+illegal instruction exception. Any vector floating-point instruction
+that modifies any floating-point extension state (i.e., floating-point
+CSRs or `f` registers) must set `mstatus.FS` to `Dirty`.
+
+If the hypervisor extension is implemented and V=1, the `vsstatus.FS` field is
+additionally in effect for vector floating-point instructions. If
+`vsstatus.FS` or `mstatus.FS` is `Off` then any
+attempt to execute a vector floating-point instruction will raise an
+illegal instruction exception. Any vector floating-point instruction
+that modifies any floating-point extension state (i.e., floating-point
+CSRs or `f` registers) must set both `mstatus.FS` and `vsstatus.FS` to `Dirty`.
+
+The vector floating-point instructions have the same behavior as the
+scalar floating-point instructions with regard to NaNs.
+
+Scalar values for floating-point vector-scalar operations are sourced
+as described in Section <<sec-arithmetic-encoding>>.
+
+==== Vector Floating-Point Exception Flags
+
+A vector floating-point exception at any active floating-point element
+sets the standard FP exception flags in the `fflags` register. Inactive
+elements do not set FP exception flags.
+
+==== Vector Single-Width Floating-Point Add/Subtract Instructions
+
+----
+ # Floating-point add
+ vfadd.vv vd, vs2, vs1, vm # Vector-vector
+ vfadd.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Floating-point subtract
+ vfsub.vv vd, vs2, vs1, vm # Vector-vector
+ vfsub.vf vd, vs2, rs1, vm # Vector-scalar vd[i] = vs2[i] - f[rs1]
+ vfrsub.vf vd, vs2, rs1, vm # Scalar-vector vd[i] = f[rs1] - vs2[i]
+----
+
+==== Vector Widening Floating-Point Add/Subtract Instructions
+
+----
+# Widening FP add/subtract, 2*SEW = SEW +/- SEW
+vfwadd.vv vd, vs2, vs1, vm # vector-vector
+vfwadd.vf vd, vs2, rs1, vm # vector-scalar
+vfwsub.vv vd, vs2, vs1, vm # vector-vector
+vfwsub.vf vd, vs2, rs1, vm # vector-scalar
+
+# Widening FP add/subtract, 2*SEW = 2*SEW +/- SEW
+vfwadd.wv vd, vs2, vs1, vm # vector-vector
+vfwadd.wf vd, vs2, rs1, vm # vector-scalar
+vfwsub.wv vd, vs2, vs1, vm # vector-vector
+vfwsub.wf vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Floating-Point Multiply/Divide Instructions
+
+----
+ # Floating-point multiply
+ vfmul.vv vd, vs2, vs1, vm # Vector-vector
+ vfmul.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Floating-point divide
+ vfdiv.vv vd, vs2, vs1, vm # Vector-vector
+ vfdiv.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Reverse floating-point divide vector = scalar / vector
+ vfrdiv.vf vd, vs2, rs1, vm # scalar-vector, vd[i] = f[rs1]/vs2[i]
+----
+
+==== Vector Widening Floating-Point Multiply
+
+----
+# Widening floating-point multiply
+vfwmul.vv vd, vs2, vs1, vm # vector-vector
+vfwmul.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+
+All four varieties of fused multiply-add are provided, and in two
+destructive forms that overwrite one of the operands, either the
+addend or the first multiplicand.
+
+----
+# FP multiply-accumulate, overwrites addend
+vfmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vfmacc.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) + vd[i]
+
+# FP negate-(multiply-accumulate), overwrites subtrahend
+vfnmacc.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) - vd[i]
+vfnmacc.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) - vd[i]
+
+# FP multiply-subtract-accumulator, overwrites subtrahend
+vfmsac.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) - vd[i]
+vfmsac.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) - vd[i]
+
+# FP negate-(multiply-subtract-accumulator), overwrites minuend
+vfnmsac.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+vfnmsac.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) + vd[i]
+
+# FP multiply-add, overwrites multiplicand
+vfmadd.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vd[i]) + vs2[i]
+vfmadd.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vd[i]) + vs2[i]
+
+# FP negate-(multiply-add), overwrites multiplicand
+vfnmadd.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vd[i]) - vs2[i]
+vfnmadd.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vd[i]) - vs2[i]
+
+# FP multiply-sub, overwrites multiplicand
+vfmsub.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vd[i]) - vs2[i]
+vfmsub.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vd[i]) - vs2[i]
+
+# FP negate-(multiply-sub), overwrites multiplicand
+vfnmsub.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vd[i]) + vs2[i]
+vfnmsub.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vd[i]) + vs2[i]
+----
+
+NOTE: While we considered using the two unused rounding modes
+in the scalar FP FMA encoding to provide a few non-destructive FMAs,
+these would complicate microarchitectures by being the only maskable
+operation with three inputs and separate output.
+
+==== Vector Widening Floating-Point Fused Multiply-Add Instructions
+
+The widening floating-point fused multiply-add instructions all
+overwrite the wide addend with the result. The multiplier inputs are
+all SEW wide, while the addend and destination is 2*SEW bits wide.
+
+----
+# FP widening multiply-accumulate, overwrites addend
+vfwmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vfwmacc.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) + vd[i]
+
+# FP widening negate-(multiply-accumulate), overwrites addend
+vfwnmacc.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) - vd[i]
+vfwnmacc.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) - vd[i]
+
+# FP widening multiply-subtract-accumulator, overwrites addend
+vfwmsac.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) - vd[i]
+vfwmsac.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) - vd[i]
+
+# FP widening negate-(multiply-subtract-accumulator), overwrites addend
+vfwnmsac.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+vfwnmsac.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) + vd[i]
+----
+
+==== Vector Floating-Point Square-Root Instruction
+
+This is a unary vector-vector instruction.
+
+----
+ # Floating-point square root
+ vfsqrt.v vd, vs2, vm # Vector-vector square root
+----
+
+==== Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+
+----
+ # Floating-point reciprocal square-root estimate to 7 bits.
+ vfrsqrt7.v vd, vs2, vm
+----
+
+This is a unary vector-vector instruction that returns an estimate of
+1/sqrt(x) accurate to 7 bits.
+
+NOTE: An earlier draft version had used the assembler name `vfrsqrte7`
+but this was deemed to cause confusion with the ``e``__x__ notation for element
+width. The earlier name can be retained as alias in tool chains for
+backward compatibility.
+
+The following table describes the instruction's behavior for all
+classes of floating-point inputs:
+
+[cols="1,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Input | Output | Exceptions raised
+
+| -{inf} {le} _x_ < -0.0 | canonical NaN | NV
+| -0.0 | -{inf} | DZ
+| +0.0 | +{inf} | DZ
+| +0.0 < _x_ < +{inf} | _estimate of 1/sqrt(x)_ |
+| +{inf} | +0.0 |
+| qNaN | canonical NaN |
+| sNaN | canonical NaN | NV
+|===
+
+NOTE: All positive normal and subnormal inputs produce normal outputs.
+
+NOTE: The output value is independent of the dynamic rounding mode.
+
+For the non-exceptional cases, the low bit of the exponent and the six high
+bits of significand (after the leading one) are concatenated and used to
+address the following table.
+The output of the table becomes the seven high bits of the result significand
+(after the leading one); the remainder of the result significand is zero.
+Subnormal inputs are normalized and the exponent adjusted appropriately before
+the lookup.
+The output exponent is chosen to make the result approximate the reciprocal of
+the square root of the argument.
+
+More precisely, the result is computed as follows.
+Let the normalized input exponent be equal to the input exponent if the input
+is normal, or 0 minus the number of leading zeros in the significand
+otherwise.
+If the input is subnormal, the normalized input significand is given by
+shifting the input significand left by 1 minus the normalized input exponent,
+discarding the leading 1 bit.
+The output exponent equals floor((3*B - 1 - the normalized input exponent) / 2),
+where B is the exponent bias. The output sign equals the input sign.
+
+The following table gives the seven MSBs of the output significand as a
+function of the LSB of the normalized input exponent and the six MSBs of the
+normalized input significand; the other bits of the output significand are zero.
+
+include::images/wavedrom/vfrsqrt7.adoc[]
+
+NOTE: For example, when SEW=32, vfrsqrt7(0x00718abc ({approx} 1.043e-38)) = 0x5f080000 ({approx} 9.800e18), and vfrsqrt7(0x7f765432 ({approx} 3.274e38)) = 0x1f820000 ({approx} 5.506e-20).
+
+NOTE: The 7 bit accuracy was chosen as it requires 0,1,2,3
+Newton-Raphson iterations to converge to close to bfloat16, FP16,
+FP32, FP64 accuracy respectively. Future instructions can be defined
+with greater estimate accuracy.
+
+==== Vector Floating-Point Reciprocal Estimate Instruction
+
+----
+ # Floating-point reciprocal estimate to 7 bits.
+ vfrec7.v vd, vs2, vm
+----
+
+NOTE: An earlier draft version had used the assembler name `vfrece7`
+but this was deemed to cause confusion with ``e``__x__ notation for element
+width. The earlier name can be retained as alias in tool chains for
+backward compatibility.
+
+This is a unary vector-vector instruction that returns an estimate of
+1/x accurate to 7 bits.
+
+The following table describes the instruction's behavior for all
+classes of floating-point inputs, where _B_ is the exponent bias:
+
+[cols="1,1,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Input (_x_) | Rounding Mode | Output (_y_ {approx} _1/x_) | Exceptions raised
+
+| -{inf} | _any_ | -0.0 |
+| -2^B+1^ < _x_ {le} -2^B^ (normal) | _any_ | -2^-(B+1)^ {ge} _y_ > -2^-B^ (subnormal, sig=01...) |
+| -2^B^ < _x_ {le} -2^B-1^ (normal) | _any_ | -2^-B^ {ge} _y_ > -2^-B+1^ (subnormal, sig=1...) |
+| -2^B-1^ < _x_ {le} -2^-B+1^ (normal) | _any_ | -2^-B+1^ {ge} _y_ > -2^B-1^ (normal) |
+| -2^-B+1^ < _x_ {le} -2^-B^ (subnormal, sig=1...) | _any_ | -2^B-1^ {ge} _y_ > -2^B^ (normal) |
+| -2^-B^ < _x_ {le} -2^-(B+1)^ (subnormal, sig=01...) | _any_ | -2^B^ {ge} _y_ > -2^B+1^ (normal) |
+| -2^-(B+1)^ < _x_ < -0.0 (subnormal, sig=00...) | RUP, RTZ | greatest-mag. negative finite value | NX, OF
+| -2^-(B+1)^ < _x_ < -0.0 (subnormal, sig=00...) | RDN, RNE, RMM | -{inf} | NX, OF
+| -0.0 | _any_ | -{inf} | DZ
+| +0.0 | _any_ | +{inf} | DZ
+| +0.0 < _x_ < 2^-(B+1)^ (subnormal, sig=00...) | RUP, RNE, RMM | +{inf} | NX, OF
+| +0.0 < _x_ < 2^-(B+1)^ (subnormal, sig=00...) | RDN, RTZ | greatest finite value | NX, OF
+| 2^-(B+1)^ {le} _x_ < 2^-B^ (subnormal, sig=01...) | _any_ | 2^B+1^ > _y_ {ge} 2^B^ (normal) |
+| 2^-B^ {le} _x_ < 2^-B+1^ (subnormal, sig=1...) | _any_ | 2^B^ > _y_ {ge} 2^B-1^ (normal) |
+| 2^-B+1^ {le} _x_ < 2^B-1^ (normal) | _any_ | 2^B-1^ > _y_ {ge} 2^-B+1^ (normal) |
+| 2^B-1^ {le} _x_ < 2^B^ (normal) | _any_ | 2^-B+1^ > _y_ {ge} 2^-B^ (subnormal, sig=1...) |
+| 2^B^ {le} _x_ < 2^B+1^ (normal) | _any_ | 2^-B^ > _y_ {ge} 2^-(B+1)^ (subnormal, sig=01...) |
+| +{inf} | _any_ | +0.0 |
+| qNaN | _any_ | canonical NaN |
+| sNaN | _any_ | canonical NaN | NV
+|===
+
+NOTE: Subnormal inputs with magnitude at least 2^-(B+1)^ produce normal outputs;
+other subnormal inputs produce infinite outputs.
+Normal inputs with magnitude at least 2^B-1^ produce subnormal outputs;
+other normal inputs produce normal outputs.
+
+NOTE: The output value depends on the dynamic rounding mode when
+the overflow exception is raised.
+
+For the non-exceptional cases, the seven high bits of significand (after the
+leading one) are used to address the following table.
+The output of the table becomes the seven high bits of the result significand
+(after the leading one); the remainder of the result significand is zero.
+Subnormal inputs are normalized and the exponent adjusted appropriately before
+the lookup.
+The output exponent is chosen to make the result approximate the reciprocal of
+the argument, and subnormal outputs are denormalized accordingly.
+
+More precisely, the result is computed as follows.
+Let the normalized input exponent be equal to the input exponent if the input
+is normal, or 0 minus the number of leading zeros in the significand
+otherwise.
+The normalized output exponent equals (2*B - 1 - the normalized input exponent).
+If the normalized output exponent is outside the range [-1, 2*B], the result
+corresponds to one of the exceptional cases in the table above.
+
+If the input is subnormal, the normalized input significand is given by
+shifting the input significand left by 1 minus the normalized input exponent,
+discarding the leading 1 bit.
+Otherwise, the normalized input significand equals the input significand.
+The following table gives the seven MSBs of the normalized output significand
+as a function of the seven MSBs of the normalized input significand; the other
+bits of the normalized output significand are zero.
+
+include::images/wavedrom/vfrec7.adoc[]
+
+If the normalized output exponent is 0 or -1, the result is subnormal: the
+output exponent is 0, and the output significand is given by concatenating
+a 1 bit to the left of the normalized output significand, then shifting that
+quantity right by 1 minus the normalized output exponent.
+Otherwise, the output exponent equals the normalized output exponent, and the
+output significand equals the normalized output significand.
+The output sign equals the input sign.
+
+NOTE: For example, when SEW=32, vfrec7(0x00718abc ({approx} 1.043e-38)) = 0x7e900000 ({approx} 9.570e37), and vfrec7(0x7f765432 ({approx} 3.274e38)) = 0x00214000 ({approx} 3.053e-39).
+
+NOTE: The 7 bit accuracy was chosen as it requires 0,1,2,3
+Newton-Raphson iterations to converge to close to bfloat16, FP16,
+FP32, FP64 accuracy respectively. Future instructions can be defined
+with greater estimate accuracy.
+
+==== Vector Floating-Point MIN/MAX Instructions
+
+The vector floating-point `vfmin` and `vfmax` instructions have the
+same behavior as the corresponding scalar floating-point instructions
+in version 2.2 of the RISC-V F/D/Q extension: they perform the `minimumNumber`
+or `maximumNumber` operation on active elements.
+
+----
+ # Floating-point minimum
+ vfmin.vv vd, vs2, vs1, vm # Vector-vector
+ vfmin.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Floating-point maximum
+ vfmax.vv vd, vs2, vs1, vm # Vector-vector
+ vfmax.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Floating-Point Sign-Injection Instructions
+
+Vector versions of the scalar sign-injection instructions. The result
+takes all bits except the sign bit from the vector `vs2` operands.
+
+----
+ vfsgnj.vv vd, vs2, vs1, vm # Vector-vector
+ vfsgnj.vf vd, vs2, rs1, vm # vector-scalar
+
+ vfsgnjn.vv vd, vs2, vs1, vm # Vector-vector
+ vfsgnjn.vf vd, vs2, rs1, vm # vector-scalar
+
+ vfsgnjx.vv vd, vs2, vs1, vm # Vector-vector
+ vfsgnjx.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: A vector of floating-point values can be negated using a
+sign-injection instruction with both source operands set to the same
+vector operand. An assembly pseudoinstruction is provided: `vfneg.v vd,vs` = `vfsgnjn.vv vd,vs,vs`.
+
+NOTE: The absolute value of a vector of floating-point elements can be
+calculated using a sign-injection instruction with both source
+operands set to the same vector operand. An assembly
+pseudoinstruction is provided: `vfabs.v vd,vs` = `vfsgnjx.vv vd,vs,vs`.
+
+==== Vector Floating-Point Compare Instructions
+
+These vector FP compare instructions compare two source operands and
+write the comparison result to a mask register. The destination mask
+vector is always held in a single vector register, with a layout of
+elements as described in Section <<sec-mask-register-layout>>. The
+destination mask vector register may be the same as the source vector
+mask register (`v0`). Compares write mask registers, and so always
+operate under a tail-agnostic policy.
+
+The compare instructions follow the semantics of the scalar
+floating-point compare instructions. `vmfeq` and `vmfne` raise the invalid
+operation exception only on signaling NaN inputs. `vmflt`, `vmfle`, `vmfgt`,
+and `vmfge` raise the invalid operation exception on both signaling and
+quiet NaN inputs.
+`vmfne` writes 1 to the destination element when either
+operand is NaN, whereas the other compares write 0 when either operand
+is NaN.
+
+----
+ # Compare equal
+ vmfeq.vv vd, vs2, vs1, vm # Vector-vector
+ vmfeq.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare not equal
+ vmfne.vv vd, vs2, vs1, vm # Vector-vector
+ vmfne.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare less than
+ vmflt.vv vd, vs2, vs1, vm # Vector-vector
+ vmflt.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare less than or equal
+ vmfle.vv vd, vs2, vs1, vm # Vector-vector
+ vmfle.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare greater than
+ vmfgt.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare greater than or equal
+ vmfge.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+----
+Comparison Assembler Mapping Assembler pseudoinstruction
+
+va < vb vmflt.vv vd, va, vb, vm
+va <= vb vmfle.vv vd, va, vb, vm
+va > vb vmflt.vv vd, vb, va, vm vmfgt.vv vd, va, vb, vm
+va >= vb vmfle.vv vd, vb, va, vm vmfge.vv vd, va, vb, vm
+
+va < f vmflt.vf vd, va, f, vm
+va <= f vmfle.vf vd, va, f, vm
+va > f vmfgt.vf vd, va, f, vm
+va >= f vmfge.vf vd, va, f, vm
+
+va, vb vector register groups
+f scalar floating-point register
+----
+
+NOTE: Providing all forms is necessary to correctly handle unordered
+compares for NaNs.
+
+NOTE: C99 floating-point quiet compares can be implemented by masking
+the signaling compares when either input is NaN, as follows. When
+the comparand is a non-NaN constant, the middle two instructions can be
+omitted.
+
+----
+ # Example of implementing isgreater()
+ vmfeq.vv v0, va, va # Only set where A is not NaN.
+ vmfeq.vv v1, vb, vb # Only set where B is not NaN.
+ vmand.mm v0, v0, v1 # Only set where A and B are ordered,
+ vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
+----
+
+NOTE: In the above sequence, it is tempting to mask the second `vmfeq`
+instruction and remove the `vmand` instruction, but this more efficient
+sequence incorrectly fails to raise the invalid exception when an
+element of `va` contains a quiet NaN and the corresponding element in
+`vb` contains a signaling NaN.
+
+==== Vector Floating-Point Classify Instruction
+
+This is a unary vector-vector instruction that operates in the same
+way as the scalar classify instruction.
+
+----
+ vfclass.v vd, vs2, vm # Vector-vector
+----
+
+The 10-bit mask produced by this instruction is placed in the
+least-significant bits of the result elements. The upper (SEW-10)
+bits of the result are filled with zeros. The instruction is only
+defined for SEW=16b and above, so the result will always fit in the
+destination elements.
+
+==== Vector Floating-Point Merge Instruction
+
+A vector-scalar floating-point merge instruction is provided, which
+operates on all body elements from `vstart` up to the current vector
+length in `vl` regardless of mask value.
+
+The `vfmerge.vfm` instruction is encoded as a masked instruction (`vm=0`).
+At elements where the mask value is zero, the first vector operand is
+copied to the destination element, otherwise a scalar floating-point
+register value is copied to the destination element.
+
+----
+vfmerge.vfm vd, vs2, rs1, v0 # vd[i] = v0.mask[i] ? f[rs1] : vs2[i]
+----
+
+[[sec-vector-float-move]]
+==== Vector Floating-Point Move Instruction
+
+The vector floating-point move instruction __splats__ a floating-point
+scalar operand to a vector register group. The instruction copies a
+scalar `f` register value to all active elements of a vector register
+group. This instruction is encoded as an unmasked instruction (`vm=1`).
+The instruction must have the `vs2` field set to `v0`, with all other
+values for `vs2` reserved.
+
+----
+vfmv.v.f vd, rs1 # vd[i] = f[rs1]
+----
+
+NOTE: The `vfmv.v.f` instruction shares the encoding with the `vfmerge.vfm`
+instruction, but with `vm=1` and `vs2=v0`.
+
+==== Single-Width Floating-Point/Integer Type-Convert Instructions
+
+Conversion operations are provided to convert to and from
+floating-point values and unsigned and signed integers, where both
+source and destination are SEW wide.
+
+----
+vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer.
+vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer.
+
+vfcvt.rtz.xu.f.v vd, vs2, vm # Convert float to unsigned integer, truncating.
+vfcvt.rtz.x.f.v vd, vs2, vm # Convert float to signed integer, truncating.
+
+vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float.
+vfcvt.f.x.v vd, vs2, vm # Convert signed integer to float.
+----
+
+The conversions follow the same rules on exceptional conditions as the
+scalar conversion instructions.
+The conversions use the dynamic rounding mode in `frm`, except for the `rtz`
+variants, which round towards zero.
+
+NOTE: The `rtz` variants are provided to accelerate truncating conversions
+from floating-point to integer, as is common in languages like C and Java.
+
+==== Widening Floating-Point/Integer Type-Convert Instructions
+
+A set of conversion instructions is provided to convert between
+narrower integer and floating-point datatypes to a type of twice the
+width.
+
+----
+vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
+vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer.
+
+vfwcvt.rtz.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer, truncating.
+vfwcvt.rtz.x.f.v vd, vs2, vm # Convert float to double-width signed integer, truncating.
+
+vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
+vfwcvt.f.x.v vd, vs2, vm # Convert signed integer to double-width float.
+
+vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
+----
+
+These instructions have the same constraints on vector register overlap
+as other widening instructions (see <<sec-widening>>).
+
+NOTE: A double-width IEEE floating-point value can always represent a
+single-width integer exactly.
+
+NOTE: A double-width IEEE floating-point value can always represent a
+single-width IEEE floating-point value exactly.
+
+NOTE: A full set of floating-point widening conversions is not
+supported as single instructions, but any widening conversion can be
+implemented as several doubling steps with equivalent results and no
+additional exception flags raised.
+
+==== Narrowing Floating-Point/Integer Type-Convert Instructions
+
+A set of conversion instructions is provided to convert wider integer
+and floating-point datatypes to a type of half the width.
+
+----
+vfncvt.xu.f.w vd, vs2, vm # Convert double-width float to unsigned integer.
+vfncvt.x.f.w vd, vs2, vm # Convert double-width float to signed integer.
+
+vfncvt.rtz.xu.f.w vd, vs2, vm # Convert double-width float to unsigned integer, truncating.
+vfncvt.rtz.x.f.w vd, vs2, vm # Convert double-width float to signed integer, truncating.
+
+vfncvt.f.xu.w vd, vs2, vm # Convert double-width unsigned integer to float.
+vfncvt.f.x.w vd, vs2, vm # Convert double-width signed integer to float.
+
+vfncvt.f.f.w vd, vs2, vm # Convert double-width float to single-width float.
+vfncvt.rod.f.f.w vd, vs2, vm # Convert double-width float to single-width float,
+ # rounding towards odd.
+----
+
+These instructions have the same constraints on vector register overlap
+as other narrowing instructions (see <<sec-narrowing>>).
+
+NOTE: A full set of floating-point narrowing conversions is not
+supported as single instructions. Conversions can be implemented in
+a sequence of halving steps. Results are equivalently rounded and
+the same exception flags are raised if all but the last halving step
+use round-towards-odd (`vfncvt.rod.f.f.w`). Only the final step
+should use the desired rounding mode.
+
+NOTE: For `vfncvt.rod.f.f.w`, a finite value that exceeds the range of the
+destination format is converted to the destination format's largest finite value with the same sign.
+
+=== Vector Reduction Operations
+
+Vector reduction operations take a vector register group of elements
+and a scalar held in element 0 of a vector register, and perform a
+reduction using some binary operator, to produce a scalar result in
+element 0 of a vector register. The scalar input and output operands
+are held in element 0 of a single vector register, not a vector
+register group, so any vector register can be the scalar source or
+destination of a vector reduction regardless of LMUL setting.
+
+The destination vector register can overlap the source operands,
+including the mask register.
+
+NOTE: Vector reductions read and write the scalar operand and result
+into element 0 of a vector register instead of a scalar register to
+avoid a loss of decoupling with the scalar processor, and to support
+future polymorphic use with future types not supported in the scalar
+unit.
+
+Inactive elements from the source vector register group are excluded
+from the reduction, but the scalar operand is always included
+regardless of the mask values.
+
+The other elements in the destination vector register ( 0 < index <
+VLEN/SEW) are considered the tail and are managed with the current
+tail agnostic/undisturbed policy.
+
+If `vl`=0, no operation is performed and the destination register is
+not updated.
+
+NOTE: This choice of behavior for `vl`=0 reduces implementation
+complexity as it is consistent with other operations on vector
+register state. For the common case that the source and destination
+scalar operand are the same vector register, this behavior also
+produces the expected result. For the uncommon case that the source
+and destination scalar operand are in different vector registers, this
+instruction will not copy the source into the destination when `vl`=0.
+However, it is expected that in most of these cases it will be
+statically known that `vl` is not zero. In other cases, a check for
+`vl`=0 will have to be added to ensure that the source scalar is
+copied to the destination (e.g., by explicitly setting `vl`=1 and
+performing a register-register copy).
+
+Traps on vector reduction instructions are always reported with a
+`vstart` of 0. Vector reduction operations raise an illegal
+instruction exception if `vstart` is non-zero.
+
+The assembler syntax for a reduction operation is `vredop.vs`, where
+the `.vs` suffix denotes the first operand is a vector register group
+and the second operand is a scalar stored in element 0 of a vector
+register.
+
+[[sec-vector-integer-reduce]]
+==== Vector Single-Width Integer Reduction Instructions
+
+All operands and results of single-width reduction instructions have
+the same SEW width. Overflows wrap around on arithmetic sums.
+
+----
+ # Simple reductions, where [*] denotes all active elements:
+ vredsum.vs vd, vs2, vs1, vm # vd[0] = sum( vs1[0] , vs2[*] )
+ vredmaxu.vs vd, vs2, vs1, vm # vd[0] = maxu( vs1[0] , vs2[*] )
+ vredmax.vs vd, vs2, vs1, vm # vd[0] = max( vs1[0] , vs2[*] )
+ vredminu.vs vd, vs2, vs1, vm # vd[0] = minu( vs1[0] , vs2[*] )
+ vredmin.vs vd, vs2, vs1, vm # vd[0] = min( vs1[0] , vs2[*] )
+ vredand.vs vd, vs2, vs1, vm # vd[0] = and( vs1[0] , vs2[*] )
+ vredor.vs vd, vs2, vs1, vm # vd[0] = or( vs1[0] , vs2[*] )
+ vredxor.vs vd, vs2, vs1, vm # vd[0] = xor( vs1[0] , vs2[*] )
+----
+
+[[sec-vector-integer-reduce-widen]]
+==== Vector Widening Integer Reduction Instructions
+
+The unsigned `vwredsumu.vs` instruction zero-extends the SEW-wide
+vector elements before summing them, then adds the 2*SEW-width scalar
+element, and stores the result in a 2*SEW-width scalar element.
+
+The `vwredsum.vs` instruction sign-extends the SEW-wide vector
+elements before summing them.
+
+For both `vwredsumu.vs` and `vwredsum.vs`, overflows wrap around.
+
+----
+ # Unsigned sum reduction into double-width accumulator
+ vwredsumu.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(zero-extend(SEW))
+
+ # Signed sum reduction into double-width accumulator
+ vwredsum.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(sign-extend(SEW))
+----
+
+[[sec-vector-float-reduce]]
+==== Vector Single-Width Floating-Point Reduction Instructions
+
+----
+ # Simple reductions.
+ vfredosum.vs vd, vs2, vs1, vm # Ordered sum
+ vfredusum.vs vd, vs2, vs1, vm # Unordered sum
+ vfredmax.vs vd, vs2, vs1, vm # Maximum value
+ vfredmin.vs vd, vs2, vs1, vm # Minimum value
+
+----
+
+NOTE: Older assembler mnemonic `vfredsum` is retained as alias for `vfredusum`.
+
+===== Vector Ordered Single-Width Floating-Point Sum Reduction
+
+The `vfredosum` instruction must sum the floating-point values in
+element order, starting with the scalar in `vs1[0]`--that is, it
+performs the computation:
+
+----
+ vd[0] = `(((vs1[0] + vs2[0]) + vs2[1]) + ...) + vs2[vl-1]`
+----
+where each addition operates identically to the scalar floating-point
+instructions in terms of raising exception flags and generating or
+propagating special values.
+
+NOTE: The ordered reduction supports compiler autovectorization, while
+the unordered FP sum allows for faster implementations.
+
+When the operation is masked (`vm=0`), the masked-off elements do not
+affect the result or the exception flags.
+
+NOTE: If no elements are active, no additions are performed, so the scalar in
+`vs1[0]` is simply copied to the destination register, without canonicalizing
+NaN values and without setting any exception flags. This behavior preserves
+the handling of NaNs, exceptions, and rounding when autovectorizing a scalar
+summation loop.
+
+===== Vector Unordered Single-Width Floating-Point Sum Reduction
+
+The unordered sum reduction instruction, `vfredusum`, provides an
+implementation more freedom in performing the reduction.
+
+The implementation must produce a result equivalent to a reduction tree
+composed of binary operator nodes, with the inputs being elements from
+the source vector register group (`vs2`) and the source scalar value
+(`vs1[0]`). Each operator in the tree accepts two inputs and produces
+one result.
+Each operator first computes an exact sum as a RISC-V scalar floating-point
+addition with infinite exponent range and precision, then converts this exact
+sum to a floating-point format with range and precision each at least as great
+as the element floating-point format indicated by SEW, rounding using the
+currently active floating-point dynamic rounding mode and raising exception
+flags as necessary.
+A different floating-point range and precision may be chosen for the result of
+each operator.
+A node where one input is derived only from elements masked-off or beyond the
+active vector length may either treat that input as the additive identity of the
+appropriate EEW or simply copy the other input to its output.
+The rounded result from the root node in the tree is converted (rounded again,
+using the dynamic rounding mode) to the standard floating-point format
+indicated by SEW.
+An implementation
+is allowed to add an additional additive identity to the final result.
+
+The additive identity is +0.0 when rounding down (towards -{inf}) or
+-0.0 for all other rounding modes.
+
+The reduction tree structure must be deterministic for a given value
+in `vtype` and `vl`.
+
+NOTE: As a consequence of this definition, implementations need not propagate
+NaN payloads through the reduction tree when no elements are active. In
+particular, if no elements are active and the scalar input is NaN,
+implementations are permitted to canonicalize the NaN and, if the NaN is
+signaling, set the invalid exception flag. Implementations are alternatively
+permitted to pass through the original NaN and set no exception flags, as with
+`vfredosum`.
+
+NOTE: The `vfredosum` instruction is a valid implementation of the
+`vfredusum` instruction.
+
+===== Vector Single-Width Floating-Point Max and Min Reductions
+
+The `vfredmin` and `vfredmax` instructions reduce the scalar argument in
+`vs1[0]` and active elements in `vs2` using the `minimumNumber` and
+`maximumNumber` operations, respectively.
+
+NOTE: Floating-point max and min reductions should return the same
+final value and raise the same exception flags regardless of operation
+order.
+
+NOTE: If no elements are active, the scalar in `vs1[0]` is simply copied to
+the destination register, without canonicalizing NaN values and without
+setting any exception flags.
+
+[[sec-vector-float-reduce-widen]]
+==== Vector Widening Floating-Point Reduction Instructions
+
+Widening forms of the sum reductions are provided that
+read and write a double-width reduction result.
+
+----
+ # Simple reductions.
+ vfwredosum.vs vd, vs2, vs1, vm # Ordered sum
+ vfwredusum.vs vd, vs2, vs1, vm # Unordered sum
+----
+
+NOTE: Older assembler mnemonic `vfwredsum` is retained as alias for `vfwredusum`.
+
+The reduction of the SEW-width elements is performed as in the
+single-width reduction case, with the elements in `vs2` promoted
+to 2*SEW bits before adding to the 2*SEW-bit accumulator.
+
+NOTE: `vfwredosum.vs` handles inactive elements and NaN payloads analogously
+to `vfredosum.vs`; `vfwredusum.vs` does so analogously to `vfredusum.vs`.
+
+[[sec-vector-mask]]
+=== Vector Mask Instructions
+
+Several instructions are provided to help operate on mask values held in
+a vector register.
+
+[[sec-mask-register-logical]]
+==== Vector Mask-Register Logical Instructions
+
+Vector mask-register logical operations operate on mask registers.
+Each element in a mask register is a single bit, so these instructions
+all operate on single vector registers regardless of the setting of
+the `vlmul` field in `vtype`. They do not change the value of
+`vlmul`. The destination vector register may be the same as either
+source vector register.
+
+As with other vector instructions, the elements with indices less than
+`vstart` are unchanged, and `vstart` is reset to zero after execution.
+Vector mask logical instructions are always unmasked, so there are no
+inactive elements, and the encodings with `vm=0` are reserved.
+Mask elements past `vl`, the tail elements, are
+always updated with a tail-agnostic policy.
+
+----
+ vmand.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && vs1.mask[i]
+ vmnand.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] && vs1.mask[i])
+ vmandn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && !vs1.mask[i]
+ vmxor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] ^^ vs1.mask[i]
+ vmor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || vs1.mask[i]
+ vmnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] || vs1.mask[i])
+ vmorn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || !vs1.mask[i]
+ vmxnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] ^^ vs1.mask[i])
+----
+
+NOTE: The previous assembler mnemonics `vmandnot` and `vmornot` have
+been changed to `vmandn` and `vmorn` to be consistent with the
+equivalent scalar instructions. The old `vmandnot` and `vmornot`
+mnemonics can be retained as assembler aliases for compatibility.
+
+Several assembler pseudoinstructions are defined as shorthand for
+common uses of mask logical operations:
+----
+ vmmv.m vd, vs => vmand.mm vd, vs, vs # Copy mask register
+ vmclr.m vd => vmxor.mm vd, vd, vd # Clear mask register
+ vmset.m vd => vmxnor.mm vd, vd, vd # Set mask register
+ vmnot.m vd, vs => vmnand.mm vd, vs, vs # Invert bits
+----
+
+NOTE: The `vmmv.m` instruction was previously called `vmcpy.m`, but
+with new layout it is more consistent to name as a "mv" because bits
+are copied without interpretation. The `vmcpy.m` assembler
+pseudoinstruction can be retained for compatibility. For
+implementations that internally rearrange bits according to EEW, a
+`vmmv.m` instruction with same source and destination can be used as
+idiom to force an internal reformat into a mask vector.
+
+The set of eight mask logical instructions can generate any of the 16
+possibly binary logical functions of the two input masks:
+
+[cols="1,1,1,1,12"]
+|===
+4+| inputs |
+
+| 0 | 0 | 1 | 1 | src1
+| 0 | 1 | 0 | 1 | src2
+|===
+
+[cols="1,1,1,1,6,6"]
+|===
+4+| output | instruction | pseudoinstruction
+
+| 0 | 0 | 0 | 0 | vmxor.mm vd, vd, vd | vmclr.m vd
+| 1 | 0 | 0 | 0 | vmnor.mm vd, src1, src2 |
+| 0 | 1 | 0 | 0 | vmandn.mm vd, src2, src1 |
+| 1 | 1 | 0 | 0 | vmnand.mm vd, src1, src1 | vmnot.m vd, src1
+| 0 | 0 | 1 | 0 | vmandn.mm vd, src1, src2 |
+| 1 | 0 | 1 | 0 | vmnand.mm vd, src2, src2 | vmnot.m vd, src2
+| 0 | 1 | 1 | 0 | vmxor.mm vd, src1, src2 |
+| 1 | 1 | 1 | 0 | vmnand.mm vd, src1, src2 |
+| 0 | 0 | 0 | 1 | vmand.mm vd, src1, src2 |
+| 1 | 0 | 0 | 1 | vmxnor.mm vd, src1, src2 |
+| 0 | 1 | 0 | 1 | vmand.mm vd, src2, src2 | vmmv.m vd, src2
+| 1 | 1 | 0 | 1 | vmorn.mm vd, src2, src1 |
+| 0 | 0 | 1 | 1 | vmand.mm vd, src1, src1 | vmmv.m vd, src1
+| 1 | 0 | 1 | 1 | vmorn.mm vd, src1, src2 |
+| 0 | 1 | 1 | 1 | vmor.mm vd, src1, src2 |
+| 1 | 1 | 1 | 1 | vmxnor.mm vd, vd, vd | vmset.m vd
+|===
+
+NOTE: The vector mask logical instructions are designed to be easily
+fused with a following masked vector operation to effectively expand
+the number of predicate registers by moving values into `v0` before
+use.
+
+
+==== Vector count population in mask `vcpop.m`
+
+----
+ vcpop.m rd, vs2, vm
+----
+
+NOTE: This instruction previously had the assembler mnemonic `vpopc.m`
+but was renamed to be consistent with the scalar instruction. The
+assembler instruction alias `vpopc.m` is being retained for software
+compatibility.
+
+The source operand is a single vector register holding mask register
+values as described in Section <<sec-mask-register-layout>>.
+
+The `vcpop.m` instruction counts the number of mask elements of the
+active elements of the vector source mask register that have the value
+1 and writes the result to a scalar `x` register.
+
+The operation can be performed under a mask, in which case only the
+masked elements are counted.
+
+----
+ vcpop.m rd, vs2, v0.t # x[rd] = sum_i ( vs2.mask[i] && v0.mask[i] )
+----
+
+The `vcpop.m` instruction writes `x[rd]` even if `vl`=0 (with the
+value 0, since no mask elements are active).
+
+Traps on `vcpop.m` are always reported with a `vstart` of 0. The
+`vcpop.m` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+==== `vfirst` find-first-set mask bit
+
+----
+ vfirst.m rd, vs2, vm
+----
+
+The `vfirst` instruction finds the lowest-numbered active element of
+the source mask vector that has the value 1 and writes that element's
+index to a GPR. If no active element has the value 1, -1 is written
+to the GPR.
+
+NOTE: Software can assume that any negative value (highest bit set)
+corresponds to no element found, as vector lengths will never reach
+2^(XLEN-1)^ on any implementation.
+
+The `vfirst.m` instruction writes `x[rd]` even if `vl`=0 (with the
+value -1, since no mask elements are active).
+
+Traps on `vfirst` are always reported with a `vstart` of 0. The
+`vfirst` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+==== `vmsbf.m` set-before-first mask bit
+
+----
+ vmsbf.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsbf.m v2, v3
+ 0 0 0 0 0 0 1 1 v2 contents
+
+ 1 0 0 1 0 1 0 1 v3 contents
+ vmsbf.m v2, v3
+ 0 0 0 0 0 0 0 0 v2
+
+ 0 0 0 0 0 0 0 0 v3 contents
+ vmsbf.m v2, v3
+ 1 1 1 1 1 1 1 1 v2
+
+ 1 1 0 0 0 0 1 1 v0 vcontents
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsbf.m v2, v3, v0.t
+ 0 1 x x x x 1 1 v2 contents
+----
+
+The `vmsbf.m` instruction takes a mask register as input and writes
+results to a mask register. The instruction writes a 1 to all active
+mask elements before the first active source element that is a 1, then
+writes a 0 to that element and all following active elements. If
+there is no set bit in the active elements of the source vector, then
+all active elements in the destination are written with a 1.
+
+The tail elements in the destination mask register are updated under a
+tail-agnostic policy.
+
+Traps on `vmsbf.m` are always reported with a `vstart` of 0. The
+`vmsbf` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+The destination register cannot overlap the source register
+and, if masked, cannot overlap the mask register ('v0').
+
+==== `vmsif.m` set-including-first mask bit
+
+The vector mask set-including-first instruction is similar to
+set-before-first, except it also includes the element with a set bit.
+
+----
+ vmsif.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsif.m v2, v3
+ 0 0 0 0 0 1 1 1 v2 contents
+
+ 1 0 0 1 0 1 0 1 v3 contents
+ vmsif.m v2, v3
+ 0 0 0 0 0 0 0 1 v2
+
+ 1 1 0 0 0 0 1 1 v0 vcontents
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsif.m v2, v3, v0.t
+ 1 1 x x x x 1 1 v2 contents
+----
+
+The tail elements in the destination mask register are updated under a
+tail-agnostic policy.
+
+Traps on `vmsif.m` are always reported with a `vstart` of 0. The
+`vmsif` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+The destination register cannot overlap the source register
+and, if masked, cannot overlap the mask register ('v0').
+
+==== `vmsof.m` set-only-first mask bit
+
+The vector mask set-only-first instruction is similar to
+set-before-first, except it only sets the first element with a bit
+set, if any.
+
+----
+ vmsof.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsof.m v2, v3
+ 0 0 0 0 0 1 0 0 v2 contents
+
+ 1 0 0 1 0 1 0 1 v3 contents
+ vmsof.m v2, v3
+ 0 0 0 0 0 0 0 1 v2
+
+ 1 1 0 0 0 0 1 1 v0 vcontents
+ 1 1 0 1 0 1 0 0 v3 contents
+ vmsof.m v2, v3, v0.t
+ 0 1 x x x x 0 0 v2 contents
+----
+
+The tail elements in the destination mask register are updated under a
+tail-agnostic policy.
+
+Traps on `vmsof.m` are always reported with a `vstart` of 0. The
+`vmsof` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+The destination register cannot overlap the source register
+and, if masked, cannot overlap the mask register ('v0').
+
+==== Example using vector mask instructions
+
+The following is an example of vectorizing a data-dependent exit loop.
+
+----
+include::example/strcpy.s[lines=4..-1]
+----
+----
+include::example/strncpy.s[lines=4..-1]
+----
+
+==== Vector Iota Instruction
+
+The `viota.m` instruction reads a source vector mask register and
+writes to each element of the destination vector register group the
+sum of all the bits of elements in the mask register
+whose index is less than the element, e.g., a parallel prefix sum of
+the mask values.
+
+This instruction can be masked, in which case only the enabled
+elements contribute to the sum.
+
+----
+ viota.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 0 0 1 v2 contents
+ viota.m v4, v2 # Unmasked
+ 2 2 2 1 1 1 1 0 v4 result
+
+ 1 1 1 0 1 0 1 1 v0 contents
+ 1 0 0 1 0 0 0 1 v2 contents
+ 2 3 4 5 6 7 8 9 v4 contents
+ viota.m v4, v2, v0.t # Masked, vtype.vma=0
+ 1 1 1 5 1 7 1 0 v4 results
+----
+
+The result value is zero-extended to fill the destination element if
+SEW is wider than the result. If the result value would overflow the
+destination SEW, the least-significant SEW bits are retained.
+
+Traps on `viota.m` are always reported with a `vstart` of 0, and
+execution is always restarted from the beginning when resuming after a
+trap handler. An illegal instruction exception is raised if `vstart`
+is non-zero.
+
+The destination register group cannot overlap the source register
+and, if masked, cannot overlap the mask register (`v0`).
+
+The `viota.m` instruction can be combined with memory scatter
+instructions (indexed stores) to perform vector compress functions.
+
+----
+ # Compact non-zero elements from input memory array to output memory array
+ #
+ # size_t compact_non_zero(size_t n, const int* in, int* out)
+ # {
+ # size_t i;
+ # size_t count = 0;
+ # int *p = out;
+ #
+ # for (i=0; i<n; i++)
+ # {
+ # const int v = *in++;
+ # if (v != 0)
+ # *p++ = v;
+ # }
+ #
+ # return (size_t) (p - out);
+ # }
+ #
+ # a0 = n
+ # a1 = &in
+ # a2 = &out
+
+compact_non_zero:
+ li a6, 0 # Clear count of non-zero elements
+loop:
+ vsetvli a5, a0, e32, m8, ta, ma # 32-bit integers
+ vle32.v v8, (a1) # Load input vector
+ sub a0, a0, a5 # Decrement number done
+ slli a5, a5, 2 # Multiply by four bytes
+ vmsne.vi v0, v8, 0 # Locate non-zero values
+ add a1, a1, a5 # Bump input pointer
+ vcpop.m a5, v0 # Count number of elements set in v0
+ viota.m v16, v0 # Get destination offsets of active elements
+ add a6, a6, a5 # Accumulate number of elements
+ vsll.vi v16, v16, 2, v0.t # Multiply offsets by four bytes
+ slli a5, a5, 2 # Multiply number of non-zero elements by four bytes
+ vsuxei32.v v8, (a2), v16, v0.t # Scatter using scaled viota results under mask
+ add a2, a2, a5 # Bump output pointer
+ bnez a0, loop # Any more?
+
+ mv a0, a6 # Return count
+ ret
+----
+
+==== Vector Element Index Instruction
+
+The `vid.v` instruction writes each element's index to the
+destination vector register group, from 0 to `vl`-1.
+
+----
+ vid.v vd, vm # Write element ID to destination.
+----
+
+The instruction can be masked. Masking does not change the
+index value written to active elements.
+
+The `vs2` field of the instruction must be set to `v0`, otherwise the
+encoding is _reserved_.
+
+The result value is zero-extended to fill the destination element if
+SEW is wider than the result. If the result value would overflow the
+destination SEW, the least-significant SEW bits are retained.
+
+NOTE: Microarchitectures can implement `vid.v` instruction using the
+same datapath as `viota.m` but with an implicit set mask source.
+
+[[sec-vector-permute]]
+=== Vector Permutation Instructions
+
+A range of permutation instructions are provided to move elements
+around within the vector registers.
+
+==== Integer Scalar Move Instructions
+
+The integer scalar read/write instructions transfer a single
+value between a scalar `x` register and element 0 of a vector
+register. The instructions ignore LMUL and vector register groups.
+
+----
+vmv.x.s rd, vs2 # x[rd] = vs2[0] (vs1=0)
+vmv.s.x vd, rs1 # vd[0] = x[rs1] (vs2=0)
+----
+
+The `vmv.x.s` instruction copies a single SEW-wide element from index 0 of the
+source vector register to a destination integer register. If SEW > XLEN, the
+least-significant XLEN bits are transferred and the upper SEW-XLEN bits are
+ignored. If SEW < XLEN, the value is sign-extended to XLEN bits.
+
+NOTE: `vmv.x.s` performs its operation even if `vstart` {ge} `vl` or `vl`=0.
+
+The `vmv.s.x` instruction copies the scalar integer register to element 0 of
+the destination vector register. If SEW < XLEN, the least-significant bits
+are copied and the upper XLEN-SEW bits are ignored. If SEW > XLEN, the value
+is sign-extended to SEW bits. The other elements in the destination vector
+register ( 0 < index < VLEN/SEW) are treated as tail elements using the current tail agnostic/undisturbed policy. If `vstart` {ge} `vl`, no
+operation is performed and the destination register is not updated.
+
+NOTE: As a consequence, when `vl`=0, no elements are updated in the
+destination vector register group, regardless of `vstart`.
+
+The encodings corresponding to the masked versions (`vm=0`) of `vmv.x.s`
+and `vmv.s.x` are reserved.
+
+==== Floating-Point Scalar Move Instructions
+
+The floating-point scalar read/write instructions transfer a single
+value between a scalar `f` register and element 0 of a vector
+register. The instructions ignore LMUL and vector register groups.
+
+----
+vfmv.f.s rd, vs2 # f[rd] = vs2[0] (rs1=0)
+vfmv.s.f vd, rs1 # vd[0] = f[rs1] (vs2=0)
+----
+
+The `vfmv.f.s` instruction copies a single SEW-wide element from index
+0 of the source vector register to a destination scalar floating-point
+register.
+
+NOTE: `vfmv.f.s` performs its operation even if `vstart` {ge} `vl` or `vl`=0.
+
+The `vfmv.s.f` instruction copies the scalar floating-point register
+to element 0 of the destination vector register. The other elements
+in the destination vector register ( 0 < index < VLEN/SEW) are treated
+as tail elements using the current tail agnostic/undisturbed policy.
+If `vstart` {ge} `vl`, no operation is performed and the destination
+register is not updated.
+
+NOTE: As a consequence, when `vl`=0, no elements are updated in the
+destination vector register group, regardless of `vstart`.
+
+The encodings corresponding to the masked versions (`vm=0`) of `vfmv.f.s`
+and `vfmv.s.f` are reserved.
+
+==== Vector Slide Instructions
+
+The slide instructions move elements up and down a vector register
+group.
+
+NOTE: The slide operations can be implemented much more efficiently
+than using the arbitrary register gather instruction. Implementations
+may optimize certain OFFSET values for `vslideup` and `vslidedown`.
+In particular, power-of-2 offsets may operate substantially faster
+than other offsets.
+
+For all of the `vslideup`, `vslidedown`, `v[f]slide1up`, and
+`v[f]slide1down` instructions, if `vstart` {ge} `vl`, the instruction performs no
+operation and leaves the destination vector register unchanged.
+
+NOTE: As a consequence, when `vl`=0, no elements are updated in the
+destination vector register group, regardless of `vstart`.
+
+The tail agnostic/undisturbed policy is followed for tail elements.
+
+The slide instructions may be masked, with mask element _i_
+controlling whether _destination_ element _i_ is written. The mask
+undisturbed/agnostic policy is followed for inactive elements.
+
+===== Vector Slideup Instructions
+
+----
+ vslideup.vx vd, vs2, rs1, vm # vd[i+x[rs1]] = vs2[i]
+ vslideup.vi vd, vs2, uimm, vm # vd[i+uimm] = vs2[i]
+----
+
+For `vslideup`, the value in `vl` specifies the maximum number of destination
+elements that are written. The start index (_OFFSET_) for the
+destination can be either specified using an unsigned integer in the
+`x` register specified by `rs1`, or a 5-bit immediate, zero-extended to XLEN bits.
+If XLEN > SEW, _OFFSET_ is _not_ truncated to SEW bits.
+Destination elements _OFFSET_ through `vl`-1 are written if unmasked and
+if _OFFSET_ < `vl`.
+
+----
+ vslideup behavior for destination elements (`vstart` < `vl`)
+
+ OFFSET is amount to slideup, either from x register or a 5-bit immediate
+
+ 0 <= i < min(vl, max(vstart, OFFSET)) Unchanged
+ max(vstart, OFFSET) <= i < vl vd[i] = vs2[i-OFFSET] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+----
+
+The destination vector register group for `vslideup` cannot overlap
+the source vector register group, otherwise the instruction encoding
+is reserved.
+
+NOTE: The non-overlap constraint avoids WAR hazards on the
+input vectors during execution, and enables restart with non-zero
+`vstart`.
+
+===== Vector Slidedown Instructions
+
+----
+ vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+x[rs1]]
+ vslidedown.vi vd, vs2, uimm, vm # vd[i] = vs2[i+uimm]
+----
+
+For `vslidedown`, the value in `vl` specifies the maximum number of
+destination elements that are written. The remaining elements past
+`vl` are handled according to the current tail policy (Section
+<<sec-agnostic>>).
+
+The start index (_OFFSET_) for the source can be either specified
+using an unsigned integer in the `x` register specified by `rs1`, or a
+5-bit immediate, zero-extended to XLEN bits.
+If XLEN > SEW, _OFFSET_ is _not_ truncated to SEW bits.
+
+----
+ vslidedown behavior for source elements for element i in slide (`vstart` < `vl`)
+ 0 <= i+OFFSET < VLMAX src[i] = vs2[i+OFFSET]
+ VLMAX <= i+OFFSET src[i] = 0
+
+ vslidedown behavior for destination element i in slide (`vstart` < `vl`)
+ 0 <= i < vstart Unchanged
+ vstart <= i < vl vd[i] = src[i] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+
+----
+
+===== Vector Slide1up
+
+Variants of slide are provided that only move by one element but which
+also allow a scalar integer value to be inserted at the vacated
+element position.
+
+----
+ vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i]
+----
+
+The `vslide1up` instruction places the `x` register argument at
+location 0 of the destination vector register group, provided that
+element 0 is active, otherwise the destination element update follows the
+current mask agnostic/undisturbed policy. If XLEN < SEW, the value is
+sign-extended to SEW bits. If XLEN > SEW, the least-significant bits
+are copied over and the high XLEN-SEW bits are ignored.
+
+The remaining active `vl`-1 elements are copied over from index _i_ in
+the source vector register group to index _i_+1 in the destination
+vector register group.
+
+The `vl` register specifies the maximum number of destination vector
+register elements updated with source values, and remaining elements
+past `vl` are handled according to the current tail policy (Section
+<<sec-agnostic>>).
+
+
+----
+ vslide1up behavior when vl > 0
+
+ i < vstart unchanged
+ 0 = i = vstart vd[i] = x[rs1] if v0.mask[i] enabled
+ max(vstart, 1) <= i < vl vd[i] = vs2[i-1] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+----
+
+The `vslide1up` instruction requires that the destination vector
+register group does not overlap the source vector register group.
+Otherwise, the instruction encoding is reserved.
+
+[[sec-vfslide1up]]
+===== Vector Floating-Point Slide1up Instruction
+
+----
+ vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i]
+----
+
+The `vfslide1up` instruction is defined analogously to `vslide1up`,
+but sources its scalar argument from an `f` register.
+
+===== Vector Slide1down Instruction
+
+The `vslide1down` instruction copies the first `vl`-1 active elements
+values from index _i_+1 in the source vector register group to index
+_i_ in the destination vector register group.
+
+The `vl` register specifies the maximum number of destination vector
+register elements written with source values, and remaining elements
+past `vl` are handled according to the current tail policy (Section
+<<sec-agnostic>>).
+
+----
+ vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1]
+----
+
+The `vslide1down` instruction places the `x` register argument at
+location `vl`-1 in the destination vector register, provided that
+element `vl-1` is active, otherwise the destination element update
+follows the current mask agnostic/undisturbed policy.
+If XLEN < SEW, the value is sign-extended to SEW bits. If
+XLEN > SEW, the least-significant bits are copied over and the high
+SEW-XLEN bits are ignored.
+
+----
+ vslide1down behavior
+
+ i < vstart unchanged
+ vstart <= i < vl-1 vd[i] = vs2[i+1] if v0.mask[i] enabled
+ vstart <= i = vl-1 vd[vl-1] = x[rs1] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+----
+
+NOTE: The `vslide1down` instruction can be used to load values into a
+vector register without using memory and without disturbing other
+vector registers. This provides a path for debuggers to modify the
+contents of a vector register, albeit slowly, with multiple repeated
+`vslide1down` invocations.
+
+[[sec-vfslide1down]]
+===== Vector Floating-Point Slide1down Instruction
+
+----
+ vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1]
+----
+
+The `vfslide1down` instruction is defined analogously to `vslide1down`,
+but sources its scalar argument from an `f` register.
+
+==== Vector Register Gather Instructions
+
+The vector register gather instructions read elements from a first
+source vector register group at locations given by a second source
+vector register group. The index values in the second vector are
+treated as unsigned integers. The source vector can be read at any
+index < VLMAX regardless of `vl`. The maximum number of elements to write to
+the destination register is given by `vl`, and the remaining elements
+past `vl` are handled according to the current tail policy
+(Section <<sec-agnostic>>). The operation can be masked, and the mask
+undisturbed/agnostic policy is followed for inactive elements.
+
+----
+vrgather.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+vrgatherei16.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+----
+
+The `vrgather.vv` form uses SEW/LMUL for both the data and
+indices. The `vrgatherei16.vv` form uses SEW/LMUL for the data in
+`vs2` but EEW=16 and EMUL = (16/SEW)*LMUL for the indices in `vs1`.
+
+NOTE: When SEW=8, `vrgather.vv` can only reference vector elements
+0-255. The `vrgatherei16` form can index 64K elements, and can also
+be used to reduce the register capacity needed to hold indices when
+SEW > 16.
+
+If an element index is out of range ( `vs1[i]` {ge} VLMAX )
+then zero is returned for the element value.
+
+Vector-scalar and vector-immediate forms of the register gather are
+also provided. These read one element from the source vector at the
+given index, and write this value to the active elements
+of the destination vector register. The index value in the scalar
+register and the immediate, zero-extended to XLEN bits, are treated as
+unsigned integers. If XLEN > SEW, the index value is _not_ truncated
+to SEW bits.
+
+NOTE: These forms allow any vector element to be "splatted" to an entire vector.
+
+----
+vrgather.vx vd, vs2, rs1, vm # vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[x[rs1]]
+vrgather.vi vd, vs2, uimm, vm # vd[i] = (uimm >= VLMAX) ? 0 : vs2[uimm]
+----
+
+For any `vrgather` instruction, the destination vector register group
+cannot overlap with the source vector register groups, otherwise the
+instruction encoding is reserved.
+
+==== Vector Compress Instruction
+
+The vector compress instruction allows elements selected by a vector
+mask register from a source vector register group to be packed into
+contiguous elements at the start of the destination vector register
+group.
+
+----
+ vcompress.vm vd, vs2, vs1 # Compress into vd elements of vs2 where vs1 is enabled
+----
+
+The vector mask register specified by `vs1` indicates which of the
+first `vl` elements of vector register group `vs2` should be extracted
+and packed into contiguous elements at the beginning of vector
+register `vd`. The remaining elements of `vd` are treated as tail
+elements according to the current tail policy (Section
+<<sec-agnostic>>).
+
+----
+ Example use of vcompress instruction
+
+ 8 7 6 5 4 3 2 1 0 Element number
+
+ 1 1 0 1 0 0 1 0 1 v0
+ 8 7 6 5 4 3 2 1 0 v1
+ 1 2 3 4 5 6 7 8 9 v2
+ vsetivli t0, 9, e8, m1, tu, ma
+ vcompress.vm v2, v1, v0
+ 1 2 3 4 8 7 5 2 0 v2
+----
+
+`vcompress` is encoded as an unmasked instruction (`vm=1`). The equivalent
+masked instruction (`vm=0`) is reserved.
+
+The destination vector register group cannot overlap the source vector
+register group or the source mask register, otherwise the instruction
+encoding is reserved.
+
+A trap on a `vcompress` instruction is always reported with a
+`vstart` of 0. Executing a `vcompress` instruction with a non-zero
+`vstart` raises an illegal instruction exception.
+
+NOTE: Although possible, `vcompress` is one of the more difficult
+instructions to restart with a non-zero `vstart`, so assumption is
+implementations will choose not do that but will instead restart from
+element 0. This does mean elements in destination register after
+`vstart` will already have been updated.
+
+===== Synthesizing `vdecompress`
+
+There is no inverse `vdecompress` provided, as this operation can be
+readily synthesized using iota and a masked vrgather:
+
+----
+ Desired functionality of 'vdecompress'
+ 7 6 5 4 3 2 1 0 # vid
+
+ e d c b a # packed vector of 5 elements
+ 1 0 0 1 1 1 0 1 # mask vector of 8 elements
+ p q r s t u v w # destination register before vdecompress
+
+ e q r d c b v a # result of vdecompress
+----
+
+----
+ # v0 holds mask
+ # v1 holds packed data
+ # v11 holds input expanded vector and result
+ viota.m v10, v0 # Calc iota from mask in v0
+ vrgather.vv v11, v1, v10, v0.t # Expand into destination
+----
+----
+ p q r s t u v w # v11 destination register
+ e d c b a # v1 source vector
+ 1 0 0 1 1 1 0 1 # v0 mask vector
+
+ 4 4 4 3 2 1 1 0 # v10 result of viota.m
+ e q r d c b v a # v11 destination after vrgather using viota.m under mask
+----
+
+==== Whole Vector Register Move
+
+The `vmv<nr>r.v` instructions copy whole vector registers (i.e., all
+VLEN bits) and can copy whole vector register groups. The `nr` value
+in the opcode is the number of individual vector registers, NREG, to
+copy. The instructions operate as if EEW=SEW, EMUL = NREG, effective
+length `evl`= EMUL * VLEN/SEW.
+
+NOTE: These instructions are intended to aid compilers to shuffle
+vector registers without needing to know or change `vl` or `vtype`.
+
+NOTE: The usual property that no elements are written if `vstart` {ge} `vl`
+does not apply to these instructions.
+Instead, no elements are written if `vstart` {ge} `evl`.
+
+NOTE: If `vd` is equal to `vs2` the instruction is an architectural
+NOP, but is treated as a hint to implementations that rearrange data
+internally that the register group will next be accessed with an EEW
+equal to SEW.
+
+The instruction is encoded as an OPIVI instruction. The number of
+vector registers to copy is encoded in the low three bits of the
+`simm` field (`simm[2:0]`) using the same encoding as the `nf[2:0]` field for memory
+instructions (Figure <<fig-nf>>), i.e., `simm[2:0]` = NREG-1.
+
+The value of NREG must be 1, 2, 4, or 8, and values of `simm[4:0]`
+other than 0, 1, 3, and 7 are reserved.
+
+NOTE: A future extension may support other numbers of registers to be moved.
+
+NOTE: The instruction uses the same funct6 encoding as the `vsmul`
+instruction but with an immediate operand, and only the unmasked
+version (`vm=1`). This encoding is chosen as it is close to the
+related `vmerge` encoding, and it is unlikely the `vsmul` instruction
+would benefit from an immediate form.
+
+----
+ vmv<nr>r.v vd, vs2 # General form
+
+ vmv1r.v v1, v2 # Copy v1=v2
+ vmv2r.v v10, v12 # Copy v10=v12; v11=v13
+ vmv4r.v v4, v8 # Copy v4=v8; v5=v9; v6=v10; v7=v11
+ vmv8r.v v0, v8 # Copy v0=v8; v1=v9; ...; v7=v15
+----
+
+The source and destination vector register numbers must be aligned
+appropriately for the vector register group size, and encodings with
+other vector register numbers are reserved.
+
+NOTE: A future extension may relax the vector register alignment
+restrictions.
+
+=== Exception Handling
+
+On a trap during a vector instruction (caused by either a synchronous
+exception or an asynchronous interrupt), the existing `*epc` CSR is
+written with a pointer to the trapping vector instruction, while the
+`vstart` CSR contains the element index on which the trap was
+taken.
+
+NOTE: We chose to add a `vstart` CSR to allow resumption of a
+partially executed vector instruction to reduce interrupt latencies
+and to simplify forward-progress guarantees. This is similar to the
+scheme in the IBM 3090 vector facility. To ensure forward progress
+without the `vstart` CSR, implementations would have to guarantee an
+entire vector instruction can always complete atomically without
+generating a trap. This is particularly difficult to ensure in the
+presence of strided or scatter/gather operations and demand-paged
+virtual memory.
+
+==== Precise vector traps
+
+NOTE: We assume most supervisor-mode environments with demand-paging
+will require precise vector traps.
+
+Precise vector traps require that:
+
+. all instructions older than the trapping vector instruction have committed their results
+. no instructions newer than the trapping vector instruction have altered architectural state
+. any operations within the trapping vector instruction affecting result elements preceding the index in the `vstart` CSR have committed their results
+. no operations within the trapping vector instruction affecting elements at or following the `vstart` CSR have altered architectural state except if restarting and completing the affected vector instruction will nevertheless produce the correct final state.
+
+We relax the last requirement to allow elements following `vstart` to
+have been updated at the time the trap is reported, provided that
+re-executing the instruction from the given `vstart` will correctly
+overwrite those elements.
+
+In idempotent memory regions, vector store instructions may have
+updated elements in memory past the element causing a synchronous
+trap. Non-idempotent memory regions must not have been updated for
+indices equal to or greater than the element that caused a synchronous
+trap during a vector store instruction.
+
+Except where noted above, vector instructions are allowed to overwrite
+their inputs, and so in most cases, the vector instruction restart
+must be from the `vstart` element index. However, there are a number of
+cases where this overwrite is prohibited to enable execution of the
+vector instructions to be idempotent and hence restartable from an
+earlier index location.
+
+Implementations must ensure forward progress can be eventually
+guaranteed for the element or segment reported by `vstart`.
+
+==== Imprecise vector traps
+
+Imprecise vector traps are traps that are not precise. In particular,
+instructions newer than `*epc` may have committed results, and
+instructions older than `*epc` may have not completed execution.
+Imprecise traps are primarily intended to be used in situations where
+reporting an error and terminating execution is the appropriate
+response.
+
+NOTE: A profile might specify that interrupts are precise while other
+traps are imprecise. We assume many embedded implementations will
+generate only imprecise traps for vector instructions on fatal errors,
+as they will not require resumable traps.
+
+Imprecise traps shall report the faulting element in `vstart` for
+traps caused by synchronous vector exceptions.
+
+There is no support for imprecise traps in the current standard extensions.
+
+==== Selectable precise/imprecise traps
+
+Some profiles may choose to provide a privileged mode bit to select
+between precise and imprecise vector traps. Imprecise mode would run
+at high-performance but possibly make it difficult to discern error
+causes, while precise mode would run more slowly, but support
+debugging of errors albeit with a possibility of not experiencing the
+same errors as in imprecise mode.
+
+This mechanism is not defined in the current standard extensions.
+
+==== Swappable traps
+
+Another trap mode can support swappable state in the vector unit,
+where on a trap, special instructions can save and restore the vector
+unit microarchitectural state, to allow execution to continue
+correctly around imprecise traps.
+
+This mechanism is not defined in the current standard extensions.
+
+NOTE: A future extension might define a standard way of saving and
+restoring opaque microarchitectural state from a vector unit
+implementation to support context switching with imprecise traps.
+
+[[sec-vector-extensions]]
+=== Standard Vector Extensions
+
+This section describes the standard vector extensions.
+A set of smaller extensions intended for embedded
+use are named with a "Zve" prefix, while a larger vector extension
+designed for application processors is named as a single-letter V
+extension. A set of vector length extension names with prefix "Zvl"
+are also provided.
+
+The initial vector extensions are designed to act as a base for
+additional vector extensions in various domains, including
+cryptography and machine learning.
+
+==== Zvl*: Minimum Vector Length Standard Extensions
+
+All standard vector extensions have a minimum required VLEN as
+described below. A set of vector length extensions are provided to
+increase the minimum vector length of a vector extension.
+
+NOTE: The vector length extensions can be used to either specify
+additional software or architecture profile requirements, or to
+advertise hardware capabilities.
+
+.Vector length extensions
+[cols="1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Extension | Minimum VLEN
+
+| Zvl32b | 32
+| Zvl64b | 64
+| Zvl128b | 128
+| Zvl256b | 256
+| Zvl512b | 512
+| Zvl1024b | 1024
+|===
+
+NOTE: Longer vector length extensions should follow the same pattern.
+
+NOTE: Every vector length extension effectively includes all shorter
+vector length extensions.
+
+NOTE: The syntax for extension names is being revised, and these names
+are subject to change. The trailing "b" will be required to
+disambiguate numeric fields from version numbers.
+
+NOTE: Explicit use of the Zvl32b extension string is not required for
+any standard vector extension as they all effectively mandate at least
+this minimum, but the string can be useful when stating hardware
+capabilities.
+
+==== Zve*: Vector Extensions for Embedded Processors
+
+The following five standard extensions are defined to provide varying
+degrees of vector support and are intended for use with embedded
+processors. Any of these extensions can be added to base ISAs with
+XLEN=32 or XLEN=64. The table lists the minimum VLEN and supported
+EEWs for each extension as well as what floating-point types are
+supported.
+
+.Embedded vector extensions
+[cols="1,1,2,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Extension | Minimum VLEN | Supported EEW | FP32 | FP64
+
+| Zve32x | 32 | 8, 16, 32 | N | N
+| Zve32f | 32 | 8, 16, 32 | Y | N
+| Zve64x | 64 | 8, 16, 32, 64 | N | N
+| Zve64f | 64 | 8, 16, 32, 64 | Y | N
+| Zve64d | 64 | 8, 16, 32, 64 | Y | Y
+|===
+
+The Zve32f and Zve64x extensions depend on the Zve32x extension.
+The Zve64f extension depends on the Zve32f and Zve64x extensions.
+The Zve64d extension depends on the Zve64f extension.
+
+All Zve* extensions have precise traps.
+
+NOTE: There is currently no standard support for handling imprecise
+traps, so standard extensions have to provide precise traps.
+
+All Zve* extensions provide support for EEW of 8, 16, and 32, and
+Zve64* extensions also support EEW of 64.
+
+All Zve* extensions support the vector configuration instructions
+(Section <<sec-vector-config>>).
+
+All Zve* extensions support all vector load and store instructions
+(Section <<sec-vector-memory>>), except Zve64* extensions do not
+support EEW=64 for index values when XLEN=32.
+
+All Zve* extensions support all vector integer instructions (Section
+<<sec-vector-integer>>), except that the `vmulh` integer multiply
+variants that return the high word of the product (`vmulh.vv`,
+`vmulh.vx`, `vmulhu.vv`, `vmulhu.vx`, `vmulhsu.vv`, `vmulhsu.vx`) are
+not included for EEW=64 in Zve64*.
+
+NOTE: Producing the high-word of a product can take substantial
+additional gates for large EEW.
+
+All Zve* extensions support all vector fixed-point arithmetic
+instructions (<<sec-vector-fixed-point>>), except that `vsmul.vv` and
+`vsmul.vx` are not included in EEW=64 in Zve64*.
+
+NOTE: As with `vmulh`, `vsmul` requires a large amount of additional
+logic, and 64-bit fixed-point multiplies are relatively rare.
+
+All Zve* extensions support all vector integer single-width and
+widening reduction operations (Sections <<sec-vector-integer-reduce>>,
+<<sec-vector-integer-reduce-widen>>).
+
+All Zve* extensions support all vector mask instructions (Section
+<<sec-vector-mask>>).
+
+All Zve* extensions support all vector permutation instructions
+(Section <<sec-vector-permute>>), except that Zve32x and Zve64x
+do not include those with floating-point operands, and Zve64f does not include those
+with EEW=64 floating-point operands.
+
+The Zve32x extension depends on the Zicsr extension.
+The Zve32f and Zve64f extensions depend upon the F extension,
+and implement all
+vector floating-point instructions (Section <<sec-vector-float>>) for
+floating-point operands with EEW=32. Vector single-width floating-point reduction
+operations (<<sec-vector-float-reduce>>) for EEW=32 are supported.
+
+The Zve64d extension depends upon the D extension,
+and implements all vector
+floating-point instructions (Section <<sec-vector-float>>) for
+floating-point operands with EEW=32 or EEW=64 (including widening
+instructions and conversions between FP32 and FP64). Vector
+single-width floating-point reductions (<<sec-vector-float-reduce>>)
+for EEW=32 and EEW=64 are supported as well as widening reductions
+from FP32 to FP64.
+
+==== V: Vector Extension for Application Processors
+
+The single-letter V extension is intended for use in application
+processor profiles.
+
+The `misa.v` bit is set for implementations providing `misa` and
+supporting V.
+
+The V vector extension has precise traps.
+
+The V vector extension depends upon the Zvl128b and Zve64d extensions.
+
+NOTE: The value of 128 was chosen as a compromise for application
+processors. Providing a larger VLEN allows stripmining code to be
+elided in some cases for short vectors, but also increases the size of
+the minimum implementation. Note that larger LMUL can be used to
+avoid stripmining for longer known-size application vectors at the
+cost of having fewer available vector register groups. For example, an
+LMUL of 8 allows vectors of up to sixteen 64-bit elements to be
+processed without stripmining using four vector register groups.
+
+The V extension supports EEW of 8, 16, and 32, and 64.
+
+The V extension supports the vector configuration instructions
+(Section <<sec-vector-config>>).
+
+The V extension supports all vector load and store instructions
+(Section <<sec-vector-memory>>), except the V extension does not
+support EEW=64 for index values when XLEN=32.
+
+The V extension supports all vector integer instructions (Section
+<<sec-vector-integer>>).
+
+The V extension supports all vector fixed-point arithmetic
+instructions (<<sec-vector-fixed-point>>).
+
+The V extension supports all vector integer single-width and
+widening reduction operations (Sections <<sec-vector-integer-reduce>>,
+<<sec-vector-integer-reduce-widen>>).
+
+The V extension supports all vector mask instructions (Section
+<<sec-vector-mask>>).
+
+The V extension supports all vector permutation instructions (Section
+<<sec-vector-permute>>).
+
+The V extension depends upon the F and D
+extensions, and implements all vector floating-point instructions
+(Section <<sec-vector-float>>) for floating-point operands with EEW=32
+or EEW=64 (including widening instructions and conversions between
+FP32 and FP64). Vector single-width floating-point reductions
+(<<sec-vector-float-reduce>>) for EEW=32 and EEW=64 are supported as
+well as widening reductions from FP32 to FP64.
+
+[NOTE]
+====
+As is the case with other RISC-V extensions, it is valid to
+include overlapping extensions in the same ISA string. For example,
+RV64GCV and RV64GCV_Zve64f are both valid and equivalent ISA strings,
+as is RV64GCV_Zve64f_Zve32x_Zvl128b.
+====
+
+==== Zvfhmin: Vector Extension for Minimal Half-Precision Floating-Point
+
+The Zvfhmin extension provides minimal support for vectors of IEEE 754-2008
+binary16 values, adding conversions to and from binary32.
+When the Zvfhmin extension is implemented, the `vfwcvt.f.f.v` and
+`vfncvt.f.f.w` instructions become defined when SEW=16.
+The EEW=16 floating-point operands of these instructions use the binary16
+format.
+
+The Zvfhmin extension depends on the Zve32f extension.
+
+==== Zvfh: Vector Extension for Half-Precision Floating-Point
+
+The Zvfh extension provides support for vectors of IEEE 754-2008
+binary16 values.
+When the Zvfh extension is implemented, all instructions in Sections
+<<sec-vector-float>>, <<sec-vector-float-reduce>>,
+<<sec-vector-float-reduce-widen>>, <<sec-vector-float-move>>,
+<<sec-vfslide1up>>, and <<sec-vfslide1down>>
+become defined when SEW=16.
+The EEW=16 floating-point operands of these instructions use the binary16
+format.
+
+Additionally, conversions between 8-bit integers and binary16 values are
+provided. The floating-point-to-integer narrowing conversions
+(`vfncvt[.rtz].x[u].f.w`) and integer-to-floating-point
+widening conversions (`vfwcvt.f.x[u].v`) become defined when SEW=8.
+
+The Zvfh extension depends on the Zve32f and Zfhmin extensions.
+
+NOTE: Requiring basic scalar half-precision support makes Zvfh's
+vector-scalar instructions substantially more useful.
+We considered requiring more complete scalar half-precision support, but we
+reasoned that, for many half-precision vector workloads, performing the scalar
+computation in single-precision will suffice.
+
+=== Vector Instruction Listing
+
+include::images/wavedrom/v-inst-table.adoc[]
+
diff --git a/src/vector-examples.adoc b/src/vector-examples.adoc
new file mode 100644
index 0000000..9e54acd
--- /dev/null
+++ b/src/vector-examples.adoc
@@ -0,0 +1,125 @@
+[appendix]
+== Vector Assembly Code Examples
+
+The following are provided as non-normative text to help explain the vector ISA.
+
+=== Vector-vector add example
+
+----
+include::example/vvaddint32.s[lines=4..-1]
+----
+
+=== Example with mixed-width mask and compute.
+
+----
+# Code using one width for predicate and different width for masked
+# compute.
+# int8_t a[]; int32_t b[], c[];
+# for (i=0; i<n; i++) { b[i] = (a[i] < 5) ? c[i] : 1; }
+#
+# Mixed-width code that keeps SEW/LMUL=8
+ loop:
+ vsetvli a4, a0, e8, m1, ta, ma # Byte vector for predicate calc
+ vle8.v v1, (a1) # Load a[i]
+ add a1, a1, a4 # Bump pointer.
+ vmslt.vi v0, v1, 5 # a[i] < 5?
+
+ vsetvli x0, a0, e32, m4, ta, mu # Vector of 32-bit values.
+ sub a0, a0, a4 # Decrement count
+ vmv.v.i v4, 1 # Splat immediate to destination
+ vle32.v v4, (a3), v0.t # Load requested elements of C, others undisturbed
+ sll t1, a4, 2
+ add a3, a3, t1 # Bump pointer.
+ vse32.v v4, (a2) # Store b[i].
+ add a2, a2, t1 # Bump pointer.
+ bnez a0, loop # Any more?
+----
+
+=== Memcpy example
+
+----
+include::example/memcpy.s[lines=4..-1]
+----
+
+=== Conditional example
+
+----
+# (int16) z[i] = ((int8) x[i] < 5) ? (int16) a[i] : (int16) b[i];
+#
+
+loop:
+ vsetvli t0, a0, e8, m1, ta, ma # Use 8b elements.
+ vle8.v v0, (a1) # Get x[i]
+ sub a0, a0, t0 # Decrement element count
+ add a1, a1, t0 # x[i] Bump pointer
+ vmslt.vi v0, v0, 5 # Set mask in v0
+ vsetvli x0, x0, e16, m2, ta, mu # Use 16b elements.
+ slli t0, t0, 1 # Multiply by 2 bytes
+ vle16.v v2, (a2), v0.t # z[i] = a[i] case
+ vmnot.m v0, v0 # Invert v0
+ add a2, a2, t0 # a[i] bump pointer
+ vle16.v v2, (a3), v0.t # z[i] = b[i] case
+ add a3, a3, t0 # b[i] bump pointer
+ vse16.v v2, (a4) # Store z
+ add a4, a4, t0 # z[i] bump pointer
+ bnez a0, loop
+----
+=== SAXPY example
+
+----
+include::example/saxpy.s[lines=4..-1]
+----
+
+=== SGEMM example
+
+----
+include::example/sgemm.S[lines=4..-1]
+----
+
+=== Division approximation example
+
+----
+# v1 = v1 / v2 to almost 23 bits of precision.
+
+vfrec7.v v3, v2 # Estimate 1/v2
+ li t0, 0x40000000
+vmv.v.x v4, t0 # Splat 2.0
+vfnmsac.vv v4, v2, v3 # 2.0 - v2 * est(1/v2)
+vfmul.vv v3, v3, v4 # Better estimate of 1/v2
+vmv.v.x v4, t0 # Splat 2.0
+vfnmsac.vv v4, v2, v3 # 2.0 - v2 * est(1/v2)
+vfmul.vv v3, v3, v4 # Better estimate of 1/v2
+vfmul.vv v1, v1, v3 # Estimate of v1/v2
+----
+
+=== Square root approximation example
+
+----
+# v1 = sqrt(v1) to almost 23 bits of precision.
+
+ fmv.w.x ft0, x0 # Mask off zero inputs
+vmfne.vf v0, v1, ft0 # to avoid div by zero
+vfrsqrt7.v v2, v1, v0.t # Estimate 1/sqrt(x)
+vmfne.vf v0, v2, ft0, v0.t # Additionally mask off +inf inputs
+ li t0, 0x40400000
+vmv.v.x v4, t0 # Splat 3.0
+vfmul.vv v3, v1, v2, v0.t # x * est
+vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
+vfmul.vv v3, v3, v2, v0.t # est * (-x * est * est + 3)
+ li t0, 0x3f000000
+ fmv.w.x ft0, t0 # 0.5
+vfmul.vf v2, v3, ft0, v0.t # Estimate to 14 bits
+vfmul.vv v3, v1, v2, v0.t # x * est
+vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
+vfmul.vv v3, v3, v2, v0.t # est * (-x * est * est + 3)
+vfmul.vf v2, v3, ft0, v0.t # Estimate to 23 bits
+vfmul.vv v1, v2, v1, v0.t # x * 1/sqrt(x)
+----
+
+=== C standard library strcmp example
+
+----
+include::example/strcmp.s[lines=4..-1]
+----
+
+include::fraclmul.adoc[]