Merge pull request #1536 from riscv/dev/kbroch/fix-v-ext-inconsistent-pseudo-intendationriscv-isa-release-6d06d0e-2024-07-16

fix inconsistent indentation of pseudocode (left justify all if it)
author: Andrew Waterman <andrew@sifive.com> 2024-07-15 17:26:15 -0700
committer: GitHub <noreply@github.com> 2024-07-15 17:26:15 -0700
commit: 6d06d0e4a69b410474b44f7fe147646f55d17a83 (patch)
tree: 924f063761ecf11bd03bd119b21ef6a7abf78a6e /src
parent: 92e5663658d4064bc67f53c2496d871f507f7979 (diff)
parent: d02409b77f274f85a7b1617192f3f30fd0037f3b (diff)
download: riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.zip
riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.tar.gz
riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.tar.bz2
1 files changed, 343 insertions, 346 deletions
diff --git a/src/v-st-ext.adoc b/src/v-st-ext.adoc
index 5909667..467d8de 100644
--- a/src/v-st-ext.adoc
+++ b/src/v-st-ext.adoc
@@ -1549,19 +1549,19 @@ currently reserved.
 ==== Vector Unit-Stride Instructions
 
 ----
-    # Vector unit-stride loads and stores
+# Vector unit-stride loads and stores
 
-    # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
-    vle8.v    vd, (rs1), vm  #    8-bit unit-stride load
-    vle16.v   vd, (rs1), vm  #   16-bit unit-stride load
-    vle32.v   vd, (rs1), vm  #   32-bit unit-stride load
-    vle64.v   vd, (rs1), vm  #   64-bit unit-stride load
+# vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
+vle8.v    vd, (rs1), vm  #    8-bit unit-stride load
+vle16.v   vd, (rs1), vm  #   16-bit unit-stride load
+vle32.v   vd, (rs1), vm  #   32-bit unit-stride load
+vle64.v   vd, (rs1), vm  #   64-bit unit-stride load
 
-    # vs3 store data, rs1 base address, vm is mask encoding (v0.t or <missing>)
-    vse8.v    vs3, (rs1), vm  #    8-bit unit-stride store
-    vse16.v   vs3, (rs1), vm  #   16-bit unit-stride store
-    vse32.v   vs3, (rs1), vm  #   32-bit unit-stride store
-    vse64.v   vs3, (rs1), vm  #   64-bit unit-stride store
+# vs3 store data, rs1 base address, vm is mask encoding (v0.t or <missing>)
+vse8.v    vs3, (rs1), vm  #    8-bit unit-stride store
+vse16.v   vs3, (rs1), vm  #   16-bit unit-stride store
+vse32.v   vs3, (rs1), vm  #   32-bit unit-stride store
+vse64.v   vs3, (rs1), vm  #   64-bit unit-stride store
 ----
 
 Additional unit-stride mask load and store instructions are
@@ -1572,11 +1572,11 @@ and the destination register is always written with a tail-agnostic
 policy.
 
 ----
-    # Vector unit-stride mask load
-    vlm.v vd, (rs1)   #  Load byte vector of length ceil(vl/8)
+# Vector unit-stride mask load
+vlm.v vd, (rs1)   #  Load byte vector of length ceil(vl/8)
 
-    # Vector unit-stride mask store
-    vsm.v vs3, (rs1)  #  Store byte vector of length ceil(vl/8)
+# Vector unit-stride mask store
+vsm.v vs3, (rs1)  #  Store byte vector of length ceil(vl/8)
 ----
 
 `vlm.v` and `vsm.v` are encoded with the same `width[2:0]`=0 encoding as
@@ -1602,19 +1602,19 @@ and also reduce the cost of mask spill/fill by reducing need to change
 ==== Vector Strided Instructions
 
 ----
-    # Vector strided loads and stores
+# Vector strided loads and stores
 
-    # vd destination, rs1 base address, rs2 byte stride
-    vlse8.v    vd, (rs1), rs2, vm  #    8-bit strided load
-    vlse16.v   vd, (rs1), rs2, vm  #   16-bit strided load
-    vlse32.v   vd, (rs1), rs2, vm  #   32-bit strided load
-    vlse64.v   vd, (rs1), rs2, vm  #   64-bit strided load
+# vd destination, rs1 base address, rs2 byte stride
+vlse8.v    vd, (rs1), rs2, vm  #    8-bit strided load
+vlse16.v   vd, (rs1), rs2, vm  #   16-bit strided load
+vlse32.v   vd, (rs1), rs2, vm  #   32-bit strided load
+vlse64.v   vd, (rs1), rs2, vm  #   64-bit strided load
 
-    # vs3 store data, rs1 base address, rs2 byte stride
-    vsse8.v    vs3, (rs1), rs2, vm  #    8-bit strided store
-    vsse16.v   vs3, (rs1), rs2, vm  #   16-bit strided store
-    vsse32.v   vs3, (rs1), rs2, vm  #   32-bit strided store
-    vsse64.v   vs3, (rs1), rs2, vm  #   64-bit strided store
+# vs3 store data, rs1 base address, rs2 byte stride
+vsse8.v    vs3, (rs1), rs2, vm  #    8-bit strided store
+vsse16.v   vs3, (rs1), rs2, vm  #   16-bit strided store
+vsse32.v   vs3, (rs1), rs2, vm  #   32-bit strided store
+vsse64.v   vs3, (rs1), rs2, vm  #   64-bit strided store
 ----
 
 Negative and zero strides are supported.
@@ -1648,36 +1648,35 @@ address are required, then an ordered indexed operation can be used.
 ==== Vector Indexed Instructions
 
 ----
-    # Vector indexed loads and stores
+# Vector indexed loads and stores
 
-    # Vector indexed-unordered load instructions
-    # vd destination, rs1 base address, vs2 byte offsets
-    vluxei8.v    vd, (rs1), vs2, vm  # unordered  8-bit indexed load of SEW data
-    vluxei16.v   vd, (rs1), vs2, vm  # unordered 16-bit indexed load of SEW data
-    vluxei32.v   vd, (rs1), vs2, vm  # unordered 32-bit indexed load of SEW data
-    vluxei64.v   vd, (rs1), vs2, vm  # unordered 64-bit indexed load of SEW data
+# Vector indexed-unordered load instructions
+# vd destination, rs1 base address, vs2 byte offsets
+vluxei8.v    vd, (rs1), vs2, vm  # unordered  8-bit indexed load of SEW data
+vluxei16.v   vd, (rs1), vs2, vm  # unordered 16-bit indexed load of SEW data
+vluxei32.v   vd, (rs1), vs2, vm  # unordered 32-bit indexed load of SEW data
+vluxei64.v   vd, (rs1), vs2, vm  # unordered 64-bit indexed load of SEW data
 
-    # Vector indexed-ordered load instructions
-    # vd destination, rs1 base address, vs2 byte offsets
-    vloxei8.v    vd, (rs1), vs2, vm  # ordered  8-bit indexed load of SEW data
-    vloxei16.v   vd, (rs1), vs2, vm  # ordered 16-bit indexed load of SEW data
-    vloxei32.v   vd, (rs1), vs2, vm  # ordered 32-bit indexed load of SEW data
-    vloxei64.v   vd, (rs1), vs2, vm  # ordered 64-bit indexed load of SEW data
+# Vector indexed-ordered load instructions
+# vd destination, rs1 base address, vs2 byte offsets
+vloxei8.v    vd, (rs1), vs2, vm  # ordered  8-bit indexed load of SEW data
+vloxei16.v   vd, (rs1), vs2, vm  # ordered 16-bit indexed load of SEW data
+vloxei32.v   vd, (rs1), vs2, vm  # ordered 32-bit indexed load of SEW data
+vloxei64.v   vd, (rs1), vs2, vm  # ordered 64-bit indexed load of SEW data
 
-    # Vector indexed-unordered store instructions
-    # vs3 store data, rs1 base address, vs2 byte offsets
-    vsuxei8.v   vs3, (rs1), vs2, vm # unordered  8-bit indexed store of SEW data
-    vsuxei16.v  vs3, (rs1), vs2, vm # unordered 16-bit indexed store of SEW data
-    vsuxei32.v  vs3, (rs1), vs2, vm # unordered 32-bit indexed store of SEW data
-    vsuxei64.v  vs3, (rs1), vs2, vm # unordered 64-bit indexed store of SEW data
-
-    # Vector indexed-ordered store instructions
-    # vs3 store data, rs1 base address, vs2 byte offsets
-    vsoxei8.v    vs3, (rs1), vs2, vm  # ordered  8-bit indexed store of SEW data
-    vsoxei16.v   vs3, (rs1), vs2, vm  # ordered 16-bit indexed store of SEW data
-    vsoxei32.v   vs3, (rs1), vs2, vm  # ordered 32-bit indexed store of SEW data
-    vsoxei64.v   vs3, (rs1), vs2, vm  # ordered 64-bit indexed store of SEW data
+# Vector indexed-unordered store instructions
+# vs3 store data, rs1 base address, vs2 byte offsets
+vsuxei8.v   vs3, (rs1), vs2, vm # unordered  8-bit indexed store of SEW data
+vsuxei16.v  vs3, (rs1), vs2, vm # unordered 16-bit indexed store of SEW data
+vsuxei32.v  vs3, (rs1), vs2, vm # unordered 32-bit indexed store of SEW data
+vsuxei64.v  vs3, (rs1), vs2, vm # unordered 64-bit indexed store of SEW data
 
+# Vector indexed-ordered store instructions
+# vs3 store data, rs1 base address, vs2 byte offsets
+vsoxei8.v    vs3, (rs1), vs2, vm  # ordered  8-bit indexed store of SEW data
+vsoxei16.v   vs3, (rs1), vs2, vm  # ordered 16-bit indexed store of SEW data
+vsoxei32.v   vs3, (rs1), vs2, vm  # ordered 32-bit indexed store of SEW data
+vsoxei64.v   vs3, (rs1), vs2, vm  # ordered 64-bit indexed store of SEW data
 ----
 
 NOTE: The assembler syntax for indexed loads and stores uses
@@ -1714,13 +1713,13 @@ operation will not be restarted due to a trap or vector-length
 trimming.
 
 ----
-    # Vector unit-stride fault-only-first loads
+# Vector unit-stride fault-only-first loads
 
-    # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
-    vle8ff.v    vd, (rs1), vm  #    8-bit unit-stride fault-only-first load
-    vle16ff.v   vd, (rs1), vm  #   16-bit unit-stride fault-only-first load
-    vle32ff.v   vd, (rs1), vm  #   32-bit unit-stride fault-only-first load
-    vle64ff.v   vd, (rs1), vm  #   64-bit unit-stride fault-only-first load
+# vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
+vle8ff.v    vd, (rs1), vm  #    8-bit unit-stride fault-only-first load
+vle16ff.v   vd, (rs1), vm  #   16-bit unit-stride fault-only-first load
+vle32ff.v   vd, (rs1), vm  #   32-bit unit-stride fault-only-first load
+vle64ff.v   vd, (rs1), vm  #   64-bit unit-stride fault-only-first load
 ----
 
 ----
@@ -1837,14 +1836,14 @@ The assembler prefixes `vlseg`/`vsseg` are used for unit-stride
 segment loads and stores respectively.
 
 ----
-    # Format
-    vlseg<nf>e<eew>.v vd, (rs1), vm      # Unit-stride segment load template
-    vsseg<nf>e<eew>.v vs3, (rs1), vm     # Unit-stride segment store template
+# Format
+vlseg<nf>e<eew>.v vd, (rs1), vm      # Unit-stride segment load template
+vsseg<nf>e<eew>.v vs3, (rs1), vm     # Unit-stride segment store template
 
-    # Examples
-    vlseg8e8.v vd, (rs1), vm   # Load eight vector registers with eight byte fields.
+# Examples
+vlseg8e8.v vd, (rs1), vm   # Load eight vector registers with eight byte fields.
 
-    vsseg3e32.v vs3, (rs1), vm  # Store packed vector of 3*4-byte segments from vs3,vs3+1,vs3+2 to memory
+vsseg3e32.v vs3, (rs1), vm  # Store packed vector of 3*4-byte segments from vs3,vs3+1,vs3+2 to memory
 ----
 
 For loads, the `vd` register will hold the first field loaded from the
@@ -1852,27 +1851,27 @@ segment.  For stores, the `vs3` register is read to provide the first
 field to be stored to each segment.
 
 ----
-    # Example 1
-    # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp)
-    vsetvli a1, t0, e8, m1, ta, ma
-    vlseg3e8.v v8, (a0), vm
-    # v8 holds the red pixels
-    # v9 holds the green pixels
-    # v10 holds the blue pixels
+# Example 1
+# Memory structure holds packed RGB pixels (24-bit data structure, 8bpp)
+vsetvli a1, t0, e8, m1, ta, ma
+vlseg3e8.v v8, (a0), vm
+# v8 holds the red pixels
+# v9 holds the green pixels
+# v10 holds the blue pixels
 
-    # Example 2
-    # Memory structure holds complex values, 32b for real and 32b for imaginary
-    vsetvli a1, t0, e32, m1, ta, ma
-    vlseg2e32.v v8, (a0), vm
-    # v8 holds real
-    # v9 holds imaginary
+# Example 2
+# Memory structure holds complex values, 32b for real and 32b for imaginary
+vsetvli a1, t0, e32, m1, ta, ma
+vlseg2e32.v v8, (a0), vm
+# v8 holds real
+# v9 holds imaginary
 ----
 
 There are also fault-only-first versions of the unit-stride instructions.
 
 ----
-    # Template for vector fault-only-first unit-stride segment loads.
-    vlseg<nf>e<eew>ff.v vd, (rs1),  vm    # Unit-stride fault-only-first segment loads
+# Template for vector fault-only-first unit-stride segment loads.
+vlseg<nf>e<eew>ff.v vd, (rs1),  vm    # Unit-stride fault-only-first segment loads
 ----
 
 For fault-only-first segment loads, if an exception is detected partway
@@ -1892,20 +1891,20 @@ GPR argument.
 NOTE: Negative and zero strides are supported.
 
 ----
-    # Format
-    vlsseg<nf>e<eew>.v vd, (rs1), rs2, vm          # Strided segment loads
-    vssseg<nf>e<eew>.v vs3, (rs1), rs2, vm         # Strided segment stores
+# Format
+vlsseg<nf>e<eew>.v vd, (rs1), rs2, vm          # Strided segment loads
+vssseg<nf>e<eew>.v vs3, (rs1), rs2, vm         # Strided segment stores
 
-    # Examples
-    vsetvli a1, t0, e8, m1, ta, ma
-    vlsseg3e8.v v4, (x5), x6   # Load bytes at addresses x5+i*x6   into v4[i],
-                              #  and bytes at addresses x5+i*x6+1 into v5[i],
-                              #  and bytes at addresses x5+i*x6+2 into v6[i].
+# Examples
+vsetvli a1, t0, e8, m1, ta, ma
+vlsseg3e8.v v4, (x5), x6   # Load bytes at addresses x5+i*x6   into v4[i],
+                            #  and bytes at addresses x5+i*x6+1 into v5[i],
+                            #  and bytes at addresses x5+i*x6+2 into v6[i].
 
-    # Examples
-    vsetvli a1, t0, e32, m1, ta, ma
-    vssseg2e32.v v2, (x5), x6   # Store words from v2[i] to address x5+i*x6
-                                #   and words from v3[i] to address x5+i*x6+4
+# Examples
+vsetvli a1, t0, e32, m1, ta, ma
+vssseg2e32.v v2, (x5), x6   # Store words from v2[i] to address x5+i*x6
+                            #   and words from v3[i] to address x5+i*x6+4
 ----
 
 Accesses to the fields within each segment can occur in any order,
@@ -1928,22 +1927,22 @@ EMUL=(EEW/SEW)*LMUL.
 The EMUL * NFIELDS {le} 8 constraint applies to the data vector register group.
 
 ----
-    # Format
-    vluxseg<nf>ei<eew>.v vd, (rs1), vs2, vm   # Indexed-unordered segment loads
-    vloxseg<nf>ei<eew>.v vd, (rs1), vs2, vm   # Indexed-ordered segment loads
-    vsuxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm  # Indexed-unordered segment stores
-    vsoxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm  # Indexed-ordered segment stores
+# Format
+vluxseg<nf>ei<eew>.v vd, (rs1), vs2, vm   # Indexed-unordered segment loads
+vloxseg<nf>ei<eew>.v vd, (rs1), vs2, vm   # Indexed-ordered segment loads
+vsuxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm  # Indexed-unordered segment stores
+vsoxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm  # Indexed-ordered segment stores
 
-    # Examples
-    vsetvli a1, t0, e8, m1, ta, ma
-    vluxseg3ei8.v v4, (x5), v3   # Load bytes at addresses x5+v3[i]   into v4[i],
-                                 #  and bytes at addresses x5+v3[i]+1 into v5[i],
-                                 #  and bytes at addresses x5+v3[i]+2 into v6[i].
+# Examples
+vsetvli a1, t0, e8, m1, ta, ma
+vluxseg3ei8.v v4, (x5), v3   # Load bytes at addresses x5+v3[i]   into v4[i],
+                                #  and bytes at addresses x5+v3[i]+1 into v5[i],
+                                #  and bytes at addresses x5+v3[i]+2 into v6[i].
 
-    # Examples
-    vsetvli a1, t0, e32, m1, ta, ma
-    vsuxseg2ei32.v v2, (x5), v5   # Store words from v2[i] to address x5+v5[i]
-                                  #   and words from v3[i] to address x5+v5[i]+4
+# Examples
+vsetvli a1, t0, e32, m1, ta, ma
+vsuxseg2ei32.v v2, (x5), v5   # Store words from v2[i] to address x5+v5[i]
+                                #   and words from v3[i] to address x5+v5[i]+4
 ----
 
 For vector indexed segment loads, the destination vector register
@@ -2060,39 +2059,39 @@ environments can mandate the minimum alignment requirements to support
 an ABI.
 
 ----
-   # Format of whole register load and store instructions.
-   vl1r.v v3, (a0)       # Pseudoinstruction equal to vl1re8.v
+# Format of whole register load and store instructions.
+vl1r.v v3, (a0)       # Pseudoinstruction equal to vl1re8.v
 
-   vl1re8.v    v3, (a0)  # Load v3 with VLEN/8 bytes held at address in a0
-   vl1re16.v   v3, (a0)  # Load v3 with VLEN/16 halfwords held at address in a0
-   vl1re32.v   v3, (a0)  # Load v3 with VLEN/32 words held at address in a0
-   vl1re64.v   v3, (a0)  # Load v3 with VLEN/64 doublewords held at address in a0
+vl1re8.v    v3, (a0)  # Load v3 with VLEN/8 bytes held at address in a0
+vl1re16.v   v3, (a0)  # Load v3 with VLEN/16 halfwords held at address in a0
+vl1re32.v   v3, (a0)  # Load v3 with VLEN/32 words held at address in a0
+vl1re64.v   v3, (a0)  # Load v3 with VLEN/64 doublewords held at address in a0
 
-   vl2r.v v2, (a0)       # Pseudoinstruction equal to vl2re8.v
+vl2r.v v2, (a0)       # Pseudoinstruction equal to vl2re8.v
 
-   vl2re8.v    v2, (a0)  # Load v2-v3 with 2*VLEN/8 bytes from address in a0
-   vl2re16.v   v2, (a0)  # Load v2-v3 with 2*VLEN/16 halfwords held at address in a0
-   vl2re32.v   v2, (a0)  # Load v2-v3 with 2*VLEN/32 words held at address in a0
-   vl2re64.v   v2, (a0)  # Load v2-v3 with 2*VLEN/64 doublewords held at address in a0
+vl2re8.v    v2, (a0)  # Load v2-v3 with 2*VLEN/8 bytes from address in a0
+vl2re16.v   v2, (a0)  # Load v2-v3 with 2*VLEN/16 halfwords held at address in a0
+vl2re32.v   v2, (a0)  # Load v2-v3 with 2*VLEN/32 words held at address in a0
+vl2re64.v   v2, (a0)  # Load v2-v3 with 2*VLEN/64 doublewords held at address in a0
 
-   vl4r.v v4, (a0)       # Pseudoinstruction equal to vl4re8.v
+vl4r.v v4, (a0)       # Pseudoinstruction equal to vl4re8.v
 
-   vl4re8.v    v4, (a0)  # Load v4-v7 with 4*VLEN/8 bytes from address in a0
-   vl4re16.v   v4, (a0)
-   vl4re32.v   v4, (a0)
-   vl4re64.v   v4, (a0)
+vl4re8.v    v4, (a0)  # Load v4-v7 with 4*VLEN/8 bytes from address in a0
+vl4re16.v   v4, (a0)
+vl4re32.v   v4, (a0)
+vl4re64.v   v4, (a0)
 
-   vl8r.v v8, (a0)       # Pseudoinstruction equal to vl8re8.v
+vl8r.v v8, (a0)       # Pseudoinstruction equal to vl8re8.v
 
-   vl8re8.v    v8, (a0)  # Load v8-v15 with 8*VLEN/8 bytes from address in a0
-   vl8re16.v   v8, (a0)
-   vl8re32.v   v8, (a0)
-   vl8re64.v   v8, (a0)
+vl8re8.v    v8, (a0)  # Load v8-v15 with 8*VLEN/8 bytes from address in a0
+vl8re16.v   v8, (a0)
+vl8re32.v   v8, (a0)
+vl8re64.v   v8, (a0)
 
-   vs1r.v v3, (a1)      # Store v3 to address in a1
-   vs2r.v v2, (a1)      # Store v2-v3 to address in a1
-   vs4r.v v4, (a1)      # Store v4-v7 to address in a1
-   vs8r.v v8, (a1)      # Store v8-v15 to address in a1
+vs1r.v v3, (a1)      # Store v3 to address in a1
+vs2r.v v2, (a1)      # Store v2-v3 to address in a1
+vs4r.v v4, (a1)      # Store v4-v7 to address in a1
+vs8r.v v8, (a1)      # Store v8-v15 to address in a1
 ----
 
 NOTE: Implementations should raise illegal instruction exceptions on
@@ -2109,10 +2108,10 @@ following vector instruction needs a new SEW/LMUL. So, in best case
 only two instructions (of which only one performs vector operations) are needed to synthesize the effect of the
 dedicated instruction:
 ----
-  csrr t0, vl                        # Save current vl (potentially not needed)
-  vsetvli t1, x0, e8, m8, ta, ma     # Maximum VLMAX
-  vlm.v v0, (a0)                     # Load mask register
-  vsetvli x0, t0, <new type>         # Restore vl (potentially already present)
+csrr t0, vl                        # Save current vl (potentially not needed)
+vsetvli t1, x0, e8, m8, ta, ma     # Maximum VLMAX
+vlm.v v0, (a0)                     # Load mask register
+vsetvli x0, t0, <new type>         # Restore vl (potentially already present)
 ----
 
 === Vector Memory Alignment Constraints
@@ -2306,7 +2305,7 @@ The first vector register group operand can be either single or
 double-width.
 
 ----
-Assembly syntax pattern for vector widening arithmetic instructions
+# Assembly syntax pattern for vector widening arithmetic instructions
 
 # Double-width result, two single-width sources: 2*SEW = SEW op SEW
 vwop.vv  vd, vs2, vs1, vm  # integer vector-vector      vd[i] = vs2[i] op vs1[i]
@@ -2526,10 +2525,10 @@ instructions with unchanged inputs, destructive accumulations will
 require an additional move to obtain correct results.
 
 ----
-  # Example multi-word arithmetic sequence, accumulating into v4
-  vmadc.vvm v1, v4, v8, v0  # Get carry into temp register v1
-  vadc.vvm v4, v4, v8, v0   # Calc new sum
-  vmmv.m v0, v1             # Move temp carry into v0 for next word
+# Example multi-word arithmetic sequence, accumulating into v4
+vmadc.vvm v1, v4, v8, v0  # Get carry into temp register v1
+vadc.vvm v4, v4, v8, v0   # Calc new sum
+vmmv.m v0, v1             # Move temp carry into v0 for next word
 ----
 
 The subtract with borrow instruction `vsbc` performs the equivalent
@@ -2537,27 +2536,27 @@ function to support long word arithmetic for subtraction.  There are
 no subtract with immediate instructions.
 
 ----
- # Produce difference with borrow.
+# Produce difference with borrow.
 
- # vd[i] = vs2[i] - vs1[i] - v0.mask[i]
- vsbc.vvm   vd, vs2, vs1, v0  # Vector-vector
+# vd[i] = vs2[i] - vs1[i] - v0.mask[i]
+vsbc.vvm   vd, vs2, vs1, v0  # Vector-vector
 
- # vd[i] = vs2[i] - x[rs1] - v0.mask[i]
- vsbc.vxm   vd, vs2, rs1, v0  # Vector-scalar
+# vd[i] = vs2[i] - x[rs1] - v0.mask[i]
+vsbc.vxm   vd, vs2, rs1, v0  # Vector-scalar
 
- # Produce borrow out in mask register format
+# Produce borrow out in mask register format
 
- # vd.mask[i] = borrow_out(vs2[i] - vs1[i] - v0.mask[i])
- vmsbc.vvm   vd, vs2, vs1, v0  # Vector-vector
+# vd.mask[i] = borrow_out(vs2[i] - vs1[i] - v0.mask[i])
+vmsbc.vvm   vd, vs2, vs1, v0  # Vector-vector
 
- # vd.mask[i] = borrow_out(vs2[i] - x[rs1] - v0.mask[i])
- vmsbc.vxm   vd, vs2, rs1, v0  # Vector-scalar
+# vd.mask[i] = borrow_out(vs2[i] - x[rs1] - v0.mask[i])
+vmsbc.vxm   vd, vs2, rs1, v0  # Vector-scalar
 
- # vd.mask[i] = borrow_out(vs2[i] - vs1[i])
- vmsbc.vv    vd, vs2, vs1      # Vector-vector, no borrow-in
+# vd.mask[i] = borrow_out(vs2[i] - vs1[i])
+vmsbc.vv    vd, vs2, vs1      # Vector-vector, no borrow-in
 
- # vd.mask[i] = borrow_out(vs2[i] - x[rs1])
- vmsbc.vx    vd, vs2, rs1      # Vector-scalar, no borrow-in
+# vd.mask[i] = borrow_out(vs2[i] - x[rs1])
+vmsbc.vx    vd, vs2, rs1      # Vector-scalar, no borrow-in
 ----
 
 For `vmsbc`, the borrow is defined to be 1 iff the difference, prior to
@@ -2807,9 +2806,9 @@ masked va >= x, any vd
 Compares effectively AND in the mask under a mask-undisturbed policy if the destination register is `v0`, e.g.,
 
 ----
-    # (a < b) && (b < c) in two instructions when mask-undisturbed
-    vmslt.vv    v0, va, vb        # All body elements written
-    vmslt.vv    v0, vb, vc, v0.t  # Only update at set mask
+# (a < b) && (b < c) in two instructions when mask-undisturbed
+vmslt.vv    v0, va, vb        # All body elements written
+vmslt.vv    v0, vb, vc, v0.t  # Only update at set mask
 ----
 
 Compares write mask registers, and so always operate under a
@@ -2883,21 +2882,21 @@ standard scalar integer multiply/divides, with the same results for
 extreme inputs.
 
 ----
-    # Unsigned divide.
-    vdivu.vv vd, vs2, vs1, vm   # Vector-vector
-    vdivu.vx vd, vs2, rs1, vm   # vector-scalar
+# Unsigned divide.
+vdivu.vv vd, vs2, vs1, vm   # Vector-vector
+vdivu.vx vd, vs2, rs1, vm   # vector-scalar
 
-    # Signed divide
-    vdiv.vv vd, vs2, vs1, vm   # Vector-vector
-    vdiv.vx vd, vs2, rs1, vm   # vector-scalar
+# Signed divide
+vdiv.vv vd, vs2, vs1, vm   # Vector-vector
+vdiv.vx vd, vs2, rs1, vm   # vector-scalar
 
-    # Unsigned remainder
-    vremu.vv vd, vs2, vs1, vm   # Vector-vector
-    vremu.vx vd, vs2, rs1, vm   # vector-scalar
+# Unsigned remainder
+vremu.vv vd, vs2, vs1, vm   # Vector-vector
+vremu.vx vd, vs2, rs1, vm   # vector-scalar
 
-    # Signed remainder
-    vrem.vv vd, vs2, vs1, vm   # Vector-vector
-    vrem.vx vd, vs2, rs1, vm   # vector-scalar
+# Signed remainder
+vrem.vv vd, vs2, vs1, vm   # Vector-vector
+vrem.vx vd, vs2, rs1, vm   # vector-scalar
 ----
 
 NOTE: The decision to include integer divide and remainder was
@@ -3188,14 +3187,14 @@ used to control the right shift amount, which provides the scaling.
 ----
 # Narrowing unsigned clip
 #                                SEW                            2*SEW   SEW
- vnclipu.wv vd, vs2, vs1, vm  # vd[i] = clip(roundoff_unsigned(vs2[i], vs1[i]))
- vnclipu.wx vd, vs2, rs1, vm  # vd[i] = clip(roundoff_unsigned(vs2[i], x[rs1]))
- vnclipu.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_unsigned(vs2[i], uimm))
+vnclipu.wv vd, vs2, vs1, vm  # vd[i] = clip(roundoff_unsigned(vs2[i], vs1[i]))
+vnclipu.wx vd, vs2, rs1, vm  # vd[i] = clip(roundoff_unsigned(vs2[i], x[rs1]))
+vnclipu.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_unsigned(vs2[i], uimm))
 
 # Narrowing signed clip
- vnclip.wv vd, vs2, vs1, vm   # vd[i] = clip(roundoff_signed(vs2[i], vs1[i]))
- vnclip.wx vd, vs2, rs1, vm   # vd[i] = clip(roundoff_signed(vs2[i], x[rs1]))
- vnclip.wi vd, vs2, uimm, vm  # vd[i] = clip(roundoff_signed(vs2[i], uimm))
+vnclip.wv vd, vs2, vs1, vm   # vd[i] = clip(roundoff_signed(vs2[i], vs1[i]))
+vnclip.wx vd, vs2, rs1, vm   # vd[i] = clip(roundoff_signed(vs2[i], x[rs1]))
+vnclip.wi vd, vs2, uimm, vm  # vd[i] = clip(roundoff_signed(vs2[i], uimm))
 ----
 
 For `vnclipu`/`vnclip`, the rounding mode is specified in the `vxrm`
@@ -3273,14 +3272,14 @@ elements do not set FP exception flags.
 ==== Vector Single-Width Floating-Point Add/Subtract Instructions
 
 ----
-    # Floating-point add
-    vfadd.vv vd, vs2, vs1, vm   # Vector-vector
-    vfadd.vf vd, vs2, rs1, vm   # vector-scalar
+# Floating-point add
+vfadd.vv vd, vs2, vs1, vm   # Vector-vector
+vfadd.vf vd, vs2, rs1, vm   # vector-scalar
 
-    # Floating-point subtract
-    vfsub.vv vd, vs2, vs1, vm   # Vector-vector
-    vfsub.vf vd, vs2, rs1, vm   # Vector-scalar vd[i] = vs2[i] - f[rs1]
-    vfrsub.vf vd, vs2, rs1, vm  # Scalar-vector vd[i] = f[rs1] - vs2[i]
+# Floating-point subtract
+vfsub.vv vd, vs2, vs1, vm   # Vector-vector
+vfsub.vf vd, vs2, rs1, vm   # Vector-scalar vd[i] = vs2[i] - f[rs1]
+vfrsub.vf vd, vs2, rs1, vm  # Scalar-vector vd[i] = f[rs1] - vs2[i]
 ----
 
 ==== Vector Widening Floating-Point Add/Subtract Instructions
@@ -3302,16 +3301,16 @@ vfwsub.wf  vd, vs2, rs1, vm  # vector-scalar
 ==== Vector Single-Width Floating-Point Multiply/Divide Instructions
 
 ----
-    # Floating-point multiply
-    vfmul.vv vd, vs2, vs1, vm   # Vector-vector
-    vfmul.vf vd, vs2, rs1, vm   # vector-scalar
+# Floating-point multiply
+vfmul.vv vd, vs2, vs1, vm   # Vector-vector
+vfmul.vf vd, vs2, rs1, vm   # vector-scalar
 
-    # Floating-point divide
-    vfdiv.vv vd, vs2, vs1, vm   # Vector-vector
-    vfdiv.vf vd, vs2, rs1, vm   # vector-scalar
+# Floating-point divide
+vfdiv.vv vd, vs2, vs1, vm   # Vector-vector
+vfdiv.vf vd, vs2, rs1, vm   # vector-scalar
 
-    # Reverse floating-point divide vector = scalar / vector
-    vfrdiv.vf vd, vs2, rs1, vm  # scalar-vector, vd[i] = f[rs1]/vs2[i]
+# Reverse floating-point divide vector = scalar / vector
+vfrdiv.vf vd, vs2, rs1, vm  # scalar-vector, vd[i] = f[rs1]/vs2[i]
 ----
 
 ==== Vector Widening Floating-Point Multiply
@@ -3396,15 +3395,15 @@ vfwnmsac.vf vd, rs1, vs2, vm   # vd[i] = -(f[rs1] * vs2[i]) + vd[i]
 This is a unary vector-vector instruction.
 
 ----
-    # Floating-point square root
-    vfsqrt.v vd, vs2, vm   # Vector-vector square root
+# Floating-point square root
+vfsqrt.v vd, vs2, vm   # Vector-vector square root
 ----
 
 ==== Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 
 ----
-    # Floating-point reciprocal square-root estimate to 7 bits.
-    vfrsqrt7.v vd, vs2, vm
+# Floating-point reciprocal square-root estimate to 7 bits.
+vfrsqrt7.v vd, vs2, vm
 ----
 
 This is a unary vector-vector instruction that returns an estimate of
@@ -3472,8 +3471,8 @@ with greater estimate accuracy.
 ==== Vector Floating-Point Reciprocal Estimate Instruction
 
 ----
-    # Floating-point reciprocal estimate to 7 bits.
-    vfrec7.v vd, vs2, vm
+# Floating-point reciprocal estimate to 7 bits.
+vfrec7.v vd, vs2, vm
 ----
 
 NOTE: An earlier draft version had used the assembler name `vfrece7`
@@ -3572,13 +3571,13 @@ in version 2.2 of the RISC-V F/D/Q extension: they perform the `minimumNumber`
 or `maximumNumber` operation on active elements.
 
 ----
-    # Floating-point minimum
-    vfmin.vv vd, vs2, vs1, vm   # Vector-vector
-    vfmin.vf vd, vs2, rs1, vm   # vector-scalar
+# Floating-point minimum
+vfmin.vv vd, vs2, vs1, vm   # Vector-vector
+vfmin.vf vd, vs2, rs1, vm   # vector-scalar
 
-    # Floating-point maximum
-    vfmax.vv vd, vs2, vs1, vm   # Vector-vector
-    vfmax.vf vd, vs2, rs1, vm   # vector-scalar
+# Floating-point maximum
+vfmax.vv vd, vs2, vs1, vm   # Vector-vector
+vfmax.vf vd, vs2, rs1, vm   # vector-scalar
 ----
 
 ==== Vector Floating-Point Sign-Injection Instructions
@@ -3587,14 +3586,14 @@ Vector versions of the scalar sign-injection instructions.  The result
 takes all bits except the sign bit from the vector `vs2` operands.
 
 ----
-    vfsgnj.vv vd, vs2, vs1, vm   # Vector-vector
-    vfsgnj.vf vd, vs2, rs1, vm   # vector-scalar
+vfsgnj.vv vd, vs2, vs1, vm   # Vector-vector
+vfsgnj.vf vd, vs2, rs1, vm   # vector-scalar
 
-    vfsgnjn.vv vd, vs2, vs1, vm  # Vector-vector
-    vfsgnjn.vf vd, vs2, rs1, vm  # vector-scalar
+vfsgnjn.vv vd, vs2, vs1, vm  # Vector-vector
+vfsgnjn.vf vd, vs2, rs1, vm  # vector-scalar
 
-    vfsgnjx.vv vd, vs2, vs1, vm  # Vector-vector
-    vfsgnjx.vf vd, vs2, rs1, vm  # vector-scalar
+vfsgnjx.vv vd, vs2, vs1, vm  # Vector-vector
+vfsgnjx.vf vd, vs2, rs1, vm  # vector-scalar
 ----
 
 NOTE: A vector of floating-point values can be negated using a
@@ -3626,27 +3625,27 @@ operand is NaN, whereas the other compares write 0 when either operand
 is NaN.
 
 ----
-    # Compare equal
-    vmfeq.vv vd, vs2, vs1, vm  # Vector-vector
-    vmfeq.vf vd, vs2, rs1, vm  # vector-scalar
+# Compare equal
+vmfeq.vv vd, vs2, vs1, vm  # Vector-vector
+vmfeq.vf vd, vs2, rs1, vm  # vector-scalar
 
-    # Compare not equal
-    vmfne.vv vd, vs2, vs1, vm  # Vector-vector
-    vmfne.vf vd, vs2, rs1, vm  # vector-scalar
+# Compare not equal
+vmfne.vv vd, vs2, vs1, vm  # Vector-vector
+vmfne.vf vd, vs2, rs1, vm  # vector-scalar
 
-    # Compare less than
-    vmflt.vv vd, vs2, vs1, vm  # Vector-vector
-    vmflt.vf vd, vs2, rs1, vm  # vector-scalar
+# Compare less than
+vmflt.vv vd, vs2, vs1, vm  # Vector-vector
+vmflt.vf vd, vs2, rs1, vm  # vector-scalar
 
-    # Compare less than or equal
-    vmfle.vv vd, vs2, vs1, vm  # Vector-vector
-    vmfle.vf vd, vs2, rs1, vm  # vector-scalar
+# Compare less than or equal
+vmfle.vv vd, vs2, vs1, vm  # Vector-vector
+vmfle.vf vd, vs2, rs1, vm  # vector-scalar
 
-    # Compare greater than
-    vmfgt.vf vd, vs2, rs1, vm  # vector-scalar
+# Compare greater than
+vmfgt.vf vd, vs2, rs1, vm  # vector-scalar
 
-    # Compare greater than or equal
-    vmfge.vf vd, vs2, rs1, vm  # vector-scalar
+# Compare greater than or equal
+vmfge.vf vd, vs2, rs1, vm  # vector-scalar
 ----
 
 ----
@@ -3675,11 +3674,11 @@ the comparand is a non-NaN constant, the middle two instructions can be
 omitted.
 
 ----
-    # Example of implementing isgreater()
-    vmfeq.vv v0, va, va        # Only set where A is not NaN.
-    vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
-    vmand.mm v0, v0, v1        # Only set where A and B are ordered,
-    vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
+# Example of implementing isgreater()
+vmfeq.vv v0, va, va        # Only set where A is not NaN.
+vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
+vmand.mm v0, v0, v1        # Only set where A and B are ordered,
+vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
 ----
 
 NOTE: In the above sequence, it is tempting to mask the second `vmfeq`
@@ -3694,7 +3693,7 @@ This is a unary vector-vector instruction that operates in the same
 way as the scalar classify instruction.
 
 ----
-    vfclass.v vd, vs2, vm   # Vector-vector
+vfclass.v vd, vs2, vm   # Vector-vector
 ----
 
 The 10-bit mask produced by this instruction is placed in the
@@ -3885,15 +3884,15 @@ All operands and results of single-width reduction instructions have
 the same SEW width.  Overflows wrap around on arithmetic sums.
 
 ----
-    # Simple reductions, where [*] denotes all active elements:
-    vredsum.vs  vd, vs2, vs1, vm   # vd[0] =  sum( vs1[0] , vs2[*] )
-    vredmaxu.vs vd, vs2, vs1, vm   # vd[0] = maxu( vs1[0] , vs2[*] )
-    vredmax.vs  vd, vs2, vs1, vm   # vd[0] =  max( vs1[0] , vs2[*] )
-    vredminu.vs vd, vs2, vs1, vm   # vd[0] = minu( vs1[0] , vs2[*] )
-    vredmin.vs  vd, vs2, vs1, vm   # vd[0] =  min( vs1[0] , vs2[*] )
-    vredand.vs  vd, vs2, vs1, vm   # vd[0] =  and( vs1[0] , vs2[*] )
-    vredor.vs   vd, vs2, vs1, vm   # vd[0] =   or( vs1[0] , vs2[*] )
-    vredxor.vs  vd, vs2, vs1, vm   # vd[0] =  xor( vs1[0] , vs2[*] )
+# Simple reductions, where [*] denotes all active elements:
+vredsum.vs  vd, vs2, vs1, vm   # vd[0] =  sum( vs1[0] , vs2[*] )
+vredmaxu.vs vd, vs2, vs1, vm   # vd[0] = maxu( vs1[0] , vs2[*] )
+vredmax.vs  vd, vs2, vs1, vm   # vd[0] =  max( vs1[0] , vs2[*] )
+vredminu.vs vd, vs2, vs1, vm   # vd[0] = minu( vs1[0] , vs2[*] )
+vredmin.vs  vd, vs2, vs1, vm   # vd[0] =  min( vs1[0] , vs2[*] )
+vredand.vs  vd, vs2, vs1, vm   # vd[0] =  and( vs1[0] , vs2[*] )
+vredor.vs   vd, vs2, vs1, vm   # vd[0] =   or( vs1[0] , vs2[*] )
+vredxor.vs  vd, vs2, vs1, vm   # vd[0] =  xor( vs1[0] , vs2[*] )
 ----
 
 [[sec-vector-integer-reduce-widen]]
@@ -3909,23 +3908,22 @@ elements before summing them.
 For both `vwredsumu.vs` and `vwredsum.vs`, overflows wrap around.
 
 ----
-    # Unsigned sum reduction into double-width accumulator
-    vwredsumu.vs vd, vs2, vs1, vm   # 2*SEW = 2*SEW + sum(zero-extend(SEW))
+# Unsigned sum reduction into double-width accumulator
+vwredsumu.vs vd, vs2, vs1, vm   # 2*SEW = 2*SEW + sum(zero-extend(SEW))
 
-    # Signed sum reduction into double-width accumulator
-    vwredsum.vs  vd, vs2, vs1, vm   # 2*SEW = 2*SEW + sum(sign-extend(SEW))
+# Signed sum reduction into double-width accumulator
+vwredsum.vs  vd, vs2, vs1, vm   # 2*SEW = 2*SEW + sum(sign-extend(SEW))
 ----
 
 [[sec-vector-float-reduce]]
 ==== Vector Single-Width Floating-Point Reduction Instructions
 
 ----
-    # Simple reductions.
-    vfredosum.vs vd, vs2, vs1, vm # Ordered sum
-    vfredusum.vs vd, vs2, vs1, vm # Unordered sum
-    vfredmax.vs  vd, vs2, vs1, vm # Maximum value
-    vfredmin.vs  vd, vs2, vs1, vm # Minimum value
-
+# Simple reductions.
+vfredosum.vs vd, vs2, vs1, vm # Ordered sum
+vfredusum.vs vd, vs2, vs1, vm # Unordered sum
+vfredmax.vs  vd, vs2, vs1, vm # Maximum value
+vfredmin.vs  vd, vs2, vs1, vm # Minimum value
 ----
 
 NOTE: Older assembler mnemonic `vfredsum` is retained as alias for `vfredusum`.
@@ -4058,14 +4056,14 @@ Mask elements past `vl`, the tail elements, are
 always updated with a tail-agnostic policy.
 
 ----
-    vmand.mm vd, vs2, vs1   # vd.mask[i] =   vs2.mask[i] &&  vs1.mask[i]
-    vmnand.mm vd, vs2, vs1  # vd.mask[i] = !(vs2.mask[i] &&  vs1.mask[i])
-    vmandn.mm vd, vs2, vs1  # vd.mask[i] =   vs2.mask[i] && !vs1.mask[i]
-    vmxor.mm  vd, vs2, vs1  # vd.mask[i] =   vs2.mask[i] ^^  vs1.mask[i]
-    vmor.mm  vd, vs2, vs1   # vd.mask[i] =   vs2.mask[i] ||  vs1.mask[i]
-    vmnor.mm  vd, vs2, vs1  # vd.mask[i] = !(vs2.mask[i] ||  vs1.mask[i])
-    vmorn.mm  vd, vs2, vs1  # vd.mask[i] =   vs2.mask[i] || !vs1.mask[i]
-    vmxnor.mm vd, vs2, vs1  # vd.mask[i] = !(vs2.mask[i] ^^  vs1.mask[i])
+vmand.mm vd, vs2, vs1   # vd.mask[i] =   vs2.mask[i] &&  vs1.mask[i]
+vmnand.mm vd, vs2, vs1  # vd.mask[i] = !(vs2.mask[i] &&  vs1.mask[i])
+vmandn.mm vd, vs2, vs1  # vd.mask[i] =   vs2.mask[i] && !vs1.mask[i]
+vmxor.mm  vd, vs2, vs1  # vd.mask[i] =   vs2.mask[i] ^^  vs1.mask[i]
+vmor.mm  vd, vs2, vs1   # vd.mask[i] =   vs2.mask[i] ||  vs1.mask[i]
+vmnor.mm  vd, vs2, vs1  # vd.mask[i] = !(vs2.mask[i] ||  vs1.mask[i])
+vmorn.mm  vd, vs2, vs1  # vd.mask[i] =   vs2.mask[i] || !vs1.mask[i]
+vmxnor.mm vd, vs2, vs1  # vd.mask[i] = !(vs2.mask[i] ^^  vs1.mask[i])
 ----
 
 NOTE: The previous assembler mnemonics `vmandnot` and `vmornot` have
@@ -4076,10 +4074,10 @@ mnemonics can be retained as assembler aliases for compatibility.
 Several assembler pseudoinstructions are defined as shorthand for
 common uses of mask logical operations:
 ----
-    vmmv.m vd, vs  => vmand.mm vd, vs, vs   # Copy mask register
-    vmclr.m vd     => vmxor.mm vd, vd, vd   # Clear mask register
-    vmset.m vd     => vmxnor.mm vd, vd, vd  # Set mask register
-    vmnot.m vd, vs => vmnand.mm vd, vs, vs  # Invert bits
+vmmv.m vd, vs  => vmand.mm vd, vs, vs   # Copy mask register
+vmclr.m vd     => vmxor.mm vd, vd, vd   # Clear mask register
+vmset.m vd     => vmxnor.mm vd, vd, vd  # Set mask register
+vmnot.m vd, vs => vmnand.mm vd, vs, vs  # Invert bits
 ----
 
 NOTE: The `vmmv.m` instruction was previously called `vmcpy.m`, but
@@ -4132,7 +4130,7 @@ use.
 ==== Vector count population in mask `vcpop.m`
 
 ----
-    vcpop.m rd, vs2, vm
+vcpop.m rd, vs2, vm
 ----
 
 NOTE: This instruction previously had the assembler mnemonic `vpopc.m`
@@ -4151,7 +4149,7 @@ The operation can be performed under a mask, in which case only the
 masked elements are counted.
 
 ----
- vcpop.m rd, vs2, v0.t # x[rd] = sum_i ( vs2.mask[i] && v0.mask[i] )
+vcpop.m rd, vs2, v0.t # x[rd] = sum_i ( vs2.mask[i] && v0.mask[i] )
 ----
 
 The `vcpop.m` instruction writes `x[rd]` even if `vl`=0 (with the
@@ -4164,7 +4162,7 @@ Traps on `vcpop.m` are always reported with a `vstart` of 0.  The
 ==== `vfirst` find-first-set mask bit
 
 ----
-    vfirst.m rd, vs2, vm
+vfirst.m rd, vs2, vm
 ----
 
 The `vfirst` instruction finds the lowest-numbered active element of
@@ -4356,27 +4354,27 @@ The `viota.m` instruction can be combined with memory scatter
 instructions (indexed stores) to perform vector compress functions.
 
 ----
-    # Compact non-zero elements from input memory array to output memory array
-    #
-    # size_t compact_non_zero(size_t n, const int* in, int* out)
-    # {
-    #   size_t i;
-    #   size_t count = 0;
-    #   int *p = out;
-    #
-    #   for (i=0; i<n; i++)
-    #   {
-    #       const int v = *in++;
-    #       if (v != 0)
-    #           *p++ = v;
-    #   }
-    #
-    #   return (size_t) (p - out);
-    # }
-    #
-    # a0 = n
-    # a1 = &in
-    # a2 = &out
+# Compact non-zero elements from input memory array to output memory array
+#
+# size_t compact_non_zero(size_t n, const int* in, int* out)
+# {
+#   size_t i;
+#   size_t count = 0;
+#   int *p = out;
+#
+#   for (i=0; i<n; i++)
+#   {
+#       const int v = *in++;
+#       if (v != 0)
+#           *p++ = v;
+#   }
+#
+#   return (size_t) (p - out);
+# }
+#
+# a0 = n
+# a1 = &in
+# a2 = &out
 
 compact_non_zero:
     li a6, 0                      # Clear count of non-zero elements
@@ -4406,7 +4404,7 @@ The `vid.v` instruction writes each element's index to the
 destination vector register group, from 0 to `vl`-1.
 
 ----
-    vid.v vd, vm  # Write element ID to destination.
+vid.v vd, vm  # Write element ID to destination.
 ----
 
 The instruction can be masked.  Masking does not change the
@@ -4516,8 +4514,8 @@ undisturbed/agnostic policy is followed for inactive elements.
 ===== Vector Slideup Instructions
 
 ----
- vslideup.vx vd, vs2, rs1, vm        # vd[i+x[rs1]] = vs2[i]
- vslideup.vi vd, vs2, uimm, vm       # vd[i+uimm] = vs2[i]
+vslideup.vx vd, vs2, rs1, vm        # vd[i+x[rs1]] = vs2[i]
+vslideup.vi vd, vs2, uimm, vm       # vd[i+uimm] = vs2[i]
 ----
 
 For `vslideup`, the value in `vl` specifies the maximum number of destination
@@ -4529,13 +4527,13 @@ Destination elements _OFFSET_ through `vl`-1 are written if unmasked and
 if _OFFSET_ < `vl`.
 
 ----
-   vslideup behavior for destination elements (`vstart` < `vl`)
+vslideup behavior for destination elements (`vstart` < `vl`)
 
-   OFFSET is amount to slideup, either from x register or a 5-bit immediate
+OFFSET is amount to slideup, either from x register or a 5-bit immediate
 
-                    0 <= i < min(vl, max(vstart, OFFSET))  Unchanged
-  max(vstart, OFFSET) <= i < vl                            vd[i] = vs2[i-OFFSET] if v0.mask[i] enabled
-                   vl <= i < VLMAX                         Follow tail policy
+                  0 <= i < min(vl, max(vstart, OFFSET))  Unchanged
+max(vstart, OFFSET) <= i < vl                            vd[i] = vs2[i-OFFSET] if v0.mask[i] enabled
+                 vl <= i < VLMAX                         Follow tail policy
 ----
 
 The destination vector register group for `vslideup` cannot overlap
@@ -4549,8 +4547,8 @@ input vectors during execution, and enables restart with non-zero
 ===== Vector Slidedown Instructions
 
 ----
- vslidedown.vx vd, vs2, rs1, vm       # vd[i] = vs2[i+x[rs1]]
- vslidedown.vi vd, vs2, uimm, vm      # vd[i] = vs2[i+uimm]
+vslidedown.vx vd, vs2, rs1, vm       # vd[i] = vs2[i+x[rs1]]
+vslidedown.vi vd, vs2, uimm, vm      # vd[i] = vs2[i+uimm]
 ----
 
 For `vslidedown`, the value in `vl` specifies the maximum number of
@@ -4564,15 +4562,14 @@ using an unsigned integer in the `x` register specified by `rs1`, or a
 If XLEN > SEW, _OFFSET_ is _not_ truncated to SEW bits.
 
 ----
-  vslidedown behavior for source elements for element i in slide (`vstart` < `vl`)
-                   0 <= i+OFFSET < VLMAX   src[i] = vs2[i+OFFSET]
-               VLMAX <= i+OFFSET           src[i] = 0
-
-  vslidedown behavior for destination element i in slide (`vstart` < `vl`)
-                   0 <= i < vstart         Unchanged
-              vstart <= i < vl             vd[i] = src[i] if v0.mask[i] enabled
-                  vl <= i < VLMAX          Follow tail policy
+vslidedown behavior for source elements for element i in slide (`vstart` < `vl`)
+                0 <= i+OFFSET < VLMAX   src[i] = vs2[i+OFFSET]
+            VLMAX <= i+OFFSET           src[i] = 0
 
+vslidedown behavior for destination element i in slide (`vstart` < `vl`)
+                 0 <= i < vstart         Unchanged
+            vstart <= i < vl             vd[i] = src[i] if v0.mask[i] enabled
+                vl <= i < VLMAX          Follow tail policy
 ----
 
 ===== Vector Slide1up
@@ -4582,7 +4579,7 @@ also allow a scalar integer value to be inserted at the vacated
 element position.
 
 ----
- vslide1up.vx  vd, vs2, rs1, vm        # vd[0]=x[rs1], vd[i+1] = vs2[i]
+vslide1up.vx  vd, vs2, rs1, vm        # vd[0]=x[rs1], vd[i+1] = vs2[i]
 ----
 
 The `vslide1up` instruction places the `x` register argument at
@@ -4603,12 +4600,12 @@ past `vl` are handled according to the current tail policy (Section
 
 
 ----
-   vslide1up behavior when vl > 0
+vslide1up behavior when vl > 0
 
-                    i < vstart  unchanged
-                0 = i = vstart  vd[i] = x[rs1] if v0.mask[i] enabled
-  max(vstart, 1) <= i < vl      vd[i] = vs2[i-1] if v0.mask[i] enabled
-              vl <= i < VLMAX   Follow tail policy
+                  i < vstart  unchanged
+              0 = i = vstart  vd[i] = x[rs1] if v0.mask[i] enabled
+max(vstart, 1) <= i < vl      vd[i] = vs2[i-1] if v0.mask[i] enabled
+            vl <= i < VLMAX   Follow tail policy
 ----
 
 The `vslide1up` instruction requires that the destination vector
@@ -4619,7 +4616,7 @@ Otherwise, the instruction encoding is reserved.
 ===== Vector Floating-Point Slide1up Instruction
 
 ----
- vfslide1up.vf vd, vs2, rs1, vm        # vd[0]=f[rs1], vd[i+1] = vs2[i]
+vfslide1up.vf vd, vs2, rs1, vm        # vd[0]=f[rs1], vd[i+1] = vs2[i]
 ----
 
 The `vfslide1up` instruction is defined analogously to `vslide1up`,
@@ -4637,7 +4634,7 @@ past `vl` are handled according to the current tail policy (Section
 <<sec-agnostic>>).
 
 ----
- vslide1down.vx  vd, vs2, rs1, vm      # vd[i] = vs2[i+1], vd[vl-1]=x[rs1]
+vslide1down.vx  vd, vs2, rs1, vm      # vd[i] = vs2[i+1], vd[vl-1]=x[rs1]
 ----
 
 The `vslide1down` instruction places the `x` register argument at
@@ -4649,12 +4646,12 @@ XLEN > SEW, the least-significant bits are copied over and the high
 SEW-XLEN bits are ignored.
 
 ----
-   vslide1down behavior
+vslide1down behavior
 
-                       i < vstart  unchanged
-             vstart <= i < vl-1    vd[i] = vs2[i+1] if v0.mask[i] enabled
-             vstart <= i = vl-1    vd[vl-1] = x[rs1] if v0.mask[i] enabled
-                 vl <= i < VLMAX   Follow tail policy
+          i < vstart  unchanged
+vstart <= i < vl-1    vd[i] = vs2[i+1] if v0.mask[i] enabled
+vstart <= i = vl-1    vd[vl-1] = x[rs1] if v0.mask[i] enabled
+    vl <= i < VLMAX   Follow tail policy
 ----
 
 NOTE: The `vslide1down` instruction can be used to load values into a
@@ -4667,7 +4664,7 @@ contents of a vector register, albeit slowly, with multiple repeated
 ===== Vector Floating-Point Slide1down Instruction
 
 ----
- vfslide1down.vf vd, vs2, rs1, vm      # vd[i] = vs2[i+1], vd[vl-1]=f[rs1]
+vfslide1down.vf vd, vs2, rs1, vm      # vd[i] = vs2[i+1], vd[vl-1]=f[rs1]
 ----
 
 The `vfslide1down` instruction is defined analogously to `vslide1down`,
@@ -4729,7 +4726,7 @@ contiguous elements at the start of the destination vector register
 group.
 
 ----
-  vcompress.vm vd, vs2, vs1  # Compress into vd elements of vs2 where vs1 is enabled
+vcompress.vm vd, vs2, vs1  # Compress into vd elements of vs2 where vs1 is enabled
 ----
 
 The vector mask register specified by `vs1` indicates which of the
@@ -4740,16 +4737,16 @@ elements according to the current tail policy (Section
 <<sec-agnostic>>).
 
 ----
-    Example use of vcompress instruction
+Example use of vcompress instruction
 
-        8 7 6 5 4 3 2 1 0   Element number
+8 7 6 5 4 3 2 1 0   Element number
 
-        1 1 0 1 0 0 1 0 1   v0
-        8 7 6 5 4 3 2 1 0   v1
-        1 2 3 4 5 6 7 8 9   v2
-                                vsetivli     t0, 9, e8, m1, tu, ma
-                                vcompress.vm v2, v1, v0
-        1 2 3 4 8 7 5 2 0   v2
+1 1 0 1 0 0 1 0 1   v0
+8 7 6 5 4 3 2 1 0   v1
+1 2 3 4 5 6 7 8 9   v2
+                        vsetivli     t0, 9, e8, m1, tu, ma
+                        vcompress.vm v2, v1, v0
+1 2 3 4 8 7 5 2 0   v2
 ----
 
 `vcompress` is encoded as an unmasked instruction (`vm=1`). The equivalent
@@ -4775,30 +4772,30 @@ There is no inverse `vdecompress` provided, as this operation can be
 readily synthesized using iota and a masked vrgather:
 
 ----
-    Desired functionality of 'vdecompress'
-      7 6 5 4 3 2 1 0     # vid
+Desired functionality of 'vdecompress'
+7 6 5 4 3 2 1 0     # vid
 
-            e d c b a     # packed vector of 5 elements
-      1 0 0 1 1 1 0 1     # mask vector of 8 elements
-      p q r s t u v w     # destination register before vdecompress
+      e d c b a     # packed vector of 5 elements
+1 0 0 1 1 1 0 1     # mask vector of 8 elements
+p q r s t u v w     # destination register before vdecompress
 
-      e q r d c b v a     # result of vdecompress
+e q r d c b v a     # result of vdecompress
 ----
 
 ----
-     # v0 holds mask
-     # v1 holds packed data
-     # v11 holds input expanded vector and result
-     viota.m v10, v0                 # Calc iota from mask in v0
-     vrgather.vv v11, v1, v10, v0.t  # Expand into destination
+# v0 holds mask
+# v1 holds packed data
+# v11 holds input expanded vector and result
+viota.m v10, v0                 # Calc iota from mask in v0
+vrgather.vv v11, v1, v10, v0.t  # Expand into destination
 ----
 ----
-   p q r s t u v w    # v11 destination register
-         e d c b a    # v1 source vector
-   1 0 0 1 1 1 0 1    # v0 mask vector
+p q r s t u v w    # v11 destination register
+      e d c b a    # v1 source vector
+1 0 0 1 1 1 0 1    # v0 mask vector
 
-   4 4 4 3 2 1 1 0    # v10 result of viota.m
-   e q r d c b v a    # v11 destination after vrgather using viota.m under mask
+4 4 4 3 2 1 1 0    # v10 result of viota.m
+e q r d c b v a    # v11 destination after vrgather using viota.m under mask
 ----
 
 ==== Whole Vector Register Move
@@ -4838,12 +4835,12 @@ related `vmerge` encoding, and it is unlikely the `vsmul` instruction
 would benefit from an immediate form.
 
 ----
-    vmv<nr>r.v vd, vs2  # General form
+vmv<nr>r.v vd, vs2  # General form
 
-    vmv1r.v v1, v2   #  Copy v1=v2
-    vmv2r.v v10, v12 #  Copy v10=v12; v11=v13
-    vmv4r.v v4, v8   #  Copy v4=v8; v5=v9; v6=v10; v7=v11
-    vmv8r.v v0, v8   #  Copy v0=v8; v1=v9; ...;  v7=v15
+vmv1r.v v1, v2   #  Copy v1=v2
+vmv2r.v v10, v12 #  Copy v10=v12; v11=v13
+vmv4r.v v4, v8   #  Copy v4=v8; v5=v9; v6=v10; v7=v11
+vmv8r.v v0, v8   #  Copy v0=v8; v1=v9; ...;  v7=v15
 ----
 
 The source and destination vector register numbers must be aligned
author	Andrew Waterman <andrew@sifive.com>	2024-07-15 17:26:15 -0700
committer	GitHub <noreply@github.com>	2024-07-15 17:26:15 -0700
commit	6d06d0e4a69b410474b44f7fe147646f55d17a83 (patch)
tree	924f063761ecf11bd03bd119b21ef6a7abf78a6e /src
parent	92e5663658d4064bc67f53c2496d871f507f7979 (diff)
parent	d02409b77f274f85a7b1617192f3f30fd0037f3b (diff)
download	riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.zip riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.tar.gz riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.tar.bz2