diff options
author | Andrew Waterman <andrew@sifive.com> | 2024-07-15 17:26:15 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-15 17:26:15 -0700 |
commit | 6d06d0e4a69b410474b44f7fe147646f55d17a83 (patch) | |
tree | 924f063761ecf11bd03bd119b21ef6a7abf78a6e /src | |
parent | 92e5663658d4064bc67f53c2496d871f507f7979 (diff) | |
parent | d02409b77f274f85a7b1617192f3f30fd0037f3b (diff) | |
download | riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.zip riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.tar.gz riscv-isa-manual-6d06d0e4a69b410474b44f7fe147646f55d17a83.tar.bz2 |
Merge pull request #1536 from riscv/dev/kbroch/fix-v-ext-inconsistent-pseudo-intendationriscv-isa-release-6d06d0e-2024-07-16
fix inconsistent indentation of pseudocode (left justify all if it)
Diffstat (limited to 'src')
-rw-r--r-- | src/v-st-ext.adoc | 689 |
1 files changed, 343 insertions, 346 deletions
diff --git a/src/v-st-ext.adoc b/src/v-st-ext.adoc index 5909667..467d8de 100644 --- a/src/v-st-ext.adoc +++ b/src/v-st-ext.adoc @@ -1549,19 +1549,19 @@ currently reserved. ==== Vector Unit-Stride Instructions ---- - # Vector unit-stride loads and stores +# Vector unit-stride loads and stores - # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>) - vle8.v vd, (rs1), vm # 8-bit unit-stride load - vle16.v vd, (rs1), vm # 16-bit unit-stride load - vle32.v vd, (rs1), vm # 32-bit unit-stride load - vle64.v vd, (rs1), vm # 64-bit unit-stride load +# vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>) +vle8.v vd, (rs1), vm # 8-bit unit-stride load +vle16.v vd, (rs1), vm # 16-bit unit-stride load +vle32.v vd, (rs1), vm # 32-bit unit-stride load +vle64.v vd, (rs1), vm # 64-bit unit-stride load - # vs3 store data, rs1 base address, vm is mask encoding (v0.t or <missing>) - vse8.v vs3, (rs1), vm # 8-bit unit-stride store - vse16.v vs3, (rs1), vm # 16-bit unit-stride store - vse32.v vs3, (rs1), vm # 32-bit unit-stride store - vse64.v vs3, (rs1), vm # 64-bit unit-stride store +# vs3 store data, rs1 base address, vm is mask encoding (v0.t or <missing>) +vse8.v vs3, (rs1), vm # 8-bit unit-stride store +vse16.v vs3, (rs1), vm # 16-bit unit-stride store +vse32.v vs3, (rs1), vm # 32-bit unit-stride store +vse64.v vs3, (rs1), vm # 64-bit unit-stride store ---- Additional unit-stride mask load and store instructions are @@ -1572,11 +1572,11 @@ and the destination register is always written with a tail-agnostic policy. ---- - # Vector unit-stride mask load - vlm.v vd, (rs1) # Load byte vector of length ceil(vl/8) +# Vector unit-stride mask load +vlm.v vd, (rs1) # Load byte vector of length ceil(vl/8) - # Vector unit-stride mask store - vsm.v vs3, (rs1) # Store byte vector of length ceil(vl/8) +# Vector unit-stride mask store +vsm.v vs3, (rs1) # Store byte vector of length ceil(vl/8) ---- `vlm.v` and `vsm.v` are encoded with the same `width[2:0]`=0 encoding as @@ -1602,19 +1602,19 @@ and also reduce the cost of mask spill/fill by reducing need to change ==== Vector Strided Instructions ---- - # Vector strided loads and stores +# Vector strided loads and stores - # vd destination, rs1 base address, rs2 byte stride - vlse8.v vd, (rs1), rs2, vm # 8-bit strided load - vlse16.v vd, (rs1), rs2, vm # 16-bit strided load - vlse32.v vd, (rs1), rs2, vm # 32-bit strided load - vlse64.v vd, (rs1), rs2, vm # 64-bit strided load +# vd destination, rs1 base address, rs2 byte stride +vlse8.v vd, (rs1), rs2, vm # 8-bit strided load +vlse16.v vd, (rs1), rs2, vm # 16-bit strided load +vlse32.v vd, (rs1), rs2, vm # 32-bit strided load +vlse64.v vd, (rs1), rs2, vm # 64-bit strided load - # vs3 store data, rs1 base address, rs2 byte stride - vsse8.v vs3, (rs1), rs2, vm # 8-bit strided store - vsse16.v vs3, (rs1), rs2, vm # 16-bit strided store - vsse32.v vs3, (rs1), rs2, vm # 32-bit strided store - vsse64.v vs3, (rs1), rs2, vm # 64-bit strided store +# vs3 store data, rs1 base address, rs2 byte stride +vsse8.v vs3, (rs1), rs2, vm # 8-bit strided store +vsse16.v vs3, (rs1), rs2, vm # 16-bit strided store +vsse32.v vs3, (rs1), rs2, vm # 32-bit strided store +vsse64.v vs3, (rs1), rs2, vm # 64-bit strided store ---- Negative and zero strides are supported. @@ -1648,36 +1648,35 @@ address are required, then an ordered indexed operation can be used. ==== Vector Indexed Instructions ---- - # Vector indexed loads and stores +# Vector indexed loads and stores - # Vector indexed-unordered load instructions - # vd destination, rs1 base address, vs2 byte offsets - vluxei8.v vd, (rs1), vs2, vm # unordered 8-bit indexed load of SEW data - vluxei16.v vd, (rs1), vs2, vm # unordered 16-bit indexed load of SEW data - vluxei32.v vd, (rs1), vs2, vm # unordered 32-bit indexed load of SEW data - vluxei64.v vd, (rs1), vs2, vm # unordered 64-bit indexed load of SEW data +# Vector indexed-unordered load instructions +# vd destination, rs1 base address, vs2 byte offsets +vluxei8.v vd, (rs1), vs2, vm # unordered 8-bit indexed load of SEW data +vluxei16.v vd, (rs1), vs2, vm # unordered 16-bit indexed load of SEW data +vluxei32.v vd, (rs1), vs2, vm # unordered 32-bit indexed load of SEW data +vluxei64.v vd, (rs1), vs2, vm # unordered 64-bit indexed load of SEW data - # Vector indexed-ordered load instructions - # vd destination, rs1 base address, vs2 byte offsets - vloxei8.v vd, (rs1), vs2, vm # ordered 8-bit indexed load of SEW data - vloxei16.v vd, (rs1), vs2, vm # ordered 16-bit indexed load of SEW data - vloxei32.v vd, (rs1), vs2, vm # ordered 32-bit indexed load of SEW data - vloxei64.v vd, (rs1), vs2, vm # ordered 64-bit indexed load of SEW data +# Vector indexed-ordered load instructions +# vd destination, rs1 base address, vs2 byte offsets +vloxei8.v vd, (rs1), vs2, vm # ordered 8-bit indexed load of SEW data +vloxei16.v vd, (rs1), vs2, vm # ordered 16-bit indexed load of SEW data +vloxei32.v vd, (rs1), vs2, vm # ordered 32-bit indexed load of SEW data +vloxei64.v vd, (rs1), vs2, vm # ordered 64-bit indexed load of SEW data - # Vector indexed-unordered store instructions - # vs3 store data, rs1 base address, vs2 byte offsets - vsuxei8.v vs3, (rs1), vs2, vm # unordered 8-bit indexed store of SEW data - vsuxei16.v vs3, (rs1), vs2, vm # unordered 16-bit indexed store of SEW data - vsuxei32.v vs3, (rs1), vs2, vm # unordered 32-bit indexed store of SEW data - vsuxei64.v vs3, (rs1), vs2, vm # unordered 64-bit indexed store of SEW data - - # Vector indexed-ordered store instructions - # vs3 store data, rs1 base address, vs2 byte offsets - vsoxei8.v vs3, (rs1), vs2, vm # ordered 8-bit indexed store of SEW data - vsoxei16.v vs3, (rs1), vs2, vm # ordered 16-bit indexed store of SEW data - vsoxei32.v vs3, (rs1), vs2, vm # ordered 32-bit indexed store of SEW data - vsoxei64.v vs3, (rs1), vs2, vm # ordered 64-bit indexed store of SEW data +# Vector indexed-unordered store instructions +# vs3 store data, rs1 base address, vs2 byte offsets +vsuxei8.v vs3, (rs1), vs2, vm # unordered 8-bit indexed store of SEW data +vsuxei16.v vs3, (rs1), vs2, vm # unordered 16-bit indexed store of SEW data +vsuxei32.v vs3, (rs1), vs2, vm # unordered 32-bit indexed store of SEW data +vsuxei64.v vs3, (rs1), vs2, vm # unordered 64-bit indexed store of SEW data +# Vector indexed-ordered store instructions +# vs3 store data, rs1 base address, vs2 byte offsets +vsoxei8.v vs3, (rs1), vs2, vm # ordered 8-bit indexed store of SEW data +vsoxei16.v vs3, (rs1), vs2, vm # ordered 16-bit indexed store of SEW data +vsoxei32.v vs3, (rs1), vs2, vm # ordered 32-bit indexed store of SEW data +vsoxei64.v vs3, (rs1), vs2, vm # ordered 64-bit indexed store of SEW data ---- NOTE: The assembler syntax for indexed loads and stores uses @@ -1714,13 +1713,13 @@ operation will not be restarted due to a trap or vector-length trimming. ---- - # Vector unit-stride fault-only-first loads +# Vector unit-stride fault-only-first loads - # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>) - vle8ff.v vd, (rs1), vm # 8-bit unit-stride fault-only-first load - vle16ff.v vd, (rs1), vm # 16-bit unit-stride fault-only-first load - vle32ff.v vd, (rs1), vm # 32-bit unit-stride fault-only-first load - vle64ff.v vd, (rs1), vm # 64-bit unit-stride fault-only-first load +# vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>) +vle8ff.v vd, (rs1), vm # 8-bit unit-stride fault-only-first load +vle16ff.v vd, (rs1), vm # 16-bit unit-stride fault-only-first load +vle32ff.v vd, (rs1), vm # 32-bit unit-stride fault-only-first load +vle64ff.v vd, (rs1), vm # 64-bit unit-stride fault-only-first load ---- ---- @@ -1837,14 +1836,14 @@ The assembler prefixes `vlseg`/`vsseg` are used for unit-stride segment loads and stores respectively. ---- - # Format - vlseg<nf>e<eew>.v vd, (rs1), vm # Unit-stride segment load template - vsseg<nf>e<eew>.v vs3, (rs1), vm # Unit-stride segment store template +# Format +vlseg<nf>e<eew>.v vd, (rs1), vm # Unit-stride segment load template +vsseg<nf>e<eew>.v vs3, (rs1), vm # Unit-stride segment store template - # Examples - vlseg8e8.v vd, (rs1), vm # Load eight vector registers with eight byte fields. +# Examples +vlseg8e8.v vd, (rs1), vm # Load eight vector registers with eight byte fields. - vsseg3e32.v vs3, (rs1), vm # Store packed vector of 3*4-byte segments from vs3,vs3+1,vs3+2 to memory +vsseg3e32.v vs3, (rs1), vm # Store packed vector of 3*4-byte segments from vs3,vs3+1,vs3+2 to memory ---- For loads, the `vd` register will hold the first field loaded from the @@ -1852,27 +1851,27 @@ segment. For stores, the `vs3` register is read to provide the first field to be stored to each segment. ---- - # Example 1 - # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp) - vsetvli a1, t0, e8, m1, ta, ma - vlseg3e8.v v8, (a0), vm - # v8 holds the red pixels - # v9 holds the green pixels - # v10 holds the blue pixels +# Example 1 +# Memory structure holds packed RGB pixels (24-bit data structure, 8bpp) +vsetvli a1, t0, e8, m1, ta, ma +vlseg3e8.v v8, (a0), vm +# v8 holds the red pixels +# v9 holds the green pixels +# v10 holds the blue pixels - # Example 2 - # Memory structure holds complex values, 32b for real and 32b for imaginary - vsetvli a1, t0, e32, m1, ta, ma - vlseg2e32.v v8, (a0), vm - # v8 holds real - # v9 holds imaginary +# Example 2 +# Memory structure holds complex values, 32b for real and 32b for imaginary +vsetvli a1, t0, e32, m1, ta, ma +vlseg2e32.v v8, (a0), vm +# v8 holds real +# v9 holds imaginary ---- There are also fault-only-first versions of the unit-stride instructions. ---- - # Template for vector fault-only-first unit-stride segment loads. - vlseg<nf>e<eew>ff.v vd, (rs1), vm # Unit-stride fault-only-first segment loads +# Template for vector fault-only-first unit-stride segment loads. +vlseg<nf>e<eew>ff.v vd, (rs1), vm # Unit-stride fault-only-first segment loads ---- For fault-only-first segment loads, if an exception is detected partway @@ -1892,20 +1891,20 @@ GPR argument. NOTE: Negative and zero strides are supported. ---- - # Format - vlsseg<nf>e<eew>.v vd, (rs1), rs2, vm # Strided segment loads - vssseg<nf>e<eew>.v vs3, (rs1), rs2, vm # Strided segment stores +# Format +vlsseg<nf>e<eew>.v vd, (rs1), rs2, vm # Strided segment loads +vssseg<nf>e<eew>.v vs3, (rs1), rs2, vm # Strided segment stores - # Examples - vsetvli a1, t0, e8, m1, ta, ma - vlsseg3e8.v v4, (x5), x6 # Load bytes at addresses x5+i*x6 into v4[i], - # and bytes at addresses x5+i*x6+1 into v5[i], - # and bytes at addresses x5+i*x6+2 into v6[i]. +# Examples +vsetvli a1, t0, e8, m1, ta, ma +vlsseg3e8.v v4, (x5), x6 # Load bytes at addresses x5+i*x6 into v4[i], + # and bytes at addresses x5+i*x6+1 into v5[i], + # and bytes at addresses x5+i*x6+2 into v6[i]. - # Examples - vsetvli a1, t0, e32, m1, ta, ma - vssseg2e32.v v2, (x5), x6 # Store words from v2[i] to address x5+i*x6 - # and words from v3[i] to address x5+i*x6+4 +# Examples +vsetvli a1, t0, e32, m1, ta, ma +vssseg2e32.v v2, (x5), x6 # Store words from v2[i] to address x5+i*x6 + # and words from v3[i] to address x5+i*x6+4 ---- Accesses to the fields within each segment can occur in any order, @@ -1928,22 +1927,22 @@ EMUL=(EEW/SEW)*LMUL. The EMUL * NFIELDS {le} 8 constraint applies to the data vector register group. ---- - # Format - vluxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-unordered segment loads - vloxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-ordered segment loads - vsuxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-unordered segment stores - vsoxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-ordered segment stores +# Format +vluxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-unordered segment loads +vloxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-ordered segment loads +vsuxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-unordered segment stores +vsoxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-ordered segment stores - # Examples - vsetvli a1, t0, e8, m1, ta, ma - vluxseg3ei8.v v4, (x5), v3 # Load bytes at addresses x5+v3[i] into v4[i], - # and bytes at addresses x5+v3[i]+1 into v5[i], - # and bytes at addresses x5+v3[i]+2 into v6[i]. +# Examples +vsetvli a1, t0, e8, m1, ta, ma +vluxseg3ei8.v v4, (x5), v3 # Load bytes at addresses x5+v3[i] into v4[i], + # and bytes at addresses x5+v3[i]+1 into v5[i], + # and bytes at addresses x5+v3[i]+2 into v6[i]. - # Examples - vsetvli a1, t0, e32, m1, ta, ma - vsuxseg2ei32.v v2, (x5), v5 # Store words from v2[i] to address x5+v5[i] - # and words from v3[i] to address x5+v5[i]+4 +# Examples +vsetvli a1, t0, e32, m1, ta, ma +vsuxseg2ei32.v v2, (x5), v5 # Store words from v2[i] to address x5+v5[i] + # and words from v3[i] to address x5+v5[i]+4 ---- For vector indexed segment loads, the destination vector register @@ -2060,39 +2059,39 @@ environments can mandate the minimum alignment requirements to support an ABI. ---- - # Format of whole register load and store instructions. - vl1r.v v3, (a0) # Pseudoinstruction equal to vl1re8.v +# Format of whole register load and store instructions. +vl1r.v v3, (a0) # Pseudoinstruction equal to vl1re8.v - vl1re8.v v3, (a0) # Load v3 with VLEN/8 bytes held at address in a0 - vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords held at address in a0 - vl1re32.v v3, (a0) # Load v3 with VLEN/32 words held at address in a0 - vl1re64.v v3, (a0) # Load v3 with VLEN/64 doublewords held at address in a0 +vl1re8.v v3, (a0) # Load v3 with VLEN/8 bytes held at address in a0 +vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords held at address in a0 +vl1re32.v v3, (a0) # Load v3 with VLEN/32 words held at address in a0 +vl1re64.v v3, (a0) # Load v3 with VLEN/64 doublewords held at address in a0 - vl2r.v v2, (a0) # Pseudoinstruction equal to vl2re8.v +vl2r.v v2, (a0) # Pseudoinstruction equal to vl2re8.v - vl2re8.v v2, (a0) # Load v2-v3 with 2*VLEN/8 bytes from address in a0 - vl2re16.v v2, (a0) # Load v2-v3 with 2*VLEN/16 halfwords held at address in a0 - vl2re32.v v2, (a0) # Load v2-v3 with 2*VLEN/32 words held at address in a0 - vl2re64.v v2, (a0) # Load v2-v3 with 2*VLEN/64 doublewords held at address in a0 +vl2re8.v v2, (a0) # Load v2-v3 with 2*VLEN/8 bytes from address in a0 +vl2re16.v v2, (a0) # Load v2-v3 with 2*VLEN/16 halfwords held at address in a0 +vl2re32.v v2, (a0) # Load v2-v3 with 2*VLEN/32 words held at address in a0 +vl2re64.v v2, (a0) # Load v2-v3 with 2*VLEN/64 doublewords held at address in a0 - vl4r.v v4, (a0) # Pseudoinstruction equal to vl4re8.v +vl4r.v v4, (a0) # Pseudoinstruction equal to vl4re8.v - vl4re8.v v4, (a0) # Load v4-v7 with 4*VLEN/8 bytes from address in a0 - vl4re16.v v4, (a0) - vl4re32.v v4, (a0) - vl4re64.v v4, (a0) +vl4re8.v v4, (a0) # Load v4-v7 with 4*VLEN/8 bytes from address in a0 +vl4re16.v v4, (a0) +vl4re32.v v4, (a0) +vl4re64.v v4, (a0) - vl8r.v v8, (a0) # Pseudoinstruction equal to vl8re8.v +vl8r.v v8, (a0) # Pseudoinstruction equal to vl8re8.v - vl8re8.v v8, (a0) # Load v8-v15 with 8*VLEN/8 bytes from address in a0 - vl8re16.v v8, (a0) - vl8re32.v v8, (a0) - vl8re64.v v8, (a0) +vl8re8.v v8, (a0) # Load v8-v15 with 8*VLEN/8 bytes from address in a0 +vl8re16.v v8, (a0) +vl8re32.v v8, (a0) +vl8re64.v v8, (a0) - vs1r.v v3, (a1) # Store v3 to address in a1 - vs2r.v v2, (a1) # Store v2-v3 to address in a1 - vs4r.v v4, (a1) # Store v4-v7 to address in a1 - vs8r.v v8, (a1) # Store v8-v15 to address in a1 +vs1r.v v3, (a1) # Store v3 to address in a1 +vs2r.v v2, (a1) # Store v2-v3 to address in a1 +vs4r.v v4, (a1) # Store v4-v7 to address in a1 +vs8r.v v8, (a1) # Store v8-v15 to address in a1 ---- NOTE: Implementations should raise illegal instruction exceptions on @@ -2109,10 +2108,10 @@ following vector instruction needs a new SEW/LMUL. So, in best case only two instructions (of which only one performs vector operations) are needed to synthesize the effect of the dedicated instruction: ---- - csrr t0, vl # Save current vl (potentially not needed) - vsetvli t1, x0, e8, m8, ta, ma # Maximum VLMAX - vlm.v v0, (a0) # Load mask register - vsetvli x0, t0, <new type> # Restore vl (potentially already present) +csrr t0, vl # Save current vl (potentially not needed) +vsetvli t1, x0, e8, m8, ta, ma # Maximum VLMAX +vlm.v v0, (a0) # Load mask register +vsetvli x0, t0, <new type> # Restore vl (potentially already present) ---- === Vector Memory Alignment Constraints @@ -2306,7 +2305,7 @@ The first vector register group operand can be either single or double-width. ---- -Assembly syntax pattern for vector widening arithmetic instructions +# Assembly syntax pattern for vector widening arithmetic instructions # Double-width result, two single-width sources: 2*SEW = SEW op SEW vwop.vv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i] @@ -2526,10 +2525,10 @@ instructions with unchanged inputs, destructive accumulations will require an additional move to obtain correct results. ---- - # Example multi-word arithmetic sequence, accumulating into v4 - vmadc.vvm v1, v4, v8, v0 # Get carry into temp register v1 - vadc.vvm v4, v4, v8, v0 # Calc new sum - vmmv.m v0, v1 # Move temp carry into v0 for next word +# Example multi-word arithmetic sequence, accumulating into v4 +vmadc.vvm v1, v4, v8, v0 # Get carry into temp register v1 +vadc.vvm v4, v4, v8, v0 # Calc new sum +vmmv.m v0, v1 # Move temp carry into v0 for next word ---- The subtract with borrow instruction `vsbc` performs the equivalent @@ -2537,27 +2536,27 @@ function to support long word arithmetic for subtraction. There are no subtract with immediate instructions. ---- - # Produce difference with borrow. +# Produce difference with borrow. - # vd[i] = vs2[i] - vs1[i] - v0.mask[i] - vsbc.vvm vd, vs2, vs1, v0 # Vector-vector +# vd[i] = vs2[i] - vs1[i] - v0.mask[i] +vsbc.vvm vd, vs2, vs1, v0 # Vector-vector - # vd[i] = vs2[i] - x[rs1] - v0.mask[i] - vsbc.vxm vd, vs2, rs1, v0 # Vector-scalar +# vd[i] = vs2[i] - x[rs1] - v0.mask[i] +vsbc.vxm vd, vs2, rs1, v0 # Vector-scalar - # Produce borrow out in mask register format +# Produce borrow out in mask register format - # vd.mask[i] = borrow_out(vs2[i] - vs1[i] - v0.mask[i]) - vmsbc.vvm vd, vs2, vs1, v0 # Vector-vector +# vd.mask[i] = borrow_out(vs2[i] - vs1[i] - v0.mask[i]) +vmsbc.vvm vd, vs2, vs1, v0 # Vector-vector - # vd.mask[i] = borrow_out(vs2[i] - x[rs1] - v0.mask[i]) - vmsbc.vxm vd, vs2, rs1, v0 # Vector-scalar +# vd.mask[i] = borrow_out(vs2[i] - x[rs1] - v0.mask[i]) +vmsbc.vxm vd, vs2, rs1, v0 # Vector-scalar - # vd.mask[i] = borrow_out(vs2[i] - vs1[i]) - vmsbc.vv vd, vs2, vs1 # Vector-vector, no borrow-in +# vd.mask[i] = borrow_out(vs2[i] - vs1[i]) +vmsbc.vv vd, vs2, vs1 # Vector-vector, no borrow-in - # vd.mask[i] = borrow_out(vs2[i] - x[rs1]) - vmsbc.vx vd, vs2, rs1 # Vector-scalar, no borrow-in +# vd.mask[i] = borrow_out(vs2[i] - x[rs1]) +vmsbc.vx vd, vs2, rs1 # Vector-scalar, no borrow-in ---- For `vmsbc`, the borrow is defined to be 1 iff the difference, prior to @@ -2807,9 +2806,9 @@ masked va >= x, any vd Compares effectively AND in the mask under a mask-undisturbed policy if the destination register is `v0`, e.g., ---- - # (a < b) && (b < c) in two instructions when mask-undisturbed - vmslt.vv v0, va, vb # All body elements written - vmslt.vv v0, vb, vc, v0.t # Only update at set mask +# (a < b) && (b < c) in two instructions when mask-undisturbed +vmslt.vv v0, va, vb # All body elements written +vmslt.vv v0, vb, vc, v0.t # Only update at set mask ---- Compares write mask registers, and so always operate under a @@ -2883,21 +2882,21 @@ standard scalar integer multiply/divides, with the same results for extreme inputs. ---- - # Unsigned divide. - vdivu.vv vd, vs2, vs1, vm # Vector-vector - vdivu.vx vd, vs2, rs1, vm # vector-scalar +# Unsigned divide. +vdivu.vv vd, vs2, vs1, vm # Vector-vector +vdivu.vx vd, vs2, rs1, vm # vector-scalar - # Signed divide - vdiv.vv vd, vs2, vs1, vm # Vector-vector - vdiv.vx vd, vs2, rs1, vm # vector-scalar +# Signed divide +vdiv.vv vd, vs2, vs1, vm # Vector-vector +vdiv.vx vd, vs2, rs1, vm # vector-scalar - # Unsigned remainder - vremu.vv vd, vs2, vs1, vm # Vector-vector - vremu.vx vd, vs2, rs1, vm # vector-scalar +# Unsigned remainder +vremu.vv vd, vs2, vs1, vm # Vector-vector +vremu.vx vd, vs2, rs1, vm # vector-scalar - # Signed remainder - vrem.vv vd, vs2, vs1, vm # Vector-vector - vrem.vx vd, vs2, rs1, vm # vector-scalar +# Signed remainder +vrem.vv vd, vs2, vs1, vm # Vector-vector +vrem.vx vd, vs2, rs1, vm # vector-scalar ---- NOTE: The decision to include integer divide and remainder was @@ -3188,14 +3187,14 @@ used to control the right shift amount, which provides the scaling. ---- # Narrowing unsigned clip # SEW 2*SEW SEW - vnclipu.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], vs1[i])) - vnclipu.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], x[rs1])) - vnclipu.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_unsigned(vs2[i], uimm)) +vnclipu.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], vs1[i])) +vnclipu.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], x[rs1])) +vnclipu.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_unsigned(vs2[i], uimm)) # Narrowing signed clip - vnclip.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_signed(vs2[i], vs1[i])) - vnclip.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_signed(vs2[i], x[rs1])) - vnclip.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_signed(vs2[i], uimm)) +vnclip.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_signed(vs2[i], vs1[i])) +vnclip.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_signed(vs2[i], x[rs1])) +vnclip.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_signed(vs2[i], uimm)) ---- For `vnclipu`/`vnclip`, the rounding mode is specified in the `vxrm` @@ -3273,14 +3272,14 @@ elements do not set FP exception flags. ==== Vector Single-Width Floating-Point Add/Subtract Instructions ---- - # Floating-point add - vfadd.vv vd, vs2, vs1, vm # Vector-vector - vfadd.vf vd, vs2, rs1, vm # vector-scalar +# Floating-point add +vfadd.vv vd, vs2, vs1, vm # Vector-vector +vfadd.vf vd, vs2, rs1, vm # vector-scalar - # Floating-point subtract - vfsub.vv vd, vs2, vs1, vm # Vector-vector - vfsub.vf vd, vs2, rs1, vm # Vector-scalar vd[i] = vs2[i] - f[rs1] - vfrsub.vf vd, vs2, rs1, vm # Scalar-vector vd[i] = f[rs1] - vs2[i] +# Floating-point subtract +vfsub.vv vd, vs2, vs1, vm # Vector-vector +vfsub.vf vd, vs2, rs1, vm # Vector-scalar vd[i] = vs2[i] - f[rs1] +vfrsub.vf vd, vs2, rs1, vm # Scalar-vector vd[i] = f[rs1] - vs2[i] ---- ==== Vector Widening Floating-Point Add/Subtract Instructions @@ -3302,16 +3301,16 @@ vfwsub.wf vd, vs2, rs1, vm # vector-scalar ==== Vector Single-Width Floating-Point Multiply/Divide Instructions ---- - # Floating-point multiply - vfmul.vv vd, vs2, vs1, vm # Vector-vector - vfmul.vf vd, vs2, rs1, vm # vector-scalar +# Floating-point multiply +vfmul.vv vd, vs2, vs1, vm # Vector-vector +vfmul.vf vd, vs2, rs1, vm # vector-scalar - # Floating-point divide - vfdiv.vv vd, vs2, vs1, vm # Vector-vector - vfdiv.vf vd, vs2, rs1, vm # vector-scalar +# Floating-point divide +vfdiv.vv vd, vs2, vs1, vm # Vector-vector +vfdiv.vf vd, vs2, rs1, vm # vector-scalar - # Reverse floating-point divide vector = scalar / vector - vfrdiv.vf vd, vs2, rs1, vm # scalar-vector, vd[i] = f[rs1]/vs2[i] +# Reverse floating-point divide vector = scalar / vector +vfrdiv.vf vd, vs2, rs1, vm # scalar-vector, vd[i] = f[rs1]/vs2[i] ---- ==== Vector Widening Floating-Point Multiply @@ -3396,15 +3395,15 @@ vfwnmsac.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) + vd[i] This is a unary vector-vector instruction. ---- - # Floating-point square root - vfsqrt.v vd, vs2, vm # Vector-vector square root +# Floating-point square root +vfsqrt.v vd, vs2, vm # Vector-vector square root ---- ==== Vector Floating-Point Reciprocal Square-Root Estimate Instruction ---- - # Floating-point reciprocal square-root estimate to 7 bits. - vfrsqrt7.v vd, vs2, vm +# Floating-point reciprocal square-root estimate to 7 bits. +vfrsqrt7.v vd, vs2, vm ---- This is a unary vector-vector instruction that returns an estimate of @@ -3472,8 +3471,8 @@ with greater estimate accuracy. ==== Vector Floating-Point Reciprocal Estimate Instruction ---- - # Floating-point reciprocal estimate to 7 bits. - vfrec7.v vd, vs2, vm +# Floating-point reciprocal estimate to 7 bits. +vfrec7.v vd, vs2, vm ---- NOTE: An earlier draft version had used the assembler name `vfrece7` @@ -3572,13 +3571,13 @@ in version 2.2 of the RISC-V F/D/Q extension: they perform the `minimumNumber` or `maximumNumber` operation on active elements. ---- - # Floating-point minimum - vfmin.vv vd, vs2, vs1, vm # Vector-vector - vfmin.vf vd, vs2, rs1, vm # vector-scalar +# Floating-point minimum +vfmin.vv vd, vs2, vs1, vm # Vector-vector +vfmin.vf vd, vs2, rs1, vm # vector-scalar - # Floating-point maximum - vfmax.vv vd, vs2, vs1, vm # Vector-vector - vfmax.vf vd, vs2, rs1, vm # vector-scalar +# Floating-point maximum +vfmax.vv vd, vs2, vs1, vm # Vector-vector +vfmax.vf vd, vs2, rs1, vm # vector-scalar ---- ==== Vector Floating-Point Sign-Injection Instructions @@ -3587,14 +3586,14 @@ Vector versions of the scalar sign-injection instructions. The result takes all bits except the sign bit from the vector `vs2` operands. ---- - vfsgnj.vv vd, vs2, vs1, vm # Vector-vector - vfsgnj.vf vd, vs2, rs1, vm # vector-scalar +vfsgnj.vv vd, vs2, vs1, vm # Vector-vector +vfsgnj.vf vd, vs2, rs1, vm # vector-scalar - vfsgnjn.vv vd, vs2, vs1, vm # Vector-vector - vfsgnjn.vf vd, vs2, rs1, vm # vector-scalar +vfsgnjn.vv vd, vs2, vs1, vm # Vector-vector +vfsgnjn.vf vd, vs2, rs1, vm # vector-scalar - vfsgnjx.vv vd, vs2, vs1, vm # Vector-vector - vfsgnjx.vf vd, vs2, rs1, vm # vector-scalar +vfsgnjx.vv vd, vs2, vs1, vm # Vector-vector +vfsgnjx.vf vd, vs2, rs1, vm # vector-scalar ---- NOTE: A vector of floating-point values can be negated using a @@ -3626,27 +3625,27 @@ operand is NaN, whereas the other compares write 0 when either operand is NaN. ---- - # Compare equal - vmfeq.vv vd, vs2, vs1, vm # Vector-vector - vmfeq.vf vd, vs2, rs1, vm # vector-scalar +# Compare equal +vmfeq.vv vd, vs2, vs1, vm # Vector-vector +vmfeq.vf vd, vs2, rs1, vm # vector-scalar - # Compare not equal - vmfne.vv vd, vs2, vs1, vm # Vector-vector - vmfne.vf vd, vs2, rs1, vm # vector-scalar +# Compare not equal +vmfne.vv vd, vs2, vs1, vm # Vector-vector +vmfne.vf vd, vs2, rs1, vm # vector-scalar - # Compare less than - vmflt.vv vd, vs2, vs1, vm # Vector-vector - vmflt.vf vd, vs2, rs1, vm # vector-scalar +# Compare less than +vmflt.vv vd, vs2, vs1, vm # Vector-vector +vmflt.vf vd, vs2, rs1, vm # vector-scalar - # Compare less than or equal - vmfle.vv vd, vs2, vs1, vm # Vector-vector - vmfle.vf vd, vs2, rs1, vm # vector-scalar +# Compare less than or equal +vmfle.vv vd, vs2, vs1, vm # Vector-vector +vmfle.vf vd, vs2, rs1, vm # vector-scalar - # Compare greater than - vmfgt.vf vd, vs2, rs1, vm # vector-scalar +# Compare greater than +vmfgt.vf vd, vs2, rs1, vm # vector-scalar - # Compare greater than or equal - vmfge.vf vd, vs2, rs1, vm # vector-scalar +# Compare greater than or equal +vmfge.vf vd, vs2, rs1, vm # vector-scalar ---- ---- @@ -3675,11 +3674,11 @@ the comparand is a non-NaN constant, the middle two instructions can be omitted. ---- - # Example of implementing isgreater() - vmfeq.vv v0, va, va # Only set where A is not NaN. - vmfeq.vv v1, vb, vb # Only set where B is not NaN. - vmand.mm v0, v0, v1 # Only set where A and B are ordered, - vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values. +# Example of implementing isgreater() +vmfeq.vv v0, va, va # Only set where A is not NaN. +vmfeq.vv v1, vb, vb # Only set where B is not NaN. +vmand.mm v0, v0, v1 # Only set where A and B are ordered, +vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values. ---- NOTE: In the above sequence, it is tempting to mask the second `vmfeq` @@ -3694,7 +3693,7 @@ This is a unary vector-vector instruction that operates in the same way as the scalar classify instruction. ---- - vfclass.v vd, vs2, vm # Vector-vector +vfclass.v vd, vs2, vm # Vector-vector ---- The 10-bit mask produced by this instruction is placed in the @@ -3885,15 +3884,15 @@ All operands and results of single-width reduction instructions have the same SEW width. Overflows wrap around on arithmetic sums. ---- - # Simple reductions, where [*] denotes all active elements: - vredsum.vs vd, vs2, vs1, vm # vd[0] = sum( vs1[0] , vs2[*] ) - vredmaxu.vs vd, vs2, vs1, vm # vd[0] = maxu( vs1[0] , vs2[*] ) - vredmax.vs vd, vs2, vs1, vm # vd[0] = max( vs1[0] , vs2[*] ) - vredminu.vs vd, vs2, vs1, vm # vd[0] = minu( vs1[0] , vs2[*] ) - vredmin.vs vd, vs2, vs1, vm # vd[0] = min( vs1[0] , vs2[*] ) - vredand.vs vd, vs2, vs1, vm # vd[0] = and( vs1[0] , vs2[*] ) - vredor.vs vd, vs2, vs1, vm # vd[0] = or( vs1[0] , vs2[*] ) - vredxor.vs vd, vs2, vs1, vm # vd[0] = xor( vs1[0] , vs2[*] ) +# Simple reductions, where [*] denotes all active elements: +vredsum.vs vd, vs2, vs1, vm # vd[0] = sum( vs1[0] , vs2[*] ) +vredmaxu.vs vd, vs2, vs1, vm # vd[0] = maxu( vs1[0] , vs2[*] ) +vredmax.vs vd, vs2, vs1, vm # vd[0] = max( vs1[0] , vs2[*] ) +vredminu.vs vd, vs2, vs1, vm # vd[0] = minu( vs1[0] , vs2[*] ) +vredmin.vs vd, vs2, vs1, vm # vd[0] = min( vs1[0] , vs2[*] ) +vredand.vs vd, vs2, vs1, vm # vd[0] = and( vs1[0] , vs2[*] ) +vredor.vs vd, vs2, vs1, vm # vd[0] = or( vs1[0] , vs2[*] ) +vredxor.vs vd, vs2, vs1, vm # vd[0] = xor( vs1[0] , vs2[*] ) ---- [[sec-vector-integer-reduce-widen]] @@ -3909,23 +3908,22 @@ elements before summing them. For both `vwredsumu.vs` and `vwredsum.vs`, overflows wrap around. ---- - # Unsigned sum reduction into double-width accumulator - vwredsumu.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(zero-extend(SEW)) +# Unsigned sum reduction into double-width accumulator +vwredsumu.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(zero-extend(SEW)) - # Signed sum reduction into double-width accumulator - vwredsum.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(sign-extend(SEW)) +# Signed sum reduction into double-width accumulator +vwredsum.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(sign-extend(SEW)) ---- [[sec-vector-float-reduce]] ==== Vector Single-Width Floating-Point Reduction Instructions ---- - # Simple reductions. - vfredosum.vs vd, vs2, vs1, vm # Ordered sum - vfredusum.vs vd, vs2, vs1, vm # Unordered sum - vfredmax.vs vd, vs2, vs1, vm # Maximum value - vfredmin.vs vd, vs2, vs1, vm # Minimum value - +# Simple reductions. +vfredosum.vs vd, vs2, vs1, vm # Ordered sum +vfredusum.vs vd, vs2, vs1, vm # Unordered sum +vfredmax.vs vd, vs2, vs1, vm # Maximum value +vfredmin.vs vd, vs2, vs1, vm # Minimum value ---- NOTE: Older assembler mnemonic `vfredsum` is retained as alias for `vfredusum`. @@ -4058,14 +4056,14 @@ Mask elements past `vl`, the tail elements, are always updated with a tail-agnostic policy. ---- - vmand.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && vs1.mask[i] - vmnand.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] && vs1.mask[i]) - vmandn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && !vs1.mask[i] - vmxor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] ^^ vs1.mask[i] - vmor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || vs1.mask[i] - vmnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] || vs1.mask[i]) - vmorn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || !vs1.mask[i] - vmxnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] ^^ vs1.mask[i]) +vmand.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && vs1.mask[i] +vmnand.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] && vs1.mask[i]) +vmandn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && !vs1.mask[i] +vmxor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] ^^ vs1.mask[i] +vmor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || vs1.mask[i] +vmnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] || vs1.mask[i]) +vmorn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || !vs1.mask[i] +vmxnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] ^^ vs1.mask[i]) ---- NOTE: The previous assembler mnemonics `vmandnot` and `vmornot` have @@ -4076,10 +4074,10 @@ mnemonics can be retained as assembler aliases for compatibility. Several assembler pseudoinstructions are defined as shorthand for common uses of mask logical operations: ---- - vmmv.m vd, vs => vmand.mm vd, vs, vs # Copy mask register - vmclr.m vd => vmxor.mm vd, vd, vd # Clear mask register - vmset.m vd => vmxnor.mm vd, vd, vd # Set mask register - vmnot.m vd, vs => vmnand.mm vd, vs, vs # Invert bits +vmmv.m vd, vs => vmand.mm vd, vs, vs # Copy mask register +vmclr.m vd => vmxor.mm vd, vd, vd # Clear mask register +vmset.m vd => vmxnor.mm vd, vd, vd # Set mask register +vmnot.m vd, vs => vmnand.mm vd, vs, vs # Invert bits ---- NOTE: The `vmmv.m` instruction was previously called `vmcpy.m`, but @@ -4132,7 +4130,7 @@ use. ==== Vector count population in mask `vcpop.m` ---- - vcpop.m rd, vs2, vm +vcpop.m rd, vs2, vm ---- NOTE: This instruction previously had the assembler mnemonic `vpopc.m` @@ -4151,7 +4149,7 @@ The operation can be performed under a mask, in which case only the masked elements are counted. ---- - vcpop.m rd, vs2, v0.t # x[rd] = sum_i ( vs2.mask[i] && v0.mask[i] ) +vcpop.m rd, vs2, v0.t # x[rd] = sum_i ( vs2.mask[i] && v0.mask[i] ) ---- The `vcpop.m` instruction writes `x[rd]` even if `vl`=0 (with the @@ -4164,7 +4162,7 @@ Traps on `vcpop.m` are always reported with a `vstart` of 0. The ==== `vfirst` find-first-set mask bit ---- - vfirst.m rd, vs2, vm +vfirst.m rd, vs2, vm ---- The `vfirst` instruction finds the lowest-numbered active element of @@ -4356,27 +4354,27 @@ The `viota.m` instruction can be combined with memory scatter instructions (indexed stores) to perform vector compress functions. ---- - # Compact non-zero elements from input memory array to output memory array - # - # size_t compact_non_zero(size_t n, const int* in, int* out) - # { - # size_t i; - # size_t count = 0; - # int *p = out; - # - # for (i=0; i<n; i++) - # { - # const int v = *in++; - # if (v != 0) - # *p++ = v; - # } - # - # return (size_t) (p - out); - # } - # - # a0 = n - # a1 = &in - # a2 = &out +# Compact non-zero elements from input memory array to output memory array +# +# size_t compact_non_zero(size_t n, const int* in, int* out) +# { +# size_t i; +# size_t count = 0; +# int *p = out; +# +# for (i=0; i<n; i++) +# { +# const int v = *in++; +# if (v != 0) +# *p++ = v; +# } +# +# return (size_t) (p - out); +# } +# +# a0 = n +# a1 = &in +# a2 = &out compact_non_zero: li a6, 0 # Clear count of non-zero elements @@ -4406,7 +4404,7 @@ The `vid.v` instruction writes each element's index to the destination vector register group, from 0 to `vl`-1. ---- - vid.v vd, vm # Write element ID to destination. +vid.v vd, vm # Write element ID to destination. ---- The instruction can be masked. Masking does not change the @@ -4516,8 +4514,8 @@ undisturbed/agnostic policy is followed for inactive elements. ===== Vector Slideup Instructions ---- - vslideup.vx vd, vs2, rs1, vm # vd[i+x[rs1]] = vs2[i] - vslideup.vi vd, vs2, uimm, vm # vd[i+uimm] = vs2[i] +vslideup.vx vd, vs2, rs1, vm # vd[i+x[rs1]] = vs2[i] +vslideup.vi vd, vs2, uimm, vm # vd[i+uimm] = vs2[i] ---- For `vslideup`, the value in `vl` specifies the maximum number of destination @@ -4529,13 +4527,13 @@ Destination elements _OFFSET_ through `vl`-1 are written if unmasked and if _OFFSET_ < `vl`. ---- - vslideup behavior for destination elements (`vstart` < `vl`) +vslideup behavior for destination elements (`vstart` < `vl`) - OFFSET is amount to slideup, either from x register or a 5-bit immediate +OFFSET is amount to slideup, either from x register or a 5-bit immediate - 0 <= i < min(vl, max(vstart, OFFSET)) Unchanged - max(vstart, OFFSET) <= i < vl vd[i] = vs2[i-OFFSET] if v0.mask[i] enabled - vl <= i < VLMAX Follow tail policy + 0 <= i < min(vl, max(vstart, OFFSET)) Unchanged +max(vstart, OFFSET) <= i < vl vd[i] = vs2[i-OFFSET] if v0.mask[i] enabled + vl <= i < VLMAX Follow tail policy ---- The destination vector register group for `vslideup` cannot overlap @@ -4549,8 +4547,8 @@ input vectors during execution, and enables restart with non-zero ===== Vector Slidedown Instructions ---- - vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+x[rs1]] - vslidedown.vi vd, vs2, uimm, vm # vd[i] = vs2[i+uimm] +vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+x[rs1]] +vslidedown.vi vd, vs2, uimm, vm # vd[i] = vs2[i+uimm] ---- For `vslidedown`, the value in `vl` specifies the maximum number of @@ -4564,15 +4562,14 @@ using an unsigned integer in the `x` register specified by `rs1`, or a If XLEN > SEW, _OFFSET_ is _not_ truncated to SEW bits. ---- - vslidedown behavior for source elements for element i in slide (`vstart` < `vl`) - 0 <= i+OFFSET < VLMAX src[i] = vs2[i+OFFSET] - VLMAX <= i+OFFSET src[i] = 0 - - vslidedown behavior for destination element i in slide (`vstart` < `vl`) - 0 <= i < vstart Unchanged - vstart <= i < vl vd[i] = src[i] if v0.mask[i] enabled - vl <= i < VLMAX Follow tail policy +vslidedown behavior for source elements for element i in slide (`vstart` < `vl`) + 0 <= i+OFFSET < VLMAX src[i] = vs2[i+OFFSET] + VLMAX <= i+OFFSET src[i] = 0 +vslidedown behavior for destination element i in slide (`vstart` < `vl`) + 0 <= i < vstart Unchanged + vstart <= i < vl vd[i] = src[i] if v0.mask[i] enabled + vl <= i < VLMAX Follow tail policy ---- ===== Vector Slide1up @@ -4582,7 +4579,7 @@ also allow a scalar integer value to be inserted at the vacated element position. ---- - vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] +vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] ---- The `vslide1up` instruction places the `x` register argument at @@ -4603,12 +4600,12 @@ past `vl` are handled according to the current tail policy (Section ---- - vslide1up behavior when vl > 0 +vslide1up behavior when vl > 0 - i < vstart unchanged - 0 = i = vstart vd[i] = x[rs1] if v0.mask[i] enabled - max(vstart, 1) <= i < vl vd[i] = vs2[i-1] if v0.mask[i] enabled - vl <= i < VLMAX Follow tail policy + i < vstart unchanged + 0 = i = vstart vd[i] = x[rs1] if v0.mask[i] enabled +max(vstart, 1) <= i < vl vd[i] = vs2[i-1] if v0.mask[i] enabled + vl <= i < VLMAX Follow tail policy ---- The `vslide1up` instruction requires that the destination vector @@ -4619,7 +4616,7 @@ Otherwise, the instruction encoding is reserved. ===== Vector Floating-Point Slide1up Instruction ---- - vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] +vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] ---- The `vfslide1up` instruction is defined analogously to `vslide1up`, @@ -4637,7 +4634,7 @@ past `vl` are handled according to the current tail policy (Section <<sec-agnostic>>). ---- - vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] +vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] ---- The `vslide1down` instruction places the `x` register argument at @@ -4649,12 +4646,12 @@ XLEN > SEW, the least-significant bits are copied over and the high SEW-XLEN bits are ignored. ---- - vslide1down behavior +vslide1down behavior - i < vstart unchanged - vstart <= i < vl-1 vd[i] = vs2[i+1] if v0.mask[i] enabled - vstart <= i = vl-1 vd[vl-1] = x[rs1] if v0.mask[i] enabled - vl <= i < VLMAX Follow tail policy + i < vstart unchanged +vstart <= i < vl-1 vd[i] = vs2[i+1] if v0.mask[i] enabled +vstart <= i = vl-1 vd[vl-1] = x[rs1] if v0.mask[i] enabled + vl <= i < VLMAX Follow tail policy ---- NOTE: The `vslide1down` instruction can be used to load values into a @@ -4667,7 +4664,7 @@ contents of a vector register, albeit slowly, with multiple repeated ===== Vector Floating-Point Slide1down Instruction ---- - vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] +vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] ---- The `vfslide1down` instruction is defined analogously to `vslide1down`, @@ -4729,7 +4726,7 @@ contiguous elements at the start of the destination vector register group. ---- - vcompress.vm vd, vs2, vs1 # Compress into vd elements of vs2 where vs1 is enabled +vcompress.vm vd, vs2, vs1 # Compress into vd elements of vs2 where vs1 is enabled ---- The vector mask register specified by `vs1` indicates which of the @@ -4740,16 +4737,16 @@ elements according to the current tail policy (Section <<sec-agnostic>>). ---- - Example use of vcompress instruction +Example use of vcompress instruction - 8 7 6 5 4 3 2 1 0 Element number +8 7 6 5 4 3 2 1 0 Element number - 1 1 0 1 0 0 1 0 1 v0 - 8 7 6 5 4 3 2 1 0 v1 - 1 2 3 4 5 6 7 8 9 v2 - vsetivli t0, 9, e8, m1, tu, ma - vcompress.vm v2, v1, v0 - 1 2 3 4 8 7 5 2 0 v2 +1 1 0 1 0 0 1 0 1 v0 +8 7 6 5 4 3 2 1 0 v1 +1 2 3 4 5 6 7 8 9 v2 + vsetivli t0, 9, e8, m1, tu, ma + vcompress.vm v2, v1, v0 +1 2 3 4 8 7 5 2 0 v2 ---- `vcompress` is encoded as an unmasked instruction (`vm=1`). The equivalent @@ -4775,30 +4772,30 @@ There is no inverse `vdecompress` provided, as this operation can be readily synthesized using iota and a masked vrgather: ---- - Desired functionality of 'vdecompress' - 7 6 5 4 3 2 1 0 # vid +Desired functionality of 'vdecompress' +7 6 5 4 3 2 1 0 # vid - e d c b a # packed vector of 5 elements - 1 0 0 1 1 1 0 1 # mask vector of 8 elements - p q r s t u v w # destination register before vdecompress + e d c b a # packed vector of 5 elements +1 0 0 1 1 1 0 1 # mask vector of 8 elements +p q r s t u v w # destination register before vdecompress - e q r d c b v a # result of vdecompress +e q r d c b v a # result of vdecompress ---- ---- - # v0 holds mask - # v1 holds packed data - # v11 holds input expanded vector and result - viota.m v10, v0 # Calc iota from mask in v0 - vrgather.vv v11, v1, v10, v0.t # Expand into destination +# v0 holds mask +# v1 holds packed data +# v11 holds input expanded vector and result +viota.m v10, v0 # Calc iota from mask in v0 +vrgather.vv v11, v1, v10, v0.t # Expand into destination ---- ---- - p q r s t u v w # v11 destination register - e d c b a # v1 source vector - 1 0 0 1 1 1 0 1 # v0 mask vector +p q r s t u v w # v11 destination register + e d c b a # v1 source vector +1 0 0 1 1 1 0 1 # v0 mask vector - 4 4 4 3 2 1 1 0 # v10 result of viota.m - e q r d c b v a # v11 destination after vrgather using viota.m under mask +4 4 4 3 2 1 1 0 # v10 result of viota.m +e q r d c b v a # v11 destination after vrgather using viota.m under mask ---- ==== Whole Vector Register Move @@ -4838,12 +4835,12 @@ related `vmerge` encoding, and it is unlikely the `vsmul` instruction would benefit from an immediate form. ---- - vmv<nr>r.v vd, vs2 # General form +vmv<nr>r.v vd, vs2 # General form - vmv1r.v v1, v2 # Copy v1=v2 - vmv2r.v v10, v12 # Copy v10=v12; v11=v13 - vmv4r.v v4, v8 # Copy v4=v8; v5=v9; v6=v10; v7=v11 - vmv8r.v v0, v8 # Copy v0=v8; v1=v9; ...; v7=v15 +vmv1r.v v1, v2 # Copy v1=v2 +vmv2r.v v10, v12 # Copy v10=v12; v11=v13 +vmv4r.v v4, v8 # Copy v4=v8; v5=v9; v6=v10; v7=v11 +vmv8r.v v0, v8 # Copy v0=v8; v1=v9; ...; v7=v15 ---- The source and destination vector register numbers must be aligned |