aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Traynor <wmat@riscv.org>2024-03-20 14:50:06 -0400
committerGitHub <noreply@github.com>2024-03-20 14:50:06 -0400
commit918ba8b4f4beab4d7e6249b9bccd52a2a8323484 (patch)
tree70da01235e9765c4a29e1bf0cc2fe66c154f57f5
parentf86578fd959bdd4b71d3e7bbdc8a32f766a3b6a4 (diff)
parent48ddf7c5014e48568685bf8c79626f144932a910 (diff)
downloadriscv-isa-manual-918ba8b4f4beab4d7e6249b9bccd52a2a8323484.zip
riscv-isa-manual-918ba8b4f4beab4d7e6249b9bccd52a2a8323484.tar.gz
riscv-isa-manual-918ba8b4f4beab4d7e6249b9bccd52a2a8323484.tar.bz2
Merge branch 'main' into scalar-crypto
Signed-off-by: Bill Traynor <wmat@riscv.org>
-rw-r--r--.github/workflows/isa-build.yml22
-rw-r--r--.github/workflows/merge-and-release.yml88
-rw-r--r--.gitignore3
-rw-r--r--build/Makefile1
-rw-r--r--dependencies/Gemfile1
-rw-r--r--marchid.md1
-rw-r--r--src/a-st-ext.adoc9
-rw-r--r--src/b-st-ext.adoc3909
-rw-r--r--src/c-st-ext.adoc7
-rw-r--r--src/calling-convention.adoc29
-rw-r--r--src/cmo.adoc1130
-rw-r--r--src/example/memcpy.s17
-rw-r--r--src/example/saxpy.s29
-rw-r--r--src/example/sgemm.S221
-rw-r--r--src/example/strcmp.s34
-rw-r--r--src/example/strcpy.s20
-rw-r--r--src/example/strlen.s22
-rw-r--r--src/example/strncpy.s36
-rw-r--r--src/example/vvaddint32.s22
-rw-r--r--src/f-st-ext.adoc2
-rw-r--r--src/fraclmul.adoc174
-rw-r--r--src/hypervisor.adoc2
-rw-r--r--src/images/bytefield/hstatusreg-rv32.edn4
-rw-r--r--src/images/bytefield/hstatusreg.edn4
-rw-r--r--src/images/bytefield/hypv-mstatus.edn6
-rw-r--r--src/images/bytefield/miereg-standard.adoc3
-rw-r--r--src/images/bytefield/mncause.edn6
-rw-r--r--src/images/bytefield/mnstatus.edn20
-rw-r--r--src/images/bytefield/vsstatusreg.edn6
-rw-r--r--src/images/smepmp-visual-representation.pngbin0 -> 89113 bytes
-rw-r--r--src/images/wavedrom/ct-unconditional-2.adoc2
-rw-r--r--src/images/wavedrom/v-inst-table.adoc210
-rw-r--r--src/images/wavedrom/valu-format.adoc104
-rw-r--r--src/images/wavedrom/vcfg-format.adoc47
-rw-r--r--src/images/wavedrom/vfrec7.adoc136
-rw-r--r--src/images/wavedrom/vfrsqrt7.adoc137
-rw-r--r--src/images/wavedrom/vmem-format.adoc108
-rw-r--r--src/images/wavedrom/vtype-format.adoc28
-rw-r--r--src/intro.adoc2
-rw-r--r--src/machine.adoc2
-rw-r--r--src/mm-eplan.adoc2
-rw-r--r--src/mm-formal.adoc68
-rw-r--r--src/resources/themes/riscv-spec.yml10
-rw-r--r--src/riscv-privileged.adoc9
-rw-r--r--src/riscv-unprivileged.adoc35
-rw-r--r--src/rnmi.adoc26
-rw-r--r--src/rv-32-64g.adoc9
-rw-r--r--src/rv32.adoc2
-rw-r--r--src/smepmp.adoc171
-rw-r--r--src/smstateen.adoc406
-rw-r--r--src/sscofpmt.adoc189
-rw-r--r--src/sstc.adoc190
-rw-r--r--src/supervisor.adoc6
-rw-r--r--src/v-st-ext.adoc5185
-rw-r--r--src/vector-examples.adoc125
-rw-r--r--src/zawrs.adoc105
-rw-r--r--src/zc.adoc2611
-rw-r--r--src/zfh.adoc2
-rw-r--r--src/zicsr.adoc33
59 files changed, 15665 insertions, 123 deletions
diff --git a/.github/workflows/isa-build.yml b/.github/workflows/isa-build.yml
index 7135c26..ca1b4c5 100644
--- a/.github/workflows/isa-build.yml
+++ b/.github/workflows/isa-build.yml
@@ -28,7 +28,7 @@ jobs:
steps:
# Checkout the repository
- name: Checkout repository
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
# Set the short SHA for use in artifact names
- name: Set short SHA
@@ -57,7 +57,7 @@ jobs:
# Upload the priv-isa-asciidoc PDF file
- name: Upload priv-isa-asciidoc.pdf
if: steps.build_files.outcome == 'success'
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: priv-isa-asciidoc-${{ env.SHORT_SHA }}.pdf
path: ${{ github.workspace }}/build/priv-isa-asciidoc.pdf
@@ -66,7 +66,7 @@ jobs:
# Upload the priv-isa-asciidoc HTML file
- name: Upload priv-isa-asciidoc.html
if: steps.build_files.outcome == 'success'
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: priv-isa-asciidoc-${{ env.SHORT_SHA }}.html
path: ${{ github.workspace }}/build/priv-isa-asciidoc.html
@@ -75,7 +75,7 @@ jobs:
# Upload the unpriv-isa-asciidoc PDF file
- name: Upload unpriv-isa-asciidoc.pdf
if: steps.build_files.outcome == 'success'
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: unpriv-isa-asciidoc-${{ env.SHORT_SHA }}.pdf
path: ${{ github.workspace }}/build/unpriv-isa-asciidoc.pdf
@@ -84,24 +84,15 @@ jobs:
# Upload the unpriv-isa-asciidoc HTML file
- name: Upload unpriv-isa-asciidoc.html
if: steps.build_files.outcome == 'success'
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: unpriv-isa-asciidoc-${{ env.SHORT_SHA }}.html
path: ${{ github.workspace }}/build/unpriv-isa-asciidoc.html
retention-days: 7
- # Upload the priv-isa-latex PDF file
- - name: Upload riscv-privileged.pdf
- if: steps.build_files.outcome == 'success'
- uses: actions/upload-artifact@v3
- with:
- name: riscv-privileged-latex-${{ env.SHORT_SHA }}.pdf
- path: ${{ github.workspace }}/build/riscv-privileged.pdf
- retention-days: 7
-
- name: Create Release
if: steps.build_files.outcome == 'success' && github.event_name == 'workflow_dispatch' && github.event.inputs.create_release == 'true'
- uses: softprops/action-gh-release@v1
+ uses: softprops/action-gh-release@v2
with:
draft: false
tag_name: riscv-isa-release-${{ env.SHORT_SHA }}-${{ env.CURRENT_DATE }}
@@ -114,7 +105,6 @@ jobs:
${{ github.workspace }}/build/priv-isa-asciidoc.html
${{ github.workspace }}/build/unpriv-isa-asciidoc.pdf
${{ github.workspace }}/build/unpriv-isa-asciidoc.html
- ${{ github.workspace }}/build/riscv-privileged.pdf
env:
GITHUB_TOKEN: ${{ secrets.GHTOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/merge-and-release.yml b/.github/workflows/merge-and-release.yml
new file mode 100644
index 0000000..88390e0
--- /dev/null
+++ b/.github/workflows/merge-and-release.yml
@@ -0,0 +1,88 @@
+name: Release New ISA When Merging a PR
+
+on:
+ pull_request:
+ branches:
+ - main
+ types:
+ - closed
+
+jobs:
+ if_merged:
+ if: github.event.pull_request.merged == true
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - run: |
+ echo The PR was successfully merged.
+
+ - name: Set short SHA
+ run: echo "SHORT_SHA=$(echo ${GITHUB_SHA::7})" >> $GITHUB_ENV
+
+ - name: Get current date
+ run: echo "CURRENT_DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
+ - name: Pull Container
+ id: pull_container_image
+ run: |
+ docker pull riscvintl/riscv-docs-base-container-image:latest
+
+ - name: Build Files
+ id: build_files
+ if: steps.pull_container_image.outcome == 'success'
+ run: |
+ docker run --rm -v ${{ github.workspace }}:/build riscvintl/riscv-docs-base-container-image:latest \
+ /bin/sh -c 'cd ./build && make'
+
+ # Upload the priv-isa-asciidoc PDF file
+ - name: Upload priv-isa-asciidoc.pdf
+ if: steps.build_files.outcome == 'success'
+ uses: actions/upload-artifact@v4
+ with:
+ name: priv-isa-asciidoc-${{ env.SHORT_SHA }}.pdf
+ path: ${{ github.workspace }}/build/priv-isa-asciidoc.pdf
+
+ # Upload the priv-isa-asciidoc HTML file
+ - name: Upload priv-isa-asciidoc.html
+ if: steps.build_files.outcome == 'success'
+ uses: actions/upload-artifact@v4
+ with:
+ name: priv-isa-asciidoc-${{ env.SHORT_SHA }}.html
+ path: ${{ github.workspace }}/build/priv-isa-asciidoc.html
+
+ # Upload the unpriv-isa-asciidoc PDF file
+ - name: Upload unpriv-isa-asciidoc.pdf
+ if: steps.build_files.outcome == 'success'
+ uses: actions/upload-artifact@v4
+ with:
+ name: unpriv-isa-asciidoc-${{ env.SHORT_SHA }}.pdf
+ path: ${{ github.workspace }}/build/unpriv-isa-asciidoc.pdf
+
+ # Upload the unpriv-isa-asciidoc HTML file
+ - name: Upload unpriv-isa-asciidoc.html
+ if: steps.build_files.outcome == 'success'
+ uses: actions/upload-artifact@v4
+ with:
+ name: unpriv-isa-asciidoc-${{ env.SHORT_SHA }}.html
+ path: ${{ github.workspace }}/build/unpriv-isa-asciidoc.html
+
+ - name: Create Release
+ uses: softprops/action-gh-release@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GHTOKEN }}
+ with:
+ tag_name: riscv-isa-release-${{ env.SHORT_SHA }}-${{ env.CURRENT_DATE }}
+ name: Release riscv-isa-release-${{ env.SHORT_SHA }}-${{ env.CURRENT_DATE }}
+ draft: false
+ prerelease: false
+ make_latest: true
+ generate_release_notes: true
+ body: |
+ This release was created by: ${{ github.event.sender.login }}
+ Release of RISC-V ISA, built from commit ${{ env.SHORT_SHA }}, is now available.
+ files: |
+ ${{ github.workspace }}/build/priv-isa-asciidoc.pdf
+ ${{ github.workspace }}/build/priv-isa-asciidoc.html
+ ${{ github.workspace }}/build/unpriv-isa-asciidoc.pdf
+ ${{ github.workspace }}/build/unpriv-isa-asciidoc.html
diff --git a/.gitignore b/.gitignore
index e61db2e..0253b91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
.DS_Store
.*.swp
+.vscode
+src/.asciidoctor
+src/diag*
diff --git a/build/Makefile b/build/Makefile
index eef7da5..fad4fbc 100644
--- a/build/Makefile
+++ b/build/Makefile
@@ -91,3 +91,4 @@ clean:
echo "Removing unpriv-isa-asciidoc.html"; \
rm -f unpriv-isa-asciidoc.html; \
fi
+
diff --git a/dependencies/Gemfile b/dependencies/Gemfile
index 8cf7a50..f347221 100644
--- a/dependencies/Gemfile
+++ b/dependencies/Gemfile
@@ -2,6 +2,7 @@ source 'https://rubygems.org'
gem 'asciidoctor'
gem 'asciidoctor-bibtex'
gem 'asciidoctor-diagram'
+gem 'mathematical'
gem 'asciidoctor-mathematical'
gem 'asciidoctor-pdf'
gem 'citeproc-ruby'
diff --git a/marchid.md b/marchid.md
index 79f5e6d..82af726 100644
--- a/marchid.md
+++ b/marchid.md
@@ -61,3 +61,4 @@ ApogeoRV | Gabriele Tripi | [Gabriele Tripi](mailto:tripi.
MicroRV32 | AGRA, Group of Computer Architecture, University of Bremen | [RISC-V @ AGRA](mailto:riscv@informatik.uni-bremen.de) | 41 | https://github.com/agra-uni-bremen/microrv32
QEMU | qemu.org | [QEMU Mailing List](mailto:qemu-riscv@nongnu.org) | 42 | https://qemu.org
KianV | Hirosh Dabui | [Hirosh Dabui](mailto:hirosh@dabui.de) | 43 | https://github.com/splinedrive/kianRiscV
+Coreblocks | Kuźnia Rdzeni, University of Wrocław | [Coreblocks Team](mailto:coreblocks@cs.uni.wroc.pl) | 44 | https://github.com/kuznia-rdzeni/coreblocks
diff --git a/src/a-st-ext.adoc b/src/a-st-ext.adoc
index 396d135..9fae7ab 100644
--- a/src/a-st-ext.adoc
+++ b/src/a-st-ext.adoc
@@ -62,10 +62,11 @@ if the reservation is still valid and the reservation set contains the
bytes being written. If the SC.W succeeds, the instruction writes the
word in _rs2_ to memory, and it writes zero to _rd_. If the SC.W fails,
the instruction does not write to memory, and it writes a nonzero value
-to _rd_. Regardless of success or failure, executing an SC.W instruction
-invalidates any reservation held by this hart. LR.D and SC.D act
-analogously on doublewords and are only available on RV64. For RV64,
-LR.W and SC.W sign-extend the value placed in _rd_.
+to _rd_. For the purposes of memory protection, a failed SC.W may be
+treated like a store. Regardless of success or failure, executing an
+SC.W instruction invalidates any reservation held by this hart. LR.D and
+SC.D act analogously on doublewords and are only available on RV64. For
+RV64, LR.W and SC.W sign-extend the value placed in _rd_.
[NOTE]
====
diff --git a/src/b-st-ext.adoc b/src/b-st-ext.adoc
index 9240f6e..52beb61 100644
--- a/src/b-st-ext.adoc
+++ b/src/b-st-ext.adoc
@@ -1,18 +1,3901 @@
[[bits]]
-== "B" Standard Extension for Bit Manipulation, Version 0.0
+== "B" Standard Extension for Bit Manipulation, Version 1.0.0
-This chapter is a placeholder for a future standard extension to provide
-bit manipulation instructions, including instructions to insert,
-extract, and test bit fields, and for rotations, funnel shifts, and bit
-and byte permutations.
-[NOTE]
+[[preface]]
+=== Bit-manipulation a, b, c and s extensions grouped for public review and ratification
+
+The bit-manipulation (bitmanip) extension collection is comprised of several component extensions to the base RISC-V architecture that are intended to provide some combination of code size reduction, performance improvement, and energy reduction.
+While the instructions are intended to have general use, some instructions are more useful in some domains than others.
+Hence, several smaller bitmanip extensions are provided, rather than one large extension.
+Each of these smaller extensions is grouped by common function and use case, and each has its own Zb*-extension name.
+
+Each bitmanip extension includes a group of several bitmanip instructions that have similar purposes and that can often share the same logic. Some instructions are available in only one extension while others are available in several.
+The instructions have mnemonics and encodings that are independent of the extensions in which they appear.
+Thus, when implementing extensions with overlapping instructions, there is no redundancy in logic or encoding.
+
+The bitmanip extensions are defined for RV32 and RV64.
+Most of the instructions are expected to be forward compatible with RV128.
+While the shift-immediate instructions are defined to have at most a 6-bit immediate field, a 7th bit is available in the encoding space should this be needed for RV128.
+
+=== Word Instructions
+
+The bitmanip extension follows the convention in RV64 that _w_-suffixed instructions (without a dot before the _w_) ignore the upper 32 bits of their inputs, operate on the least-significant 32-bits as signed values and produce a 32-bit signed result that is sign-extended to XLEN.
+
+Bitmanip instructions with the suffix _.uw_ have one operand that is an unsigned 32-bit value that is extracted from the least significant 32 bits of the specified register. Other than that, these perform full XLEN operations.
+
+Bitmanip instructions with the suffix _.b_, _.h_ and _.w_ only look at the least significant 8-bits, 16-bits and 32-bits of the input (respectively) and produce an XLEN-wide result that is sign-extended or zero-extended, based on the specific instruction.
+
+=== Pseudocode for instruction semantics
+
+The semantics of each instruction in <<#insns>> is expressed in a SAIL-like syntax.
+
+=== Extensions
+
+The first group of bitmanip extensions to be released for Public Review are:
+
+* <<#zba>>
+* <<#zbb>>
+* <<#zbc>>
+* <<#zbs>>
+
+Below is a list of all of the instructions (and pseudoinstructions) that are included in these extensions
+along with their specific mapping:
+
+[%header,cols="^3,^3,10,16,^2,^2,^2,^2"]
+|====
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+|Zba
+|Zbb
+|Zbc
+|Zbs
+
+|
+|&#10003;
+|add.uw _rd_, _rs1_, _rs2_
+|<<#insns-add_uw>>
+|&#10003;
+|
+|
+|
+
+|&#10003;
+|&#10003;
+|andn _rd_, _rs1_, _rs2_
+|<<#insns-andn>>
+|
+|&#10003;
+|
+|
+
+
+|&#10003;
+|&#10003;
+|clmul _rd_, _rs1_, _rs2_
+|<<#insns-clmul>>
+|
+|
+|&#10003;
+|
+
+|&#10003;
+|&#10003;
+|clmulh _rd_, _rs1_, _rs2_
+|<<#insns-clmulh>>
+|
+|
+|&#10003;
+|
+
+|&#10003;
+|&#10003;
+|clmulr _rd_, _rs1_, _rs2_
+|<<#insns-clmulr>>
+|
+|
+|&#10003;
+|
+
+|&#10003;
+|&#10003;
+|clz _rd_, _rs_
+|<<#insns-clz>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|clzw _rd_, _rs_
+|<<#insns-clzw>>
+|
+|&#10003;
+|
+|
+|&#10003;
+|&#10003;
+|cpop _rd_, _rs_
+|<<#insns-cpop>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|cpopw _rd_, _rs_
+|<<#insns-cpopw>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|ctz _rd_, _rs_
+|<<#insns-ctz>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|ctzw _rd_, _rs_
+|<<#insns-ctzw>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|max _rd_, _rs1_, _rs2_
+|<<#insns-max>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|maxu _rd_, _rs1_, _rs2_
+|<<#insns-maxu>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|min _rd_, _rs1_, _rs2_
+|<<#insns-min>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|minu _rd_, _rs1_, _rs2_
+|<<#insns-minu>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|orc.b _rd_, _rs1_, _rs2_
+|<<#insns-orc_b>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|orn _rd_, _rs1_, _rs2_
+|<<#insns-orn>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|rev8 _rd_, _rs_
+|<<#insns-rev8>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|rol _rd_, _rs1_, _rs2_
+|<<#insns-rol>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|rolw _rd_, _rs1_, _rs2_
+|<<#insns-rolw>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|ror _rd_, _rs1_, _rs2_
+|<<#insns-ror>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|rori _rd_, _rs1_, _shamt_
+|<<#insns-rori>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|roriw _rd_, _rs1_, _shamt_
+|<<#insns-roriw>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|rorw _rd_, _rs1_, _rs2_
+|<<#insns-rorw>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|bclr _rd_, _rs1_, _rs2_
+|<<#insns-bclr>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|bclri _rd_, _rs1_, _imm_
+|<<#insns-bclri>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|bext _rd_, _rs1_, _rs2_
+|<<#insns-bext>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|bexti _rd_, _rs1_, _imm_
+|<<#insns-bexti>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|binv _rd_, _rs1_, _rs2_
+|<<#insns-binv>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|binvi _rd_, _rs1_, _imm_
+|<<#insns-binvi>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|bset _rd_, _rs1_, _rs2_
+|<<#insns-bset>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|bseti _rd_, _rs1_, _imm_
+|<<#insns-bseti>>
+|
+|
+|
+|&#10003;
+
+|&#10003;
+|&#10003;
+|sext.b _rd_, _rs_
+|<<#insns-sext_b>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|sext.h _rd_, _rs_
+|<<#insns-sext_h>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|sh1add _rd_, _rs1_, _rs2_
+|<<#insns-sh1add>>
+|&#10003;
+|
+|
+|
+
+|
+|&#10003;
+|sh1add.uw _rd_, _rs1_, _rs2_
+|<<#insns-sh1add_uw>>
+|&#10003;
+|
+|
+|
+
+|&#10003;
+|&#10003;
+|sh2add _rd_, _rs1_, _rs2_
+|<<#insns-sh2add>>
+|&#10003;
+|
+|
+|
+
+|
+|&#10003;
+|sh2add.uw _rd_, _rs1_, _rs2_
+|<<#insns-sh2add_uw>>
+|&#10003;
+|
+|
+|
+
+|&#10003;
+|&#10003;
+|sh3add _rd_, _rs1_, _rs2_
+|<<#insns-sh3add>>
+|&#10003;
+|
+|
+|
+
+|
+|&#10003;
+|sh3add.uw _rd_, _rs1_, _rs2_
+|<<#insns-sh3add_uw>>
+|&#10003;
+|
+|
+|
+
+|
+|&#10003;
+|slli.uw _rd_, _rs1_, _imm_
+|<<#insns-slli_uw>>
+|&#10003;
+|
+|
+|
+
+|&#10003;
+|&#10003;
+|xnor _rd_, _rs1_, _rs2_
+|<<#insns-xnor>>
+|
+|&#10003;
+|
+|
+
+|&#10003;
+|&#10003;
+|zext.h _rd_, _rs_
+|<<#insns-zext_h>>
+|
+|&#10003;
+|
+|
+
+|
+|&#10003;
+|zext.w _rd_, _rs_
+|<<#insns-add_uw>>
+|&#10003;
+|
+|
+|
+
+|====
+
+[#zba,reftext=Address generation instructions]
+==== Zba: Address generation
+
+[NOTE,caption=Frozen]
+====
+The Zba extension is frozen.
+====
+
+The Zba instructions can be used to accelerate the generation of addresses that index into arrays of basic types (halfword, word, doubleword) using both unsigned word-sized and XLEN-sized indices: a shifted index is added to a base address.
+
+The shift and add instructions do a left shift of 1, 2, or 3 because these are commonly found in real-world code and because they can be implemented with a minimal amount of additional hardware beyond that of the simple adder. This avoids lengthening the critical path in implementations.
+
+While the shift and add instructions are limited to a maximum left shift of 3, the slli instruction (from the base ISA) can be used to perform similar shifts for indexing into arrays of wider elements. The slli.uw -- added in this extension -- can be used when the index is to be interpreted as an unsigned word.
+
+The following instructions (and pseudoinstructions) comprise the Zba extension:
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|
+|&#10003;
+|add.uw _rd_, _rs1_, _rs2_
+|<<#insns-add_uw>>
+
+|&#10003;
+|&#10003;
+|sh1add _rd_, _rs1_, _rs2_
+|<<#insns-sh1add>>
+
+|
+|&#10003;
+|sh1add.uw _rd_, _rs1_, _rs2_
+|<<#insns-sh1add_uw>>
+
+|&#10003;
+|&#10003;
+|sh2add _rd_, _rs1_, _rs2_
+|<<#insns-sh2add>>
+
+|
+|&#10003;
+|sh2add.uw _rd_, _rs1_, _rs2_
+|<<#insns-sh2add_uw>>
+
+|&#10003;
+|&#10003;
+|sh3add _rd_, _rs1_, _rs2_
+|<<#insns-sh3add>>
+
+|
+|&#10003;
+|sh3add.uw _rd_, _rs1_, _rs2_
+|<<#insns-sh3add_uw>>
+
+|
+|&#10003;
+|slli.uw _rd_, _rs1_, _imm_
+|<<#insns-slli_uw>>
+
+|
+|&#10003;
+|zext.w _rd_, _rs_
+|<<#insns-add_uw>>
+
+|===
+
+[#zbb,reftext="Basic bit-manipulation"]
+==== Zbb: Basic bit-manipulation
+
+[NOTE,caption=Frozen]
====
-Although bit manipulation instructions are very effective in some
-application domains, particularly when dealing with externally packed
-data structures, we excluded them from the base ISAs as they are not
-useful in all domains and can add additional complexity or instruction
-formats to supply all needed operands.
+The Zbb extension is frozen.
+====
+===== Logical with negate
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|andn _rd_, _rs1_, _rs2_
+|<<#insns-andn>>
+
+|&#10003;
+|&#10003;
+|orn _rd_, _rs1_, _rs2_
+|<<#insns-orn>>
+
+|&#10003;
+|&#10003;
+|xnor _rd_, _rs1_, _rs2_
+|<<#insns-xnor>>
+|===
+
+.Implementation Hint
+[NOTE, caption="Imp" ]
+===============================================================
+The Logical with Negate instructions can be implemented by inverting the _rs2_ inputs to the base-required AND, OR, and XOR logic instructions.
+In some implementations, the inverter on rs2 used for subtraction can be reused for this purpose.
+===============================================================
+
+===== Count leading/trailing zero bits
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|clz _rd_, _rs_
+|<<#insns-clz>>
+
+|
+|&#10003;
+|clzw _rd_, _rs_
+|<<#insns-clzw>>
+
+|&#10003;
+|&#10003;
+|ctz _rd_, _rs_
+|<<#insns-ctz>>
+
+|
+|&#10003;
+|ctzw _rd_, _rs_
+|<<#insns-ctzw>>
+|===
+
+===== Count population
+
+These instructions count the number of set bits (1-bits). This is also
+commonly referred to as population count.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|cpop _rd_, _rs_
+|<<#insns-cpop>>
+
+|
+|&#10003;
+|cpopw _rd_, _rs_
+|<<#insns-cpopw>>
+|===
+
+===== Integer minimum/maximum
+
+The integer minimum/maximum instructions are arithmetic R-type
+instructions that return the smaller/larger of two operands.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|max _rd_, _rs1_, _rs2_
+|<<#insns-max>>
+
+|&#10003;
+|&#10003;
+|maxu _rd_, _rs1_, _rs2_
+|<<#insns-maxu>>
+
+|&#10003;
+|&#10003;
+|min _rd_, _rs1_, _rs2_
+|<<#insns-min>>
+
+|&#10003;
+|&#10003;
+|minu _rd_, _rs1_, _rs2_
+|<<#insns-minu>>
+|===
+
+===== Sign- and zero-extension
+
+These instructions perform the sign-extension or zero-extension of the least significant 8 bits or 16 bits of the source register.
+
+These instructions replace the generalized idioms `slli rD,rS,(XLEN-<size>) + srli` (for zero-extension) or `slli + srai` (for sign-extension) for the sign-extension of 8-bit and 16-bit quantities, and for the zero-extension of 16-bit quantities.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|sext.b _rd_, _rs_
+|<<#insns-sext_b>>
+
+|&#10003;
+|&#10003;
+|sext.h _rd_, _rs_
+|<<#insns-sext_h>>
+
+|&#10003;
+|&#10003;
+|zext.h _rd_, _rs_
+|<<#insns-zext_h>>
+|===
+
+===== Bitwise rotation
+
+Bitwise rotation instructions are similar to the shift-logical operations from the base spec. However, where the shift-logical
+instructions shift in zeros, the rotate instructions shift in the bits that were shifted out of the other side of the value.
+Such operations are also referred to as ‘circular shifts’.
+
+
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|rol _rd_, _rs1_, _rs2_
+|<<#insns-rol>>
+
+|
+|&#10003;
+|rolw _rd_, _rs1_, _rs2_
+|<<#insns-rolw>>
+
+|&#10003;
+|&#10003;
+|ror _rd_, _rs1_, _rs2_
+|<<#insns-ror>>
+
+|&#10003;
+|&#10003;
+|rori _rd_, _rs1_, _shamt_
+|<<#insns-rori>>
+
+|
+|&#10003;
+|roriw _rd_, _rs1_, _shamt_
+|<<#insns-roriw>>
+
+|
+|&#10003;
+|rorw _rd_, _rs1_, _rs2_
+|<<#insns-rorw>>
+|===
+
+.Architecture Explanation
+[NOTE, caption="AE" ]
+===============================================================
+The rotate instructions were included to replace a common
+four-instruction sequence to achieve the same effect (neg; sll/srl; srl/sll; or)
+===============================================================
+
+===== OR Combine
+
+*orc.b* sets the bits of each byte in the result _rd_ to all zeros if no bit within the respective byte of _rs_ is set, or to all ones if any bit within the respective byte of _rs_ is set.
+
+One use-case is string-processing functions, such as *strlen* and *strcpy*, which can use *orc.b* to test for the terminating zero byte by counting the set bits in leading non-zero bytes in a word.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|orc.b _rd_, _rs_
+|<<#insns-orc_b>>
+|===
+
+===== Byte-reverse
+
+*rev8* reverses the byte-ordering of _rs_.
+
+[%header,cols="^1,^1,4,8"]
+|====
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|rev8 _rd_, _rs_
+|<<#insns-rev8>>
+
+|====
+
+[#zbc,reftext="Carry-less multiplication"]
+==== Zbc: Carry-less multiplication
+
+[NOTE,caption=Frozen]
+====
+The Zbc extension is frozen.
+====
+
+Carry-less multiplication is the multiplication in the polynomial ring over GF(2).
+
+*clmul* produces the lower half of the carry-less product and *clmulh* produces the upper half of the 2&#x2715;XLEN carry-less product.
+
+*clmulr* produces bits 2&#x2715;XLEN−2:XLEN-1 of the 2&#x2715;XLEN carry-less product.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|clmul _rd_, _rs1_, _rs2_
+|<<#insns-clmul>>
+
+|&#10003;
+|&#10003;
+|clmulh _rd_, _rs1_, _rs2_
+|<<#insns-clmulh>>
+
+|&#10003;
+|&#10003;
+|clmulr _rd_, _rs1_, _rs2_
+|<<#insns-clmulr>>
+
+|===
+
+[#zbs,reftext="Single-bit instructions"]
+==== Zbs: Single-bit instructions
+
+[NOTE,caption=Frozen]
+====
+The Zbs extension is frozen.
+====
+
+The single-bit instructions provide a mechanism to set, clear, invert, or extract
+a single bit in a register. The bit is specified by its index.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|bclr _rd_, _rs1_, _rs2_
+|<<#insns-bclr>>
+
+|&#10003;
+|&#10003;
+|bclri _rd_, _rs1_, _imm_
+|<<#insns-bclri>>
+
+|&#10003;
+|&#10003;
+|bext _rd_, _rs1_, _rs2_
+|<<#insns-bext>>
+
+|&#10003;
+|&#10003;
+|bexti _rd_, _rs1_, _imm_
+|<<#insns-bexti>>
+
+|&#10003;
+|&#10003;
+|binv _rd_, _rs1_, _rs2_
+|<<#insns-binv>>
+
+|&#10003;
+|&#10003;
+|binvi _rd_, _rs1_, _imm_
+|<<#insns-binvi>>
+
+|&#10003;
+|&#10003;
+|bset _rd_, _rs1_, _rs2_
+|<<#insns-bset>>
+
+|&#10003;
+|&#10003;
+|bseti _rd_, _rs1_, _imm_
+|<<#insns-bseti>>
+
+|===
+
+[#zbkc,reftext="Carry-less multiplication for Cryptography"]
+==== Zbkc: Carry-less multiplication for Cryptography
+
+[NOTE,caption=Frozen]
+====
+The Zbkc extension is frozen.
+====
+
+Carry-less multiplication is the multiplication in the polynomial ring over
+GF(2). This is a critical operation in some cryptographic workloads,
+particularly the AES-GCM authenticated encryption scheme.
+This extension provides only the instructions needed to
+efficiently implement the GHASH operation, which is part of this workload.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|clmul _rd_, _rs1_, _rs2_
+|<<#insns-clmul>>
+
+|&#10003;
+|&#10003;
+|clmulh _rd_, _rs1_, _rs2_
+|<<#insns-clmulh>>
+
+|===
+
+[#zbkx,reftext="Crossbar permutations"]
+==== Zbkx: Crossbar permutations
+
+[NOTE,caption=Frozen]
+====
+The Zbkx extension is frozen.
+====
+
+These instructions implement a "lookup table" for 4 and 8 bit elements
+inside the general purpose registers.
+_rs1_ is used as a vector of N-bit words, and _rs2_ as a vector of N-bit
+indices into _rs1_.
+Elements in _rs1_ are replaced by the indexed element in _rs2_, or zero
+if the index into _rs2_ is out of bounds.
+
+These instructions are useful for expressing N-bit to N-bit boolean
+operations, and implementing cryptographic code with secret
+dependent memory accesses (particularly SBoxes) such that the execution
+latency does not depend on the (secret) data being operated on.
-We anticipate the B extension will be a brownfield encoding within the
-base 30-bit instruction space.
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|xperm.n _rd_, _rs1_, _rs2_
+|<<#insns-xpermn>>
+
+|&#10003;
+|&#10003;
+|xperm.b _rd_, _rs1_, _rs2_
+|<<#insns-xpermb>>
+
+|===
+
+[#zbkb,reftext="Bit-manipulation for Cryptography"]
+==== Zbkb: Bit-manipulation for Cryptography
+
+[NOTE,caption=Frozen]
+====
+The Zbkb extension is frozen.
====
+
+This extension contains instructions essential for implementing
+common operations in cryptographic workloads.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+
+| &#10003;
+| &#10003;
+| rol
+| <<insns-rol>>
+
+|
+| &#10003;
+| rolw
+| <<insns-rolw>>
+
+| &#10003;
+| &#10003;
+| ror
+| <<insns-ror>>
+
+| &#10003;
+| &#10003;
+| rori
+| <<insns-rori>>
+
+|
+| &#10003;
+| roriw
+| <<insns-roriw>>
+
+|
+| &#10003;
+| rorw
+| <<insns-rorw>>
+
+| &#10003;
+| &#10003;
+| andn
+| <<insns-andn>>
+
+| &#10003;
+| &#10003;
+| orn
+| <<insns-orn>>
+
+| &#10003;
+| &#10003;
+| xnor
+| <<insns-xnor>>
+
+| &#10003;
+| &#10003;
+| pack
+| <<insns-pack>>
+
+| &#10003;
+| &#10003;
+| packh
+| <<insns-packh>>
+
+|
+| &#10003;
+| packw
+| <<insns-packw>>
+
+| &#10003;
+| &#10003;
+| rev.b
+| <<insns-revb>>
+
+| &#10003;
+| &#10003;
+| rev8
+| <<insns-rev8>>
+
+| &#10003;
+|
+| zip
+| <<insns-zip>>
+
+| &#10003;
+|
+| unzip
+| <<insns-unzip>>
+
+|===
+
+<<<
+
+[#insns,reftext="Instructions (in alphabetical order)"]
+=== Instructions (in alphabetical order)
+
+[#insns-add_uw,reftext=Add unsigned word]
+==== add.uw
+
+Synopsis::
+Add unsigned word
+
+Mnemonic::
+add.uw _rd_, _rs1_, _rs2_
+
+
+Pseudoinstructions::
+zext.w _rd_, _rs1_ &#8594; add.uw _rd_, _rs1_, zero
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x0, attr: ['ADD.UW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x04, attr: ['ADD.UW'] },
+]}
+....
+
+Description::
+This instruction performs an XLEN-wide addition between _rs2_ and the zero-extended least-significant word of _rs1_.
+
+Operation::
+[source,sail]
+--
+let base = X(rs2);
+let index = EXTZ(X(rs1)[31..0]);
+
+X(rd) = base + index;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-andn,reftext="AND with inverted operand"]
+==== andn
+
+Synopsis::
+AND with inverted operand
+
+Mnemonic::
+andn _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x7, attr: ['ANDN']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x20, attr: ['ANDN'] },
+]}
+....
+
+Description::
+This instruction performs the bitwise logical AND operation between _rs1_ and the bitwise inversion of _rs2_.
+
+Operation::
+[source,sail]
+--
+X(rd) = X(rs1) & ~X(rs2);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-bclr,reftext="Single-Bit Clear (Register)"]
+==== bclr
+
+Synopsis::
+Single-Bit Clear (Register)
+
+Mnemonic::
+bclr _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BCLR'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x24, attr: ['BCLR/BEXT'] },
+]}
+....
+
+Description::
+This instruction returns _rs1_ with a single bit cleared at the index specified in _rs2_.
+The index is read from the lower log2(XLEN) bits of _rs2_.
+
+Operation::
+[source,sail]
+--
+let index = X(rs2) & (XLEN - 1);
+X(rd) = X(rs1) & ~(1 << index)
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-bclri,reftext="Single-Bit Clear (Immediate)"]
+==== bclri
+
+Synopsis::
+Single-Bit Clear (Immediate)
+
+Mnemonic::
+bclri _rd_, _rs1_, _shamt_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BCLRI'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'shamt' },
+ { bits: 7, name: 0x24, attr: ['BCLRI'] },
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BCLRI'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 6, name: 'shamt' },
+ { bits: 6, name: 0x12, attr: ['BCLRI'] },
+]}
+....
+
+Description::
+This instruction returns _rs1_ with a single bit cleared at the index specified in _shamt_.
+The index is read from the lower log2(XLEN) bits of _shamt_.
+For RV32, the encodings corresponding to shamt[5]=1 are reserved.
+
+Operation::
+[source,sail]
+--
+let index = shamt & (XLEN - 1);
+X(rd) = X(rs1) & ~(1 << index)
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-bext,reftext="Single-Bit Extract (Register)"]
+==== bext
+
+Synopsis::
+Single-Bit Extract (Register)
+// Should we describe this as a Set-if-bit-is-set?
+
+Mnemonic::
+bext _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['BEXT'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x24, attr: ['BCLR/BEXT'] },
+]}
+....
+
+Description::
+This instruction returns a single bit extracted from _rs1_ at the index specified in _rs2_.
+The index is read from the lower log2(XLEN) bits of _rs2_.
+
+Operation::
+[source,sail]
+--
+let index = X(rs2) & (XLEN - 1);
+X(rd) = (X(rs1) >> index) & 1;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-bexti,reftext="Single-Bit Extract (Immediate)"]
+==== bexti
+
+Synopsis::
+Single-Bit Extract (Immediate)
+
+Mnemonic::
+bexti _rd_, _rs1_, _shamt_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['BEXTI'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'shamt' },
+ { bits: 7, name: 0x24, attr: ['BEXTI/BCLRI'] },
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['BEXTI'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 6, name: 'shamt' },
+ { bits: 6, name: 0x12, attr: ['BEXTI/BCLRI'] },
+]}
+....
+
+Description::
+This instruction returns a single bit extracted from _rs1_ at the index specified in _rs2_.
+The index is read from the lower log2(XLEN) bits of _shamt_.
+For RV32, the encodings corresponding to shamt[5]=1 are reserved.
+
+Operation::
+[source,sail]
+--
+let index = shamt & (XLEN - 1);
+X(rd) = (X(rs1) >> index) & 1;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-binv,reftext="Single-Bit Invert (Register)"]
+==== binv
+
+Synopsis::
+Single-Bit Invert (Register)
+
+Mnemonic::
+binv _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BINV'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x34, attr: ['BINV'] },
+]}
+....
+
+Description::
+This instruction returns _rs1_ with a single bit inverted at the index specified in _rs2_.
+The index is read from the lower log2(XLEN) bits of _rs2_.
+
+Operation::
+[source,sail]
+--
+let index = X(rs2) & (XLEN - 1);
+X(rd) = X(rs1) ^ (1 << index)
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-binvi,reftext="Single-Bit Invert (Immediate)"]
+==== binvi
+
+Synopsis::
+Single-Bit Invert (Immediate)
+
+Mnemonic::
+binvi _rd_, _rs1_, _shamt_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BINV'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'shamt' },
+ { bits: 7, name: 0x34, attr: ['BINVI'] },
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BINV'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 6, name: 'shamt' },
+ { bits: 6, name: 0x1a, attr: ['BINVI'] },
+]}
+....
+
+Description::
+This instruction returns _rs1_ with a single bit inverted at the index specified in _shamt_.
+The index is read from the lower log2(XLEN) bits of _shamt_.
+For RV32, the encodings corresponding to shamt[5]=1 are reserved.
+
+Operation::
+[source,sail]
+--
+let index = shamt & (XLEN - 1);
+X(rd) = X(rs1) ^ (1 << index)
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-bset,reftext="Single-Bit Set (Register)"]
+==== bset
+
+Synopsis::
+Single-Bit Set (Register)
+
+Mnemonic::
+bset _rd_, _rs1_,_rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BSET'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x14, attr: ['BSET'] },
+]}
+....
+
+Description::
+This instruction returns _rs1_ with a single bit set at the index specified in _rs2_.
+The index is read from the lower log2(XLEN) bits of _rs2_.
+
+Operation::
+[source,sail]
+--
+let index = X(rs2) & (XLEN - 1);
+X(rd) = X(rs1) | (1 << index)
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-bseti,reftext="Single-Bit Set (Immediate)"]
+==== bseti
+
+Synopsis::
+Single-Bit Set (Immediate)
+
+Mnemonic::
+bseti _rd_, _rs1_,_shamt_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BSETI'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'shamt' },
+ { bits: 7, name: 0x14, attr: ['BSETI'] },
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['BSETI'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 6, name: 'shamt' },
+ { bits: 6, name: 0x0a, attr: ['BSETI'] },
+]}
+....
+
+Description::
+This instruction returns _rs1_ with a single bit set at the index specified in _shamt_.
+The index is read from the lower log2(XLEN) bits of _shamt_.
+For RV32, the encodings corresponding to shamt[5]=1 are reserved.
+
+Operation::
+[source,sail]
+--
+let index = shamt & (XLEN - 1);
+X(rd) = X(rs1) | (1 << index)
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbs (<<#zbs>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-clmul,reftext="Carry-less multiply (low-part)"]
+==== clmul
+
+Synopsis::
+Carry-less multiply (low-part)
+
+Mnemonic::
+clmul _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CLMUL'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x5, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+clmul produces the lower half of the 2·XLEN carry-less product.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+let output : xlenbits = 0;
+
+foreach (i from 0 to (xlen - 1) by 1) {
+ output = if ((rs2_val >> i) & 1)
+ then output ^ (rs1_val << i);
+ else output;
+}
+
+X[rd] = output
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbc (<<#zbc>>)
+|0.93
+|Frozen
+
+|Zbkc (<<#zbkc>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-clmulh,reftext="Carry-less multiply (high-part)"]
+==== clmulh
+
+Synopsis::
+Carry-less multiply (high-part)
+
+Mnemonic::
+clmulh _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x3, attr: ['CLMULH'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x5, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+clmulh produces the upper half of the 2·XLEN carry-less product.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+let output : xlenbits = 0;
+
+foreach (i from 1 to xlen by 1) {
+ output = if ((rs2_val >> i) & 1)
+ then output ^ (rs1_val >> (xlen - i));
+ else output;
+}
+
+X[rd] = output
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbc (<<#zbc>>)
+|0.93
+|Frozen
+
+|Zbkc (<<#zbkc>>)
+|v0.9.4
+|Frozen
+|===
+
+
+<<<
+[#insns-clmulr,reftext="Carry-less multiply (reversed)"]
+==== clmulr
+
+Synopsis::
+Carry-less multiply (reversed)
+
+Mnemonic::
+clmulr _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x2, attr: ['CLMULR'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x5, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+*clmulr* produces bits 2·XLEN−2:XLEN-1 of the 2·XLEN carry-less
+product.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+let output : xlenbits = 0;
+
+foreach (i from 0 to (xlen - 1) by 1) {
+ output = if ((rs2_val >> i) & 1)
+ then output ^ (rs1_val >> (xlen - i - 1));
+ else output;
+}
+
+X[rd] = output
+--
+
+.Note
+[NOTE, caption="A" ]
+===============================================================
+The *clmulr* instruction is used to accelerate CRC calculations.
+The *r* in the instruction's mnemonic stands for _reversed_, as the
+instruction is equivalent to bit-reversing the inputs, performing
+a *clmul*, then bit-reversing the output.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbc (<<#zbc>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-clz,reftext="Count leading zero bits"]
+==== clz
+
+Synopsis::
+Count leading zero bits
+
+Mnemonic::
+clz _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CLZ'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x0, attr: ['CLZ'] },
+ { bits: 7, name: 0x30, attr: ['CLZ'] },
+]}
+....
+
+Description::
+This instruction counts the number of 0's before the first 1, starting at the most-significant bit (i.e., XLEN-1) and progressing to bit 0. Accordingly, if the input is 0, the output is XLEN, and if the most-significant bit of the input is a 1, the output is 0.
+
+Operation::
+[source,sail]
+--
+val HighestSetBit : forall ('N : Int), 'N >= 0. bits('N) -> int
+
+function HighestSetBit x = {
+ foreach (i from (xlen - 1) to 0 by 1 in dec)
+ if [x[i]] == 0b1 then return(i) else ();
+ return -1;
+}
+
+let rs = X(rs);
+X[rd] = (xlen - 1) - HighestSetBit(rs);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-clzw,reftext="Count leading zero bits in word"]
+==== clzw
+
+Synopsis::
+Count leading zero bits in word
+
+Mnemonic::
+clzw _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x1b, attr: ['OP-IMM-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CLZW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x0, attr: ['CLZW'] },
+ { bits: 7, name: 0x30, attr: ['CLZW'] },
+]}
+....
+
+Description::
+This instruction counts the number of 0's before the first 1 starting at bit 31 and progressing to bit 0.
+Accordingly, if the least-significant word is 0, the output is 32, and if the most-significant bit of the word (i.e., bit 31) is a 1, the output is 0.
+
+Operation::
+[source,sail]
+--
+val HighestSetBit32 : forall ('N : Int), 'N >= 0. bits('N) -> int
+
+function HighestSetBit32 x = {
+ foreach (i from 31 to 0 by 1 in dec)
+ if [x[i]] == 0b1 then return(i) else ();
+ return -1;
+}
+
+let rs = X(rs);
+X[rd] = 31 - HighestSetBit(rs);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-cpop,reftext="Count set bits"]
+==== cpop
+
+Synopsis::
+Count set bits
+
+Mnemonic::
+cpop _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CPOP'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x2, attr: ['CPOP'] },
+ { bits: 7, name: 0x30, attr: ['CPOP'] },
+]}
+....
+Description::
+This instructions counts the number of 1's (i.e., set bits) in the source register.
+
+Operation::
+[source,sail]
+--
+let bitcount = 0;
+let rs = X(rs);
+
+foreach (i from 0 to (xlen - 1) in inc)
+ if rs[i] == 0b1 then bitcount = bitcount + 1 else ();
+
+X[rd] = bitcount
+--
+
+.Software Hint
+[NOTE, caption="SH" ]
+===============================================================
+This operations is known as population count, popcount, sideways sum, bit summation, or Hamming weight.
+
+The GCC builtin function `+__builtin_popcount (unsigned int x)+` is implemented by cpop on RV32 and by *cpopw* on RV64.
+The GCC builtin function `+__builtin_popcountl (unsigned long x)+` for LP64 is implemented by *cpop* on RV64.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-cpopw,reftext="Count set bits in word"]
+==== cpopw
+
+Synopsis::
+Count set bits in word
+
+Mnemonic::
+cpopw _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x1b, attr: ['OP-IMM-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CPOPW'] },
+ { bits: 5, name: 'rs' },
+ { bits: 5, name: 0x2, attr: ['CPOPW'] },
+ { bits: 7, name: 0x30, attr: ['CPOPW'] },
+]}
+....
+Description::
+This instructions counts the number of 1's (i.e., set bits) in the least-significant word of the source register.
+
+Operation::
+[source,sail]
+--
+let bitcount = 0;
+let val = X(rs);
+
+foreach (i from 0 to 31 in inc)
+ if val[i] == 0b1 then bitcount = bitcount + 1 else ();
+
+X[rd] = bitcount
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-ctz,reftext="Count trailing zero bits"]
+==== ctz
+
+Synopsis::
+Count trailing zeros
+
+Mnemonic::
+ctz _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CTZ/CTZW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x1, attr: ['CTZ/CTZW'] },
+ { bits: 7, name: 0x30, attr: ['CTZ/CTZW'] },
+]}
+....
+
+Description::
+This instruction counts the number of 0's before the first 1, starting at the least-significant bit (i.e., 0) and progressing to the most-significant bit (i.e., XLEN-1).
+Accordingly, if the input is 0, the output is XLEN, and if the least-significant bit of the input is a 1, the output is 0.
+
+Operation::
+[source,sail]
+--
+val LowestSetBit : forall ('N : Int), 'N >= 0. bits('N) -> int
+
+function LowestSetBit x = {
+ foreach (i from 0 to (xlen - 1) by 1 in dec)
+ if [x[i]] == 0b1 then return(i) else ();
+ return xlen;
+}
+
+let rs = X(rs);
+X[rd] = LowestSetBit(rs);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-ctzw,reftext="Count trailing zero bits in word"]
+==== ctzw
+
+Synopsis::
+Count trailing zero bits in word
+
+Mnemonic::
+ctzw _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x1b, attr: ['OP-IMM-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['CTZ/CTZW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x1, attr: ['CTZ/CTZW'] },
+ { bits: 7, name: 0x30, attr: ['CTZ/CTZW'] },
+]}
+....
+
+Description::
+This instruction counts the number of 0's before the first 1, starting at the least-significant bit (i.e., 0) and progressing to the most-significant bit of the least-significant word (i.e., 31). Accordingly, if the least-significant word is 0, the output is 32, and if the least-significant bit of the input is a 1, the output is 0.
+
+Operation::
+[source,sail]
+--
+val LowestSetBit32 : forall ('N : Int), 'N >= 0. bits('N) -> int
+
+function LowestSetBit32 x = {
+ foreach (i from 0 to 31 by 1 in dec)
+ if [x[i]] == 0b1 then return(i) else ();
+ return 32;
+}
+
+let rs = X(rs);
+X[rd] = LowestSetBit32(rs);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-max,reftext="Maximum"]
+==== max
+
+Synopsis::
+Maximum
+
+Mnemonic::
+max _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x6, attr: ['MAX']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x05, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+This instruction returns the larger of two signed integers.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+
+let result = if rs1_val <_s rs2_val
+ then rs2_val
+ else rs1_val;
+
+X(rd) = result;
+--
+
+.Software Hint
+[NOTE, caption="SW"]
+===============================================================
+Calculating the absolute value of a signed integer can be performed
+using the following sequence: *neg rD,rS* followed by *max
+rD,rS,rD*. When using this common sequence, it is suggested that they
+are scheduled with no intervening instructions so that
+implementations that are so optimized can fuse them together.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-maxu,reftext="Unsigned maximum"]
+==== maxu
+
+Synopsis::
+Unsigned maximum
+
+Mnemonic::
+maxu _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x7, attr: ['MAXU']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x05, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+This instruction returns the larger of two unsigned integers.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+
+let result = if rs1_val <_u rs2_val
+ then rs2_val
+ else rs1_val;
+
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-min,reftext="Minimum"]
+==== min
+
+Synopsis::
+Minimum
+
+Mnemonic::
+min _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x4, attr: ['MIN']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x05, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+This instruction returns the smaller of two signed integers.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+
+let result = if rs1_val <_s rs2_val
+ then rs1_val
+ else rs2_val;
+
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-minu,reftext="Unsigned minimum"]
+==== minu
+
+Synopsis::
+Unsigned minimum
+
+Mnemonic::
+minu _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['MINU']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x05, attr: ['MINMAX/CLMUL'] },
+]}
+....
+
+Description::
+This instruction returns the smaller of two unsigned integers.
+
+Operation::
+[source,sail]
+--
+let rs1_val = X(rs1);
+let rs2_val = X(rs2);
+
+let result = if rs1_val <_u rs2_val
+ then rs1_val
+ else rs2_val;
+
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-orc_b,reftext="Bitwise OR-Combine, byte granule"]
+==== orc.b
+
+Synopsis::
+Bitwise OR-Combine, byte granule
+
+Mnemonic::
+orc.b _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5 },
+ { bits: 5, name: 'rs' },
+ { bits: 12, name: 0x287 }
+]}
+....
+
+Description::
+Combines the bits within each byte using bitwise logical OR.
+This sets the bits of each byte in the result _rd_ to all zeros if no bit within the respective byte of _rs_ is set, or to all ones if any bit within the respective byte of _rs_ is set.
+
+Operation::
+[source,sail]
+--
+let input = X(rs);
+let output : xlenbits = 0;
+
+foreach (i from 0 to (xlen - 8) by 8) {
+ output[(i + 7)..i] = if input[(i + 7)..i] == 0
+ then 0b00000000
+ else 0b11111111;
+}
+
+X[rd] = output;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-orn,reftext="OR with inverted operand"]
+==== orn
+
+Synopsis::
+OR with inverted operand
+
+Mnemonic::
+orn _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x6, attr: ['ORN']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x20, attr: ['ORN'] },
+]}
+....
+
+Description::
+This instruction performs the bitwise logical OR operation between _rs1_ and the bitwise inversion of _rs2_.
+
+Operation::
+[source,sail]
+--
+X(rd) = X(rs1) | ~X(rs2);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-pack,reftext="Pack low halves of registers"]
+==== pack
+
+Synopsis::
+Pack the low halves of _rs1_ and _rs2_ into _rd_.
+
+Mnemonic::
+pack _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ {bits: 7, name: 0x33, attr: ['OP'] },
+ {bits: 5, name: 'rd'},
+ {bits: 3, name: 0x4, attr:['PACK']},
+ {bits: 5, name: 'rs1'},
+ {bits: 5, name: 'rs2'},
+ {bits: 7, name: 0x4, attr:['PACK']},
+]}
+....
+
+Description::
+The pack instruction packs the XLEN/2-bit lower halves of _rs1_ and _rs2_ into
+_rd_, with _rs1_ in the lower half and _rs2_ in the upper half.
+
+Operation::
+[source,sail]
+--
+let lo_half : bits(xlen/2) = X(rs1)[xlen/2-1..0];
+let hi_half : bits(xlen/2) = X(rs2)[xlen/2-1..0];
+X(rd) = EXTZ(hi_half @ lo_half);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-packh,reftext="Pack low bytes of registers"]
+==== packh
+
+Synopsis::
+Pack the low bytes of _rs1_ and _rs2_ into _rd_.
+
+Mnemonic::
+packh _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ {bits: 7, name: 0x33, attr: ['OP'] },
+ {bits: 5, name: 'rd'},
+ {bits: 3, name: 0x7, attr: ['PACKH']},
+ {bits: 5, name: 'rs1'},
+ {bits: 5, name: 'rs2'},
+ {bits: 7, name: 0x4, attr: ['PACKH']},
+]}
+....
+
+Description::
+And the packh instruction packs the least-significant bytes of
+_rs1_ and _rs2_ into the 16 least-significant bits of _rd_,
+zero extending the rest of _rd_.
+
+Operation::
+[source,sail]
+--
+let lo_half : bits(8) = X(rs1)[7..0];
+let hi_half : bits(8) = X(rs2)[7..0];
+X(rd) = EXTZ(hi_half @ lo_half);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-packw,reftext="Pack low 16-bits of registers (RV64)"]
+==== packw
+
+Synopsis::
+Pack the low 16-bits of _rs1_ and _rs2_ into _rd_ on RV64.
+
+Mnemonic::
+packw _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 2, name: 0x3},
+{bits: 5, name: 0xe},
+{bits: 5, name: 'rd'},
+{bits: 3, name: 0x4},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 'rs2'},
+{bits: 7, name: 0x4},
+]}
+....
+
+Description::
+This instruction packs the low 16 bits of
+_rs1_ and _rs2_ into the 32 least-significant bits of _rd_,
+sign extending the 32-bit result to the rest of _rd_.
+This instruction only exists on RV64 based systems.
+
+Operation::
+[source,sail]
+--
+let lo_half : bits(16) = X(rs1)[15..0];
+let hi_half : bits(16) = X(rs2)[15..0];
+X(rd) = EXTS(hi_half @ lo_half);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-rev8,reftext="Byte-reverse register"]
+==== rev8
+
+Synopsis::
+Byte-reverse register
+
+Mnemonic::
+rev8 _rd_, _rs_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5 },
+ { bits: 5, name: 'rs' },
+ { bits: 12, name: 0x698 }
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5 },
+ { bits: 5, name: 'rs' },
+ { bits: 12, name: 0x6b8 }
+]}
+....
+
+Description::
+This instruction reverses the order of the bytes in _rs_.
+
+Operation::
+[source,sail]
+--
+let input = X(rs);
+let output : xlenbits = 0;
+let j = xlen - 1;
+
+foreach (i from 0 to (xlen - 8) by 8) {
+ output[i..(i + 7)] = input[(j - 7)..j];
+ j = j - 8;
+}
+
+X[rd] = output
+--
+
+.Note
+[NOTE, caption="A" ]
+===============================================================
+The *rev8* mnemonic corresponds to different instruction encodings in RV32 and RV64.
+===============================================================
+
+.Software Hint
+[NOTE, caption="SH" ]
+===============================================================
+The byte-reverse operation is only available for the full register
+width. To emulate word-sized and halfword-sized byte-reversal,
+perform a `rev8 rd,rs` followed by a `srai rd,rd,K`, where K is
+XLEN-32 and XLEN-16, respectively.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-revb,reftext="Reverse bits in bytes"]
+==== rev.b
+
+Synopsis::
+Reverse the bits in each byte of a source register.
+
+Mnemonic::
+rev.b _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5 },
+ { bits: 5, name: 'rs' },
+ { bits: 12, name: 0x687 }
+]}
+....
+
+Description::
+This instruction reverses the order of the bits in every byte of a register.
+
+Operation::
+[source,sail]
+--
+result : xlenbits = EXTZ(0b0);
+foreach (i from 0 to sizeof(xlen) by 8) {
+ result[i+7..i] = reverse_bits_in_byte(X(rs1)[i+7..i]);
+};
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-rol,reftext="Rotate left (Register)"]
+==== rol
+
+Synopsis::
+Rotate Left (Register)
+
+Mnemonic::
+rol _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['ROL']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x30, attr: ['ROL'] },
+]}
+....
+
+Description::
+This instruction performs a rotate left of _rs1_ by the amount in least-significant log2(XLEN) bits of _rs2_.
+
+Operation::
+[source,sail]
+--
+let shamt = if xlen == 32
+ then X(rs2)[4..0]
+ else X(rs2)[5..0];
+let result = (X(rs1) << shamt) | (X(rs1) >> (xlen - shamt));
+
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-rolw,reftext="Rotate Left Word (Register)"]
+==== rolw
+
+Synopsis::
+Rotate Left Word (Register)
+
+Mnemonic::
+rolw _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['ROLW']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x30, attr: ['ROLW'] },
+]}
+....
+
+Description::
+This instruction performs a rotate left on the least-significant word of _rs1_ by the amount in least-significant 5 bits of _rs2_.
+The resulting word value is sign-extended by copying bit 31 to all of the more-significant bits.
+
+Operation::
+[source,sail]
+--
+let rs1 = EXTZ(X(rs1)[31..0])
+let shamt = X(rs2)[4..0];
+let result = (rs1 << shamt) | (rs1 >> (32 - shamt));
+X(rd) = EXTS(result[31..0]);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-ror,reftext="Rotate right (Register)"]
+==== ror
+
+Synopsis::
+Rotate Right
+
+Mnemonic::
+ror _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['ROR']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x30, attr: ['ROR'] },
+]}
+....
+
+Description::
+This instruction performs a rotate right of _rs1_ by the amount in least-significant log2(XLEN) bits of _rs2_.
+
+Operation::
+[source,sail]
+--
+let shamt = if xlen == 32
+ then X(rs2)[4..0]
+ else X(rs2)[5..0];
+let result = (X(rs1) >> shamt) | (X(rs1) << (xlen - shamt));
+
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-rori,reftext="Rotate right (Immediate)"]
+==== rori
+
+Synopsis::
+Rotate Right (Immediate)
+
+Mnemonic::
+rori _rd_, _rs1_, _shamt_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['RORI']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'shamt' },
+ { bits: 7, name: 0x30, attr: ['RORI'] },
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['RORI']},
+ { bits: 5, name: 'rs1' },
+ { bits: 6, name: 'shamt' },
+ { bits: 6, name: 0x18, attr: ['RORI'] },
+]}
+....
+
+Description::
+This instruction performs a rotate right of _rs1_ by the amount in the least-significant log2(XLEN) bits of _shamt_.
+For RV32, the encodings corresponding to shamt[5]=1 are reserved.
+
+Operation::
+[source,sail]
+--
+let shamt = if xlen == 32
+ then shamt[4..0]
+ else shamt[5..0];
+let result = (X(rs1) >> shamt) | (X(rs1) << (xlen - shamt));
+
+X(rd) = result;
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-roriw,reftext="Rotate right Word (Immediate)"]
+==== roriw
+
+Synopsis::
+Rotate Right Word by Immediate
+
+Mnemonic::
+roriw _rd_, _rs1_, _shamt_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x1b, attr: ['OP-IMM-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['RORIW']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'shamt' },
+ { bits: 7, name: 0x30, attr: ['RORIW'] },
+]}
+....
+
+Description::
+This instruction performs a rotate right on the least-significant word
+of _rs1_ by the amount in the least-significant log2(XLEN) bits of
+_shamt_.
+The resulting word value is sign-extended by copying bit 31 to all of
+the more-significant bits.
+
+
+Operation::
+[source,sail]
+--
+let rs1_data = EXTZ(X(rs1)[31..0];
+let result = (rs1_data >> shamt) | (rs1_data << (32 - shamt));
+X(rd) = EXTS(result[31..0]);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-rorw,reftext="Rotate right Word (Register)"]
+==== rorw
+
+Synopsis::
+Rotate Right Word (Register)
+
+Mnemonic::
+rorw _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x5, attr: ['RORW']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x30, attr: ['RORW'] },
+]}
+....
+
+Description::
+This instruction performs a rotate right on the least-significant word of _rs1_ by the amount in least-significant 5 bits of _rs2_.
+The resultant word is sign-extended by copying bit 31 to all of the more-significant bits.
+
+Operation::
+[source,sail]
+--
+let rs1 = EXTZ(X(rs1)[31..0])
+let shamt = X(rs2)[4..0];
+let result = (rs1 >> shamt) | (rs1 << (32 - shamt));
+X(rd) = EXTS(result);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-sext_b,reftext="Sign-extend byte"]
+==== sext.b
+
+Synopsis::
+Sign-extend byte
+
+Mnemonic::
+sext.b _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['SEXT.B/SEXT.H'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x04, attr: ['SEXT.B'] },
+ { bits: 7, name: 0x30 },
+]}
+....
+
+Description::
+This instruction sign-extends the least-significant byte in the source to XLEN by copying the most-significant bit in the byte (i.e., bit 7) to all of the more-significant bits.
+
+Operation::
+[source,sail]
+--
+X(rd) = EXTS(X(rs)[7..0]);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-sext_h,reftext="Sign-extend halfword"]
+==== sext.h
+
+Synopsis::
+Sign-extend halfword
+
+Mnemonic::
+sext.h _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['SEXT.B/SEXT.H'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 0x05, attr: ['SEXT.H'] },
+ { bits: 7, name: 0x30 },
+]}
+....
+
+Description::
+This instruction sign-extends the least-significant halfword in _rs_ to XLEN by copying the most-significant bit in the halfword (i.e., bit 15) to all of the more-significant bits.
+
+Operation::
+[source,sail]
+--
+X(rd) = EXTS(X(rs)[15..0]);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+
+<<<
+[#insns-sh1add,reftext=Shift left by 1 and add]
+==== sh1add
+
+Synopsis::
+Shift left by 1 and add
+
+Mnemonic::
+sh1add _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x2, attr: ['SH1ADD'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x10, attr: ['SH1ADD'] },
+]}
+....
+
+Description::
+This instruction shifts _rs1_ to the left by 1 bit and adds it to _rs2_.
+
+Operation::
+[source,sail]
+--
+X(rd) = X(rs2) + (X(rs1) << 1);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+// We have decided that this and all other instructions will not have reserved encodings for "useless encodings"
+// We could follow suit of the base ISA and create HINTs if there is some recognized value for doing so
+
+<<<
+[#insns-sh1add_uw,reftext=Shift unsigned word left by 1 and add]
+==== sh1add.uw
+
+Synopsis::
+Shift unsigned word left by 1 and add
+
+Mnemonic::
+sh1add.uw _rd_, _rs1_, _rs2_
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x2, attr: ['SH1ADD.UW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x10, attr: ['SH1ADD.UW'] },
+]}
+....
+
+Description::
+This instruction performs an XLEN-wide addition of two addends.
+The first addend is _rs2_. The second addend is the unsigned value formed by extracting the least-significant word of _rs1_ and shifting it left by 1 place.
+
+Operation::
+[source,sail]
+--
+let base = X(rs2);
+let index = EXTZ(X(rs1)[31..0]);
+
+X(rd) = base + (index << 1);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-sh2add,reftext=Shift left by 2 and add]
+==== sh2add
+
+Synopsis::
+Shift left by 2 and add
+
+Mnemonic::
+sh2add _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x4, attr: ['SH2ADD'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x10, attr: ['SH2ADD'] },
+]}
+....
+
+Description::
+This instruction shifts _rs1_ to the left by 2 places and adds it to _rs2_.
+
+Operation::
+[source,sail]
+--
+X(rd) = X(rs2) + (X(rs1) << 2);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-sh2add_uw,reftext=Shift unsigned word left by 2 and add]
+==== sh2add.uw
+
+Synopsis::
+Shift unsigned word left by 2 and add
+
+Mnemonic::
+sh2add.uw _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x4, attr: ['SH2ADD.UW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x10, attr: ['SH2ADD.UW'] },
+]}
+....
+
+Description::
+This instruction performs an XLEN-wide addition of two addends.
+The first addend is _rs2_.
+The second addend is the unsigned value formed by extracting the least-significant word of _rs1_ and shifting it left by 2 places.
+
+Operation::
+[source,sail]
+--
+let base = X(rs2);
+let index = EXTZ(X(rs1)[31..0]);
+
+X(rd) = base + (index << 2);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-sh3add,reftext=Shift left by 3 and add]
+==== sh3add
+
+Synopsis::
+Shift left by 3 and add
+
+Mnemonic::
+sh3add _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x6, attr: ['SH3ADD'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x10, attr: ['SH3ADD'] },
+]}
+....
+
+Description::
+This instruction shifts _rs1_ to the left by 3 places and adds it to _rs2_.
+
+Operation::
+[source,sail]
+--
+X(rd) = X(rs2) + (X(rs1) << 3);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-sh3add_uw,reftext=Shift unsigned word left by 3 and add]
+==== sh3add.uw
+
+Synopsis::
+Shift unsigned word left by 3 and add
+
+Mnemonic::
+sh3add.uw _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x6, attr: ['SH3ADD.UW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x10, attr: ['SH3ADD.UW'] },
+]}
+....
+
+Description::
+This instruction performs an XLEN-wide addition of two addends. The first addend is _rs2_. The second addend is the unsigned value formed by extracting the least-significant word of _rs1_ and shifting it left by 3 places.
+
+Operation::
+[source,sail]
+--
+let base = X(rs2);
+let index = EXTZ(X(rs1)[31..0]);
+
+X(rd) = base + (index << 3);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-slli_uw,reftext="Shift-left unsigned word (Immediate)"]
+==== slli.uw
+
+Synopsis::
+Shift-left unsigned word (Immediate)
+
+Mnemonic::
+slli.uw _rd_, _rs1_, _shamt_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x1b, attr: ['OP-IMM-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x1, attr: ['SLLI.UW'] },
+ { bits: 5, name: 'rs1' },
+ { bits: 6, name: 'shamt' },
+ { bits: 6, name: 0x02, attr: ['SLLI.UW'] },
+]}
+....
+
+Description::
+This instruction takes the least-significant word of _rs1_, zero-extends it, and shifts it left by the immediate.
+
+Operation::
+[source,sail]
+--
+X(rd) = (EXTZ(X(rs)[31..0]) << shamt);
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zba (<<#zba>>)
+|0.93
+|Frozen
+|===
+
+.Architecture Explanation
+[NOTE, caption="A" ]
+===============================================================
+This instruction is the same as *slli* with *zext.w* performed on _rs1_ before shifting.
+===============================================================
+
+<<<
+[#insns-unzip,reftext="Bit deinterleave"]
+==== unzip
+
+Synopsis::
+Implements the inverse of the zip instruction.
+
+Mnemonic::
+unzip _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 0x13, attr: ['OP-IMM']},
+{bits: 5, name: 'rd'},
+{bits: 3, name: 0x5},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 0x1f},
+{bits: 7, name: 0x4},
+]}
+....
+
+Description::
+This instruction gathers bits from the high and low halves of the source
+word into odd/even bit positions in the destination word.
+It is the inverse of the <<insns-zip,zip>> instruction.
+This instruction is available only on RV32.
+
+Operation::
+[source,sail]
+--
+foreach (i from 0 to xlen/2-1) {
+ X(rd)[i] = X(rs1)[2*i]
+ X(rd)[i+xlen/2] = X(rs1)[2*i+1]
+}
+--
+
+.Software Hint
+[NOTE, caption="SH" ]
+===============================================================
+This instruction is useful for implementing the SHA3 cryptographic
+hash function on a 32-bit architecture, as it implements the
+bit-interleaving operation used to speed up the 64-bit rotations
+directly.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkb (<<#zbkb>>) (RV32)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-xnor,reftext="Exclusive NOR"]
+==== xnor
+
+Synopsis::
+Exclusive NOR
+
+Mnemonic::
+xnor _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x4, attr: ['XNOR']},
+ { bits: 5, name: 'rs1' },
+ { bits: 5, name: 'rs2' },
+ { bits: 7, name: 0x20, attr: ['XNOR'] },
+]}
+....
+
+Description::
+This instruction performs the bit-wise exclusive-NOR operation on _rs1_ and _rs2_.
+
+Operation::
+[source,sail]
+--
+X(rd) = ~(X(rs1) ^ X(rs2));
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+
+|Zbkb (<<#zbkb>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-xpermb,reftext="Crossbar permutation (bytes)"]
+==== xperm.b
+
+Synopsis::
+Byte-wise lookup of indices into a vector in registers.
+
+Mnemonic::
+xperm.b _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 2, name: 0x3},
+{bits: 5, name: 0xc},
+{bits: 5, name: 'rd'},
+{bits: 3, name: 0x4},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 'rs2'},
+{bits: 7, name: 0x14},
+]}
+....
+
+Description::
+The xperm.b instruction operates on bytes.
+The _rs1_ register contains a vector of XLEN/8 8-bit elements.
+The _rs2_ register contains a vector of XLEN/8 8-bit indexes.
+The result is each element in _rs2_ replaced by the indexed element in _rs1_,
+or zero if the index into _rs2_ is out of bounds.
+
+Operation::
+[source,sail]
+--
+val xpermb_lookup : (bits(8), xlenbits) -> bits(8)
+function xpermb_lookup (idx, lut) = {
+ (lut >> (idx @ 0b000))[7..0]
+}
+
+function clause execute ( XPERM_B (rs2,rs1,rd)) = {
+ result : xlenbits = EXTZ(0b0);
+ foreach(i from 0 to xlen by 8) {
+ result[i+7..i] = xpermn_lookup(X(rs2)[i+7..i], X(rs1));
+ };
+ X(rd) = result;
+ RETIRE_SUCCESS
+}
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkx (<<#zbkx>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-xpermn,reftext="Crossbar permutation (nibbles)"]
+==== xperm.n
+
+Synopsis::
+Nibble-wise lookup of indices into a vector.
+
+Mnemonic::
+xperm.n _rd_, _rs1_, _rs2_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 2, name: 0x3},
+{bits: 5, name: 0xc},
+{bits: 5, name: 'rd'},
+{bits: 3, name: 0x2},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 'rs2'},
+{bits: 7, name: 0x14},
+]}
+....
+
+Description::
+The xperm.n instruction operates on nibbles.
+The _rs1_ register contains a vector of XLEN/4 4-bit elements.
+The _rs2_ register contains a vector of XLEN/4 4-bit indexes.
+The result is each element in _rs2_ replaced by the indexed element in _rs1_,
+or zero if the index into _rs2_ is out of bounds.
+
+Operation::
+[source,sail]
+--
+val xpermn_lookup : (bits(4), xlenbits) -> bits(4)
+function xpermn_lookup (idx, lut) = {
+ (lut >> (idx @ 0b00))[3..0]
+}
+
+function clause execute ( XPERM_N (rs2,rs1,rd)) = {
+ result : xlenbits = EXTZ(0b0);
+ foreach(i from 0 to xlen by 4) {
+ result[i+3..i] = xpermn_lookup(X(rs2)[i+3..i], X(rs1));
+ };
+ X(rd) = result;
+ RETIRE_SUCCESS
+}
+--
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkx (<<#zbkx>>)
+|v0.9.4
+|Frozen
+|===
+
+<<<
+[#insns-zext_h,reftext="Zero-extend halfword"]
+==== zext.h
+
+Synopsis::
+Zero-extend halfword
+
+Mnemonic::
+zext.h _rd_, _rs_
+
+Encoding (RV32)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x33, attr: ['OP'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x4, attr: ['ZEXT.H']},
+ { bits: 5, name: 'rs' },
+ { bits: 5, name: 0x00 },
+ { bits: 7, name: 0x04 },
+]}
+....
+
+Encoding (RV64)::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x3b, attr: ['OP-32'] },
+ { bits: 5, name: 'rd' },
+ { bits: 3, name: 0x4, attr: ['ZEXT.H']},
+ { bits: 5, name: 'rs' },
+ { bits: 5, name: 0x00 },
+ { bits: 7, name: 0x04 },
+]}
+....
+
+Description::
+This instruction zero-extends the least-significant halfword of the source to XLEN by inserting 0's into all of the bits more significant than 15.
+
+Operation::
+[source,sail]
+--
+X(rd) = EXTZ(X(rs)[15..0]);
+--
+
+.Note
+[NOTE, caption="A" ]
+===============================================================
+The *zext.h* mnemonic corresponds to different instruction encodings in RV32 and RV64.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbb (<<#zbb>>)
+|0.93
+|Frozen
+|===
+
+<<<
+[#insns-zip,reftext="Bit interleave"]
+==== zip
+
+Synopsis::
+Gather odd and even bits of the source word into upper/lower halves of the
+destination.
+
+Mnemonic::
+zip _rd_, _rs_
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 0x13, attr: ['OP-IMM']},
+{bits: 5, name: 'rd'},
+{bits: 3, name: 0x1},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 0x1e},
+{bits: 7, name: 0x4},
+]}
+....
+
+Description::
+This instruction scatters all of the odd and even bits of a source word into
+the high and low halves of a destination word.
+It is the inverse of the <<insns-unzip,unzip>> instruction.
+This instruction is available only on RV32.
+
+Operation::
+[source,sail]
+--
+foreach (i from 0 to xlen/2-1) {
+ X(rd)[2*i] = X(rs1)[i]
+ X(rd)[2*i+1] = X(rs1)[i+xlen/2]
+}
+--
+
+.Software Hint
+[NOTE, caption="SH" ]
+===============================================================
+This instruction is useful for implementing the SHA3 cryptographic
+hash function on a 32-bit architecture, as it implements the
+bit-interleaving operation used to speed up the 64-bit rotations
+directly.
+===============================================================
+
+Included in::
+[%header,cols="4,2,2"]
+|===
+|Extension
+|Minimum version
+|Lifecycle state
+
+|Zbkb (<<#zbkb>>) (RV32)
+|v0.9.4
+|Frozen
+|===
+
+
+=== Software optimization guide
+
+==== strlen
+
+The *orc.b* instruction allows for the efficient detection of *NUL* bytes in an XLEN-sized chunk of data:
+
+ * the result of *orc.b* on a chunk that does not contain any *NUL* bytes will be all-ones, and
+ * after a bitwise-negation of the result of *orc.b*, the number of data bytes before the first *NUL* byte (if any) can be detected by *ctz*/*clz* (depending on the endianness of data).
+
+A full example of a *strlen* function, which uses these techniques and also demonstrates the use of it for unaligned/partial data, is the following:
+
+[source,asm]
+--
+#include <sys/asm.h>
+
+ .text
+ .globl strlen
+ .type strlen, @function
+strlen:
+ andi a3, a0, (SZREG-1) // offset
+ andi a1, a0, -SZREG // align pointer
+.Lprologue:
+ li a4, SZREG
+ sub a4, a4, a3 // XLEN - offset
+ slli a3, a3, 3 // offset * 8
+ REG_L a2, 0(a1) // chunk
+ /*
+ * Shift the partial/unaligned chunk we loaded to remove the bytes
+ * from before the start of the string, adding NUL bytes at the end.
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ srl a2, a2 ,a3 // chunk >> (offset * 8)
+#else
+ sll a2, a2, a3
+#endif
+ orc.b a2, a2
+ not a2, a2
+ /*
+ * Non-NUL bytes in the string have been expanded to 0x00, while
+ * NUL bytes have become 0xff. Search for the first set bit
+ * (corresponding to a NUL byte in the original chunk).
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ ctz a2, a2
+#else
+ clz a2, a2
+#endif
+ /*
+ * The first chunk is special: compare against the number of valid
+ * bytes in this chunk.
+ */
+ srli a0, a2, 3
+ bgtu a4, a0, .Ldone
+ addi a3, a1, SZREG
+ li a4, -1
+ .align 2
+ /*
+ * Our critical loop is 4 instructions and processes data in 4 byte
+ * or 8 byte chunks.
+ */
+.Lloop:
+ REG_L a2, SZREG(a1)
+ addi a1, a1, SZREG
+ orc.b a2, a2
+ beq a2, a4, .Lloop
+
+.Lepilogue:
+ not a2, a2
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ ctz a2, a2
+#else
+ clz a2, a2
+#endif
+ sub a1, a1, a3
+ add a0, a0, a1
+ srli a2, a2, 3
+ add a0, a0, a2
+.Ldone:
+ ret
+--
+
+==== strcmp
+
+[source,asm]
+--
+#include <sys/asm.h>
+
+ .text
+ .globl strcmp
+ .type strcmp, @function
+strcmp:
+ or a4, a0, a1
+ li t2, -1
+ and a4, a4, SZREG-1
+ bnez a4, .Lsimpleloop
+
+ # Main loop for aligned strings
+.Lloop:
+ REG_L a2, 0(a0)
+ REG_L a3, 0(a1)
+ orc.b t0, a2
+ bne t0, t2, .Lfoundnull
+ addi a0, a0, SZREG
+ addi a1, a1, SZREG
+ beq a2, a3, .Lloop
+
+ # Words don't match, and no null byte in first word.
+ # Get bytes in big-endian order and compare.
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 a2, a2
+ rev8 a3, a3
+#endif
+ # Synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence.
+ sltu a0, a2, a3
+ neg a0, a0
+ ori a0, a0, 1
+ ret
+
+.Lfoundnull:
+ # Found a null byte.
+ # If words don't match, fall back to simple loop.
+ bne a2, a3, .Lsimpleloop
+
+ # Otherwise, strings are equal.
+ li a0, 0
+ ret
+
+ # Simple loop for misaligned strings
+.Lsimpleloop:
+ lbu a2, 0(a0)
+ lbu a3, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne a2, a3, 1f
+ bnez a2, .Lsimpleloop
+
+1:
+ sub a0, a2, a3
+ ret
+
+.size strcmp, .-strcmp
+-- \ No newline at end of file
diff --git a/src/c-st-ext.adoc b/src/c-st-ext.adoc
index cfd9538..4cc36cd 100644
--- a/src/c-st-ext.adoc
+++ b/src/c-st-ext.adoc
@@ -298,7 +298,7 @@ registers.
==== Stack-Pointer-Based Loads and Stores
include::images/wavedrom/c-sp-load-store.adoc[]
-[c-sp-load-store]
+[[c-sp-load-store]]
//.Stack-Pointer-Based Loads and Stores--these instructions use the CI format.
These instructions use the CI format.
@@ -306,8 +306,7 @@ These instructions use the CI format.
C.LWSP loads a 32-bit value from memory into register _rd_. It computes
an effective address by adding the _zero_-extended offset, scaled by 4,
to the stack pointer, `x2`. It expands to `lw rd, offset(x2)`. C.LWSP is
-only valid when _rd_&#x2260;x0 the code
-points with _rd_=x0 are reserved.
+only valid when _rd_&#x2260;x0 the code points with _rd_=x0 are reserved.
C.LDSP is an RV64C/RV128C-only instruction that loads a 64-bit value
from memory into register _rd_. It computes its effective address by
@@ -336,7 +335,7 @@ _zero_-extended offset, scaled by 8, to the stack pointer, `x2`. It
expands to `fld rd, offset(x2)`.
include::images/wavedrom/c-sp-load-store-css.adoc[]
-[c-sp-load-store-css]
+[[c-sp-load-store-css]]
//.Stack-Pointer-Based Loads and Stores--these instructions use the CSS format.
These instructions use the CSS format.
diff --git a/src/calling-convention.adoc b/src/calling-convention.adoc
new file mode 100644
index 0000000..f5cb079
--- /dev/null
+++ b/src/calling-convention.adoc
@@ -0,0 +1,29 @@
+[appendix]
+== Calling Convention for Vector State (Not authoritative - Placeholder Only)
+
+NOTE: This Appendix is only a placeholder to help explain the
+conventions used in the code examples, and is not considered frozen or
+part of the ratification process. The official RISC-V psABI document
+is being expanded to specify the vector calling conventions.
+
+In the RISC-V psABI, the vector registers `v0`-`v31` are all caller-saved.
+The `vl` and `vtype` CSRs are also caller-saved.
+
+Procedures may assume that `vstart` is zero upon entry. Procedures may
+assume that `vstart` is zero upon return from a procedure call.
+
+NOTE: Application software should normally not write `vstart` explicitly.
+Any procedure that does explicitly write `vstart` to a nonzero value must
+zero `vstart` before either returning or calling another procedure.
+
+The `vxrm` and `vxsat` fields of `vcsr` have thread storage duration.
+
+Executing a system call causes all caller-saved vector registers
+(`v0`-`v31`, `vl`, `vtype`) and `vstart` to become unspecified.
+
+NOTE: This scheme allows system calls that cause context switches to avoid
+saving and later restoring the vector registers.
+
+NOTE: Most OSes will choose to either leave these registers intact or reset
+them to their initial state to avoid leaking information across process
+boundaries.
diff --git a/src/cmo.adoc b/src/cmo.adoc
new file mode 100644
index 0000000..705166a
--- /dev/null
+++ b/src/cmo.adoc
@@ -0,0 +1,1130 @@
+[[cmo]]
+== Base Cache Management Operation ISA Extensions
+
+=== Pseudocode for instruction semantics
+
+The semantics of each instruction in the <<#insns>> chapter is expressed in a
+SAIL-like syntax.
+
+[#intro,reftext="Introduction"]
+=== Introduction
+
+_Cache-management operation_ (or _CMO_) instructions perform operations on
+copies of data in the memory hierarchy. In general, CMO instructions operate on
+cached copies of data, but in some cases, a CMO instruction may operate on
+memory locations directly. Furthermore, CMO instructions are grouped by
+operation into the following classes:
+
+* A _management_ instruction manipulates cached copies of data with respect to a
+ set of agents that can access the data
+* A _zero_ instruction zeros out a range of memory locations, potentially
+ allocating cached copies of data in one or more caches
+* A _prefetch_ instruction indicates to hardware that data at a given memory
+ location may be accessed in the near future, potentially allocating cached
+ copies of data in one or more caches
+
+This document introduces a base set of CMO ISA extensions that operate
+specifically on cache blocks or the memory locations corresponding to a cache
+block; these are known as _cache-block operation_ (or _CBO_) instructions. Each
+of the above classes of instructions represents an extension in this
+specification:
+
+* The _Zicbom_ extension defines a set of cache-block management instructions:
+ `CBO.INVAL`, `CBO.CLEAN`, and `CBO.FLUSH`
+* The _Zicboz_ extension defines a cache-block zero instruction: `CBO.ZERO`
+* The _Zicbop_ extension defines a set of cache-block prefetch instructions:
+ `PREFETCH.R`, `PREFETCH.W`, and `PREFETCH.I`
+
+The execution behavior of the above instructions is also modified by CSR state
+added by this specification.
+
+The remainder of this document provides general background information on CMO
+instructions and describes each of the above ISA extensions.
+
+[NOTE]
+====
+_The term CMO encompasses all operations on caches or resources related to
+caches. The term CBO represents a subset of CMOs that operate only on cache
+blocks. The first CMO extensions only define CBOs._
+====
+
+[#background,reftext="Background"]
+=== Background
+
+This chapter provides information common to all CMO extensions.
+
+[#memory-caches,reftext="Memory and Caches"]
+==== Memory and Caches
+
+A _memory location_ is a physical resource in a system uniquely identified by a
+_physical address_. An _agent_ is a logic block, such as a RISC-V hart,
+accelerator, I/O device, etc., that can access a given memory location.
+
+[NOTE]
+====
+_A given agent may not be able to access all memory locations in a system, and
+two different agents may or may not be able to access the same set of memory
+locations._
+====
+
+A _load operation_ (or _store operation_) is performed by an agent to consume
+(or modify) the data at a given memory location. Load and store operations are
+performed as a result of explicit memory accesses to that memory location.
+Additionally, a _read transfer_ from memory fetches the data at the memory
+location, while a _write transfer_ to memory updates the data at the memory
+location.
+
+A _cache_ is a structure that buffers copies of data to reduce average memory
+latency. Any number of caches may be interspersed between an agent and a memory
+location, and load and store operations from an agent may be satisfied by a
+cache instead of the memory location.
+
+[NOTE]
+====
+_Load and store operations are decoupled from read and write transfers by
+caches. For example, a load operation may be satisfied by a cache without
+performing a read transfer from memory, or a store operation may be satisfied by
+a cache that first performs a read transfer from memory._
+====
+
+Caches organize copies of data into _cache blocks_, each of which represents a
+contiguous, naturally aligned power-of-two (or _NAPOT_) range of memory
+locations. A cache block is identified by a physical address corresponding to
+the underlying memory locations. The capacity and organization of a cache and
+the size of a cache block are both _implementation-specific_, and the execution
+environment provides software a means to discover information about the caches
+and cache blocks in a system. In the initial set of CMO extensions, the size of
+a cache block shall be uniform throughout the system.
+
+[NOTE]
+====
+_In future CMO extensions, the requirement for a uniform cache block size may be
+relaxed._
+====
+
+Implementation techniques such as speculative execution or hardware prefetching
+may cause a given cache to allocate or deallocate a copy of a cache block at any
+time, provided the corresponding physical addresses are accessible according to
+the supported access type PMA and are cacheable according to the cacheability
+PMA. Allocating a copy of a cache block results in a read transfer from another
+cache or from memory, while deallocating a copy of a cache block may result in a
+write transfer to another cache or to memory depending on whether the data in
+the copy were modified by a store operation. Additional details are discussed in
+<<#coherent-agents-caches>>.
+
+==== Cache-Block Operations
+
+A CBO instruction causes one or more operations to be performed on the cache
+blocks identified by the instruction. In general, a CBO instruction may identify
+one or more cache blocks; however, in the initial set of CMO extensions, CBO
+instructions identify a single cache block only.
+
+A cache-block management instruction performs one of the following operations,
+relative to the copy of a given cache block allocated in a given cache:
+
+* An _invalidate operation_ deallocates the copy of the cache block
+
+* A _clean operation_ performs a write transfer to another cache or to memory if
+ the data in the copy of the cache block have been modified by a store
+ operation
+
+* A _flush operation_ atomically performs a clean operation followed by an
+ invalidate operation
+
+Additional details, including the actual operation performed by a given
+cache-block management instruction, are described in <<#Zicbom>>.
+
+A cache-block zero instruction performs a set of store operations that write
+zeros to the set of bytes corresponding to a cache block. Unless specified
+otherwise, the store operations generated by a cache-block zero instruction have
+the same general properties and behaviors that other store instructions in the
+architecture have. An implementation may or may not update the entire set of
+bytes atomically with a single store operation. Additional details are described
+in <<#Zicboz>>.
+
+A cache-block prefetch instruction is a HINT to the hardware that software
+expects to perform a particular type of memory access in the near future.
+Additional details are described in <<#Zicbop>>.
+
+[#coherent-agents-caches,reftext="Coherent Agents and Caches"]
+=== Coherent Agents and Caches
+
+For a given memory location, a _set of coherent agents_ consists of the agents
+for which all of the following hold:
+
+* Store operations from all agents in the set appear to be serialized with
+ respect to each other
+* Store operations from all agents in the set eventually appear to all other
+ agents in the set
+* A load operation from an agent in the set returns data from a store operation
+ from an agent in the set (or from the initial data in memory)
+
+The coherent agents within such a set shall access a given memory location with
+the same physical address and the same physical memory attributes; however, if
+the coherence PMA for a given agent indicates a given memory location is not
+coherent, that agent shall not be a member of a set of coherent agents with any
+other agent for that memory location and shall be the sole member of a set of
+coherent agents consisting of itself.
+
+An agent who is a member of a set of coherent agents is said to be _coherent_
+with respect to the other agents in the set. On the other hand, an agent who is
+_not_ a member is said to be _non-coherent_ with respect to the agents in the
+set.
+
+Caches introduce the possibility that multiple copies of a given cache block may
+be present in a system at the same time. An _implementation-specific_ mechanism
+keeps these copies coherent with respect to the load and store operations from
+the agents in the set of coherent agents. Additionally, if a coherent agent in
+the set executes a CBO instruction that specifies the cache block, the resulting
+operation shall apply to any and all of the copies in the caches that can be
+accessed by the load and store operations from the coherent agents.
+
+[NOTE]
+====
+_An operation from a CBO instruction is defined to operate only on the copies of
+a cache block that are cached in the caches accessible by the explicit memory
+accesses performed by the set of coherent agents. This includes copies of a
+cache block in caches that are accessed only indirectly by load and store
+operations, e.g. coherent instruction caches._
+====
+
+The set of caches subject to the above mechanism form a _set of coherent
+caches_, and each coherent cache has the following behaviors, assuming all
+operations are performed by the agents in a set of coherent agents:
+
+* A coherent cache is permitted to allocate and deallocate copies of a cache
+ block and perform read and write transfers as described in <<#memory-caches>>
+
+* A coherent cache is permitted to perform a write transfer to memory provided
+ that a store operation has modified the data in the cache block since the most
+ recent invalidate, clean, or flush operation on the cache block
+
+* At least one coherent cache is responsible for performing a write transfer to
+ memory once a store operation has modified the data in the cache block until
+ the next invalidate, clean, or flush operation on the cache block, after which
+ no coherent cache is responsible (or permitted) to perform a write transfer to
+ memory until the next store operation has modified the data in the cache block
+
+* A coherent cache is required to perform a write transfer to memory if a store
+ operation has modified the data in the cache block since the most recent
+ invalidate, clean, or flush operation on the cache block and if the next clean
+ or flush operation requires a write transfer to memory
+
+[NOTE]
+====
+_The above restrictions ensure that a "clean" copy of a cache block, fetched by
+a read transfer from memory and unmodified by a store operation, cannot later
+overwrite the copy of the cache block in memory updated by a write transfer to
+memory from a non-coherent agent._
+====
+
+A non-coherent agent may initiate a cache-block operation that operates on the
+set of coherent caches accessed by a set of coherent agents. The mechanism to
+perform such an operation is _implementation-specific_.
+
+==== Memory Ordering
+
+===== Preserved Program Order
+
+The preserved program order (abbreviated _PPO_) rules are defined by the RVWMO
+memory ordering model. How the operations resulting from CMO instructions fit
+into these rules is described below.
+
+For cache-block management instructions, the resulting invalidate, clean, and
+flush operations behave as stores in the PPO rules subject to one additional
+overlapping address rule. Specifically, if _a_ precedes _b_ in program order,
+then _a_ will precede _b_ in the global memory order if:
+
+* _a_ is an invalidate, clean, or flush, _b_ is a load, and _a_ and _b_ access
+ overlapping memory addresses
+
+[NOTE]
+====
+_The above rule ensures that a subsequent load in program order never appears
+in the global memory order before a preceding invalidate, clean, or flush
+operation to an overlapping address._
+====
+
+Additionally, invalidate, clean, and flush operations are classified as W or O
+(depending on the physical memory attributes for the corresponding physical
+addresses) for the purposes of predecessor and successor sets in `FENCE`
+instructions. These operations are _not_ ordered by other instructions that
+order stores, e.g. `FENCE.I` and `SFENCE.VMA`.
+
+For cache-block zero instructions, the resulting store operations behave as
+stores in the PPO rules and are ordered by other instructions that order stores.
+
+Finally, for cache-block prefetch instructions, the resulting operations are
+_not_ ordered by the PPO rules nor are they ordered by any other ordering
+instructions.
+
+===== Load Values
+
+An invalidate operation may change the set of values that can be returned by a
+load. In particular, an additional condition is added to the Load Value Axiom:
+
+* If an invalidate operation _i_ precedes a load _r_ and operates on a byte _x_
+ returned by _r_, and no store to _x_ appears between _i_ and _r_ in program
+ order or in the global memory order, then _r_ returns any of the following
+ values for _x_:
+
+. If no clean or flush operations on _x_ precede _i_ in the global memory order,
+ either the initial value of _x_ or the value of any store to _x_ that precedes
+ _i_
+
+. If no store to _x_ precedes a clean or flush operation on _x_ in the global
+ memory order and if the clean or flush operation on _x_ precedes _i_ in the
+ global memory order, either the initial value of _x_ or the value of any store
+ to _x_ that precedes _i_
+
+. If a store to _x_ precedes a clean or flush operation on _x_ in the global
+ memory order and if the clean or flush operation on _x_ precedes _i_ in the
+ global memory order, either the value of the latest store to _x_ that precedes
+ the latest clean or flush operation on _x_ or the value of any store to _x_
+ that both precedes _i_ and succeeds the latest clean or flush operation on _x_
+ that precedes _i_
+
+. The value of any store to _x_ by a non-coherent agent regardless of the above
+ conditions
+
+[NOTE]
+====
+_The first three bullets describe the possible load values at different points
+in the global memory order relative to clean or flush operations. The final
+bullet implies that the load value may be produced by a non-coherent agent at
+any time._
+====
+
+==== Traps
+
+Execution of certain CMO instructions may result in traps due to CSR state,
+described in the <<#csr_state>> section, or due to the address translation and
+protection mechanisms. The trapping behavior of CMO instructions is described in
+the following sections.
+
+===== Illegal Instruction and Virtual Instruction Exceptions
+
+Cache-block management instructions and cache-block zero instructions may raise
+illegal instruction exceptions or virtual instruction exceptions depending on
+the current privilege mode and the state of the CMO control registers described
+in the <<#csr_state>> section.
+
+Cache-block prefetch instructions raise neither illegal instruction exceptions
+nor virtual instruction exceptions.
+
+===== Page Fault, Guest-Page Fault, and Access Fault Exceptions
+
+Similar to load and store instructions, CMO instructions are explicit memory
+access instructions that compute an effective address. The effective address is
+ultimately translated into a physical address based on the privilege mode and
+the enabled translation mechanisms, and the CMO extensions impose the following
+constraints on the physical addresses in a given cache block:
+
+* The PMP access control bits shall be the same for _all_ physical addresses in
+ the cache block, and if write permission is granted by the PMP access control
+ bits, read permission shall also be granted
+
+* The PMAs shall be the same for _all_ physical addresses in the cache block,
+ and if write permission is granted by the supported access type PMAs, read
+ permission shall also be granted
+
+If the above constraints are not met, the behavior of a CBO instruction is
+UNSPECIFIED.
+
+[NOTE]
+====
+_This specification assumes that the above constraints will typically be met for
+main memory regions and may be met for certain I/O regions._
+====
+
+The Zicboz extension introduces an additional supported access type PMA for
+cache-block zero instructions. Main memory regions are required to support
+accesses by cache-block zero instructions; however, I/O regions may specify
+whether accesses by cache-block zero instructions are supported.
+
+A cache-block management instruction is permitted to access the specified cache
+block whenever a load instruction or store instruction is permitted to access
+the corresponding physical addresses. If neither a load instruction nor store
+instruction is permitted to access the physical addresses, but an instruction
+fetch is permitted to access the physical addresses, whether a cache-block
+management instruction is permitted to access the cache block is UNSPECIFIED. If
+access to the cache block is not permitted, a cache-block management instruction
+raises a store page fault or store guest-page fault exception if address
+translation does not permit any access or raises a store access fault exception
+otherwise. During address translation, the instruction also checks the accessed
+bit and may either raise an exception or set the bit as required.
+
+[NOTE]
+====
+_The interaction between cache-block management instructions and instruction
+fetches will be specified in a future extension._
+
+_As implied by omission, a cache-block management instruction does not check the
+dirty bit and neither raises an exception nor sets the bit._
+====
+
+A cache-block zero instruction is permitted to access the specified cache block
+whenever a store instruction is permitted to access the corresponding physical
+addresses and when the PMAs indicate that cache-block zero instructions are a
+supported access type. If access to the cache block is not permitted, a
+cache-block zero instruction raises a store page fault or store guest-page fault
+exception if address translation does not permit write access or raises a store
+access fault exception otherwise. During address translation, the instruction
+also checks the accessed and dirty bits and may either raise an exception or set
+the bits as required.
+
+A cache-block prefetch instruction is permitted to access the specified cache
+block whenever a load instruction, store instruction, or instruction fetch is
+permitted to access the corresponding physical addresses. If access to the cache
+block is not permitted, a cache-block prefetch instruction does not raise any
+exceptions and shall not access any caches or memory. During address
+translation, the instruction does _not_ check the accessed and dirty bits and
+neither raises an exception nor sets the bits.
+
+[NOTE]
+====
+_Like a load or store instruction, a CMO instruction may or may not be permitted
+to access a cache block based on the states of the `MPRV`, `MPV`, and `MPP` bits
+in `mstatus` and the `SUM` and `MXR` bits in `mstatus`, `sstatus`, and
+`vsstatus`._
+
+_This specification expects that implementations will process cache-block
+management instructions like store/AMO instructions, so store/AMO exceptions are
+appropriate for these instructions, regardless of the permissions required._
+====
+
+===== Address Misaligned Exceptions
+
+CMO instructions do _not_ generate address misaligned exceptions.
+
+===== Breakpoint Exceptions and Debug Mode Entry
+
+Unless otherwise defined by the debug architecture specification, the behavior
+of trigger modules with respect to CMO instructions is UNSPECIFIED.
+
+[NOTE]
+====
+_For the Zicbom, Zicboz, and Zicbop extensions, this specification recommends
+the following common trigger module behaviors:_
+
+* Type 6 address match triggers, i.e. `tdata1.type=6` and `mcontrol6.select=0`,
+ should be supported
+
+* Type 2 address/data match triggers, i.e. `tdata1.type=2`, should be
+ unsupported
+
+* The size of a memory access equals the size of the cache block accessed, and
+ the compare values follow from the addresses of the NAPOT memory region
+ corresponding to the cache block containing the effective address
+
+* Unless an encoding for a cache block is added to the `mcontrol6.size` field,
+ an address trigger should only match a memory access from a CBO instruction if
+ `mcontrol6.size=0`
+
+_If the Zicbom extension is implemented, this specification recommends the
+following additional trigger module behaviors:_
+
+* Implementing address match triggers should be optional
+
+* Type 6 data match triggers, i.e. `tdata1.type=6` and `mcontrol6.select=1`,
+ should be unsupported
+
+* Memory accesses are considered to be stores, i.e. an address trigger matches
+ only if `mcontrol6.store=1`
+
+_If the Zicboz extension is implemented, this specification recommends the
+following additional trigger module behaviors:_
+
+* Implementing address match triggers should be mandatory
+
+* Type 6 data match triggers, i.e. `tdata1.type=6` and `mcontrol6.select=1`,
+ should be supported, and implementing these triggers should be optional
+
+* Memory accesses are considered to be stores, i.e. an address trigger matches
+ only if `mcontrol6.store=1`
+
+_If the Zicbop extension is implemented, this specification recommends the
+following additional trigger module behaviors:_
+
+* Implementing address match triggers should be optional
+
+* Type 6 data match triggers, i.e. `tdata1.type=6` and `mcontrol6.select=1`,
+ should be unsupported
+
+* Memory accesses may be considered to be loads or stores depending on the
+ implementation, i.e. whether an address trigger matches on these instructions
+ when `mcontrol6.load=1` or `mcontrol6.store=1` is _implementation-specific_
+
+_This specification also recommends that the behavior of trigger modules with
+respect to the Zicboz extension should be defined in version 1.0 of the debug
+architecture specification. The behavior of trigger modules with respect to the
+Zicbom and Zicbop extensions is expected to be defined in future extensions._
+====
+
+===== Hypervisor Extension
+
+For the purposes of writing the `mtinst` or `htinst` register on a trap, the
+following standard transformation is defined for cache-block management
+instructions and cache-block zero instructions:
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 'opcode'},
+ { bits: 5, name: 0x0 },
+ { bits: 3, name: 'funct3'},
+ { bits: 5, name: 0x0},
+ { bits: 12, name: 'operation'},
+]}
+....
+
+The `operation` field corresponds to the 12 most significant bits of the
+trapping instruction.
+
+[NOTE]
+====
+_As described in the hypervisor extension, a zero may be written into `mtinst`
+or `htinst` instead of the standard transformation defined above._
+====
+
+==== Effects on Constrained LR/SC Loops
+
+The following event is added to the list of events that satisfy the eventuality
+guarantee provided by constrained LR/SC loops, as defined in the A extension:
+
+* Some other hart executes a cache-block management instruction or a cache-block
+ zero instruction to the reservation set of the LR instruction in _H_'s
+ constrained LR/SC loop.
+
+[NOTE]
+====
+_The above event has been added to accommodate cache coherence protocols that
+cannot distinguish between invalidations for stores and invalidations for
+cache-block management operations._
+
+_Aside from the above event, CMO instructions neither change the properties of
+constrained LR/SC loops nor modify the eventuality guarantee provided by them.
+For example, executing a CMO instruction may cause a constrained LR/SC loop on
+any hart to fail periodically or may cause a unconstrained LR/SC sequence on the
+same hart to fail always. Additionally, executing a cache-block prefetch
+instruction does not impact the eventuality guarantee provided by constrained
+LR/SC loops executed on any hart._
+====
+
+==== Software Discovery
+
+The initial set of CMO extensions requires the following information to be
+discovered by software:
+
+* The size of the cache block for management and prefetch instructions
+* The size of the cache block for zero instructions
+* CBIE support at each privilege level
+
+Other general cache characteristics may also be specified in the discovery
+mechanism.
+
+[#csr_state,reftext="Control and Status Register State"]
+=== Control and Status Register State
+
+[NOTE]
+====
+_The CMO extensions rely on state in {csrname} CSRs that will be defined in a
+future update to the privileged architecture. If this CSR update is not
+ratified, the CMO extension will define its own CSRs._
+====
+
+Three CSRs control the execution of CMO instructions:
+
+* `m{csrname}`
+* `s{csrname}`
+* `h{csrname}`
+
+The `s{csrname}` register is used by all supervisor modes, including VS-mode. A
+hypervisor is responsible for saving and restoring `s{csrname}` on guest context
+switches. The `h{csrname}` register is only present if the H-extension is
+implemented and enabled.
+
+Each `x{csrname}` register (where `x` is `m`, `s`, or `h`) has the following
+generic format:
+
+.Generic Format for x{csrname} CSRs
+[cols="^10,^10,80a"]
+|===
+| Bits | Name | Description
+
+| [5:4] | `CBIE` | Cache Block Invalidate instruction Enable
+
+Enables the execution of the cache block invalidate instruction, `CBO.INVAL`, in
+a lower privilege mode:
+
+* `00`: The instruction raises an illegal instruction or virtual instruction
+ exception
+* `01`: The instruction is executed and performs a flush operation
+* `10`: _Reserved_
+* `11`: The instruction is executed and performs an invalidate operation
+
+| [6] | `CBCFE` | Cache Block Clean and Flush instruction Enable
+
+Enables the execution of the cache block clean instruction, `CBO.CLEAN`, and the
+cache block flush instruction, `CBO.FLUSH`, in a lower privilege mode:
+
+* `0`: The instruction raises an illegal instruction or virtual instruction
+ exception
+* `1`: The instruction is executed
+
+| [7] | `CBZE` | Cache Block Zero instruction Enable
+
+Enables the execution of the cache block zero instruction, `CBO.ZERO`, in a
+lower privilege mode:
+
+* `0`: The instruction raises an illegal instruction or virtual instruction
+ exception
+* `1`: The instruction is executed
+
+|===
+
+The x{csrname} registers control CBO instruction execution based on the current
+privilege mode and the state of the appropriate CSRs, as detailed below.
+
+A `CBO.INVAL` instruction executes or raises either an illegal instruction
+exception or a virtual instruction exception based on the state of the
+`x{csrname}.CBIE` fields:
+
+[source,sail,subs="attributes+"]
+--
+
+// illegal instruction exceptions
+if (((priv_mode != M) && (m{csrname}.CBIE == 00)) ||
+ ((priv_mode == U) && (s{csrname}.CBIE == 00)))
+{
+ <raise illegal instruction exception>
+}
+// virtual instruction exceptions
+else if (((priv_mode == VS) && (h{csrname}.CBIE == 00)) ||
+ ((priv_mode == VU) && ((h{csrname}.CBIE == 00) || (s{csrname}.CBIE == 00))))
+{
+ <raise virtual instruction exception>
+}
+// execute instruction
+else
+{
+ if (((priv_mode != M) && (m{csrname}.CBIE == 01)) ||
+ ((priv_mode == U) && (s{csrname}.CBIE == 01)) ||
+ ((priv_mode == VS) && (h{csrname}.CBIE == 01)) ||
+ ((priv_mode == VU) && ((h{csrname}.CBIE == 01) || (s{csrname}.CBIE == 01))))
+ {
+ <execute CBO.INVAL and perform flush operation>
+ }
+ else
+ {
+ <execute CBO.INVAL and perform invalidate operation>
+ }
+}
+
+
+--
+
+[NOTE]
+====
+_Until a modified cache block has updated memory, a `CBO.INVAL` instruction may
+expose stale data values in memory if the CSRs are programmed to perform an
+invalidate operation. This behavior may result in a security hole if lower
+privileged level software performs an invalidate operation and accesses
+sensitive information in memory._
+
+_To avoid such holes, higher privileged level software must perform either a
+clean or flush operation on the cache block before permitting lower privileged
+level software to perform an invalidate operation on the block. Alternatively,
+higher privileged level software may program the CSRs so that `CBO.INVAL`
+either traps or performs a flush operation in a lower privileged level._
+====
+
+A `CBO.CLEAN` or `CBO.FLUSH` instruction executes or raises an illegal
+instruction or virtual instruction exception based on the state of the
+`x{csrname}.CBCFE` bits:
+
+[source,sail,subs="attributes+"]
+--
+
+// illegal instruction exceptions
+if (((priv_mode != M) && !m{csrname}.CBCFE) ||
+ ((priv_mode == U) && !s{csrname}.CBCFE))
+{
+ <raise illegal instruction exception>
+}
+// virtual instruction exceptions
+else if (((priv_mode == VS) && !h{csrname}.CBCFE) ||
+ ((priv_mode == VU) && !(h{csrname}.CBCFE && s{csrname}.CBCFE)))
+{
+ <raise virtual instruction exception>
+}
+// execute instruction
+else
+{
+ <execute CBO.CLEAN or CBO.FLUSH>
+}
+
+--
+
+Finally, a `CBO.ZERO` instruction executes or raises an illegal instruction or
+virtual instruction exception based on the state of the `x{csrname}.CBZE` bits:
+
+[source,sail,subs="attributes+"]
+--
+
+// illegal instruction exceptions
+if (((priv_mode != M) && !m{csrname}.CBZE) ||
+ ((priv_mode == U) && !s{csrname}.CBZE))
+{
+ <raise illegal instruction exception>
+}
+// virtual instruction exceptions
+else if (((priv_mode == VS) && !h{csrname}.CBZE) ||
+ ((priv_mode == VU) && !(h{csrname}.CBZE && s{csrname}.CBZE)))
+{
+ <raise virtual instruction exception>
+}
+// execute instruction
+else
+{
+ <execute CBO.ZERO>
+}
+
+--
+
+Each `x{csrname}` register is WARL; however, software should determine the legal
+values from the execution environment discovery mechanism.
+
+[#extensions,reftext="Extensions"]
+=== Extensions
+
+CMO instructions are defined in the following extensions:
+
+* <<#Zicbom>>
+* <<#Zicboz>>
+* <<#Zicbop>>
+
+[#Zicbom,reftext="Cache-Block Management Instructions"]
+==== Cache-Block Management Instructions
+
+Cache-block management instructions enable software running on a set of coherent
+agents to communicate with a set of non-coherent agents by performing one of the
+following operations:
+
+* An invalidate operation makes data from store operations performed by a set of
+ non-coherent agents visible to the set of coherent agents at a point common to
+ both sets by deallocating all copies of a cache block from the set of coherent
+ caches up to that point
+
+* A clean operation makes data from store operations performed by the set of
+ coherent agents visible to a set of non-coherent agents at a point common to
+ both sets by performing a write transfer of a copy of a cache block to that
+ point provided a coherent agent performed a store operation that modified the
+ data in the cache block since the previous invalidate, clean, or flush
+ operation on the cache block
+
+* A flush operation atomically performs a clean operation followed by an
+ invalidate operation
+
+In the Zicbom extension, the instructions operate to a point common to _all_
+agents in the system. In other words, an invalidate operation ensures that store
+operations from all non-coherent agents visible to agents in the set of coherent
+agents, and a clean operation ensures that store operations from coherent agents
+visible to all non-coherent agents.
+
+[NOTE]
+====
+_The Zicbom extension does not prohibit agents that fall outside of the above
+architectural definition; however, software cannot rely on the defined cache
+operations to have the desired effects with respect to those agents._
+
+_Future extensions may define different sets of agents for the purposes of
+performance optimization._
+====
+
+These instructions operate on the cache block whose effective address is
+specified in _rs1_. The effective address is translated into a corresponding
+physical address by the appropriate translation mechanisms.
+
+The following instructions comprise the Zicbom extension:
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|cbo.clean _base_
+|<<#insns-cbo_clean>>
+
+|&#10003;
+|&#10003;
+|cbo.flush _base_
+|<<#insns-cbo_flush>>
+
+|&#10003;
+|&#10003;
+|cbo.inval _base_
+|<<#insns-cbo_inval>>
+
+|===
+
+[#Zicboz,reftext="Cache-Block Zero Instructions"]
+==== Cache-Block Zero Instructions
+
+Cache-block zero instructions store zeros to the set of bytes corresponding to a
+cache block. An implementation may update the bytes in any order and with any
+granularity and atomicity, including individual bytes.
+
+[NOTE]
+====
+_Cache-block zero instructions store zeros independently of whether data from
+the underlying memory locations are cacheable. In addition, this specification
+does not constrain how the bytes are written._
+====
+
+These instructions operate on the cache block, or the memory locations
+corresponding to the cache block, whose effective address is specified in _rs1_.
+The effective address is translated into a corresponding physical address by the
+appropriate translation mechanisms.
+
+The following instructions comprise the Zicboz extension:
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|cbo.zero _base_
+|<<#insns-cbo_zero>>
+
+|===
+
+[#Zicbop,reftext="Cache-Block Prefetch Instructions"]
+==== Cache-Block Prefetch Instructions
+
+Cache-block prefetch instructions are HINTs to the hardware to indicate that
+software intends to perform a particular type of memory access in the near
+future. The types of memory accesses are instruction fetch, data read (i.e.
+load), and data write (i.e. store).
+
+These instructions operate on the cache block whose effective address is the sum
+of the base address specified in _rs1_ and the sign-extended offset encoded in
+_imm[11:0]_, where _imm[4:0]_ shall equal `0b00000`. The effective address is
+translated into a corresponding physical address by the appropriate translation
+mechanisms.
+
+[NOTE]
+====
+_Cache-block prefetch instructions are encoded as ORI instructions with rd equal
+to `0b00000`; however, for the purposes of effective address calculation, this
+field is also interpreted as imm[4:0] like a store instruction._
+====
+
+The following instructions comprise the Zicbop extension:
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|&#10003;
+|&#10003;
+|prefetch.i _offset_(_base_)
+|<<#insns-prefetch_i>>
+
+|&#10003;
+|&#10003;
+|prefetch.r _offset_(_base_)
+|<<#insns-prefetch_r>>
+
+|&#10003;
+|&#10003;
+|prefetch.w _offset_(_base_)
+|<<#insns-prefetch_w>>
+
+|===
+
+[#insns,reftext="Instructions"]
+=== Instructions
+
+[#insns-cbo_clean,reftext="Cache Block Clean"]
+==== cbo.clean
+
+Synopsis::
+Perform a clean operation on a cache block
+
+Mnemonic::
+cbo.clean _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0xF, attr: ['MISC-MEM'] },
+ { bits: 5, name: 0x0 },
+ { bits: 3, name: 0x2, attr: ['CBO'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 12, name: 0x001, attr: ['CBO.CLEAN'] },
+]}
+....
+
+Description::
+
+A *cbo.clean* instruction performs a clean operation on the cache block whose
+effective address is the base address specified in _rs1_. The offset operand may
+be omitted; otherwise, any expression that computes the offset shall evaluate to
+zero. The instruction operates on the set of coherent caches accessed by the
+agent executing the instruction.
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
+[#insns-cbo_flush,reftext="Cache Block Flush"]
+==== cbo.flush
+
+Synopsis::
+Perform a flush operation on a cache block
+
+Mnemonic::
+cbo.flush _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0xF, attr: ['MISC-MEM'] },
+ { bits: 5, name: 0x0 },
+ { bits: 3, name: 0x2, attr: ['CBO'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 12, name: 0x002, attr: ['CBO.FLUSH'] },
+]}
+....
+
+Description::
+
+A *cbo.flush* instruction performs a flush operation on the cache block whose
+effective address is the base address specified in _rs1_. The offset operand may
+be omitted; otherwise, any expression that computes the offset shall evaluate to
+zero. The instruction operates on the set of coherent caches accessed by the
+agent executing the instruction.
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
+[#insns-cbo_inval,reftext="Cache Block Invalidate"]
+==== cbo.inval
+
+Synopsis::
+Perform an invalidate operation on a cache block
+
+Mnemonic::
+cbo.inval _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0xF, attr: ['MISC-MEM'] },
+ { bits: 5, name: 0x0 },
+ { bits: 3, name: 0x2, attr: ['CBO'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 12, name: 0x000, attr: ['CBO.INVAL'] },
+]}
+....
+
+Description::
+
+A *cbo.inval* instruction performs an invalidate operation on the cache block
+whose effective address is the base address specified in _rs1_. The offset
+operand may be omitted; otherwise, any expression that computes the offset shall
+evaluate to zero. The instruction operates on the set of coherent caches
+accessed by the agent executing the instruction. Depending on CSR programming,
+the instruction may perform a flush operation instead of an invalidate
+operation.
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
+[#insns-cbo_zero,reftext="Cache Block Zero"]
+==== cbo.zero
+
+Synopsis::
+Store zeros to the full set of bytes corresponding to a cache block
+
+Mnemonic::
+cbo.zero _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0xF, attr: ['MISC-MEM'] },
+ { bits: 5, name: 0x0 },
+ { bits: 3, name: 0x2, attr: ['CBO'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 12, name: 0x004, attr: ['CBO.ZERO'] },
+]}
+....
+
+Description::
+
+A *cbo.zero* instruction performs stores of zeros to the full set of bytes
+corresponding to the cache block whose effective address is the base address
+specified in _rs1_. The offset operand may be omitted; otherwise, any expression
+that computes the offset shall evaluate to zero. An implementation may or may
+not update the entire set of bytes atomically.
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
+[#insns-prefetch_i,reftext="Cache Block Prefetch for Instruction Fetch"]
+==== prefetch.i
+
+Synopsis::
+Provide a HINT to hardware that a cache block is likely to be accessed by an
+instruction fetch in the near future
+
+Mnemonic::
+prefetch.i _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 0x0, attr: ['offset[4:0]'] },
+ { bits: 3, name: 0x6, attr: ['ORI'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 5, name: 0x0, attr: ['PREFETCH.I'] },
+ { bits: 7, name: 'imm[11:5]', attr: ['offset[11:5]'] },
+]}
+....
+
+Description::
+
+A *prefetch.i* instruction indicates to hardware that the cache block whose
+effective address is the sum of the base address specified in _rs1_ and the
+sign-extended offset encoded in _imm[11:0]_, where _imm[4:0]_ equals `0b00000`,
+is likely to be accessed by an instruction fetch in the near future.
+
+[NOTE]
+====
+_An implementation may opt to cache a copy of the cache block in a cache
+accessed by an instruction fetch in order to improve memory access latency, but
+this behavior is not required._
+====
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
+[#insns-prefetch_r,reftext="Cache Block Prefetch for Data Read"]
+==== prefetch.r
+
+Synopsis::
+Provide a HINT to hardware that a cache block is likely to be accessed by a data
+read in the near future
+
+Mnemonic::
+prefetch.r _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 0x0, attr: ['offset[4:0]'] },
+ { bits: 3, name: 0x6, attr: ['ORI'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 5, name: 0x1, attr: ['PREFETCH.R'] },
+ { bits: 7, name: 'imm[11:5]', attr: ['offset[11:5]'] },
+]}
+....
+
+Description::
+
+A *prefetch.r* instruction indicates to hardware that the cache block whose
+effective address is the sum of the base address specified in _rs1_ and the
+sign-extended offset encoded in _imm[11:0]_, where _imm[4:0]_ equals `0b00000`,
+is likely to be accessed by a data read (i.e. load) in the near future.
+
+[NOTE]
+====
+_An implementation may opt to cache a copy of the cache block in a cache
+accessed by a data read in order to improve memory access latency, but this
+behavior is not required._
+====
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
+[#insns-prefetch_w,reftext="Cache Block Prefetch for Data Write"]
+==== prefetch.w
+
+Synopsis::
+Provide a HINT to hardware that a cache block is likely to be accessed by a data
+write in the near future
+
+Mnemonic::
+prefetch.w _offset_(_base_)
+
+Encoding::
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 7, name: 0x13, attr: ['OP-IMM'] },
+ { bits: 5, name: 0x0, attr: ['offset[4:0]'] },
+ { bits: 3, name: 0x6, attr: ['ORI'] },
+ { bits: 5, name: 'rs1', attr: ['base'] },
+ { bits: 5, name: 0x3, attr: ['PREFETCH.W'] },
+ { bits: 7, name: 'imm[11:5]', attr: ['offset[11:5]'] },
+]}
+....
+
+Description::
+
+A *prefetch.w* instruction indicates to hardware that the cache block whose
+effective address is the sum of the base address specified in _rs1_ and the
+sign-extended offset encoded in _imm[11:0]_, where _imm[4:0]_ equals `0b00000`,
+is likely to be accessed by a data write (i.e. store) in the near future.
+
+[NOTE]
+====
+_An implementation may opt to cache a copy of the cache block in a cache
+accessed by a data write in order to improve memory access latency, but this
+behavior is not required._
+====
+
+Operation::
+[source,sail]
+--
+TODO
+--
+
diff --git a/src/example/memcpy.s b/src/example/memcpy.s
new file mode 100644
index 0000000..5f6318a
--- /dev/null
+++ b/src/example/memcpy.s
@@ -0,0 +1,17 @@
+ .text
+ .balign 4
+ .global memcpy
+ # void *memcpy(void* dest, const void* src, size_t n)
+ # a0=dest, a1=src, a2=n
+ #
+ memcpy:
+ mv a3, a0 # Copy destination
+ loop:
+ vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
+ vle8.v v0, (a1) # Load bytes
+ add a1, a1, t0 # Bump pointer
+ sub a2, a2, t0 # Decrement count
+ vse8.v v0, (a3) # Store bytes
+ add a3, a3, t0 # Bump pointer
+ bnez a2, loop # Any more?
+ ret # Return
diff --git a/src/example/saxpy.s b/src/example/saxpy.s
new file mode 100644
index 0000000..de7f224
--- /dev/null
+++ b/src/example/saxpy.s
@@ -0,0 +1,29 @@
+ .text
+ .balign 4
+ .global saxpy
+# void
+# saxpy(size_t n, const float a, const float *x, float *y)
+# {
+# size_t i;
+# for (i=0; i<n; i++)
+# y[i] = a * x[i] + y[i];
+# }
+#
+# register arguments:
+# a0 n
+# fa0 a
+# a1 x
+# a2 y
+
+saxpy:
+ vsetvli a4, a0, e32, m8, ta, ma
+ vle32.v v0, (a1)
+ sub a0, a0, a4
+ slli a4, a4, 2
+ add a1, a1, a4
+ vle32.v v8, (a2)
+ vfmacc.vf v8, fa0, v0
+ vse32.v v8, (a2)
+ add a2, a2, a4
+ bnez a0, saxpy
+ ret
diff --git a/src/example/sgemm.S b/src/example/sgemm.S
new file mode 100644
index 0000000..e29cc8d
--- /dev/null
+++ b/src/example/sgemm.S
@@ -0,0 +1,221 @@
+ .text
+ .balign 4
+ .global sgemm_nn
+# RV64IDV system
+#
+# void
+# sgemm_nn(size_t n,
+# size_t m,
+# size_t k,
+# const float*a, // m * k matrix
+# size_t lda,
+# const float*b, // k * n matrix
+# size_t ldb,
+# float*c, // m * n matrix
+# size_t ldc)
+#
+# c += a*b (alpha=1, no transpose on input matrices)
+# matrices stored in C row-major order
+
+#define n a0
+#define m a1
+#define k a2
+#define ap a3
+#define astride a4
+#define bp a5
+#define bstride a6
+#define cp a7
+#define cstride t0
+#define kt t1
+#define nt t2
+#define bnp t3
+#define cnp t4
+#define akp t5
+#define bkp s0
+#define nvl s1
+#define ccp s2
+#define amp s3
+
+# Use args as additional temporaries
+#define ft12 fa0
+#define ft13 fa1
+#define ft14 fa2
+#define ft15 fa3
+
+# This version holds a 16*VLMAX block of C matrix in vector registers
+# in inner loop, but otherwise does not cache or TLB tiling.
+
+sgemm_nn:
+ addi sp, sp, -FRAMESIZE
+ sd s0, OFFSET(sp)
+ sd s1, OFFSET(sp)
+ sd s2, OFFSET(sp)
+
+ # Check for zero size matrices
+ beqz n, exit
+ beqz m, exit
+ beqz k, exit
+
+ # Convert elements strides to byte strides.
+ ld cstride, OFFSET(sp) # Get arg from stack frame
+ slli astride, astride, 2
+ slli bstride, bstride, 2
+ slli cstride, cstride, 2
+
+ slti t6, m, 16
+ bnez t6, end_rows
+
+c_row_loop: # Loop across rows of C blocks
+
+ mv nt, n # Initialize n counter for next row of C blocks
+
+ mv bnp, bp # Initialize B n-loop pointer to start
+ mv cnp, cp # Initialize C n-loop pointer
+
+c_col_loop: # Loop across one row of C blocks
+ vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1
+
+ mv akp, ap # reset pointer into A to beginning
+ mv bkp, bnp # step to next column in B matrix
+
+ # Initalize current C submatrix block from memory.
+ vle32.v v0, (cnp); add ccp, cnp, cstride;
+ vle32.v v1, (ccp); add ccp, ccp, cstride;
+ vle32.v v2, (ccp); add ccp, ccp, cstride;
+ vle32.v v3, (ccp); add ccp, ccp, cstride;
+ vle32.v v4, (ccp); add ccp, ccp, cstride;
+ vle32.v v5, (ccp); add ccp, ccp, cstride;
+ vle32.v v6, (ccp); add ccp, ccp, cstride;
+ vle32.v v7, (ccp); add ccp, ccp, cstride;
+ vle32.v v8, (ccp); add ccp, ccp, cstride;
+ vle32.v v9, (ccp); add ccp, ccp, cstride;
+ vle32.v v10, (ccp); add ccp, ccp, cstride;
+ vle32.v v11, (ccp); add ccp, ccp, cstride;
+ vle32.v v12, (ccp); add ccp, ccp, cstride;
+ vle32.v v13, (ccp); add ccp, ccp, cstride;
+ vle32.v v14, (ccp); add ccp, ccp, cstride;
+ vle32.v v15, (ccp)
+
+
+ mv kt, k # Initialize inner loop counter
+
+ # Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
+ # Software pipeline loads
+ flw ft0, (akp); add amp, akp, astride;
+ flw ft1, (amp); add amp, amp, astride;
+ flw ft2, (amp); add amp, amp, astride;
+ flw ft3, (amp); add amp, amp, astride;
+ # Get vector from B matrix
+ vle32.v v16, (bkp)
+
+ # Loop on inner dimension for current C block
+ k_loop:
+ vfmacc.vf v0, ft0, v16
+ add bkp, bkp, bstride
+ flw ft4, (amp)
+ add amp, amp, astride
+ vfmacc.vf v1, ft1, v16
+ addi kt, kt, -1 # Decrement k counter
+ flw ft5, (amp)
+ add amp, amp, astride
+ vfmacc.vf v2, ft2, v16
+ flw ft6, (amp)
+ add amp, amp, astride
+ flw ft7, (amp)
+ vfmacc.vf v3, ft3, v16
+ add amp, amp, astride
+ flw ft8, (amp)
+ add amp, amp, astride
+ vfmacc.vf v4, ft4, v16
+ flw ft9, (amp)
+ add amp, amp, astride
+ vfmacc.vf v5, ft5, v16
+ flw ft10, (amp)
+ add amp, amp, astride
+ vfmacc.vf v6, ft6, v16
+ flw ft11, (amp)
+ add amp, amp, astride
+ vfmacc.vf v7, ft7, v16
+ flw ft12, (amp)
+ add amp, amp, astride
+ vfmacc.vf v8, ft8, v16
+ flw ft13, (amp)
+ add amp, amp, astride
+ vfmacc.vf v9, ft9, v16
+ flw ft14, (amp)
+ add amp, amp, astride
+ vfmacc.vf v10, ft10, v16
+ flw ft15, (amp)
+ add amp, amp, astride
+ addi akp, akp, 4 # Move to next column of a
+ vfmacc.vf v11, ft11, v16
+ beqz kt, 1f # Don't load past end of matrix
+ flw ft0, (akp)
+ add amp, akp, astride
+1: vfmacc.vf v12, ft12, v16
+ beqz kt, 1f
+ flw ft1, (amp)
+ add amp, amp, astride
+1: vfmacc.vf v13, ft13, v16
+ beqz kt, 1f
+ flw ft2, (amp)
+ add amp, amp, astride
+1: vfmacc.vf v14, ft14, v16
+ beqz kt, 1f # Exit out of loop
+ flw ft3, (amp)
+ add amp, amp, astride
+ vfmacc.vf v15, ft15, v16
+ vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
+ j k_loop
+
+1: vfmacc.vf v15, ft15, v16
+
+ # Save C matrix block back to memory
+ vse32.v v0, (cnp); add ccp, cnp, cstride;
+ vse32.v v1, (ccp); add ccp, ccp, cstride;
+ vse32.v v2, (ccp); add ccp, ccp, cstride;
+ vse32.v v3, (ccp); add ccp, ccp, cstride;
+ vse32.v v4, (ccp); add ccp, ccp, cstride;
+ vse32.v v5, (ccp); add ccp, ccp, cstride;
+ vse32.v v6, (ccp); add ccp, ccp, cstride;
+ vse32.v v7, (ccp); add ccp, ccp, cstride;
+ vse32.v v8, (ccp); add ccp, ccp, cstride;
+ vse32.v v9, (ccp); add ccp, ccp, cstride;
+ vse32.v v10, (ccp); add ccp, ccp, cstride;
+ vse32.v v11, (ccp); add ccp, ccp, cstride;
+ vse32.v v12, (ccp); add ccp, ccp, cstride;
+ vse32.v v13, (ccp); add ccp, ccp, cstride;
+ vse32.v v14, (ccp); add ccp, ccp, cstride;
+ vse32.v v15, (ccp)
+
+ # Following tail instructions should be scheduled earlier in free slots during C block save.
+ # Leaving here for clarity.
+
+ # Bump pointers for loop across blocks in one row
+ slli t6, nvl, 2
+ add cnp, cnp, t6 # Move C block pointer over
+ add bnp, bnp, t6 # Move B block pointer over
+ sub nt, nt, nvl # Decrement element count in n dimension
+ bnez nt, c_col_loop # Any more to do?
+
+ # Move to next set of rows
+ addi m, m, -16 # Did 16 rows above
+ slli t6, astride, 4 # Multiply astride by 16
+ add ap, ap, t6 # Move A matrix pointer down 16 rows
+ slli t6, cstride, 4 # Multiply cstride by 16
+ add cp, cp, t6 # Move C matrix pointer down 16 rows
+
+ slti t6, m, 16
+ beqz t6, c_row_loop
+
+ # Handle end of matrix with fewer than 16 rows.
+ # Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
+end_rows:
+ # Not done.
+
+exit:
+ ld s0, OFFSET(sp)
+ ld s1, OFFSET(sp)
+ ld s2, OFFSET(sp)
+ addi sp, sp, FRAMESIZE
+ ret
diff --git a/src/example/strcmp.s b/src/example/strcmp.s
new file mode 100644
index 0000000..c657703
--- /dev/null
+++ b/src/example/strcmp.s
@@ -0,0 +1,34 @@
+ .text
+ .balign 4
+ .global strcmp
+ # int strcmp(const char *src1, const char* src2)
+strcmp:
+ ## Using LMUL=2, but same register names work for larger LMULs
+ li t1, 0 # Initial pointer bump
+loop:
+ vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes
+ add a0, a0, t1 # Bump src1 pointer
+ vle8ff.v v8, (a0) # Get src1 bytes
+ add a1, a1, t1 # Bump src2 pointer
+ vle8ff.v v16, (a1) # Get src2 bytes
+
+ vmseq.vi v0, v8, 0 # Flag zero bytes in src1
+ vmsne.vv v1, v8, v16 # Flag if src1 != src2
+ vmor.mm v0, v0, v1 # Combine exit conditions
+
+ vfirst.m a2, v0 # ==0 or != ?
+ csrr t1, vl # Get number of bytes fetched
+
+ bltz a2, loop # Loop if all same and no zero byte
+
+ add a0, a0, a2 # Get src1 element address
+ lbu a3, (a0) # Get src1 byte from memory
+
+ add a1, a1, a2 # Get src2 element address
+ lbu a4, (a1) # Get src2 byte from memory
+
+ sub a0, a3, a4 # Return value.
+
+ ret
+
+
diff --git a/src/example/strcpy.s b/src/example/strcpy.s
new file mode 100644
index 0000000..109112d
--- /dev/null
+++ b/src/example/strcpy.s
@@ -0,0 +1,20 @@
+ .text
+ .balign 4
+ .global strcpy
+ # char* strcpy(char *dst, const char* src)
+strcpy:
+ mv a2, a0 # Copy dst
+ li t0, -1 # Infinite AVL
+loop:
+ vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes
+ vle8ff.v v8, (a1) # Get src bytes
+ csrr t1, vl # Get number of bytes fetched
+ vmseq.vi v1, v8, 0 # Flag zero bytes
+ vfirst.m a3, v1 # Zero found?
+ add a1, a1, t1 # Bump pointer
+ vmsif.m v0, v1 # Set mask up to and including zero byte.
+ vse8.v v8, (a2), v0.t # Write out bytes
+ add a2, a2, t1 # Bump pointer
+ bltz a3, loop # Zero byte not found, so loop
+
+ ret
diff --git a/src/example/strlen.s b/src/example/strlen.s
new file mode 100644
index 0000000..1c3af4b
--- /dev/null
+++ b/src/example/strlen.s
@@ -0,0 +1,22 @@
+ .text
+ .balign 4
+ .global strlen
+# size_t strlen(const char *str)
+# a0 holds *str
+
+strlen:
+ mv a3, a0 # Save start
+loop:
+ vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length
+ vle8ff.v v8, (a3) # Load bytes
+ csrr a1, vl # Get bytes read
+ vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0
+ vfirst.m a2, v0 # Find first set bit
+ add a3, a3, a1 # Bump pointer
+ bltz a2, loop # Not found?
+
+ add a0, a0, a1 # Sum start + bump
+ add a3, a3, a2 # Add index
+ sub a0, a3, a0 # Subtract start address+bump
+
+ ret
diff --git a/src/example/strncpy.s b/src/example/strncpy.s
new file mode 100644
index 0000000..87e5410
--- /dev/null
+++ b/src/example/strncpy.s
@@ -0,0 +1,36 @@
+ .text
+ .balign 4
+ .global strncpy
+ # char* strncpy(char *dst, const char* src, size_t n)
+strncpy:
+ mv a3, a0 # Copy dst
+loop:
+ vsetvli x0, a2, e8, m8, ta, ma # Vectors of bytes.
+ vle8ff.v v8, (a1) # Get src bytes
+ vmseq.vi v1, v8, 0 # Flag zero bytes
+ csrr t1, vl # Get number of bytes fetched
+ vfirst.m a4, v1 # Zero found?
+ vmsbf.m v0, v1 # Set mask up to before zero byte.
+ vse8.v v8, (a3), v0.t # Write out non-zero bytes
+ bgez a4, zero_tail # Zero remaining bytes.
+ sub a2, a2, t1 # Decrement count.
+ add a3, a3, t1 # Bump dest pointer
+ add a1, a1, t1 # Bump src pointer
+ bnez a2, loop # Anymore?
+
+ ret
+
+zero_tail:
+ sub a2, a2, a4 # Subtract count on non-zero bytes.
+ add a3, a3, a4 # Advance past non-zero bytes.
+ vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes.
+ vmv.v.i v0, 0 # Splat zero.
+
+zero_loop:
+ vse8.v v0, (a3) # Store zero.
+ sub a2, a2, t1 # Decrement count.
+ add a3, a3, t1 # Bump pointer
+ vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes.
+ bnez a2, zero_loop # Anymore?
+
+ ret
diff --git a/src/example/vvaddint32.s b/src/example/vvaddint32.s
new file mode 100644
index 0000000..22305d9
--- /dev/null
+++ b/src/example/vvaddint32.s
@@ -0,0 +1,22 @@
+ .text
+ .balign 4
+ .global vvaddint32
+ # vector-vector add routine of 32-bit integers
+ # void vvaddint32(size_t n, const int*x, const int*y, int*z)
+ # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+ #
+ # a0 = n, a1 = x, a2 = y, a3 = z
+ # Non-vector instructions are indented
+vvaddint32:
+ vsetvli t0, a0, e32, ta, ma # Set vector length based on 32-bit vectors
+ vle32.v v0, (a1) # Get first vector
+ sub a0, a0, t0 # Decrement number done
+ slli t0, t0, 2 # Multiply number done by 4 bytes
+ add a1, a1, t0 # Bump pointer
+ vle32.v v1, (a2) # Get second vector
+ add a2, a2, t0 # Bump pointer
+ vadd.vv v2, v0, v1 # Sum vectors
+ vse32.v v2, (a3) # Store result
+ add a3, a3, t0 # Bump pointer
+ bnez a0, vvaddint32 # Loop back
+ ret # Finished
diff --git a/src/f-st-ext.adoc b/src/f-st-ext.adoc
index 54d43ca..24941ed 100644
--- a/src/f-st-ext.adoc
+++ b/src/f-st-ext.adoc
@@ -37,7 +37,7 @@ floating-point register file state can reduce context-switch overhead.
[[fprs]]
.RISC-V standard F extension single-precision floating-point state
-[col[s="<|^|>"|option[s="header",width="50%",align="center"grid="rows"]
+[cols="<,^,>",options="header",width="50%",align="center",grid="rows"]
|===
| [.small]#FLEN-1#| >| [.small]#0#
3+^| [.small]#f0#
diff --git a/src/fraclmul.adoc b/src/fraclmul.adoc
new file mode 100644
index 0000000..6f12f58
--- /dev/null
+++ b/src/fraclmul.adoc
@@ -0,0 +1,174 @@
+=== Fractional Lmul example
+
+This appendix presents a non-normative example to help explain where
+compilers can make good use of the fractional LMUL feature.
+
+Consider the following (admittedly contrived) loop written in C:
+
+----
+void add_ref(long N,
+ signed char *restrict c_c, signed char *restrict c_a, signed char *restrict c_b,
+ long *restrict l_c, long *restrict l_a, long *restrict l_b,
+ long *restrict l_d, long *restrict l_e, long *restrict l_f,
+ long *restrict l_g, long *restrict l_h, long *restrict l_i,
+ long *restrict l_j, long *restrict l_k, long *restrict l_l,
+ long *restrict l_m) {
+ long i;
+ for (i = 0; i < N; i++) {
+ c_c[i] = c_a[i] + c_b[i]; // Note this 'char' addition that creates a mixed type situation
+ l_c[i] = l_a[i] + l_b[i];
+ l_f[i] = l_d[i] + l_e[i];
+ l_i[i] = l_g[i] + l_h[i];
+ l_l[i] = l_k[i] + l_j[i];
+ l_m[i] += l_m[i] + l_c[i] + l_f[i] + l_i[i] + l_l[i];
+ }
+}
+----
+
+The example loop has a high register pressure due to the many input variables
+and temporaries required. The compiler realizes there are two datatypes within
+the loop: an 8-bit 'char' and a 64-bit 'long *'. Without fractional LMUL, the
+compiler would be forced to use LMUL=1 for the 8-bit computation and LMUL=8 for
+the 64-bit computation(s), to have equal number of elements on all computations
+within the same loop iteration. Under LMUL=8, only 4 registers are available
+to the register allocator. Given the large number of 64-bit variables and
+temporaries required in this loop, the compiler ends up generating a lot of
+spill code. The code below demonstrates this effect:
+
+----
+.LBB0_4: # %vector.body
+ # =>This Inner Loop Header: Depth=1
+ add s9, a2, s6
+ vsetvli s1, zero, e8,m1,ta,mu
+ vle8.v v25, (s9)
+ add s1, a3, s6
+ vle8.v v26, (s1)
+ vadd.vv v25, v26, v25
+ add s1, a1, s6
+ vse8.v v25, (s1)
+ add s9, a5, s10
+ vsetvli s1, zero, e64,m8,ta,mu
+ vle64.v v8, (s9)
+ add s1, a6, s10
+ vle64.v v16, (s1)
+ add s1, a7, s10
+ vle64.v v24, (s1)
+ add s1, s3, s10
+ vle64.v v0, (s1)
+ sd a0, -112(s0)
+ ld a0, -128(s0)
+ vs8r.v v0, (a0) # Spill LMUL=8
+ add s9, t6, s10
+ add s11, t5, s10
+ add ra, t2, s10
+ add s1, t3, s10
+ vle64.v v0, (s9)
+ ld s9, -136(s0)
+ vs8r.v v0, (s9) # Spill LMUL=8
+ vle64.v v0, (s11)
+ ld s9, -144(s0)
+ vs8r.v v0, (s9) # Spill LMUL=8
+ vle64.v v0, (ra)
+ ld s9, -160(s0)
+ vs8r.v v0, (s9) # Spill LMUL=8
+ vle64.v v0, (s1)
+ ld s1, -152(s0)
+ vs8r.v v0, (s1) # Spill LMUL=8
+ vadd.vv v16, v16, v8
+ ld s1, -128(s0)
+ vl8r.v v8, (s1) # Reload LMUL=8
+ vadd.vv v8, v8, v24
+ ld s1, -136(s0)
+ vl8r.v v24, (s1) # Reload LMUL=8
+ ld s1, -144(s0)
+ vl8r.v v0, (s1) # Reload LMUL=8
+ vadd.vv v24, v0, v24
+ ld s1, -128(s0)
+ vs8r.v v24, (s1) # Spill LMUL=8
+ ld s1, -152(s0)
+ vl8r.v v0, (s1) # Reload LMUL=8
+ ld s1, -160(s0)
+ vl8r.v v24, (s1) # Reload LMUL=8
+ vadd.vv v0, v0, v24
+ add s1, a4, s10
+ vse64.v v16, (s1)
+ add s1, s2, s10
+ vse64.v v8, (s1)
+ vadd.vv v8, v8, v16
+ add s1, t4, s10
+ ld s9, -128(s0)
+ vl8r.v v16, (s9) # Reload LMUL=8
+ vse64.v v16, (s1)
+ add s9, t0, s10
+ vadd.vv v8, v8, v16
+ vle64.v v16, (s9)
+ add s1, t1, s10
+ vse64.v v0, (s1)
+ vadd.vv v8, v8, v0
+ vsll.vi v16, v16, 1
+ vadd.vv v8, v8, v16
+ vse64.v v8, (s9)
+ add s6, s6, s7
+ add s10, s10, s8
+ bne s6, s4, .LBB0_4
+----
+
+If instead of using LMUL=1 for the 8-bit computation, the compiler is allowed
+to use a fractional LMUL=1/2, then the 64-bit computations can be performed
+using LMUL=4 (note that the same ratio of 64-bit elements and 8-bit elements is
+preserved as in the previous example). Now the compiler has 8 available
+registers to perform register allocation, resulting in no spill code, as
+shown in the loop below:
+
+----
+.LBB0_4: # %vector.body
+ # =>This Inner Loop Header: Depth=1
+ add s9, a2, s6
+ vsetvli s1, zero, e8,mf2,ta,mu // LMUL=1/2 !
+ vle8.v v25, (s9)
+ add s1, a3, s6
+ vle8.v v26, (s1)
+ vadd.vv v25, v26, v25
+ add s1, a1, s6
+ vse8.v v25, (s1)
+ add s9, a5, s10
+ vsetvli s1, zero, e64,m4,ta,mu // LMUL=4
+ vle64.v v28, (s9)
+ add s1, a6, s10
+ vle64.v v8, (s1)
+ vadd.vv v28, v8, v28
+ add s1, a7, s10
+ vle64.v v8, (s1)
+ add s1, s3, s10
+ vle64.v v12, (s1)
+ add s1, t6, s10
+ vle64.v v16, (s1)
+ add s1, t5, s10
+ vle64.v v20, (s1)
+ add s1, a4, s10
+ vse64.v v28, (s1)
+ vadd.vv v8, v12, v8
+ vadd.vv v12, v20, v16
+ add s1, t2, s10
+ vle64.v v16, (s1)
+ add s1, t3, s10
+ vle64.v v20, (s1)
+ add s1, s2, s10
+ vse64.v v8, (s1)
+ add s9, t4, s10
+ vadd.vv v16, v20, v16
+ add s11, t0, s10
+ vle64.v v20, (s11)
+ vse64.v v12, (s9)
+ add s1, t1, s10
+ vse64.v v16, (s1)
+ vsll.vi v20, v20, 1
+ vadd.vv v28, v8, v28
+ vadd.vv v28, v28, v12
+ vadd.vv v28, v28, v16
+ vadd.vv v28, v28, v20
+ vse64.v v28, (s11)
+ add s6, s6, s7
+ add s10, s10, s8
+ bne s6, s4, .LBB0_4
+----
diff --git a/src/hypervisor.adoc b/src/hypervisor.adoc
index 6d3d226..e4775b5 100644
--- a/src/hypervisor.adoc
+++ b/src/hypervisor.adoc
@@ -2334,7 +2334,7 @@ nonzero value (the faulting guest physical address) is written to
<<pseudoinsts>>; zero is not allowed.
[[pseudoinsts]]
-.Special pseudoinstruction values for guest-page faults. The RV32 values are used when VSXLEN=32, and the TV64 values when VSXLEN=64.
+.Special pseudoinstruction values for guest-page faults. The RV32 values are used when VSXLEN=32, and the RV64 values when VSXLEN=64.
[%autowidth,float="center",align="center",cols="<,<",options="header"]
|===
|Value |Meaning
diff --git a/src/images/bytefield/hstatusreg-rv32.edn b/src/images/bytefield/hstatusreg-rv32.edn
index 02db585..2762ce6 100644
--- a/src/images/bytefield/hstatusreg-rv32.edn
+++ b/src/images/bytefield/hstatusreg-rv32.edn
@@ -51,9 +51,9 @@
(draw-box "6" {:span 5 :borders {}})
(draw-box "2" {:span 2 :borders {}})
(draw-box "1" {:borders {}})
-(draw-box "2" {:span 2 :borders {}})
+(draw-box "1" {:span 2 :borders {}})
(draw-box "1" {:span 3 :borders {}})
(draw-box "1" {:span 3 :borders {}})
-(draw-box "2" {:span 2 :borders {}})
+(draw-box "1" {:span 2 :borders {}})
(draw-box "5" {:span 2 :borders {}})
----
diff --git a/src/images/bytefield/hstatusreg.edn b/src/images/bytefield/hstatusreg.edn
index cff75db..cce601e 100644
--- a/src/images/bytefield/hstatusreg.edn
+++ b/src/images/bytefield/hstatusreg.edn
@@ -8,7 +8,7 @@
(def boxes-per-row 32)
(draw-box nil {:span 3 :borders {}})
-(draw-box "HSXLEN-1" {:span 8 :borders {} :text-anchor "start"})
+(draw-box "63" {:span 8 :borders {} :text-anchor "start"})
(draw-box "34" {:borders {}})
(draw-box "33" {:span 2 :borders {} :text-anchor "start"})
(draw-box "32" {:span 2 :borders {} :text-anchor "end"})
@@ -31,7 +31,7 @@
(draw-box nil {:span 3 :borders {}})
(draw-box nil {:span 3 :borders {}})
-(draw-box "HSXLEN-34" {:span 9 :borders {}})
+(draw-box "30" {:span 9 :borders {}})
(draw-box "2" {:span 4 :borders {}})
(draw-box "9" {:span 6 :borders {}})
(draw-box "1" {:span 2 :borders {}})
diff --git a/src/images/bytefield/hypv-mstatus.edn b/src/images/bytefield/hypv-mstatus.edn
index 2ed4a4d..885dc00 100644
--- a/src/images/bytefield/hypv-mstatus.edn
+++ b/src/images/bytefield/hypv-mstatus.edn
@@ -7,8 +7,8 @@
(def right-margin 30)
(def boxes-per-row 32)
-(draw-box "MSXLEN-1" {:span 3 :borders {}})
-(draw-box "MXLEN-2" {:span 4 :text-anchor "start" :borders {}})
+(draw-box "63" {:span 3 :borders {}})
+(draw-box "62" {:span 4 :text-anchor "start" :borders {}})
(draw-box "40" {:span 4 :text-anchor "end" :borders {}})
(draw-box "39" {:span 3 :borders {}})
(draw-box "38" {:span 3 :borders {}})
@@ -31,7 +31,7 @@
(draw-box nil {:borders {:top :border-unrelated :bottom :border-unrelated}})
(draw-box "1" {:span 3 :borders {}})
-(draw-box "MXLEN-41" {:span 8 :borders {}})
+(draw-box "23" {:span 8 :borders {}})
(draw-box "1" {:span 3 :borders {}})
(draw-box "1" {:span 3 :borders {}})
(draw-box "1" {:span 3 :borders {}})
diff --git a/src/images/bytefield/miereg-standard.adoc b/src/images/bytefield/miereg-standard.adoc
index d4affab..680fb1c 100644
--- a/src/images/bytefield/miereg-standard.adoc
+++ b/src/images/bytefield/miereg-standard.adoc
@@ -11,6 +11,8 @@
(draw-box "0" {:span 2})
(draw-box (text "LCOFIE" {:font-size 10}) {:span 1})
(draw-box "0" {:span 1})
+(draw-box "MEIE" {:span 1})
+(draw-box "0" {:span 1})
(draw-box "SEIE" {:span 1})
(draw-box "0" {:span 1})
(draw-box "MTIE" {:span 1})
@@ -36,4 +38,5 @@
(draw-box "1" {:span 1 :borders {}})
(draw-box "1" {:span 1 :borders {}})
(draw-box "1" {:span 1 :borders {}})
+(draw-box "1" {:span 1 :borders {}})
----
diff --git a/src/images/bytefield/mncause.edn b/src/images/bytefield/mncause.edn
index 5323f24..0b56e9b 100644
--- a/src/images/bytefield/mncause.edn
+++ b/src/images/bytefield/mncause.edn
@@ -8,9 +8,9 @@
(def boxes-per-row 32)
(draw-column-headers {:height 24 :font-size 24 :labels (reverse ["0" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "MXLEN-2" "" "" "" "MXLEN-1" ""])})
-(draw-box "1" {:span 4})
-(draw-box (text "NMI Cause" {:font-size 24}) {:span 14 :text-anchor "end" :borders {:top :border-unrelated :bottom :border-unrelated :left :border-unrelated}})
+(draw-box "Interrupt" {:span 4})
+(draw-box (text "Exception Code" {:font-size 24}) {:span 14 :text-anchor "end" :borders {:top :border-unrelated :bottom :border-unrelated :left :border-unrelated}})
(draw-box (text "(WARL)" {:font-weight "bold" :font-size 24}) {:span 14 :text-anchor "start" :borders {:top :border-unrelated :right :border-unrelated :bottom :border-unrelated}})
(draw-box "1" {:span 4 :borders {}})
(draw-box "MXLEN-1" {:font-size 24 :span 28 :borders {}})
----- \ No newline at end of file
+----
diff --git a/src/images/bytefield/mnstatus.edn b/src/images/bytefield/mnstatus.edn
index 186bfb8..8a5f39d 100644
--- a/src/images/bytefield/mnstatus.edn
+++ b/src/images/bytefield/mnstatus.edn
@@ -5,25 +5,29 @@
(def row-header-fn nil)
(def left-margin 30)
(def right-margin 30)
-(def boxes-per-row 32)
-(draw-column-headers {:height 24 :font-size 24 :labels (reverse ["0" "" "" "2" "" "3" "4" "" "" "6" "" "" "" "7" "" "" "8" "" "" "10" "11" "" "" "12" "13" "" "" "" "" "" "MXLEN-1" ""])})
+(def boxes-per-row 35)
+(draw-column-headers {:height 24 :font-size 24 :labels (reverse ["0" "" "" "2" "" "3" "4" "" "" "6" "" "" "7" "" "" "8" "" "" "9" "" "" "10" "" "11" "" "" "12" "13" "" "" "" "" "" "MXLEN-1" ""])})
(draw-box (text "Reserved" {:font-style "italic" :font-size 24}) {:span 8})
(draw-box (text "MNPP" {:font-size 24}) {:span 2 :text-anchor "end" :borders {:top :border-unrelated :bottom :border-unrelated :left :border-unrelated}})
(draw-box (text "(WARL)" {:font-weight "bold" :font-size 20}) {:span 2 :text-anchor "start" :borders {:top :border-unrelated :right :border-unrelated :bottom :border-unrelated}})
-(draw-box (text "Reserved" {:font-style "italic" :font-size 24}) {:span 4})
-(draw-box (text "MNPV" {:font-size 24}) {:span 3 :text-anchor "end" :borders {:top :border-unrelated :bottom :border-unrelated :left :border-unrelated}})
-(draw-box (text "(WARL)" {:font-weight "bold" :font-size 24}) {:span 3 :text-anchor "start" :borders {:top :border-unrelated :right :border-unrelated :bottom :border-unrelated}})
+(draw-box (text "Reserved" {:font-style "italic" :font-size 24}) {:span 3})
+(draw-box (text "MNPELP" {:font-style "italic" :font-size 20}) {:span 3})
+(draw-box (text "Reserved" {:font-style "italic" :font-size 20}) {:span 3})
+(draw-box (text "MNPV" {:font-size 24}) {:span 2 :text-anchor "end" :borders {:top :border-unrelated :bottom :border-unrelated :left :border-unrelated}})
+(draw-box (text "(WARL)" {:font-weight "bold" :font-size 20}) {:span 2 :text-anchor "start" :borders {:top :border-unrelated :right :border-unrelated :bottom :border-unrelated}})
(draw-box (text "Reserved" {:font-style "italic" :font-size 24}) {:span 4})
(draw-box "NMIE" {:span 2})
(draw-box (text "Reserved" {:font-style "italic" :font-size 24}) {:span 4})
(draw-box "MXLEN-13" {:span 8 :borders {}})
(draw-box "2" {:span 4 :borders {}})
-(draw-box "3" {:span 4 :borders {}})
-(draw-box "1" {:span 6 :borders {}})
+(draw-box "1" {:span 3 :borders {}})
+(draw-box "1" {:span 3 :borders {}})
+(draw-box "1" {:span 3 :borders {}})
+(draw-box "1" {:span 4 :borders {}})
(draw-box "3" {:span 4 :borders {}})
(draw-box "1" {:span 2 :borders {}})
(draw-box "3" {:span 4 :borders {}})
----- \ No newline at end of file
+----
diff --git a/src/images/bytefield/vsstatusreg.edn b/src/images/bytefield/vsstatusreg.edn
index 87f4725..95780a6 100644
--- a/src/images/bytefield/vsstatusreg.edn
+++ b/src/images/bytefield/vsstatusreg.edn
@@ -7,8 +7,8 @@
(def right-margin 30)
(def boxes-per-row 32)
-(draw-box "VSXLEN-1" {:span 3 :borders {}})
-(draw-box "VSXLEN-2" {:span 5 :text-anchor "start" :borders {}})
+(draw-box "63" {:span 3 :borders {}})
+(draw-box "62" {:span 5 :text-anchor "start" :borders {}})
(draw-box "34" {:span 5 :text-anchor "end" :borders {}})
(draw-box "33" {:span 2 :text-anchor "start" :borders {}})
(draw-box "32" {:span 2 :text-anchor "end" :borders {}})
@@ -30,7 +30,7 @@
(draw-box nil {:span 2 :borders {}})
(draw-box "1" {:span 3 :borders {}})
-(draw-box "VSXLEN-35" {:span 10 :borders {}})
+(draw-box "29" {:span 10 :borders {}})
(draw-box "2" {:span 4 :borders {}})
(draw-box "12" {:span 6 :borders {}})
(draw-box "1" {:span 2 :borders {}})
diff --git a/src/images/smepmp-visual-representation.png b/src/images/smepmp-visual-representation.png
new file mode 100644
index 0000000..9502271
--- /dev/null
+++ b/src/images/smepmp-visual-representation.png
Binary files differ
diff --git a/src/images/wavedrom/ct-unconditional-2.adoc b/src/images/wavedrom/ct-unconditional-2.adoc
index ef33a9e..4dda824 100644
--- a/src/images/wavedrom/ct-unconditional-2.adoc
+++ b/src/images/wavedrom/ct-unconditional-2.adoc
@@ -4,7 +4,7 @@
....
{reg: [
{bits: 7, name: 'opcode', attr: ['7', 'JALR'], type: 8},
- {bits: 5, name: 'rd', attr: ['6', 'dest'], type: 2},
+ {bits: 5, name: 'rd', attr: ['5', 'dest'], type: 2},
{bits: 3, name: 'funct3', attr: ['3', '0'], type: 8},
{bits: 5, name: 'rs1', attr: ['5', 'base'], type: 4},
{bits: 12, name: 'imm[11:0]', attr: ['12', 'offset[11:0]'], type: 3},
diff --git a/src/images/wavedrom/v-inst-table.adoc b/src/images/wavedrom/v-inst-table.adoc
new file mode 100644
index 0000000..0c02220
--- /dev/null
+++ b/src/images/wavedrom/v-inst-table.adoc
@@ -0,0 +1,210 @@
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+[cols="<,<,<,<,<,<,<,<,<,<,<,<,<",options="headers"]
+|===
+5+| Integer 4+| Integer 4+| FP
+
+| funct3 | | | | | funct3 | | | | funct3 | | |
+| OPIVV |V| | | | OPMVV{nbsp} |V| | | OPFVV |V| |
+| OPIVX | |X| | | OPMVX{nbsp} | |X| | OPFVF | |F|
+| OPIVI | | |I| | | | | | | | |
+|===
+
+[cols="<,<,<,<,<,<,<,<,<,<,<,<,<",options="headers"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 000000 |V|X|I| vadd | 000000 |V| | vredsum | 000000 |V|F| vfadd
+| 000001 | | | | | 000001 |V| | vredand | 000001 |V| | vfredusum
+| 000010 |V|X| | vsub | 000010 |V| | vredor | 000010 |V|F| vfsub
+| 000011 | |X|I| vrsub | 000011 |V| | vredxor | 000011 |V| | vfredosum
+| 000100 |V|X| | vminu | 000100 |V| | vredminu | 000100 |V|F| vfmin
+| 000101 |V|X| | vmin | 000101 |V| | vredmin | 000101 |V| | vfredmin
+| 000110 |V|X| | vmaxu | 000110 |V| | vredmaxu | 000110 |V|F| vfmax
+| 000111 |V|X| | vmax | 000111 |V| | vredmax | 000111 |V| | vfredmax
+| 001000 | | | | | 001000 |V|X| vaaddu | 001000 |V|F| vfsgnj
+| 001001 |V|X|I| vand | 001001 |V|X| vaadd | 001001 |V|F| vfsgnjn
+| 001010 |V|X|I| vor | 001010 |V|X| vasubu | 001010 |V|F| vfsgnjx
+| 001011 |V|X|I| vxor | 001011 |V|X| vasub | 001011 | | |
+| 001100 |V|X|I| vrgather | 001100 | | | | 001100 | | |
+| 001101 | | | | | 001101 | | | | 001101 | | |
+| 001110 | |X|I| vslideup | 001110 | |X| vslide1up | 001110 | |F| vfslide1up
+| 001110 |V| | |vrgatherei16| | | | | | | |
+| 001111 | |X|I| vslidedown | 001111 | |X| vslide1down | 001111 | |F| vfslide1down
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 010000 |V|X|I| vadc | 010000 |V| | VWXUNARY0 | 010000 |V| | VWFUNARY0
+| | | | | | 010000 | |X| VRXUNARY0 | 010000 | |F| VRFUNARY0
+| 010001 |V|X|I| vmadc | 010001 | | | | 010001 | | |
+| 010010 |V|X| | vsbc | 010010 |V| | VXUNARY0 | 010010 |V| | VFUNARY0
+| 010011 |V|X| | vmsbc | 010011 | | | | 010011 |V| | VFUNARY1
+| 010100 | | | | | 010100 |V| | VMUNARY0 | 010100 | | |
+| 010101 | | | | | 010101 | | | | 010101 | | |
+| 010110 | | | | | 010110 | | | | 010110 | | |
+| 010111 |V|X|I| vmerge/vmv | 010111 |V| | vcompress | 010111 | |F| vfmerge/vfmv
+| 011000 |V|X|I| vmseq | 011000 |V| | vmandn | 011000 |V|F| vmfeq
+| 011001 |V|X|I| vmsne | 011001 |V| | vmand | 011001 |V|F| vmfle
+| 011010 |V|X| | vmsltu | 011010 |V| | vmor | 011010 | | |
+| 011011 |V|X| | vmslt | 011011 |V| | vmxor | 011011 |V|F| vmflt
+| 011100 |V|X|I| vmsleu | 011100 |V| | vmorn | 011100 |V|F| vmfne
+| 011101 |V|X|I| vmsle | 011101 |V| | vmnand | 011101 | |F| vmfgt
+| 011110 | |X|I| vmsgtu | 011110 |V| | vmnor | 011110 | | |
+| 011111 | |X|I| vmsgt | 011111 |V| | vmxnor | 011111 | |F| vmfge
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 100000 |V|X|I| vsaddu | 100000 |V|X| vdivu | 100000 |V|F| vfdiv
+| 100001 |V|X|I| vsadd | 100001 |V|X| vdiv | 100001 | |F| vfrdiv
+| 100010 |V|X| | vssubu | 100010 |V|X| vremu | 100010 | | |
+| 100011 |V|X| | vssub | 100011 |V|X| vrem | 100011 | | |
+| 100100 | | | | | 100100 |V|X| vmulhu | 100100 |V|F| vfmul
+| 100101 |V|X|I| vsll | 100101 |V|X| vmul | 100101 | | |
+| 100110 | | | | | 100110 |V|X| vmulhsu | 100110 | | |
+| 100111 |V|X| | vsmul | 100111 |V|X| vmulh | 100111 | |F| vfrsub
+| 100111 | | |I| vmv<nr>r | | | | | | | |
+| 101000 |V|X|I| vsrl | 101000 | | | | 101000 |V|F| vfmadd
+| 101001 |V|X|I| vsra | 101001 |V|X| vmadd | 101001 |V|F| vfnmadd
+| 101010 |V|X|I| vssrl | 101010 | | | | 101010 |V|F| vfmsub
+| 101011 |V|X|I| vssra | 101011 |V|X| vnmsub | 101011 |V|F| vfnmsub
+| 101100 |V|X|I| vnsrl | 101100 | | | | 101100 |V|F| vfmacc
+| 101101 |V|X|I| vnsra | 101101 |V|X| vmacc | 101101 |V|F| vfnmacc
+| 101110 |V|X|I| vnclipu | 101110 | | | | 101110 |V|F| vfmsac
+| 101111 |V|X|I| vnclip | 101111 |V|X| vnmsac | 101111 |V|F| vfnmsac
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+|===
+5+| funct6 4+| funct6 4+| funct6
+
+| 110000 |V| | | vwredsumu | 110000 |V|X| vwaddu | 110000 |V|F| vfwadd
+| 110001 |V| | | vwredsum | 110001 |V|X| vwadd | 110001 |V| | vfwredusum
+| 110010 | | | | | 110010 |V|X| vwsubu | 110010 |V|F| vfwsub
+| 110011 | | | | | 110011 |V|X| vwsub | 110011 |V| | vfwredosum
+| 110100 | | | | | 110100 |V|X| vwaddu.w | 110100 |V|F| vfwadd.w
+| 110101 | | | | | 110101 |V|X| vwadd.w | 110101 | | |
+| 110110 | | | | | 110110 |V|X| vwsubu.w | 110110 |V|F| vfwsub.w
+| 110111 | | | | | 110111 |V|X| vwsub.w | 110111 | | |
+| 111000 | | | | | 111000 |V|X| vwmulu | 111000 |V|F| vfwmul
+| 111001 | | | | | 111001 | | | | 111001 | | |
+| 111010 | | | | | 111010 |V|X| vwmulsu | 111010 | | |
+| 111011 | | | | | 111011 |V|X| vwmul | 111011 | | |
+| 111100 | | | | | 111100 |V|X| vwmaccu | 111100 |V|F| vfwmacc
+| 111101 | | | | | 111101 |V|X| vwmacc | 111101 |V|F| vfwnmacc
+| 111110 | | | | | 111110 | |X| vwmaccus | 111110 |V|F| vfwmsac
+| 111111 | | | | | 111111 |V|X| vwmaccsu | 111111 |V|F| vfwnmsac
+|===
+
+<<<
+
+.VRXUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs2 |
+
+| 00000 | vmv.s.x
+|===
+
+.VWXUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00000 | vmv.x.s
+| 10000 | vcpop
+| 10001 | vfirst
+|===
+
+.VXUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00010 | vzext.vf8
+| 00011 | vsext.vf8
+| 00100 | vzext.vf4
+| 00101 | vsext.vf4
+| 00110 | vzext.vf2
+| 00111 | vsext.vf2
+|===
+
+.VRFUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs2 |
+
+| 00000 | vfmv.s.f
+|===
+
+.VWFUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00000 | vfmv.f.s
+|===
+
+.VFUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 | name
+
+2+| single-width converts
+| 00000 | vfcvt.xu.f.v
+| 00001 | vfcvt.x.f.v
+| 00010 | vfcvt.f.xu.v
+| 00011 | vfcvt.f.x.v
+| 00110 | vfcvt.rtz.xu.f.v
+| 00111 | vfcvt.rtz.x.f.v
+| |
+2+| widening converts
+| 01000 | vfwcvt.xu.f.v
+| 01001 | vfwcvt.x.f.v
+| 01010 | vfwcvt.f.xu.v
+| 01011 | vfwcvt.f.x.v
+| 01100 | vfwcvt.f.f.v
+| 01110 | vfwcvt.rtz.xu.f.v
+| 01111 | vfwcvt.rtz.x.f.v
+| |
+2+| narrowing converts
+| 10000 | vfncvt.xu.f.w
+| 10001 | vfncvt.x.f.w
+| 10010 | vfncvt.f.xu.w
+| 10011 | vfncvt.f.x.w
+| 10100 | vfncvt.f.f.w
+| 10101 | vfncvt.rod.f.f.w
+| 10110 | vfncvt.rtz.xu.f.w
+| 10111 | vfncvt.rtz.x.f.w
+|===
+
+.VFUNARY1 encoding space
+[cols="2,14"]
+|===
+| vs1 | name
+
+| 00000 | vfsqrt.v
+| 00100 | vfrsqrt7.v
+| 00101 | vfrec7.v
+| 10000 | vfclass.v
+|===
+
+
+.VMUNARY0 encoding space
+[cols="2,14"]
+|===
+| vs1 |
+
+| 00001 | vmsbf
+| 00010 | vmsof
+| 00011 | vmsif
+| 10000 | viota
+| 10001 | vid
+|===
+
+
diff --git a/src/images/wavedrom/valu-format.adoc b/src/images/wavedrom/valu-format.adoc
new file mode 100644
index 0000000..cdd3447
--- /dev/null
+++ b/src/images/wavedrom/valu-format.adoc
@@ -0,0 +1,104 @@
+Formats for Vector Arithmetic Instructions under OP-V major opcode
+
+////
+31 26 25 24 20 19 15 14 12 11 7 6 0
+ funct6 | vm | vs2 | vs1 | 0 0 0 | vd |1010111| OP-V (OPIVV)
+ funct6 | vm | vs2 | vs1 | 0 0 1 | vd/rd |1010111| OP-V (OPFVV)
+ funct6 | vm | vs2 | vs1 | 0 1 0 | vd/rd |1010111| OP-V (OPMVV)
+ funct6 | vm | vs2 | imm[4:0] | 0 1 1 | vd |1010111| OP-V (OPIVI)
+ funct6 | vm | vs2 | rs1 | 1 0 0 | vd |1010111| OP-V (OPIVX)
+ funct6 | vm | vs2 | rs1 | 1 0 1 | vd |1010111| OP-V (OPFVF)
+ funct6 | vm | vs2 | rs1 | 1 1 0 | vd/rd |1010111| OP-V (OPMVX)
+ 6 1 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPIVV'},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 0},
+ {bits: 5, name: 'vs1', type: 2},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPFVV'},
+ {bits: 5, name: 'vd / rd', type: 7},
+ {bits: 3, name: 1},
+ {bits: 5, name: 'vs1', type: 2},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPMVV'},
+ {bits: 5, name: 'vd / rd', type: 7},
+ {bits: 3, name: 2},
+ {bits: 5, name: 'vs1', type: 2},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: ['OPIVI']},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 3},
+ {bits: 5, name: 'imm[4:0]', type: 5},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPIVX'},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 4},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPFVF'},
+ {bits: 5, name: 'vd', type: 2},
+ {bits: 3, name: 5},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'OPMVX'},
+ {bits: 5, name: 'vd / rd', type: 7},
+ {bits: 3, name: 6},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'vs2', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 6, name: 'funct6'},
+]}
+....
diff --git a/src/images/wavedrom/vcfg-format.adoc b/src/images/wavedrom/vcfg-format.adoc
new file mode 100644
index 0000000..ac0353c
--- /dev/null
+++ b/src/images/wavedrom/vcfg-format.adoc
@@ -0,0 +1,47 @@
+Formats for Vector Configuration Instructions under OP-V major opcode
+
+////
+ 31 30 25 24 20 19 15 14 12 11 7 6 0
+ 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli
+ 1 | 1| zimm[ 9:0] | uimm[4:0]| 1 1 1 | rd |1010111| vsetivli
+ 1 | 000000 | rs2 | rs1 | 1 1 1 | rd |1010111| vsetvl
+ 1 6 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'vsetvli'},
+ {bits: 5, name: 'rd', type: 4},
+ {bits: 3, name: 7},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 11, name: 'vtypei[10:0]', type: 5},
+ {bits: 1, name: '0'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'vsetivli'},
+ {bits: 5, name: 'rd', type: 4},
+ {bits: 3, name: 7},
+ {bits: 5, name: 'uimm[4:0]', type: 5},
+ {bits: 10, name: 'vtypei[9:0]', type: 5},
+ {bits: 1, name: '1'},
+ {bits: 1, name: '1'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x57, attr: 'vsetvl'},
+ {bits: 5, name: 'rd', type: 4},
+ {bits: 3, name: 7},
+ {bits: 5, name: 'rs1', type: 4},
+ {bits: 5, name: 'rs2', type: 4},
+ {bits: 6, name: 0x00},
+ {bits: 1, name: 1},
+]}
+....
diff --git a/src/images/wavedrom/vfrec7.adoc b/src/images/wavedrom/vfrec7.adoc
new file mode 100644
index 0000000..d33f44e
--- /dev/null
+++ b/src/images/wavedrom/vfrec7.adoc
@@ -0,0 +1,136 @@
+.vfrec7.v common-case lookup table contents
+[%autowidth,float="center",align="center",options="header"]
+|===
+
+| sig[MSB -: 7] | sig_out[MSB -: 7]
+
+| 0 | 127
+| 1 | 125
+| 2 | 123
+| 3 | 121
+| 4 | 119
+| 5 | 117
+| 6 | 116
+| 7 | 114
+| 8 | 112
+| 9 | 110
+| 10 | 109
+| 11 | 107
+| 12 | 105
+| 13 | 104
+| 14 | 102
+| 15 | 100
+| 16 | 99
+| 17 | 97
+| 18 | 96
+| 19 | 94
+| 20 | 93
+| 21 | 91
+| 22 | 90
+| 23 | 88
+| 24 | 87
+| 25 | 85
+| 26 | 84
+| 27 | 83
+| 28 | 81
+| 29 | 80
+| 30 | 79
+| 31 | 77
+| 32 | 76
+| 33 | 75
+| 34 | 74
+| 35 | 72
+| 36 | 71
+| 37 | 70
+| 38 | 69
+| 39 | 68
+| 40 | 66
+| 41 | 65
+| 42 | 64
+| 43 | 63
+| 44 | 62
+| 45 | 61
+| 46 | 60
+| 47 | 59
+| 48 | 58
+| 49 | 57
+| 50 | 56
+| 51 | 55
+| 52 | 54
+| 53 | 53
+| 54 | 52
+| 55 | 51
+| 56 | 50
+| 57 | 49
+| 58 | 48
+| 59 | 47
+| 60 | 46
+| 61 | 45
+| 62 | 44
+| 63 | 43
+| 64 | 42
+| 65 | 41
+| 66 | 40
+| 67 | 40
+| 68 | 39
+| 69 | 38
+| 70 | 37
+| 71 | 36
+| 72 | 35
+| 73 | 35
+| 74 | 34
+| 75 | 33
+| 76 | 32
+| 77 | 31
+| 78 | 31
+| 79 | 30
+| 80 | 29
+| 81 | 28
+| 82 | 28
+| 83 | 27
+| 84 | 26
+| 85 | 25
+| 86 | 25
+| 87 | 24
+| 88 | 23
+| 89 | 23
+| 90 | 22
+| 91 | 21
+| 92 | 21
+| 93 | 20
+| 94 | 19
+| 95 | 19
+| 96 | 18
+| 97 | 17
+| 98 | 17
+| 99 | 16
+| 100 | 15
+| 101 | 15
+| 102 | 14
+| 103 | 14
+| 104 | 13
+| 105 | 12
+| 106 | 12
+| 107 | 11
+| 108 | 11
+| 109 | 10
+| 110 | 9
+| 111 | 9
+| 112 | 8
+| 113 | 8
+| 114 | 7
+| 115 | 7
+| 116 | 6
+| 117 | 5
+| 118 | 5
+| 119 | 4
+| 120 | 4
+| 121 | 3
+| 122 | 3
+| 123 | 2
+| 124 | 2
+| 125 | 1
+| 126 | 1
+| 127 | 0
+
+|===
diff --git a/src/images/wavedrom/vfrsqrt7.adoc b/src/images/wavedrom/vfrsqrt7.adoc
new file mode 100644
index 0000000..8ebc621
--- /dev/null
+++ b/src/images/wavedrom/vfrsqrt7.adoc
@@ -0,0 +1,137 @@
+.vfrsqrt7.v common-case lookup table contents
+[%autowidth,float=center,align=center,options="header"]
+|===
+
+|exp[0] | sig[MSB -: 6] | sig_out[MSB -: 7]
+
+| 0| 0 | 52
+| 0| 1 | 51
+| 0| 2 | 50
+| 0| 3 | 48
+| 0| 4 | 47
+| 0| 5 | 46
+| 0| 6 | 44
+| 0| 7 | 43
+| 0| 8 | 42
+| 0| 9 | 41
+| 0| 10 | 40
+| 0| 11 | 39
+| 0| 12 | 38
+| 0| 13 | 36
+| 0| 14 | 35
+| 0| 15 | 34
+| 0| 16 | 33
+| 0| 17 | 32
+| 0| 18 | 31
+| 0| 19 | 30
+| 0| 20 | 30
+| 0| 21 | 29
+| 0| 22 | 28
+| 0| 23 | 27
+| 0| 24 | 26
+| 0| 25 | 25
+| 0| 26 | 24
+| 0| 27 | 23
+| 0| 28 | 23
+| 0| 29 | 22
+| 0| 30 | 21
+| 0| 31 | 20
+| 0| 32 | 19
+| 0| 33 | 19
+| 0| 34 | 18
+| 0| 35 | 17
+| 0| 36 | 16
+| 0| 37 | 16
+| 0| 38 | 15
+| 0| 39 | 14
+| 0| 40 | 14
+| 0| 41 | 13
+| 0| 42 | 12
+| 0| 43 | 12
+| 0| 44 | 11
+| 0| 45 | 10
+| 0| 46 | 10
+| 0| 47 | 9
+| 0| 48 | 9
+| 0| 49 | 8
+| 0| 50 | 7
+| 0| 51 | 7
+| 0| 52 | 6
+| 0| 53 | 6
+| 0| 54 | 5
+| 0| 55 | 4
+| 0| 56 | 4
+| 0| 57 | 3
+| 0| 58 | 3
+| 0| 59 | 2
+| 0| 60 | 2
+| 0| 61 | 1
+| 0| 62 | 1
+| 0| 63 | 0
+
+| 1| 0 | 127
+| 1| 1 | 125
+| 1| 2 | 123
+| 1| 3 | 121
+| 1| 4 | 119
+| 1| 5 | 118
+| 1| 6 | 116
+| 1| 7 | 114
+| 1| 8 | 113
+| 1| 9 | 111
+| 1| 10 | 109
+| 1| 11 | 108
+| 1| 12 | 106
+| 1| 13 | 105
+| 1| 14 | 103
+| 1| 15 | 102
+| 1| 16 | 100
+| 1| 17 | 99
+| 1| 18 | 97
+| 1| 19 | 96
+| 1| 20 | 95
+| 1| 21 | 93
+| 1| 22 | 92
+| 1| 23 | 91
+| 1| 24 | 90
+| 1| 25 | 88
+| 1| 26 | 87
+| 1| 27 | 86
+| 1| 28 | 85
+| 1| 29 | 84
+| 1| 30 | 83
+| 1| 31 | 82
+| 1| 32 | 80
+| 1| 33 | 79
+| 1| 34 | 78
+| 1| 35 | 77
+| 1| 36 | 76
+| 1| 37 | 75
+| 1| 38 | 74
+| 1| 39 | 73
+| 1| 40 | 72
+| 1| 41 | 71
+| 1| 42 | 70
+| 1| 43 | 70
+| 1| 44 | 69
+| 1| 45 | 68
+| 1| 46 | 67
+| 1| 47 | 66
+| 1| 48 | 65
+| 1| 49 | 64
+| 1| 50 | 63
+| 1| 51 | 63
+| 1| 52 | 62
+| 1| 53 | 61
+| 1| 54 | 60
+| 1| 55 | 59
+| 1| 56 | 59
+| 1| 57 | 58
+| 1| 58 | 57
+| 1| 59 | 56
+| 1| 60 | 56
+| 1| 61 | 55
+| 1| 62 | 54
+| 1| 63 | 53
+
+|=== \ No newline at end of file
diff --git a/src/images/wavedrom/vmem-format.adoc b/src/images/wavedrom/vmem-format.adoc
new file mode 100644
index 0000000..f9b25ee
--- /dev/null
+++ b/src/images/wavedrom/vmem-format.adoc
@@ -0,0 +1,108 @@
+Format for Vector Load Instructions under LOAD-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | mew| mop | vm | lumop | rs1 | width | vd |0000111| VL* unit-stride
+ nf | mew| mop | vm | rs2 | rs1 | width | vd |0000111| VLS* strided
+ nf | mew| mop | vm | vs2 | rs1 | width | vd |0000111| VLX* indexed
+ 3 1 2 1 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x7, attr: 'VL* unit-stride'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'lumop'},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x7, attr: 'VLS* strided'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'rs2', attr: 'stride', type: 4},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x7, attr: 'VLX* indexed'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'vs2', attr: 'address offsets', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+Format for Vector Store Instructions under STORE-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | mew| mop | vm | sumop | rs1 | width | vs3 |0100111| VS* unit-stride
+ nf | mew| mop | vm | rs2 | rs1 | width | vs3 |0100111| VSS* strided
+ nf | mew| mop | vm | vs2 | rs1 | width | vs3 |0100111| VSX* indexed
+ 3 1 2 1 5 5 3 5 7
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VS* unit-stride'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'sumop'},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VSS* strided'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'rs2', attr: 'stride', type: 4},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VSX* indexed'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 'vs2', attr: 'address offsets', type: 2},
+ {bits: 1, name: 'vm'},
+ {bits: 2, name: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
diff --git a/src/images/wavedrom/vtype-format.adoc b/src/images/wavedrom/vtype-format.adoc
new file mode 100644
index 0000000..9e6ab34
--- /dev/null
+++ b/src/images/wavedrom/vtype-format.adoc
@@ -0,0 +1,28 @@
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 3, name: 'vlmul[2:0]'},
+ {bits: 3, name: 'vsew[2:0]'},
+ {bits: 1, name: 'vta'},
+ {bits: 1, name: 'vma'},
+ {bits: 23, name: 'reserved'},
+ {bits: 1, name: 'vill'},
+]}
+....
+
+NOTE: This diagram shows the layout for RV32 systems, whereas in
+general `vill` should be at bit XLEN-1.
+
+.`vtype` register layout
+[cols=">2,4,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Bits | Name | Description
+
+| XLEN-1 | vill | Illegal value if set
+| XLEN-2:8 | 0 | Reserved if non-zero
+| 7 | vma | Vector mask agnostic
+| 6 | vta | Vector tail agnostic
+| 5:3 | vsew[2:0] | Selected element width (SEW) setting
+| 2:0 | vlmul[2:0] | Vector register group multiplier (LMUL) setting
+|===
diff --git a/src/intro.adoc b/src/intro.adoc
index 53379e7..78d7a34 100644
--- a/src/intro.adoc
+++ b/src/intro.adoc
@@ -195,7 +195,7 @@ environment but must do so in a way that guest harts operate like
independent hardware threads. In particular, if there are more guest
harts than host harts then the execution environment must be able to
preempt the guest harts and must not wait indefinitely for guest
-software on a guest hart to “yield" control of the guest hart.
+software on a guest hart to "yield" control of the guest hart.
====
=== RISC-V ISA Overview
diff --git a/src/machine.adoc b/src/machine.adoc
index d9e9042..640a794 100644
--- a/src/machine.adoc
+++ b/src/machine.adoc
@@ -2631,7 +2631,7 @@ progress is detected.
The misaligned atomicity granule PMA provides constrained support for
misaligned AMOs.
This PMA, if present, specifies the size of a _misaligned atomicity granule_,
-a power-of-two number of bytes.
+a naturally aligned power-of-two number of bytes.
Specific supported values for this PMA are represented by MAG__NN__, e.g.,
MAG16 indicates the misaligned atomicity granule is at least 16 bytes.
diff --git a/src/mm-eplan.adoc b/src/mm-eplan.adoc
index 1243b1d..470a3ab 100644
--- a/src/mm-eplan.adoc
+++ b/src/mm-eplan.adoc
@@ -922,7 +922,7 @@ instruction will be followed by a conditional branch checking whether
the outcome was successful; this implies that there will be a control
dependency from the store operation generated by the SC instruction to
any memory operations following the branch. PPO
-rule <<ppo-ctrl>> in turn implies that any subsequent store
+rule <<ppo, 11>> in turn implies that any subsequent store
operations will appear later in the global memory order than the store
operation generated by the SC. However, since control, address, and data
dependencies are defined over memory operations, and since an
diff --git a/src/mm-formal.adoc b/src/mm-formal.adoc
index 2a49696..fb89914 100644
--- a/src/mm-formal.adoc
+++ b/src/mm-formal.adoc
@@ -525,7 +525,7 @@ a construction of the post-transition model state for each.
Transitions for all instructions:
-latexmath:[$\bullet$] <<fetch, Fetch instruction>>: This transition represents a fetch and decode of a new instruction instance, as a program order successor of a previously fetched
+* <<fetch, Fetch instruction>>: This transition represents a fetch and decode of a new instruction instance, as a program order successor of a previously fetched
instruction instance (or the initial fetch address).
The model assumes the instruction memory is fixed; it does not describe
@@ -534,16 +534,17 @@ not generate memory load operations, and the shared memory is not
involved in the transition. Instead, the model depends on an external
oracle that provides an opcode when given a memory location.
-latexmath:[$\circ$] <<reg_write, Register write>>: This is a write of a register value.
+[circle]
+* <<reg_write, Register write>>: This is a write of a register value.
-latexmath:[$\circ$] <<reg_read, Register read>>: This is a read of a register value from the most recent
+* <<reg_read, Register read>>: This is a read of a register value from the most recent
program-order-predecessor instruction instance that writes to that
register.
-latexmath:[$\circ$] <<sail_interp, Pseudocode internal step>>: This covers pseudocode internal computation: arithmetic, function
+* <<sail_interp, Pseudocode internal step>>: This covers pseudocode internal computation: arithmetic, function
calls, etc.
-latexmath:[$\circ$] <<finish, Finish instruction>>: At this point the instruction pseudocode is done, the instruction cannot be restarted, memory accesses cannot be discarded, and all memory
+* <<finish, Finish instruction>>: At this point the instruction pseudocode is done, the instruction cannot be restarted, memory accesses cannot be discarded, and all memory
effects have taken place. For conditional branch and indirect jump
instructions, any program order successors that were fetched from an
address that is not the one that was written to the _pc_ register are
@@ -552,15 +553,20 @@ them.
Transitions specific to load instructions:
-latexmath:[$\circ$] <<initiate_load, Initiate memory load operations>>: At this point the memory footprint of the load instruction is
+[circle]
+* <<initiate_load, Initiate memory load operations>>: At this point the memory footprint of the load instruction is
provisionally known (it could change if earlier instructions are
restarted) and its individual memory load operations can start being
satisfied.
-latexmath:[$\bullet$] <<sat_from_forwarding, Satisfy memory load operation by forwarding from unpropogated stores>>: This partially or entirely satisfies a single memory load operation
-by forwarding, from program-order-previous memory store operations.
-latexmath:[$\bullet$] <<sat_from_mem, Satisfy memory load operation from memory>>: This entirely satisfies the outstanding slices of a single memory
+
+[disc]
+* <<sat_from_forwarding, Satisfy memory load operation by forwarding from unpropogated stores>>: This partially or entirely satisfies a single memory load operation by forwarding, from program-order-previous memory store operations.
+
+* <<sat_from_mem, Satisfy memory load operation from memory>>: This entirely satisfies the outstanding slices of a single memory
load operation, from memory.
-latexmath:[$\circ$] <<complete_loads, Complete load operations>>: At this point all the memory load operations of the instruction have
+
+[circle]
+* <<complete_loads, Complete load operations>>: At this point all the memory load operations of the instruction have
been entirely satisfied and the instruction pseudocode can continue
executing. A load instruction can be subject to being restarted until
the transition. But, under some conditions, the model might treat a load
@@ -568,44 +574,56 @@ instruction as non-restartable even before it is finished (e.g. see ).
Transitions specific to store instructions:
-latexmath:[$\circ$] <<initiate_store_footprint, Initiate memory store operation footprints>>: At this point the memory footprint of the store is provisionally
+[circle]
+* <<initiate_store_footprint, Initiate memory store operation footprints>>: At this point the memory footprint of the store is provisionally
known.
-latexmath:[$\circ$] <<instantiate_store_value, Instantiate memory store operation values>>: At this point the memory store operations have their values and
+
+* <<instantiate_store_value, Instantiate memory store operation values>>: At this point the memory store operations have their values and
program-order-successor memory load operations can be satisfied by
forwarding from them.
-latexmath:[$\circ$] <<commit_stores, Commit store instruction>>: At this point the store operations are guaranteed to happen (the
+
+* <<commit_stores, Commit store instruction>>: At this point the store operations are guaranteed to happen (the
instruction can no longer be restarted or discarded), and they can start
being propagated to memory.
-latexmath:[$\bullet$] <<prop_store, Propagate store operation>>: This propagates a single memory store operation to memory.
-latexmath:[$\circ$] <<complete_stores, Complete store operations>>: At this point all the memory store operations of the instruction
+
+[disc]
+* <<prop_store, Propagate store operation>>: This propagates a single memory store operation to memory.
+
+[circle]
+* <<complete_stores, Complete store operations>>: At this point all the memory store operations of the instruction
have been propagated to memory, and the instruction pseudocode can
continue executing.
Transitions specific to `sc` instructions:
-latexmath:[$\bullet$] <<early_sc_fail, Early sc fail>>: This causes the `sc` to fail, either a spontaneous fail or because
-it is not paired with a program-order-previous `lr`.
-latexmath:[$\bullet$] <<paired_sc, Paired sc>>: This transition indicates the `sc` is paired with an `lr` and might
+[disc]
+* <<early_sc_fail, Early sc fail>>: This causes the `sc` to fail, either a spontaneous fail or becauset is not paired with a program-order-previous `lr`.
+
+* <<paired_sc, Paired sc>>: This transition indicates the `sc` is paired with an `lr` and might
succeed.
-latexmath:[$\bullet$] <<commit_sc, Commit and propagate store operation of an sc>>: This is an atomic execution of the transitions <<commit_stores, Commit store instruction>> and <<prop_store, Propagate store operation>>, it is enabled
+
+* <<commit_sc, Commit and propagate store operation of an sc>>: This is an atomic execution of the transitions <<commit_stores, Commit store instruction>> and <<prop_store, Propagate store operation>>, it is enabled
only if the stores from which the `lr` read from have not been
overwritten.
-latexmath:[$\bullet$] <<late_sc_fail, Late sc fail>>: This causes the `sc` to fail, either a spontaneous fail or because
+
+* <<late_sc_fail, Late sc fail>>: This causes the `sc` to fail, either a spontaneous fail or because
the stores from which the `lr` read from have been overwritten.
Transitions specific to AMO instructions:
-latexmath:[$\bullet$] <<do_amo, Satisfy, commit and propagate operations of an AMO>>: This is an atomic execution of all the transitions needed to satisfy
+[disc]
+* <<do_amo, Satisfy, commit and propagate operations of an AMO>>: This is an atomic execution of all the transitions needed to satisfy
the load operation, do the required arithmetic, and propagate the store
operation.
Transitions specific to fence instructions:
-latexmath:[$\circ$] <<commit_fence, Commit fence>>
+[circle]
+* <<commit_fence, Commit fence>>
The transitions labeled latexmath:[$\circ$] can always be taken eagerly,
as soon as their precondition is satisfied, without excluding other
-behavior; the latexmath:[$\bullet$] cannot. Although is marked with a
+behavior; the latexmath:[$\bullet$] cannot. Although <<fetch, Fetch instruction>> is marked with a
latexmath:[$\bullet$], it can be taken eagerly as long as it is not
taken infinitely many times.
@@ -1214,7 +1232,7 @@ time if:
. every memory store operation that has been forwarded to
latexmath:[$i'$] is propagated;
. the conditions of <<commit_stores, Commit store instruction>> is satisfied;
-. the conditions of <<prop_stores, Commit store instruction>> is satisfied (notice that an `sc` instruction can
+. the conditions of <<prop_store, Propagate store instruction>> is satisfied (notice that an `sc` instruction can
only have one memory store operation); and
. for every store slice latexmath:[$msos$] from latexmath:[$msoss$],
latexmath:[$msos$] has not been overwritten, in the shared memory, by a
@@ -1224,7 +1242,7 @@ since latexmath:[$msos$] was propagated to memory.
Action:
. apply the actions of <<commit_stores, Commit store instruction>>; and
-. apply the action of <<prop_stores, Commit store instruction>>.
+. apply the action of <<prop_store, Propagate store instruction>>.
[[late_sc_fail]]
===== Late `sc` fail
diff --git a/src/resources/themes/riscv-spec.yml b/src/resources/themes/riscv-spec.yml
index 4aa9535..e8332fc 100644
--- a/src/resources/themes/riscv-spec.yml
+++ b/src/resources/themes/riscv-spec.yml
@@ -164,14 +164,17 @@ admonition:
padding: [0, $horizontal_rhythm, 0, $horizontal_rhythm]
icon:
note:
- name: pencil-square-o
+ # name: pencil-square-o
+ name: far-edit
stroke_color: 6489b3
tip:
- name: comments-o
+ #name: comments-o
+ name: far-comments
stroke_color: 646b74
size: 24
important:
- name: info
+ #name: info
+ name: fas-info-circle
stroke_color: 5f8c8b
warning:
stroke_color: 9c4d4b
@@ -247,6 +250,7 @@ figure:
align: center
table:
background_color: $page_background_color
+ font-size: 9
#head_background_color: #2596be
#head_font_color: $base_font_color
head_font_style: bold
diff --git a/src/riscv-privileged.adoc b/src/riscv-privileged.adoc
index 410aeab..7ca9ad1 100644
--- a/src/riscv-privileged.adoc
+++ b/src/riscv-privileged.adoc
@@ -51,6 +51,11 @@ endif::[]
:hide-uri-scheme:
:stem: latexmath
:footnote:
+:le: &#8804;
+:ge: &#8805;
+:ne: &#8800;
+:approx: &#8776;
+:inf: &#8734;
_Contributors to all versions of the spec in alphabetical order (please contact
editors to suggest corrections): Krste Asanović, Peter Ashenden, Rimas
@@ -82,12 +87,16 @@ include::priv-intro.adoc[]
include::priv-csrs.adoc[]
//machine.tex
include::machine.adoc[]
+include::smstateen.adoc[]
+include::smepmp.adoc[]
//rnmi.tex
include::rnmi.adoc[]
//supervisor.tex
include::supervisor.adoc[]
+include::sscofpmt.adoc[]
//hypervisor.tex
include::hypervisor.adoc[]
+include::sstc.adoc[]
//priv-insns.tex
include::priv-insns.adoc[]
//priv-history.tex
diff --git a/src/riscv-unprivileged.adoc b/src/riscv-unprivileged.adoc
index da5cbfe..4cd2b1d 100644
--- a/src/riscv-unprivileged.adoc
+++ b/src/riscv-unprivileged.adoc
@@ -47,20 +47,27 @@ endif::[]
:hide-uri-scheme:
:stem: latexmath
:footnote:
+:le: &#8804;
+:ge: &#8805;
+:ne: &#8800;
+:approx: &#8776;
+:inf: &#8734;
+:csrname: envcfg
_Contributors to all versions of the spec in alphabetical order (please contact editors to suggest
-corrections): Derek Atkins, Arvind, Krste Asanović, Rimas Avižienis, Jacob Bachmeyer, Christopher F. Batten,
-Allen J. Baum, Alex Bradbury, Scott Beamer, Preston Briggs, Christopher Celio, Chuanhua
-Chang, David Chisnall, Paul Clayton, Palmer Dabbelt, L Peter Deutsch, Ken Dockser, Roger Espasa, Greg Favor,
+Derek Atkins, Arvind, Krste Asanović, Rimas Avižienis, Jacob Bachmeyer, Christopher F. Batten,
+Allen J. Baum, Alex Bradbury, Scott Beamer, Abel Bernabeu, Alex Bradbury, Scott Beamer, Preston Briggs, Christopher Celio, Chuanhua
+Chang, David Chisnall, Paul Clayton, Palmer Dabbelt, L Peter Deutsch, Ken Dockser, Paul Donahue, Aaron Durbin, Roger Espasa,
+Greg Favor,
Shaked Flur, Stefan Freudenberger, Marc Gauthier, Andy Glew, Jan Gray, Michael Hamburg, John
-Hauser, David Horner, Bruce Hoult, Bill Huffman, Alexandre Joannou, Olof Johansson, Ben Keller,
-David Kruckemyer, Yunsup Lee, Paul Loewenstein, Daniel Lustig, Yatin Manerkar, Luc Maranget,
-Ben Marshall, Margaret Martonosi, Nathan Menhorn, Joseph Myers, Vijayanand Nagarajan, Richard Newell, Rishiyur Nikhil, Jonas Oberhauser,
+Hauser, John Ingalls, David Horner, Bruce Hoult, Bill Huffman, Alexandre Joannou, Olof Johansson, Ben Keller,
+David Kruckemyer, Tariq Kurd, Yunsup Lee, Paul Loewenstein, Daniel Lustig, Yatin Manerkar, Luc Maranget,
+Ben Marshall, Yatin Manerkar, Luc Maranget, Margaret Martonosi, Phil McCoy, Nathan Menhorn, Christoph Müllner, Joseph Myers,
+Vijayanand Nagarajan, Richard Newell, Rishiyur Nikhil, Jonas Oberhauser,
Stefan O'Rear, Markku-Juhani O. Saarinen, Albert Ou, John Ousterhout, Daniel Page, David Patterson, Christopher Pulte, Jose Renau,
-Susmit Sarkar, Josh Scheid, Colin Schmidt, Peter Sewell, Brent Spinney, Michael Taylor, Wesley Terpstra, Matt
-Thomas, Tommy Thorn, Caroline Trippel, Ray VanDeWalker, Muralidaran Vijayaraghavan, Megan
-Wachs, Andrew Waterman, Robert Watson, Derek Williams, Claire Wolf, Andrew Wright, Reinoud Zandijk, Alexander Zeh
-and Sizhuo Zhang._
+Susmit Sarkar, Josh Scheid, Colin Schmidt, Peter Sewell, Ved Shanbhogue, Brent Spinney, Michael Taylor, Wesley Terpstra, Matt
+Thomas, Tommy Thorn, Philipp Tomsich, Caroline Trippel, Ray VanDeWalker, Muralidaran Vijayaraghavan, Megan
+Wachs, Paul Wamsley, Andrew Waterman, Robert Watson, David Weaver, Derek Williams, Claire Wolf, Andrew Wright, Reinoud Zandijk, Alexander Zeh and Sizhuo Zhang._
_This document is released under a Creative Commons Attribution 4.0 International License._
@@ -126,6 +133,9 @@ include::zfa.adoc[]
include::ztso-st-ext.adoc[]
//ztso.tex
include::scalar-crypto.adoc[]
+include::cmo.adoc[]
+include::zawrs.adoc[]
+include::zc.adoc[]
include::rv-32-64g.adoc[]
//gmaps.tex
include::extending.adoc[]
@@ -138,6 +148,11 @@ include::mm-eplan.adoc[]
//memory.tex
include::mm-formal.adoc[]
//end of memory.tex, memory-model-alloy.tex, memory-model-herd.tex
+//Appendices for Vector
+include::vector-examples.adoc[]
+include::calling-convention.adoc[]
+//include::fraclmul.adoc[]
+//End of Vector appendices
include::index.adoc[]
// this is generated generated from index markers.
include::bibliography.adoc[]
diff --git a/src/rnmi.adoc b/src/rnmi.adoc
index 9938917..f505f56 100644
--- a/src/rnmi.adoc
+++ b/src/rnmi.adoc
@@ -1,9 +1,9 @@
[[rnmi]]
-== "Smrnmi" Standard Extension for Resumable Non-Maskable Interrupts, Version 0.4
+== "Smrnmi" Standard Extension for Resumable Non-Maskable Interrupts, Version 0.5
[WARNING]
====
-*Warning! This draft specification may change before being accepted as
+*Warning! This frozen specification may change before being accepted as
standard by RISC-V International.*
====
@@ -71,20 +71,30 @@ of holding.
.Resumable NMI cause `mncause`.
include::images/bytefield/mncause.edn[]
-The `mncause` CSR holds the reason for the NMI, with bit MXLEN-1 set to
-1, and the NMI cause encoded in the least-significant bits or zero if
-NMI causes are not supported.
+The `mncause` CSR holds the reason for the NMI.
+If the reason is an interrupt, bit MXLEN-1 is set to 1, and the NMI
+cause is encoded in the least-significant bits.
+If the reason is an interrupt and NMI causes are not supported, bit MXLEN-1 is
+set to 1, and zero is written to the least-significant bits.
+If the reason is an exception within M-mode that results in a double trap as
+specified in the Smdbltrp extension, bit MXLEN-1 is set to 0 and the
+least-significant bits are set to the cause code corresponding to the
+exception that precipitated the double trap.
.Resumable NMI status register `mnstatus`.
include::images/bytefield/mnstatus.edn[]
The `mnstatus` CSR holds a two-bit field, MNPP, which on entry to the
-trap handler holds the privilege mode of the interrupted context,
+RNMI trap handler holds the privilege mode of the interrupted context,
encoded in the same manner as `mstatus`.MPP. It also holds a one-bit
-field, MNPV, which on entry to the trap handler holds the virtualization
+field, MNPV, which on entry to the RNMI trap handler holds the virtualization
mode of the interrupted context, encoded in the same manner as
`mstatus`.MPV.
+If the Zicfilp extension is implemented, `mnstatus` also holds the MNPELP
+field, which on entry to the RNMI trap handler holds the previous `ELP` state.
+When an RNMI trap is taken, MNPELP is set to `ELP` and `ELP` is set to 0.
+
`mnstatus` also holds the NMIE bit. When NMIE=1, nonmaskable interrupts
are enabled. When NMIE=0, _all_ interrupts are disabled.
@@ -131,6 +141,8 @@ MNRET is an M-mode-only instruction that uses the values in `mnepc` and
`mnstatus` to return to the program counter, privilege mode, and
virtualization mode of the interrupted context. This instruction also
sets `mnstatus`.NMIE. If MNRET changes the privilege mode to a mode less privileged than M, it also sets `mstatus`.MPRV to 0.
+If the Zicfilp extension is implemented, then if `mnstatus`.MNPP holds the
+value __y__, MNRET sets `ELP` to the logical AND of __y__LPE and `mnstatus`.MNPELP.
=== RNMI Operation
diff --git a/src/rv-32-64g.adoc b/src/rv-32-64g.adoc
index 7714436..1818ddf 100644
--- a/src/rv-32-64g.adoc
+++ b/src/rv-32-64g.adoc
@@ -442,6 +442,15 @@ ISA.
2+|1101010 |00011 |rs1 |rm |rd |1010011 |FCVT.H.LU
|===
+[%autowidth.stretch,float="center",align="center",cols="^2m,^2m,^2m,^2m,<2m,>3m, <4m, >4m, <4m, >4m, <4m, >4m, <4m, >4m, <6m"]
+|===
+15+^|Zawrs Standard Extension
+
+6+^|000000001101 2+^|00000 2+^|000 2+^|00000 2+^|1110011 <|WRS.NTO
+6+^|000000011101 2+^|00000 2+^|000 2+^|00000 2+^|1110011 <|WRS.STO
+|===
+
+
<<rvgcsrnames>> lists the CSRs that have currently been
allocated CSR addresses. The timers, counters, and floating-point CSRs
are the only CSRs defined in this specification.
diff --git a/src/rv32.adoc b/src/rv32.adoc
index 9ce3fb0..bd38ac8 100644
--- a/src/rv32.adoc
+++ b/src/rv32.adoc
@@ -50,7 +50,7 @@ holds the address of the current instruction.
[[gprs]]
.RISC-V base unprivileged integer register state.
-[col[s="<|^|>"|option[s="header",width="50%",align="center"grid="rows"]
+[cols="<,^,>",options="header",width="50%",align="center",grid="rows"]
|===
<| [.small]#XLEN-1#| >| [.small]#0#
3+^| [.small]#x0/zero#
diff --git a/src/smepmp.adoc b/src/smepmp.adoc
new file mode 100644
index 0000000..547f723
--- /dev/null
+++ b/src/smepmp.adoc
@@ -0,0 +1,171 @@
+[[smepmp]]
+== PMP Enhancements for memory access and execution prevention on Machine mode (Smepmp)
+=== Introduction
+
+Being able to access the memory of a process running at a high privileged execution mode, such as the Supervisor or Machine mode, from a lower privileged mode such as the User mode, introduces an obvious attack vector since it allows for an attacker to perform privilege escalation, and tamper with the code and/or data of that process. A less obvious attack vector exists when the reverse happens, in which case an attacker instead of tampering with code and/or data that belong to a high-privileged process, can tamper with the memory of an unprivileged / less-privileged process and trick the high-privileged process to use or execute it.
+
+To prevent this attack vector, two mechanisms known as Supervisor Memory Access Prevention (SMAP) and Supervisor Memory Execution Prevention (SMEP) were introduced in recent systems. The first one prevents the OS from accessing the memory of an unprivileged process unless a specific code path is followed, and the second one prevents the OS from executing the memory of an unprivileged process at all times. RISC-V already includes support for SMAP, through the ``sstatus.SUM`` bit, and for SMEP by always denying execution of virtual memory pages marked with the U bit, with Supervisor mode (OS) privileges, as mandated on the Privilege Spec.
+
+
+[NOTE]
+====
+Terms:
+
+* *PMP Entry*: A pair of ``pmpcfg[i]`` / ``pmpaddr[i]`` registers.
+* *PMP Rule*: The contents of a pmpcfg register and its associated pmpaddr register(s), that encode a valid protected physical memory region, where ``pmpcfg[i].A != OFF``, and if ``pmpcfg[i].A == TOR``, ``pmpaddr[i-1] < pmpaddr[i]``.
+* *Ignored*: Any permissions set by a matching PMP rule are ignored, and _all_ accesses to the requested address range are allowed.
+* *Enforced*: Only access types configured in the PMP rule matching the requested address range are allowed; failures will cause an access-fault exception.
+* *Denied*: Any permissions set by a matching PMP rule are ignored, and _no_ accesses to the requested address range are allowed.; failures will cause an access-fault exception.
+* *Locked*: A PMP rule/entry where the ``pmpcfg.L`` bit is set.
+* *PMP reset*: A reset process where all PMP settings of the hart, including locked rules/settings, are re-initialized to a set of safe defaults, before releasing the hart (back) to the firmware / OS / application.
+====
+
+==== Threat model
+
+However, there are no such mechanisms available on Machine mode in the current (v1.11) Privileged Spec. It is not possible for a PMP rule to be *enforced* only on non-Machine modes and *denied* on Machine mode, to only allow access to a memory region by less-privileged modes. it is only possible to have a *locked* rule that will be *enforced* on all modes, or a rule that will be *enforced* on non-Machine modes and be *ignored* by Machine mode. So for any physical memory region which is not protected with a Locked rule, Machine mode has unlimited access, including the ability to execute it.
+
+Without being able to protect less-privileged modes from Machine mode, it is not possible to prevent the mentioned attack vector. This becomes even more important for RISC-V than on other architectures, since implementations are allowed where a hart only has Machine and User modes available, so the whole OS will run on Machine mode instead of the non-existent Supervisor mode. In such implementations the attack surface is greatly increased, and the same kind of attacks performed on Supervisor mode and mitigated through SMAP/SMEP, can be performed on Machine mode without any available mitigations. Even on implementations with Supervisor mode present attacks are still possible against the Firmware and/or the Secure Monitor running on Machine mode.
+
+[[proposal]]
+=== Proposal
+
+. *Machine Security Configuration (mseccfg)* is a new RW Machine mode CSR, used for configuring various security mechanisms present on the hart, and only accessible to Machine mode. It is 64 bits wide, and is at address *0x747 on RV64* and *0x747 (low 32bits), 0x757 (high 32bits) on RV32*. All mseccfg fields defined on this proposal are WARL, and the remaining bits are reserved for future standard use and should always read zero. The reset value of mseccfg is implementation-specific, otherwise if backwards compatibility is a requirement it should reset to zero on hard reset.
+
+. On ``mseccfg`` we introduce a field on bit 2 called *Rule Locking Bypass (mseccfg.RLB)* with the following functionality:
++
+.. When ``mseccfg.RLB`` is 1 *locked* PMP rules may be removed/modified and *locked* PMP entries may be edited.
+
+.. When ``mseccfg.RLB`` is 0 and ``pmpcfg.L`` is 1 in any rule or entry (including disabled entries), then ``mseccfg.RLB`` remains 0 and any further modifications to ``mseccfg.RLB`` are ignored until a *PMP reset*.
++
+[CAUTION]
+====
+Note that this feature is intended to be used as a debug mechanism, or as a temporary workaround during the boot process for simplifying software, and optimizing the allocation of memory and PMP rules. Using this functionality under normal operation, after the boot process is completed, should be avoided since it weakens the protection of _M-mode-only_ rules. Vendors who don’t need this functionality may hardwire this field to 0.
+====
+
+. On ``mseccfg`` we introduce a field in bit 1 called *Machine Mode Whitelist Policy (mseccfg.MMWP)*. This is a sticky bit, meaning that once set it cannot be unset until a *PMP reset*. When set it changes the default PMP policy for M-mode when accessing memory regions that don’t have a matching PMP rule, to *denied* instead of *ignored*.
+
+. On ``mseccfg`` we introduce a field in bit 0 called *Machine Mode Lockdown (mseccfg.MML)*. This is a sticky bit, meaning that once set it cannot be unset until a *PMP reset*. When ``mseccfg.MML`` is set the system's behavior changes in the following way:
+
+.. The meaning of ``pmpcfg.L`` changes: Instead of marking a rule as *locked* and *enforced* in all modes, it now marks a rule as *M-mode-only* when set and *S/U-mode-only* when unset. The formerly reserved encoding of ``pmpcfg.RW=01``, and the encoding ``pmpcfg.LRWX=1111``, now encode a *Shared-Region*.
++
+An _M-mode-only_ rule is *enforced* on Machine mode and *denied* in Supervisor or User mode. It also remains *locked* so that any further modifications to its associated configuration or address registers are ignored until a *PMP reset*, unless ``mseccfg.RLB`` is set.
++
+An _S/U-mode-only_ rule is *enforced* on Supervisor and User modes and *denied* on Machine mode.
++
+A _Shared-Region_ rule is *enforced* on all modes, with restrictions depending on the ``pmpcfg.L`` and ``pmpcfg.X`` bits:
++
+* A _Shared-Region_ rule where ``pmpcfg.L`` is not set can be used for sharing data between M-mode and S/U-mode, so is not executable. M-mode has read/write access to that region, and S/U-mode has read access if ``pmpcfg.X`` is not set, or read/write access if ``pmpcfg.X`` is set.
++
+* A _Shared-Region_ rule where ``pmpcfg.L`` is set can be used for sharing code between M-mode and S/U-mode, so is not writeable. Both M-mode and S/U-mode have execute access on the region, and M-mode also has read access if ``pmpcfg.X`` is set. The rule remains *locked* so that any further modifications to its associated configuration or address registers are ignored until a *PMP reset*, unless ``mseccfg.RLB`` is set.
++
+* The encoding ``pmpcfg.LRWX=1111`` can be used for sharing data between M-mode and S/U mode, where both modes only have read-only access to the region. The rule remains *locked* so that any further modifications to its associated configuration or address registers are ignored until a *PMP reset*, unless ``mseccfg.RLB`` is set.
+
+
+.. Adding a rule with executable privileges that either is *M-mode-only* or a *locked* *Shared-Region* is not possible and such ``pmpcfg`` writes are ignored, leaving ``pmpcfg`` unchanged. This restriction can be temporarily lifted by setting ``mseccfg.RLB`` e.g. during the boot process.
+
+.. Executing code with Machine mode privileges is only possible from memory regions with a matching *M-mode-only* rule or a *locked* *Shared-Region* rule with executable privileges. Executing code from a region without a matching rule or with a matching _S/U-mode-only_ rule is *denied*.
+
+.. If ``mseccfg.MML`` is not set, the combination of ``pmpcfg.RW=01`` remains reserved for future standard use.
+
+
+==== Truth table when mseccfg.MML is set
+
+[cols="^1,^1,^1,^1,^3,^3",stripes=even,options="header"]
+|===
+4+|Bits on _pmpcfg_ register {set:cellbgcolor:green} 2+|Result
+|L|R|W|X|M Mode|S/U Mode
+|{set:cellbgcolor:!} 0|0|0|0 2+|Inaccessible region (Access Exception)
+|0|0|0|1|Access Exception|Execute-only region
+|0|0|1|0 2+|Shared data region: Read/write on M mode, read-only on S/U mode
+|0|0|1|1 2+|Shared data region: Read/write for both M and S/U mode
+|0|1|0|0|Access Exception|Read-only region
+|0|1|0|1|Access Exception|Read/Execute region
+|0|1|1|0|Access Exception|Read/Write region
+|0|1|1|1|Access Exception|Read/Write/Execute region
+|1|0|0|0 2+|Locked inaccessible region* (Access Exception)
+|1|0|0|1|Locked Execute-only region*|Access Exception
+|1|0|1|0 2+|Locked Shared code region: Execute only on both M and S/U mode.*
+|1|0|1|1 2+|Locked Shared code region: Execute only on S/U mode, read/execute on M mode.*
+|1|1|0|0|Locked Read-only region*|Access Exception
+|1|1|0|1|Locked Read/Execute region*|Access Exception
+|1|1|1|0|Locked Read/Write region*|Access Exception
+|1|1|1|1 2+|Locked Shared data region: Read only on both M and S/U mode.*
+|===
+
+*: *Locked* rules cannot be removed or modified until a *PMP reset*, unless ``mseccfg.RLB`` is set.
+
+==== Visual representation of the proposal
+
+image::smepmp-visual-representation.png[]
+
+=== Smepmp software discovery
+
+Since all fields defined on ``mseccfg`` as part of this proposal are locked when set (``MMWP``/``MML``) or locked when cleared (``RLB``), software can't poll them for determining the presence of Smepmp. It is expected that BootROM will set ``mseccfg.MMWP`` and/or ``mseccfg.MML`` during early boot, before jumping to the firmware, so that the firmware will be able to determine the presence of Smepmp by reading ``mseccfg`` and checking the state of ``mseccfg.MMWP`` and ``mseccfg.MML``.
+
+[[rationale]]
+=== Rationale
+
+. Since a CSR for security and / or global PMP behavior settings is not available with the current spec, we needed to define a new one. This new CSR will allow us to add further security configuration options in the future and also allow developers to verify the existence of the new mechanisms defined on this proposal.
+
+. There are use cases where developers want to enforce PMP rules in M-mode during the boot process, that are also able to modify, merge, and / or remove later on. Since a rule that is enforced in M-mode also needs to be locked (or else badly written or malicious M-mode software can remove it at any time), the only way for developers to approach this is to keep adding PMP rules to the chain and rely on rule priority. This is a waste of PMP rules and since it’s only needed during boot, ``mseccfg.RLB`` is a simple workaround that can be used temporarily and then disabled and locked down.
++
+Also when ``mseccfg.MML`` is set, according to 4b it’s not possible to add a _Shared-Region_ rule with executable privileges. So RLB can be set temporarily during the boot process to register such regions. Note that it’s still possible to register executable _Shared-Region_ rules using initial register settings (that may include ``mseccfg.MML`` being set and the rule being set on PMP registers) on *PMP reset*, without using RLB.
++
+[WARNING]
+====
+*Be aware that RLB introduces a security vulnerability if left set after the boot process is over and in general it should be used with caution, even when used temporarily.* Having editable PMP rules in M-mode gives a false sense of security since it only takes a few malicious instructions to lift any PMP restrictions this way. It doesn’t make sense to have a security control in place and leave it unprotected. Rule Locking Bypass is only meant as a way to optimize the allocation of PMP rules, catch errors durring debugging, and allow the bootrom/firmware to register executable _Shared-Region_ rules. If developers / vendors have no use for such functionality, they should never set ``mseccfg.RLB`` and if possible hard-wire it to 0. In any case *RLB should be disabled and locked as soon as possible*.
+====
++
+[NOTE]
+====
+If ``mseccfg.RLB`` is not used and left unset, it wil be locked as soon as a PMP rule/entry with the ``pmpcfg.L`` bit set is configured.
+====
++
+[IMPORTANT]
+====
+Since PMP rules with a higher priority override rules with a lower priority, locked rules must precede non-locked rules.
+====
+
+. With the current spec M-mode can access any memory region unless restricted by a PMP rule with the ``pmpcfg.L`` bit set. There are cases where this approach is overly permissive, and although it’s possible to restrict M-mode by adding PMP rules during the boot process, this can also be seen as a waste of PMP rules. Having the option to block anything by default, and use PMP as a whitelist for M-mode is considered a safer approach. This functionality may be used during the boot process or upon *PMP reset*, using initial register settings. +
+
+. The current dual meaning of the ``pmpcfg.L`` bit that marks a rule as Locked and *enforced* on all modes is neither flexible nor clean. With the introduction of _Machine Mode Lock-down_ the ``pmpcfg.L`` bit distinguishes between rules that are *enforced* *only* in M-mode (_M-mode-only_) or *only* in S/U-modes (_S/U-mode-only_). The rule locking becomes part of the definition of an _M-mode-only_ rule, since when a rule is added in M mode, if not locked, can be modified or removed in a few instructions. On the other hand, S/U modes can’t modify PMP rules anyway so locking them doesn’t make sense.
+
+.. This separation between _M-mode-only_ and _S/U-mode-only_ rules also allows us to distinguish which regions are to be used by processes in Machine mode (``pmpcfg.L == 1``) and which by Supervisor or User mode processes (``pmpcfg.L == 0``), in the same way the U bit on the Virtual Memory’s PTEs marks which Virtual Memory pages are to be used by User mode applications (U=1) and which by the Supervisor / OS (U=0). With this distinction in place we are able to implement memory access and execution prevention in M-mode for any physical memory region that is not _M-mode-only_.
++
+An attacker that manages to tamper with a memory region used by S/U mode, even after successfully tricking a process running in M-mode to use or execute that region, will fail to perform a successful attack since that region will be _S/U-mode-only_ hence any access when in M-mode will trigger an access exception.
++
+[NOTE]
+====
+In order to support zero-copy transfers between M-mode and S/U-mode we need to either allow shared memory regions, or introduce a mechanism similar to the ``sstatus.SUM`` bit to temporary allow the high-privileged mode (in this case M-mode) to be able to perform loads and stores on the region of a less-privileged process (in this case S/U-mode). In our case after discussion within the group it seemed a better idea to follow the first approach and have this functionality encoded on a per-rule basis to avoid the risk of leaving a temporary, global bypass active when exiting M-mode, hence rendering memory access prevention useless.
+====
++
+
+[NOTE]
+====
+Although it’s possible to use ``mstatus.MPRV`` in M-mode to read/write data on an _S/U-mode-only_ region using general purpose registers for copying, this will happen with S/U-mode permissions, honoring any MMU restrictions put in place by S-mode. Of course it’s still possible for M-mode to tamper with the page tables and / or add _S/U-mode-only_ rules and bypass the protections put in place by S-mode but if an attacker has managed to compromise M-mode to such extent, no security guarantees are possible in any way. *Also note that the threat model we present here assumes buggy software in M-mode, not compromised software*. We considered disabling ``mstatus.MPRV`` but it seemed too much and out of scope.
+====
++
+_Shared-region_ rules can be used both for zero-copy data transfers and for sharing code segments. The latter may be used for example to allow S/U-mode to execute code by the vendor, that makes use of some vendor-specific ISA extension, without having to go through the firmware with an ecall. This is similar to the vDSO approach followed on Linux, that allows userspace code to execute kernel code without having to perform a system call.
++
+To make sure that shared data regions can’t be executed and shared code regions can’t be modified, the encoding changes the meaning of the ``pmpcfg.X bit``. In case of shared data regions, with the exception of the ``pmpcfg.LRWX=1111`` encoding, the ``pmpcfg.X`` bit marks the capability of S/U-mode to write to that region, so it’s not possible to encode an executable shared data region. In case of shared code regions, the ``pmpcfg.X`` bit marks the capability of M-mode to read from that region, and since ``pmpcfg.RW=01`` is used for encoding the shared region, it’s not possible to encode a shared writable code region.
++
+[NOTE]
+====
+For adding _Shared-region_ rules with executable privileges to share code segments between M-mode and S/U-mode, ``mseccfg.RLB`` needs to be implemented, or else such rules can only be added together with ``mseccfg.MML`` being set on *PMP Reset*. That's because the reserved encoding ``pmpcfg.RW=01`` being used for _Shared-region_ rules is only defined when ``mseccfg.MML`` is set, and 4b prevents the adition of rules with executable privileges on M-mode after ``mseccfg.MML`` is set unless ``mseccfg.RLB`` is also set.
+====
++
+[NOTE]
+====
+Using the ``pmpcfg.LRWX=1111`` encoding for a locked shared read-only data region was decided later on, its initial meaning was an M-mode-only read/write/execute region. The reason for that change was that the already defined shared data regions were not locked, so r/w access to M-mode couldn’t be restricted. In the same way we have execute-only shared code regions for both modes, it was decided to also be able to allow a least-privileged shared data region for both modes. This approach allows for example to share the .text section of an ELF with a shared code region and the .rodata section with a locked shared data region, without allowing M-mode to modify .rodata. We also decided that having a locked read/write/execute region in M-mode doesn’t make much sense and could be dangerous, since M-mode won’t be able to add further restrictions there (as in the case of S/U-mode where S-mode can further limit access to an ``pmpcfg.LWRX=0111`` region through the MMU), leaving the possibility of modifying an executable region in M-mode open.
+====
++
+[NOTE]
+====
+For encoding Shared-region rules initially we used one of the two reserved bits on pmpcfg (bit 5) but in order to avoid allocating an extra bit, since those bits are a very limited resource, it was decided to use the reserved R=0,W=1 combination.
+====
+.. The idea with this restriction is that after the Firmware or the OS running in M-mode is initialized and ``mseccfg.MML`` is set, no new code regions are expected to be added since nothing else is expected to run in M-mode (everything else will run in S/U mode). Since we want to limit the attack surface of the system as much as possible, it makes sense to disallow any new code regions which may include malicious code, to be added/executed in M-mode.
+
+.. In case ``mseccfg.MMWP`` is not set, M-mode can still access and execute any region not covered by a PMP rule. Since we try to prevent M-mode from executing malicious code and since an attacker may manage to place code on some region not covered by PMP (e.g. a directly-addressable flash memory), we need to ensure that M-mode can only execute the code segments initialized during firmware / OS initialization.
+
+.. We are only using the encoding ``pmpcfg.RW=01`` together with ``mseccfg.MML``, if ``mseccfg.MML`` is not set the encoding remains usable for future use.
+
diff --git a/src/smstateen.adoc b/src/smstateen.adoc
new file mode 100644
index 0000000..f524581
--- /dev/null
+++ b/src/smstateen.adoc
@@ -0,0 +1,406 @@
+[[smstateen]]
+== "Smststeen" State Enable Extension, Version 1.0.0
+
+=== Motivation
+
+The implementation of optional RISC-V extensions has the potential to open
+covert channels between separate user threads, or between separate guest OSes
+running under a hypervisor. The problem occurs when an extension adds processor
+state---usually explicit registers, but possibly other forms of state---that
+the main OS or hypervisor is unaware of (and hence won't context-switch) but
+that can be modified/written by one user thread or guest OS and
+perceived/examined/read by another.
+
+For example, the proposed Advanced Interrupt Architecture (AIA) for RISC-V adds
+to a hart as many as ten supervisor-level CSRs (`siselect`, `sireg`, `stopi`,
+`sseteipnum`, `sclreipnum`, `sseteienum`, `sclreienum`, `sclaimei`, `sieh`, and `siph`) and
+provides also the option for hardware to be backward-compatible with older,
+pre-AIA software. Because an older hypervisor that is oblivious to the AIA will
+not know to swap any of the AIA's new CSRs on context switches, the registers may
+then be used as a covert channel between multiple guest OSes that run atop this
+hypervisor. Although traditional practices might consider such a communication
+channel harmless, the intense focus on security today argues that a means be
+offered to plug such channels.
+
+The `f` registers of the RISC-V floating-point extensions and the `v` registers of
+the vector extension would similarly be potential covert channels between user
+threads, except for the existence of the FS and VS fields in the `sstatus`
+register. Even if an OS is unaware of, say, the vector extension and its `v`
+registers, access to those registers is blocked when the VS field is
+initialized to zero, either at machine level or by the OS itself initializing
+`sstatus`.
+
+Obviously, one way to prevent the use of new user-level CSRs as covert channels
+would be to add to `mstatus` or `sstatus` an "XS" field for each relevant
+extension, paralleling the V extension's VS field. However, this is not
+considered a general solution to the problem due to the number of potential
+future extensions that may add small amounts of state. Even with a 64-bit
+`sstatus` (necessitating adding `sstatush` for RV32), it is not certain there are
+enough remaining bits in `sstatus` to accommodate all future user-level
+extensions. In any event, there is no need to strain `sstatus` (and add `sstatush`)
+for this purpose. The "enable" flags that are needed to plug covert channels
+are not generally expected to require swapping on context switches of user
+threads, making them a less-than-compelling candidate for inclusion in `sstatus`.
+Hence, a new place is proposed for them instead.
+
+=== Proposal
+
+For RV64 harts, this extension adds four new 64-bit CSRs at machine level,
+listed with their CSR addresses:
+
+`0x30C mstateen0` (Machine State Enable 0)
+
+`0x30D mstateen1`
+
+`0x30E mstateen2`
+
+`0x30F mstateen3`
+
+If supervisor mode is implemented, another four CSRs are defined at supervisor
+level:
+
+`0x10C sstateen0`
+
+`0x10D sstateen1`
+
+`0x10E sstateen2`
+
+`0x10F sstateen3`
+
+And if the hypervisor extension is implemented, another set of CSRs is added:
+
+`0x60C hstateen0`
+
+`0x60D hstateen1`
+
+`0x60E hstateen2`
+
+`0x60F hstateen3`
+
+For RV32, the registers listed above are 32-bit, and for the machine-level and
+hypervisor CSRs there is a corresponding set of high-half CSRs for the upper 32
+bits of each register:
+
+`0x31C mstateen0h`
+
+`0x31D mstateen1h`
+
+`0x31E mstateen2h`
+
+`0x31F mstateen3h`
+
+`0x61C hstateen0h`
+
+`0x61D hstateen1h`
+
+`0x61E hstateen2h`
+
+`0x61F hstateen3h`
+
+For the supervisor-level `sstateen` registers, high-half CSRs are not added at
+this time because it is expected the upper 32 bits of these registers will
+always be zeros, as explained later below.
+
+Each bit of a `stateen` CSR controls less-privileged access to an extension's
+state, for an extension that was not deemed "worthy" of a full XS field in
+`sstatus` like the FS and VS fields for the F and V extensions. The number of
+registers provided at each level is four because it is believed that 4 * 64 =
+256 bits for machine and hypervisor levels, and 4 * 32 = 128 bits for
+supervisor level, will be adequate for many years to come, perhaps for as long
+as the RISC-V ISA is in use. The exact number four is an attempted compromise
+between providing too few bits on the one hand and going overboard with CSRs
+that will never be used on the other. A possible future doubling of the number
+of `stateen` CSRs is covered later.
+
+The `stateen` registers at each level control access to state at all
+less-privileged levels, but not at its own level. This is analogous to how the
+existing `counteren` CSRs control access to performance counter registers. Just
+as with the `counteren` CSRs, when a `stateen` CSR prevents access to state by
+less-privileged levels, an attempt in one of those privilege modes to execute
+an instruction that would read or write the protected state raises an illegal
+instruction exception, or, if executing in VS or VU mode and the circumstances
+for a virtual instruction exception apply, raises a virtual instruction
+exception instead of an illegal instruction exception.
+
+When this extension is not implemented, all state added by an extension is
+accessible as defined by that extension.
+
+When a `stateen` CSR prevents access to state for a privilege mode, attempting to
+execute in that privilege mode an instruction that _implicitly_ updates the
+state without reading it may or may not raise an illegal instruction or virtual
+instruction exception. Such cases must be disambiguated by being explicitly
+specified one way or the other.
+
+In some cases, the bits of the `stateen` CSRs will have a dual purpose as enables
+for the ISA extensions that introduce the controlled state.
+
+Each bit of a supervisor-level `sstateen` CSR controls user-level access (from
+U-mode or VU-mode) to an extension's state. The intention is to allocate the
+bits of `sstateen` CSRs starting at the least-significant end, bit 0, through to
+bit 31, and then on to the next-higher-numbered `sstateen` CSR.
+
+For every bit with a defined purpose in an `sstateen` CSR, the same bit is
+defined in the matching `mstateen` CSR to control access below machine level to
+the same state. The upper 32 bits of an `mstateen` CSR (or for RV32, the
+corresponding high-half CSR) control access to state that is inherently
+inaccessible to user level, so no corresponding enable bits in the
+supervisor-level `sstateen` CSR are applicable. The intention is to allocate bits
+for this purpose starting at the most-significant end, bit 63, through to bit
+32, and then on to the next-higher `mstateen` CSR. If the rate that bits are
+being allocated from the least-significant end for `sstateen` CSRs is
+sufficiently low, allocation from the most-significant end of `mstateen` CSRs may
+be allowed to encroach on the lower 32 bits before jumping to the next-higher
+`mstateen` CSR. In that case, the bit positions of "encroaching" bits will remain
+forever read-only zeros in the matching `sstateen` CSRs.
+
+With the hypervisor extension, the `hstateen` CSRs have identical encodings to
+the `mstateen` CSRs, except controlling accesses for a virtual machine (from VS
+and VU modes).
+
+Each standard-defined bit of a `stateen` CSR is WARL and may be read-only zero or
+one, subject to the following conditions.
+
+Bits in any `stateen` CSR that are defined to control state that a hart doesn't
+implement are read-only zeros for that hart. Likewise, all reserved bits not
+yet given a defined meaning are also read-only zeros. For every bit in an
+`mstateen` CSR that is zero (whether read-only zero or set to zero), the same bit
+appears as read-only zero in the matching `hstateen` and `sstateen` CSRs. For every
+bit in an `hstateen` CSR that is zero (whether read-only zero or set to zero),
+the same bit appears as read-only zero in `sstateen` when accessed in VS-mode.
+
+A bit in a supervisor-level `sstateen` CSR cannot be read-only one unless the
+same bit is read-only one in the matching `mstateen` CSR and, if it exists, in
+the matching `hstateen` CSR. A bit in an `hstateen` CSR cannot be read-only one
+unless the same bit is read-only one in the matching `mstateen` CSR.
+
+On reset, all writable `mstateen` bits are initialized by the hardware to zeros.
+If machine-level software changes these values, it is responsible for
+initializing the corresponding writable bits of the `hstateen` and `sstateen` CSRs
+to zeros too. Software at each privilege level should set its respective
+`stateen` CSRs to indicate the state it is prepared to allow less-privileged
+software to access. For OSes and hypervisors, this usually means the state that
+the OS or hypervisor is prepared to swap on a context switch, or to manage in
+some other way.
+
+For each `mstateen` CSR, bit 63 is defined to control access to the
+matching `sstateen` and `hstateen` CSRs.
+That is, bit 63 of `mstateen0` controls access to `sstateen0` and `hstateen0`;
+bit 63 of `mstateen1` controls access to `sstateen1` and `hstateen1`; etc.
+Likewise, bit 63 of each `hstateen` correspondingly controls access to
+the matching `sstateen` CSR.
+A hypervisor may need this control over
+accesses to the `sstateen` CSRs if it ever must emulate for a virtual machine an
+extension that is supposed to be affected by a bit in an `sstateen` CSR. (Even if
+such emulation is uncommon, it should not be excluded.) Machine-level software
+needs identical control to be able to emulate the hypervisor extension. (That
+is, machine level needs control over accesses to the supervisor-level `sstateen`
+CSRs in order to emulate the `hstateen` CSRs, which have such control.)
+
+Bit 63 of each `mstateen` CSR may be read-only zero only if the hypervisor
+extension is not implemented and the matching supervisor-level `sstateen` CSR is
+all read-only zeros. In that case, machine-level software should emulate
+attempts to access the affected `sstateen` CSR from S-mode, ignoring writes and
+returning zero for reads. Bit 63 of each `hstateen` CSR is always writable (not
+read-only).
+
+[wavedrom, ,svg]
+....
+{reg: [
+{bits: 1, name: 'C'},
+{bits: 1, name: 'FCSR'},
+{bits: 1, name: 'JVT'},
+{bits: 61, name: 'WPRI'}
+], config:{bits: 64, lanes: 4, hspace:1024}}
+....
+
+The C bit controls access to any and all custom state.
+
+[NOTE]
+====
+Bit 0 of these registers is not custom state itself; it is a standard field of
+a standard CSR, either mstateen0, hstateen0, or sstateen0. The
+requirements that non-standard extensions must meet to be conforming are not
+relaxed due solely to changes in the value of this bit. In particular, if
+software sets this bit but does not execute any custom instructions or access
+any custom state, the software must continue to execute as specified by all
+relevant RISC-V standards, or the hardware is not standard-conforming.
+The FCSR bit controls access to fcsr for the case when floating-point
+instructions operate on x registers instead of f registers as specified by
+the Zfinx and related extensions (Zdinx, etc.). Whenever misa.F = 1, bit 1 of
+mstateen0 is read-only zero (and hence read-only zero in hstateen0 and
+sstateen0 too). For convenience, when the stateen CSRs are implemented and
+misa.F = 0, then if bit 1 of a controlling stateen0 CSR is zero, all
+floating-point instructions cause an illegal instruction trap (or virtual
+instruction trap, if relevant), as though they all access fcsr, regardless of
+whether they really do.
+====
+
+The JVT bit controls access to the JVT CSR provided by the Zcmt extension.
+
+=== Machine State Enable Register (mstateen0)
+
+[wavedrom, ,svg]
+....
+{reg: [
+{bits: 1, name: 'C'},
+{bits: 1, name: 'FCSR'},
+{bits: 1, name: 'JVT'},
+{bits: 53, name: 'WPRI'},
+{bits: 1, name: 'P1P13'},
+{bits: 1, name: 'CONTEXT'},
+{bits: 1, name: 'IMSIC'},
+{bits: 1, name: 'AIA'},
+{bits: 1, name: 'CSRIND'},
+{bits: 1, name: 'WPRI'},
+{bits: 1, name: 'ENVCFG'},
+{bits: 1, name: 'SE0'},
+], config: {bits: 64, lanes: 4, hspace:1024}}
+....
+
+The C bit controls access to any and all custom state. The FCSR and the JVT
+bits control access to the same state as controlled by the same bits in the
+sstateen0 CSR.
+
+The SE0 bit in mstateen0 controls access to the hstateen0, hstateen0h,
+and the sstateen0 CSRs.
+
+The ENVCFG bit in mstateen0 controls access to the henvcfg, henvcfgh,
+and the senvcfg CSRs.
+
+The CSRIND bit in mstateen0 controls access to the siselect, sireg*,
+vsiselect, and the vsireg* CSRs provided by the Sscsrind extensions.
+
+The IMSIC bit in mstateen0 controls access to the IMSIC state, including
+CSRs stopei and vstopei, provided by the Ssaia extension.
+
+The AIA bit in mstateen0 controls access to all state introduced by the
+Ssaia extension and is not controlled by either the CSRIND or the IMSIC
+bits.
+
+The CONTEXT bit in mstateen0 controls access to the scontext and
+hcontext CSRs provided by the Sdtrig ISA extension.
+
+The P1P13 bit in mstateen0 controls access to the hedelegh introduced by
+Privileged Specification Version 1.13.
+
+=== Hypervisor State Enable Register (hstateen0)
+
+[wavedrom, ,svg]
+....
+{reg: [
+{bits: 1, name: 'C'},
+{bits: 1, name: 'FCSR'},
+{bits: 1, name: 'JVT'},
+{bits: 54, name: 'WPRI'},
+{bits: 1, name: 'CONTEXT'},
+{bits: 1, name: 'IMSIC'},
+{bits: 1, name: 'AIA'},
+{bits: 1, name: 'CSRIND'},
+{bits: 1, name: 'WPRI'},
+{bits: 1, name: 'ENVCFG'},
+{bits: 1, name: 'SE0'},
+], config: {bits: 64, lanes: 4, hspace:1024}}
+....
+
+The C bit controls access to any and all custom state. The FCSR and the JVT
+bits control access to the same state as controlled by the same bits in the
+sstateen0 CSR.
+
+The SE0 bit in hstateen0 controls access to the sstateen0 CSR.
+
+The ENVCFG bit in hstateen0 controls access to the senvcfg CSRs.
+The CSRIND bit in hstateen0 controls access to the siselect and the
+sireg*, (really vsiselect and vsireg*) CSRs provided by the
+Sscsrind extensions.
+
+The IMSIC bit in hstateen0 controls access to the guest IMSIC state,
+including CSRs stopei (really vstopei), provided by the Ssaia extension.
+
+[NOTE]
+====
+Setting the IMSIC bit in hstateen0 to zero prevents a virtual machine from
+accessing the hart's IMSIC the same as setting hstatus.VGEIN = 0.
+The AIA bit in hstateen0 controls access to all state introduced by the
+Ssaia extension and is not controlled by either the CSRIND or the IMSIC
+bits of hstateen0.
+====
+
+The CONTEXT bit in hstateen0 controls access to the scontext CSR
+provided by the Sdtrig ISA extension.
+
+=== Usage
+
+After the writable bits of the machine-level `mstateen` CSRs are initialized to
+zeros on reset, machine-level software can set bits in these registers to
+enable less-privileged access to the controlled state. This may be either
+because machine-level software knows how to swap the state or, more likely,
+because machine-level software isn't swapping supervisor-level environments.
+(Recall that the main reason the `mstateen` CSRs must exist is so machine level
+can emulate the hypervisor extension. When machine level isn't emulating the
+hypervisor extension, it is likely there will be no need to keep any
+implemented `mstateen` bits zero.)
+
+If machine level sets any writable `mstateen` bits to nonzero, it must initialize
+the matching `hstateen` CSRs, if they exist, by writing zeros to them. And if any
+`mstateen` bits that are set to one have matching bits in the `sstateen` CSRs,
+machine-level software must also initialize those `sstateen` CSRs by writing
+zeros to them. Ordinarily, machine-level software will want to set bit 63 of
+all `mstateen` CSRs, necessitating that it write zero to all `hstateen` CSRs.
+
+Software should ensure that all writable bits of `sstateen` CSRs are initialized
+to zeros when an OS at supervisor level is first entered. The OS can then set
+bits in these registers to enable user-level access to the controlled state,
+presumably because it knows how to context-swap the state.
+
+For the `sstateen` CSRs whose access by a guest OS is permitted by bit 63 of the
+corresponding `hstateen` CSRs, a hypervisor must include the `sstateen` CSRs in the
+context it swaps for a guest OS. When it starts a new guest OS, it must ensure
+the writable bits of those `sstateen` CSRs are initialized to zeros, and it must
+emulate accesses to any other `sstateen` CSRs.
+
+If software at any privilege level does not support multiple contexts for
+less-privilege levels, then it may choose to maximize less-privileged access to
+all state by writing a value of all ones to the `stateen` CSRs at its level (the
+`mstateen` CSRs for machine level, the `sstateen` CSRs for an OS, and the `hstateen`
+CSRs for a hypervisor), without knowing all the state to which it is granting
+access. This is justified because there is no risk of a covert channel between
+execution contexts at the less-privileged level when only one context exists
+at that level. This situation is expected to be common for machine level, and
+it might also arise, for example, for a type-1 hypervisor that hosts only a
+single guest virtual machine.
+
+=== Possible expansion
+
+If a need is anticipated, the set of `stateen` CSRs could in the future be
+doubled by adding these:
+
+`0x38C mstateen4` `0x39C mstateen4h`
+
+`0x38D mstateen5` `0x39D mstateen5h`
+
+`0x38E mstateen6` `0x39E mstateen6h`
+
+`0x38F mstateen7` `0x39F mstateen7h`
+
+`0x18C sstateen4`
+
+`0x18D sstateen5`
+
+`0x18E sstateen6`
+
+`0x18F sstateen7`
+
+`0x68C hstateen4` `0x69C hstateen4h`
+
+`0x68D hstateen5` `0x69D hstateen5h`
+
+`0x68E hstateen6` `0x69E hstateen6h`
+
+`0x68F hstateen7` `0x69F hstateen7h`
+
+These additional CSRs are not a definite part of the original proposal because
+it is unclear whether they will ever be needed, and it is believed the rate of
+consumption of bits in the first group, registers numbered 0-3, will be slow
+enough that any looming shortage will be perceptible many years in advance. At
+the moment, it is not known even how many years it may take to exhaust just
+`mstateen0`, `sstateen0`, and `hstateen0`. \ No newline at end of file
diff --git a/src/sscofpmt.adoc b/src/sscofpmt.adoc
new file mode 100644
index 0000000..101c15f
--- /dev/null
+++ b/src/sscofpmt.adoc
@@ -0,0 +1,189 @@
+[[Sscofpmf]]
+== "Sscofpmf" Count Overflow and Mode-Based Filtering Extension, Version 1.0.0
+
+The current Privileged specification defines mhpmevent CSRs to select and
+control event counting by the associated hpmcounter CSRs, but provides no
+standardization of any fields within these CSRs. For at least Linux-class
+rich-OS systems it is desirable to standardize certain basic features that are
+broadly desired (and have come up over the past year plus on RISC-V lists, as
+well as have been the subject of past proposals). This enables there to be
+standard upstream software support that eliminates the need for implementations
+to provide their own custom software support.
+
+This extension serves to accomplish exactly this within the existing mhpmevent
+CSRs (and correspondingly avoids the unnecessary creation of whole new sets of
+CSRs - past just one new CSR).
+
+This extension sticks to addressing two basic well-understood needs that have
+been requested by various people. To make it easy to understand the deltas from
+the current Priv 1.11/1.12 specs, this is written as the actual exact changes
+to be made to existing paragraphs of Priv spec text (or additional paragraphs
+within the existing text).
+
+The extension name is "Sscofpmf" ('Ss' for Privileged arch and Supervisor-level
+extensions, and 'cofpmf' for Count OverFlow and Privilege Mode Filtering).
+
+Note that the new count overflow interrupt will be treated as a standard local
+interrupt that is assigned to bit 13 in the mip/mie/sip/sie registers.
+
+=== Machine Level Additions
+
+==== Hardware Performance Monitor
+
+This extension expands the hardware performance monitor description and extends
+the mhpmevent registers to 64 bits (in RV32) as follows:
+
+The hardware performance monitor includes 29 additional 64-bit event counters and 29 associated 64-bit event selector registers - the mhpmcounter3–mhpmcounter31 and mhpmevent3–mhpmevent31 CSRs.
+
+The mhpmcounters are WARL registers that support up to 64 bits of precision on
+RV32 and RV64.
+
+The mhpmevent__n__ registers are WARL registers that control which event causes
+the corresponding counter to increment and what happens when the corresponding
+count overflows. Currently just a few bits are defined here. Past this, the
+actual selection and meaning of events is defined by the platform, but
+(mhpmevent == 0) is defined to mean “no event" and that the corresponding
+counter will never be incremented. Typically the lower bits of mhpmevent will
+be used for event selection purposes.
+
+On RV32 only, accesses to the mcycle, minstret, mhpmcounter__n__, and
+mhpmevent__n__ CSRs access the low 32 bits, while accesses to the mcycleh,
+minstreth, mhpmcounter__n__h, and mhpmevent__n__h CSRs access bits 63–32 of the
+corresponding counter or event selector. The proposed CSR numbers for
+mhpmevent__n__h are 0x723 - 0x73F.
+
+The following bits are added to mhpmevent:
+
+bit [63] +++OF+++ - Overflow status and interrupt disable bit that is set when counter overflows
+
+bit [62] +++MINH+++ - If set, then counting of events in M-mode is inhibited
+
+bit [61] +++SINH+++ - If set, then counting of events in S/HS-mode is inhibited
+
+bit [60] +++UINH+++ - If set, then counting of events in U-mode is inhibited
+
+bit [59] +++VSINH+++ - If set, then counting of events in VS-mode is inhibited
+
+bit [58] +++VUINH+++ - If set, then counting of events in VU-mode is inhibited
+
+bit [57] 0 - Reserved for possible future modes
+
+bit [56] 0 - Reserved for possible future modes
+
+Each of the five ``x``INH bits, when set, inhibit counting of events while in
+privilege mode ``x``. All-zeroes for these bits results in counting of events in
+all modes.
+
+The OF bit is set when the corresponding hpmcounter overflows, and remains set
+until written by software. Since hpmcounter values are unsigned values,
+overflow is defined as unsigned overflow of the implemented counter bits. Note
+that there is no loss of information after an overflow since the counter wraps
+around and keeps counting while the sticky OF bit remains set.
+
+If supervisor mode is implemented, the 32-bit scountovf register contains
+read-only shadow copies of the OF bits in all 32 mhpmevent registers.
+
+If an hpmcounter overflows while the associated OF bit is zero, then a "count
+overflow interrupt request" is generated. If the OF bit is one, then no
+interrupt request is generated. Consequently the OF bit also functions as a
+count overflow interrupt disable for the associated hpmcounter.
+
+Count overflow never results from writes to the mhpmcounter__n__ or
+mhpmevent__n__ registers, only from hardware increments of counter registers.
+
+This "count overflow interrupt request" signal is treated as a standard local
+interrupt that corresponds to bit 13 in the mip/mie/sip/sie registers. The
+mip/sip LCOFIP and mie/sie LCOFIE bits are respectively the interrupt-pending
+and interrupt-enable bits for this interrupt. ('LCOFI' represents 'Local Count
+Overflow Interrupt'.)
+
+Generation of a "count overflow interrupt request" by an hpmcounter sets the
+LCOFIP bit in the mip/sip registers and sets the associated OF bit. The mideleg
+register controls the delegation of this interrupt to S-mode versus M-mode. The
+LCOFIP bit is cleared by software before servicing the count overflow interrupt
+resulting from one or more count overflows.
+
+[NOTE]
+.Non-normative
+====
+There are not separate overflow status and overflow interrupt enable bits. In
+practice, enabling overflow interrupt generation (by clearing the OF bit) is
+done in conjunction with initializing the counter to a starting value. Once a
+counter has overflowed, it and the OF bit must be reinitialized before another
+overflow interrupt can be generated.
+====
+
+[NOTE]
+.Non-normative
+====
+Software can distinguish newly overflowed counters (yet to be serviced by an
+overflow interrupt handler) from overflowed counters that have already been
+serviced or that are configured to not generate an interrupt on overflow, by
+maintaining a bit mask reflecting which counters are active and due to
+eventually overflow.
+====
+
+==== Machine Interrupt Registers (mip and mie)
+
+This extension adds the description of the LCOFIP/LCOFIE bits in these
+registers (and modifies related text) as follows:
+
+LCOFIP is added to mip in <<mipreg-standard>> as bit 13. LCOFIP is added to mie in
+<<miereg-standard>> as bit 13.
+
+If the Sscofpmf extension is implemented, bits mip.LCOFIP and mie.LCOFIE are
+the interrupt-pending and interrupt-enable bits for local count overflow
+interrupts. LCOFIP is read-write in mip and reflects the occurrence of a local
+count overflow interrupt request resulting from any of the mhpmevent__n__.OF
+bits being set. If the Sscofpmf extension is not implemented, these LCOFIP and
+LCOFIE bits are hardwired to zeros.
+
+Multiple simultaneous interrupts destined for different privilege modes are
+handled in decreasing order of destined privilege mode. Multiple simultaneous
+interrupts destined for the same privilege mode are handled in the following
+decreasing priority order: MEI, MSI, MTI, SEI, SSI, STI, LCOFI.
+
+=== Supervisor Level Additions
+
+==== Supervisor Interrupt Registers (sip and sie)
+
+This extension adds the description of the LCOFIP/LCOFIE bits in these
+registers (and modifies related text) as follows:
+
+LCOFIP is added to sip in <<sipreg-standard>> as bit 13. LCOFIP is added to sie in
+<<siereg-standard>> as bit 13.
+
+If the Sscofpmf extension is implemented, bits sip.LCOFIP and sie.LCOFIE are
+the interrupt-pending and interrupt-enable bits for local count overflow
+interrupts. LCOFIP is read-write in sip and reflects the occurrence of a local
+count overflow interrupt request resulting from any of the mhpmevent__n__.OF
+bits being set. If the Sscofpmf extension is not implemented, these LCOFIP and
+LCOFIE bits are hardwired to zeros.
+
+Each standard interrupt type (LCOFI, SEI, STI, or SSI) may not be implemented,
+in which case the corresponding interrupt-pending and interrupt-enable bits are
+hardwired to zeros. All bits in sip and sie are WARL fields.
+
+Multiple simultaneous interrupts destined for supervisor mode are handled in
+the following decreasing priority order: SEI, SSI, STI, LCOFI.
+
+==== Supervisor Count Overflow (scountovf)
+
+This extension adds this new CSR.
+
+The scountovf CSR is a 32-bit read-only register that contains shadow copies of
+the OF bits in the 29 mhpmevent CSRs (mhpmevent__3__ - mhpmevent__31__) - where
+scountovf bit _X_ corresponds to mhpmevent__X__. The proposed CSR number is
+0xDA0.
+
+This register enables supervisor-level overflow interrupt handler software to
+quickly and easily determine which counter(s) have overflowed (without needing
+to make an execution environment call or series of calls ultimately up to
+M-mode).
+
+Read access to bit _X_ is subject to the same mcounteren (or mcounteren and
+hcounteren) CSRs that mediate access to the hpmcounter CSRs by S-mode (or
+VS-mode). In M and S modes, scountovf bit _X_ is readable when mcounteren bit
+_X_ is set, and otherwise reads as zero. Similarly, in VS mode, scountovf bit
+_X_ is readable when mcounteren bit _X_ and hcounteren bit _X_ are both set,
+and otherwise reads as zero. \ No newline at end of file
diff --git a/src/sstc.adoc b/src/sstc.adoc
new file mode 100644
index 0000000..8e7a8e7
--- /dev/null
+++ b/src/sstc.adoc
@@ -0,0 +1,190 @@
+[[Sstc]]
+== "Stimecmp/Vstimecmp" Extension, Version 1.0.0
+
+The current Privileged arch specification only defines a hardware mechanism for
+generating machine-mode timer interrupts (based on the mtime and mtimecmp
+registers). With the resultant requirement that timer services for
+S-mode/HS-mode (and for VS-mode) have to all be provided by M-mode - via SBI
+calls from S/HS-mode up to M-mode (or VS-mode calls to HS-mode and then to
+M-mode). M-mode software then multiplexes these multiple logical timers onto
+its one physical M-mode timer facility, and the M-mode timer interrupt handler
+passes timer interrupts back down to the appropriate lower privilege mode.
+
+This extension serves to provide supervisor mode with its own CSR-based timer
+interrupt facility that it can directly manage to provide its own timer service
+(in the form of having its own stimecmp register) - thus eliminating the large
+overheads for emulating S/HS-mode timers and timer interrupt generation up in
+M-mode. Further, this extension adds a similar facility to the Hypervisor
+extension for VS-mode.
+
+To make it easy to understand the deltas from the current Priv 1.11/1.12 specs,
+this is written as the actual exact changes to be made to existing paragraphs
+of Priv spec text (or additional paragraphs within the existing text).
+
+The extension name is "Sstc" ('Ss' for Privileged arch and Supervisor-level
+extensions, and 'tc' for timecmp). This extension adds the S-level stimecmp CSR
+and the VS-level vstimecmp CSR.
+
+=== Machine and Supervisor Level Additions
+
+==== *Supervisor Timer Register (stimecmp)*
+
+This extension adds this new CSR.
+
+The stimecmp CSR is a 64-bit register and has 64-bit precision on all RV32 and
+RV64 systems. In RV32 only, accesses to the stimecmp CSR access the low 32
+bits, while accesses to the stimecmph CSR access the high 32 bits of stimecmp.
+
+The CSR numbers for stimecmp / stimecmph are 0x14D / 0x15D (within the
+Supervisor Trap Setup block of CSRs).
+
+A supervisor timer interrupt becomes pending - as reflected in the STIP bit in
+the mip and sip registers - whenever time contains a value greater than or
+equal to stimecmp, treating the values as unsigned integers. Writes to stimecmp
+are guaranteed to be reflected in STIP eventually, but not necessarily
+immediately. The interrupt remains posted until stimecmp becomes greater than
+time - typically as a result of writing stimecmp. The interrupt will be taken
+based on the standard interrupt enable and delegation rules.
+
+[NOTE]
+.Non-normative
+====
+A spurious timer interrupt might occur if an interrupt handler advances
+stimecmp then immediately returns, because STIP might not yet have fallen in
+the interim. All software should be written to assume this event is possible,
+but most software should assume this event is extremely unlikely. It is almost
+always more performant to incur an occasional spurious timer interrupt than to
+poll STIP until it falls.
+====
+
+[NOTE]
+.Non-normative
+====
+In systems in which a supervisor execution environment (SEE) provides timer
+facilities via an SBI function call, this SBI call will continue to support
+requests to schedule a timer interrupt. The SEE will simply make use of
+stimecmp, changing its value as appropriate. This ensures compatibility with
+existing S-mode software that uses this SEE facility, while new S-mode software
+takes advantage of stimecmp directly.)
+====
+
+==== Machine Interrupt Registers (mip and mie)
+
+This extension modifies the description of the STIP/STIE bits in these
+registers as follows:
+
+If supervisor mode is implemented, its mip.STIP and mie.STIE are the
+interrupt-pending and interrupt-enable bits for supervisor-level timer
+interrupts. If the stimecmp register is not implemented, STIP is writable in
+mip, and may be written by M-mode software to deliver timer interrupts to
+S-mode. If the stimecmp (supervisor-mode timer compare) register is
+implemented, STIP is read-only in mip and reflects the supervisor-level timer
+interrupt signal resulting from stimecmp. This timer interrupt signal is
+cleared by writing stimecmp with a value greater than the current time value.
+
+==== Supervisor Interrupt Registers (sip and sie)
+
+This extension modifies the description of the STIP/STIE bits in these
+registers as follows:
+
+Bits sip.STIP and sie.STIE are the interrupt-pending and interrupt-enable bits
+for supervisor level timer interrupts. If implemented, STIP is read-only in
+sip, and is either set and cleared by the execution environment (if stimecmp is
+not implemented), or reflects the timer interrupt signal resulting from
+stimecmp (if stimecmp is implemented). The sip.STIP bit, in response to timer
+interrupts generated by stimecmp, is set and cleared by writing stimecmp with a
+value that respectively is less than or equal to, or greater than, the current
+time value.
+
+==== Machine Counter-Enable Register (mcounteren)
+
+This extension adds to the description of the TM bit in this register as
+follows:
+
+In addition, when the TM bit in the mcounteren register is clear, attempts to
+access the stimecmp or vstimecmp register while executing in a mode less
+privileged than M will cause an illegal instruction exception. When this bit
+is set, access to the stimecmp or vstimecmp register is permitted in S-mode if
+implemented, and access to the vstimecmp register (via stimecmp) is permitted
+in VS-mode if implemented and not otherwise prevented by the TM bit in
+hcounteren.
+
+=== Hypervisor Extension Additions
+
+==== *Virtual Supervisor Timer Register (vstimecmp)*
+
+This extension adds this new CSR.
+
+The vstimecmp CSR is a 64-bit register and has 64-bit precision on all RV32 and
+RV64 systems. In RV32 only, accesses to the vstimecmp CSR access the low 32
+bits, while accesses to the vstimecmph CSR access the high 32 bits of
+vstimecmp.
+
+The proposed CSR numbers for vstimecmp / vstimecmph are 0x24D / 0x25D (within
+the Virtual Supervisor Registers block of CSRs, and mirroring the CSR numbers
+for stimecmp/stimecmph).
+
+A virtual supervisor timer interrupt becomes pending - as reflected in the
+VSTIP bit in the hip register - whenever (time + htimedelta), truncated to 64
+bits, contains a value greater than or equal to vstimecmp, treating the values
+as unsigned integers. Writes to vstimecmp and htimedelta are guaranteed to be
+reflected in VSTIP eventually, but not necessarily immediately. The interrupt
+remains posted until vstimecmp becomes greater than (time + htimedelta) -
+typically as a result of writing vstimecmp. The interrupt will be taken based
+on the standard interrupt enable and delegation rules while V=1.
+
+[NOTE]
+.Non-normative
+====
+In systems in which a supervisor execution environment (SEE) implemented by an
+HS-mode hypervisor provides timer facilities via an SBI function call, this SBI
+call will continue to support requests to schedule a timer interrupt. The SEE
+will simply make use of vstimecmp, changing its value as appropriate. This
+ensures compatibility with existing guest VS-mode software that uses this SEE
+facility, while new VS-mode software takes advantage of vstimecmp directly.)
+====
+
+==== Hypervisor Interrupt Registers (hvip, hip, and hie)
+
+This extension modifies the description of the VSTIP/VSTIE bits in the hip/hie
+registers as follows:
+
+Bits hip.VSTIP and hie.VSTIE are the interrupt-pending and interrupt-enable
+bits for VS-level timer interrupts. VSTIP is read-only in hip, and is the
+logical-OR of hvip.VSTIP and the timer interrupt signal resulting from
+vstimecmp (if vstimecmp is implemented). The hip.VSTIP bit, in response to
+timer interrupts generated by vstimecmp, is set and cleared by writing
+vstimecmp with a value that respectively is less than or equal to, or greater
+than, the current (time + htimedelta) value. The hip.VSTIP bit remains defined
+while V=0 as well as V=1.
+
+==== Hypervisor Counter-Enable Register (hcounteren)
+
+This extension adds to the description of the TM bit in this register as
+follows:
+
+In addition, when the TM bit in the hcounteren register is clear, attempts to
+access the vstimecmp register (via stimecmp) while executing in VS-mode will
+cause a virtual instruction exception if the same bit in mcounteren is set.
+When this bit and the same bit in mcounteren are both set, access to the
+vstimecmp register (if implemented) is permitted in VS-mode.
+
+=== Environment Config (menvcfg/henvcfg) Support
+
+Enable/disable bits for this extension are provided in the new menvcfg /
+henvcfg CSRs.
+
+Bit 63 of menvcfg (or bit 31 of menvcfgh) - named STCE (STimecmp Enable) -
+enables stimecmp for S-mode when set to one, and the same bit of henvcfg
+enables vstimecmp for VS-mode. These STCE bits are WARL and are hard-wired to 0
+when this extension is not implemented.
+
+When STCE in menvcfg is zero, an attempt to access stimecmp or vstimecmp in a
+mode other than M-mode raises an illegal instruction exception, STCE in henvcfg
+is read-only zero, and STIP in mip and sip reverts to its defined behavior as
+if this extension is not implemented.
+
+When STCE in menvcfg is one but STCE in henvcfg is zero, an attempt to access
+stimecmp (really vstimecmp) when V = 1 raises a virtual instruction exception,
+and VSTIP in hip reverts to its defined behavior as if this extension is not
+implemented. \ No newline at end of file
diff --git a/src/supervisor.adoc b/src/supervisor.adoc
index 2b30893..e9f2855 100644
--- a/src/supervisor.adoc
+++ b/src/supervisor.adoc
@@ -2021,6 +2021,12 @@ or VU-mode, or to execute SINVAL.VMA in VU-mode, raises a
virtual-instruction exception. When `hstatus`.VTVM=1, an attempt to execute
SINVAL.VMA in VS-mode also raises a virtual instruction exception.
+Attempting to execute SFENCE.W.INVAL or SFENCE.INVAL.IR in U-mode
+raises an illegal-instruction exception.
+Doing so in VU-mode raises a virtual-instruction exception.
+SFENCE.W.INVAL and SFENCE.INVAL.IR are unaffected by the `mstatus`.TVM and
+`hstatus`.VTVM fields and hence are always permitted in S-mode and VS-mode.
+
[NOTE]
====
SFENCE.W.INVAL and SFENCE.INVAL.IR instructions do not need to be
diff --git a/src/v-st-ext.adoc b/src/v-st-ext.adoc
index 88dcf8d..194e448 100644
--- a/src/v-st-ext.adoc
+++ b/src/v-st-ext.adoc
@@ -1,9 +1,6 @@
[[vector]]
== "V" Standard Extension for Vector Operations, Version 1.0
-The specification is currently hosted at
-https://github.com/riscv/riscv-v-spec.
-
[NOTE]
====
_The base vector extension is intended to provide general support for
@@ -12,3 +9,5185 @@ with later vector extensions supporting richer functionality for certain
domains._
====
+=== Introduction
+
+This document is version 1.1-draft of the RISC-V vector extension.
+
+NOTE: This version holds updates gathered after the start of the
+public review. The spec will have a final update to version 2.0 at
+time of ratification.
+
+This spec includes the complete set of currently frozen vector
+instructions. Other instructions that have been considered during
+development but are not present in this document are not included in
+the review and ratification process, and may be completely revised or
+abandoned. Section <<sec-vector-extensions>> lists the standard
+vector extensions and which instructions and element widths are
+supported by each extension.
+
+=== Implementation-defined Constant Parameters
+
+Each hart supporting a vector extension defines two parameters:
+
+. The maximum size in bits of a vector element that any operation can produce or consume, _ELEN_ {ge} 8, which
+must be a power of 2.
+. The number of bits in a single vector register, _VLEN_ {ge} ELEN, which must be a power of 2, and must be no greater than 2^16^.
+
+Standard vector extensions (Section <<sec-vector-extensions>>) and
+architecture profiles may set further constraints on _ELEN_ and _VLEN_.
+
+NOTE: Future extensions may allow ELEN {gt} VLEN by holding one
+element using bits from multiple vector registers, but this current
+proposal does not include this option.
+
+NOTE: The upper limit on VLEN allows software to know that indices
+will fit into 16 bits (largest VLMAX of 65,536 occurs for LMUL=8 and
+SEW=8 with VLEN=65,536). Any future extension beyond 64Kib per vector
+register will require new configuration instructions such that
+software using the old configuration instructions does not see greater
+vector lengths.
+
+The vector extension supports writing binary code that under certain
+constraints will execute portably on harts with different values for
+the VLEN parameter, provided the harts support the required element
+types and instructions.
+
+NOTE: Code can be written that will expose differences in
+implementation parameters.
+
+NOTE: In general, thread contexts with active vector state cannot be
+migrated during execution between harts that have any difference in
+VLEN or ELEN parameters.
+
+=== Vector Extension Programmer's Model
+
+The vector extension adds 32 vector registers, and seven unprivileged
+CSRs (`vstart`, `vxsat`, `vxrm`, `vcsr`, `vtype`, `vl`, `vlenb`) to a
+base scalar RISC-V ISA.
+
+.New vector CSRs
+[cols="2,2,2,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Address | Privilege | Name | Description
+
+| 0x008 | URW | vstart | Vector start position
+| 0x009 | URW | vxsat | Fixed-Point Saturate Flag
+| 0x00A | URW | vxrm | Fixed-Point Rounding Mode
+| 0x00F | URW | vcsr | Vector control and status register
+| 0xC20 | URO | vl | Vector length
+| 0xC21 | URO | vtype | Vector data type register
+| 0xC22 | URO | vlenb | VLEN/8 (vector register length in bytes)
+|===
+
+NOTE: The four CSR numbers `0x00B`-`0x00E` are tentatively reserved
+for future vector CSRs, some of which may be mirrored into `vcsr`.
+
+==== Vector Registers
+
+The vector extension adds 32 architectural vector registers,
+`v0`-`v31` to the base scalar RISC-V ISA.
+
+Each vector register has a fixed VLEN bits of state.
+
+==== Vector Context Status in `mstatus`
+
+A vector context status field, `VS`, is added to `mstatus[10:9]` and shadowed
+in `sstatus[10:9]`. It is defined analogously to the floating-point context
+status field, `FS`.
+
+Attempts to execute any vector instruction, or to access the vector
+CSRs, raise an illegal-instruction exception when `mstatus.VS` is
+set to Off.
+
+When `mstatus.VS` is set to Initial or Clean, executing any
+instruction that changes vector state, including the vector CSRs, will
+change `mstatus.VS` to Dirty.
+Implementations may also change `mstatus.VS` from Initial or Clean to Dirty
+at any time, even when there is no change in vector state.
+
+NOTE: Accurate setting of `mstatus.VS` is an optimization. Software
+will typically use VS to reduce context-swap overhead.
+
+If `mstatus.VS` is Dirty, `mstatus.SD` is 1;
+otherwise, `mstatus.SD` is set in accordance with existing specifications.
+
+Implementations may have a writable `misa.V` field. Analogous to the
+way in which the floating-point unit is handled, the `mstatus.VS`
+field may exist even if `misa.V` is clear.
+
+NOTE: Allowing `mstatus.VS` to exist when `misa.V` is clear, enables
+vector emulation and simplifies handling of `mstatus.VS` in systems
+with writable `misa.V`.
+
+==== Vector Context Status in `vsstatus`
+
+When the hypervisor extension is present, a vector context status field, `VS`,
+is added to `vsstatus[10:9]`.
+It is defined analogously to the floating-point context status field, `FS`.
+
+When V=1, both `vsstatus.VS` and `mstatus.VS` are in effect: attempts to
+execute any vector instruction, or to access the vector CSRs, raise an
+illegal-instruction exception when either field is set to Off.
+
+When V=1 and neither `vsstatus.VS` nor `mstatus.VS` is set to Off, executing
+any instruction that changes vector state, including the vector CSRs, will
+change both `mstatus.VS` and `vsstatus.VS` to Dirty.
+Implementations may also change `mstatus.VS` or `vsstatus.VS` from Initial or
+Clean to Dirty at any time, even when there is no change in vector state.
+
+If `vsstatus.VS` is Dirty, `vsstatus.SD` is 1;
+otherwise, `vsstatus.SD` is set in accordance with existing specifications.
+
+If `mstatus.VS` is Dirty, `mstatus.SD` is 1;
+otherwise, `mstatus.SD` is set in accordance with existing specifications.
+
+For implementations with a writable `misa.V` field,
+the `vsstatus.VS` field may exist even if `misa.V` is clear.
+
+==== Vector type register, `vtype`
+
+The read-only XLEN-wide _vector_ _type_ CSR, `vtype` provides the
+default type used to interpret the contents of the vector register
+file, and can only be updated by `vset{i}vl{i}` instructions. The
+vector type determines the organization of elements in each
+vector register, and how multiple vector registers are grouped. The
+`vtype` register also indicates how masked-off elements and elements
+past the current vector length in a vector result are handled.
+
+NOTE: Allowing updates only via the `vset{i}vl{i}` instructions
+simplifies maintenance of the `vtype` register state.
+
+The `vtype` register has five fields, `vill`, `vma`, `vta`,
+`vsew[2:0]`, and `vlmul[2:0]`. Bits `vtype[XLEN-2:8]` should be
+written with zero, and non-zero values in this field are reserved.
+
+include::images/wavedrom/vtype-format.adoc[]
+
+NOTE: A small implementation supporting ELEN=32 requires only seven
+bits of state in `vtype`: two bits for `ma` and `ta`, two bits for
+`vsew[1:0]` and three bits for `vlmul[2:0]`. The illegal value
+represented by `vill` can be internally encoded using the illegal 64-bit
+combination in `vsew[1:0]` without requiring an additional storage
+bit to hold `vill`.
+
+NOTE: Further standard and custom vector extensions may extend these
+fields to support a greater variety of data types.
+
+NOTE: The primary motivation for the `vtype` CSR is to allow the
+vector instruction set to fit into a 32-bit instruction encoding
+space. A separate `vset{i}vl{i}` instruction can be used to set `vl`
+and/or `vtype` fields before execution of a vector instruction, and
+implementations may choose to fuse these two instructions into a single
+internal vector microop. In many cases, the `vl` and `vtype` values
+can be reused across multiple instructions, reducing the static and
+dynamic instruction overhead from the `vset{i}vl{i}` instructions. It
+is anticipated that a future extended 64-bit instruction encoding
+would allow these fields to be specified statically in the instruction
+encoding.
+
+===== Vector selected element width `vsew[2:0]`
+
+The value in `vsew` sets the dynamic _selected_ _element_ _width_
+(SEW). By default, a vector register is viewed as being divided into
+VLEN/SEW elements.
+
+.vsew[2:0] (selected element width) encoding
+[cols="1,1,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+3+| vsew[2:0] | SEW
+
+| 0 | 0 | 0 | 8
+| 0 | 0 | 1 | 16
+| 0 | 1 | 0 | 32
+| 0 | 1 | 1 | 64
+| 1 | X | X | Reserved
+|===
+
+NOTE: While it is anticipated the larger `vsew[2:0]` encodings
+(`100`-`111`) will be used to encode larger SEW, the encodings are
+formally _reserved_ at this point.
+
+.Example VLEN = 128 bits
+[cols=">,>"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| SEW | Elements per vector register
+
+| 64 | 2
+| 32 | 4
+| 16 | 8
+| 8 | 16
+|===
+
+The supported element width may vary with LMUL.
+
+NOTE: The current set of standard vector extensions do not vary
+supported element width with LMUL. Some future extensions may support
+larger SEWs only when bits from multiple vector registers are combined
+using LMUL. In this case, software that relies on large SEW should
+attempt to use the largest LMUL, and hence the fewest vector register
+groups, to increase the number of implementations on which the code
+will run. The `vill` bit in `vtype` should be checked after setting
+`vtype` to see if the configuration is supported, and an alternate
+code path should be provided if it is not. Alternatively, a profile
+can mandate the minimum SEW at each LMUL setting.
+
+===== Vector Register Grouping (`vlmul[2:0]`)
+
+Multiple vector registers can be grouped together, so that a single
+vector instruction can operate on multiple vector registers. The term
+_vector_ _register_ _group_ is used herein to refer to one or more
+vector registers used as a single operand to a vector instruction.
+Vector register groups can be used to provide greater execution
+efficiency for longer application vectors, but the main reason for
+their inclusion is to allow double-width or larger elements to be
+operated on with the same vector length as single-width elements. The
+vector length multiplier, _LMUL_, when greater than 1, represents the
+default number of vector registers that are combined to form a vector
+register group. Implementations must support LMUL integer values of
+1, 2, 4, and 8.
+
+
+NOTE: The vector architecture includes instructions that take multiple
+source and destination vector operands with different element widths,
+but the same number of elements. The effective LMUL (EMUL) of each
+vector operand is determined by the number of registers required to
+hold the elements. For example, for a widening add operation, such as
+add 32-bit values to produce 64-bit results, a double-width result
+requires twice the LMUL of the single-width inputs.
+
+LMUL can also be a fractional value, reducing the number of bits used
+in a single vector register. Fractional LMUL is used to increase the
+number of effective usable vector register groups when operating on
+mixed-width values.
+
+NOTE: With only integer LMUL values, a loop operating on a range of
+sizes would have to allocate at least one whole vector register
+(LMUL=1) for the narrowest data type and then would consume multiple
+vector registers (LMUL>1) to form a vector register group for each
+wider vector operand. This can limit the number of vector register groups
+available. With fractional LMUL, the widest values need occupy only a
+single vector register while narrower values can occupy a fraction of
+a single vector register, allowing all 32 architectural vector
+register names to be used for different values in a vector loop even
+when handling mixed-width values. Fractional LMUL implies portions of
+vector registers are unused, but in some cases, having more shorter
+register-resident vectors improves efficiency relative to fewer longer
+register-resident vectors.
+
+Implementations must provide fractional LMUL settings that allow the
+narrowest supported type to occupy a fraction of a vector register
+corresponding to the ratio of the narrowest supported type's width to
+that of the largest supported type's width. In general, the
+requirement is to support LMUL {ge} SEW~MIN~/ELEN, where SEW~MIN~ is
+the narrowest supported SEW value and ELEN is the widest supported SEW
+value. In the standard extensions, SEW~MIN~=8. For
+standard vector extensions with ELEN=32, fractional LMULs of 1/2 and
+1/4 must be supported. For standard vector extensions with ELEN=64,
+fractional LMULs of 1/2, 1/4, and 1/8 must be supported.
+
+NOTE: When LMUL < SEW~MIN~/ELEN, there is no guarantee
+an implementation would have enough bits in the fractional vector
+register to store at least one element, as VLEN=ELEN is a
+valid implementation choice. For example, with VLEN=ELEN=32,
+and SEW~MIN~=8, an LMUL of 1/8 would only provide four bits of
+storage in a vector register.
+
+For a given supported fractional LMUL setting, implementations must support
+SEW settings between SEW~MIN~ and LMUL * ELEN, inclusive.
+
+The use of `vtype` encodings with LMUL < SEW~MIN~/ELEN is
+__reserved__, but implementations can set `vill` if they do not
+support these configurations.
+
+NOTE: Requiring all implementations to set `vill` in this case would
+prohibit future use of this case in an extension, so to allow for a
+future definition of LMUL<SEW~MIN~/ELEN behavior, we
+consider the use of this case to be __reserved__.
+
+NOTE: It is recommended that assemblers provide a warning (not an
+error) if a `vsetvli` instruction attempts to write an LMUL < SEW~MIN~/ELEN.
+
+LMUL is set by the signed `vlmul` field in `vtype` (i.e., LMUL =
+2^`vlmul[2:0]`^).
+
+The derived value VLMAX = LMUL*VLEN/SEW represents the maximum number
+of elements that can be operated on with a single vector instruction
+given the current SEW and LMUL settings as shown in the table below.
+
+[cols="1,1,1,2,2,5,5"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+ 3+| vlmul[2:0] | LMUL | #groups | VLMAX | Registers grouped with register __n__
+
+| 1 | 0 | 0 | - | - | - | reserved
+| 1 | 0 | 1 | 1/8| 32 | VLEN/SEW/8 | `v` __n__ (single register in group)
+| 1 | 1 | 0 | 1/4| 32 | VLEN/SEW/4 | `v` __n__ (single register in group)
+| 1 | 1 | 1 | 1/2| 32 | VLEN/SEW/2 | `v` __n__ (single register in group)
+| 0 | 0 | 0 | 1 | 32 | VLEN/SEW | `v` __n__ (single register in group)
+| 0 | 0 | 1 | 2 | 16 | 2*VLEN/SEW | `v` __n__, `v` __n__+1
+| 0 | 1 | 0 | 4 | 8 | 4*VLEN/SEW | `v` __n__, ..., `v` __n__+3
+| 0 | 1 | 1 | 8 | 4 | 8*VLEN/SEW | `v` __n__, ..., `v` __n__+7
+|===
+
+When LMUL=2, the vector register group contains vector register `v`
+__n__ and vector register `v` __n__+1, providing twice the vector
+length in bits. Instructions specifying an LMUL=2 vector register group
+with an odd-numbered vector register are reserved.
+
+When LMUL=4, the vector register group contains four vector registers,
+and instructions specifying an LMUL=4 vector register group using vector
+register numbers that are not multiples of four are reserved.
+
+When LMUL=8, the vector register group contains eight vector
+registers, and instructions specifying an LMUL=8 vector register group
+using register numbers that are not multiples of eight are reserved.
+
+Mask registers are always contained in a single vector register,
+regardless of LMUL.
+
+[[sec-agnostic]]
+===== Vector Tail Agnostic and Vector Mask Agnostic `vta` and `vma`
+
+These two bits modify the behavior of destination tail elements and
+destination inactive masked-off elements respectively during the
+execution of vector instructions. The tail and inactive sets contain
+element positions that are not receiving new results during a vector
+operation, as defined in Section <<sec-inactive-defs>>.
+
+All systems must support all four options:
+
+[cols="1,1,3,3"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| `vta` | `vma` | Tail Elements | Inactive Elements
+
+| 0 | 0 | undisturbed | undisturbed
+| 0 | 1 | undisturbed | agnostic
+| 1 | 0 | agnostic | undisturbed
+| 1 | 1 | agnostic | agnostic
+|===
+
+Mask destination tail elements are always treated as tail-agnostic,
+regardless of the setting of `vta`.
+
+When a set is marked undisturbed, the corresponding set of destination
+elements in a vector register group retain the value they previously
+held.
+
+When a set is marked agnostic, the corresponding set of destination
+elements in any vector destination operand can either retain the value
+they previously held, or are overwritten with 1s. Within a single vector
+instruction, each destination element can be either left undisturbed
+or overwritten with 1s, in any combination, and the pattern of
+undisturbed or overwritten with 1s is not required to be deterministic
+when the instruction is executed with the same inputs.
+
+NOTE: The agnostic policy was added to accommodate machines with
+vector register renaming. With an undisturbed policy, all elements
+would have to be read from the old physical destination vector
+register to be copied into the new physical destination vector
+register. This causes an inefficiency when these inactive or tail
+values are not required for subsequent calculations.
+
+NOTE: The value of all 1s instead of all 0s was chosen for the
+overwrite value to discourage software developers from depending on
+the value written.
+
+NOTE: A simple in-order implementation can ignore the settings and
+simply execute all vector instructions using the undisturbed
+policy. The `vta` and `vma` state bits must still be provided in
+`vtype` for compatibility and to support thread migration.
+
+NOTE: An out-of-order implementation can choose to implement
+tail-agnostic + mask-agnostic using tail-agnostic + mask-undisturbed
+to reduce implementation complexity.
+
+NOTE: The definition of agnostic result policy is left loose to
+accommodate migrating application threads between harts on a small
+in-order core (which probably leaves agnostic regions undisturbed) and
+harts on a larger out-of-order core with register renaming (which
+probably overwrites agnostic elements with 1s). As it might be
+necessary to restart in the middle, we allow arbitrary mixing of
+agnostic policies within a single vector instruction. This allowed
+mixing of policies also enables implementations that might change
+policies for different granules of a vector register, for example,
+using undisturbed within a granule that is actively operated on but
+renaming to all 1s for granules in the tail.
+
+In addition, except for mask load instructions, any element in the
+tail of a mask result can also be written with the value the
+mask-producing operation would have calculated with `vl`=VLMAX.
+Furthermore, for mask-logical instructions and `vmsbf.m`, `vmsif.m`,
+`vmsof.m` mask-manipulation instructions, any element in the tail of
+the result can be written with the value the mask-producing operation
+would have calculated with `vl`=VLEN, SEW=8, and LMUL=8 (i.e., all
+bits of the mask register can be overwritten).
+
+NOTE: Mask tails are always treated as agnostic to reduce complexity
+of managing mask data, which can be written at bit granularity. There
+appears to be little software need to support tail-undisturbed for
+mask register values. Allowing mask-generating instructions to write
+back the result of the instruction avoids the need for logic to mask
+out the tail, except mask loads cannot write memory values to
+destination mask tails as this would imply accessing memory past
+software intent.
+
+The assembly syntax adds two mandatory flags to the `vsetvli` instruction:
+
+----
+ ta # Tail agnostic
+ tu # Tail undisturbed
+ ma # Mask agnostic
+ mu # Mask undisturbed
+
+ vsetvli t0, a0, e32, m4, ta, ma # Tail agnostic, mask agnostic
+ vsetvli t0, a0, e32, m4, tu, ma # Tail undisturbed, mask agnostic
+ vsetvli t0, a0, e32, m4, ta, mu # Tail agnostic, mask undisturbed
+ vsetvli t0, a0, e32, m4, tu, mu # Tail undisturbed, mask undisturbed
+----
+
+NOTE: Prior to v0.9, when these flags were not specified on a
+`vsetvli`, they defaulted to mask-undisturbed/tail-undisturbed. The
+use of `vsetvli` without these flags is deprecated, however, and
+specifying a flag setting is now mandatory. The default should
+perhaps be tail-agnostic/mask-agnostic, so software has to specify
+when it cares about the non-participating elements, but given the
+historical meaning of the instruction prior to introduction of these
+flags, it was decided to always require them in future assembly code.
+
+===== Vector Type Illegal `vill`
+
+The `vill` bit is used to encode that a previous `vset{i}vl{i}`
+instruction attempted to write an unsupported value to `vtype`.
+
+NOTE: The `vill` bit is held in bit XLEN-1 of the CSR to support
+checking for illegal values with a branch on the sign bit.
+
+If the `vill` bit is set, then any attempt to execute a vector instruction
+that depends upon `vtype` will raise an illegal-instruction exception.
+
+NOTE: `vset{i}vl{i}` and whole register loads and stores do not depend
+upon `vtype`.
+
+When the `vill` bit is set, the other XLEN-1 bits in `vtype` shall be
+zero.
+
+==== Vector Length Register `vl`
+
+The _XLEN_-bit-wide read-only `vl` CSR can only be updated by the
+`vset{i}vl{i}` instructions, and the _fault-only-first_ vector load
+instruction variants.
+
+The `vl` register holds an unsigned integer specifying the number of
+elements to be updated with results from a vector instruction, as
+further detailed in Section <<sec-inactive-defs>>.
+
+NOTE: The number of bits implemented in `vl` depends on the
+implementation's maximum vector length of the smallest supported
+type. The smallest vector implementation with VLEN=32 and supporting
+SEW=8 would need at least six bits in `vl` to hold the values 0-32
+(VLEN=32, with LMUL=8 and SEW=8, yields VLMAX=32).
+
+==== Vector Byte Length `vlenb`
+
+The _XLEN_-bit-wide read-only CSR `vlenb` holds the value VLEN/8,
+i.e., the vector register length in bytes.
+
+NOTE: The value in `vlenb` is a design-time constant in any
+implementation.
+
+NOTE: Without this CSR, several instructions are needed to calculate
+VLEN in bytes, and the code has to disturb current `vl` and `vtype`
+settings which require them to be saved and restored.
+
+==== Vector Start Index CSR `vstart`
+
+The _XLEN_-bit-wide read-write `vstart` CSR specifies the index of the
+first element to be executed by a vector instruction, as described in
+Section <<sec-inactive-defs>>.
+
+Normally, `vstart` is only written by hardware on a trap on a vector
+instruction, with the `vstart` value representing the element on which
+the trap was taken (either a synchronous exception or an asynchronous
+interrupt), and at which execution should resume after a resumable
+trap is handled.
+
+All vector instructions are defined to begin execution with the
+element number given in the `vstart` CSR, leaving earlier elements in
+the destination vector undisturbed, and to reset the `vstart` CSR to
+zero at the end of execution.
+
+NOTE: All vector instructions, including `vset{i}vl{i}`, reset the `vstart`
+CSR to zero.
+
+`vstart` is not modified by vector instructions that raise illegal-instruction
+exceptions.
+
+The `vstart` CSR is defined to have only enough writable bits to hold
+the largest element index (one less than the maximum VLMAX).
+
+NOTE: The maximum vector length is obtained with the largest LMUL
+setting (8) and the smallest SEW setting (8), so VLMAX_max = 8*VLEN/8 = VLEN. For example, for VLEN=256, `vstart` would have 8 bits to
+represent indices from 0 through 255.
+
+The use of `vstart` values greater than the largest element index for
+the current `vtype` setting is reserved.
+
+NOTE: It is recommended that implementations trap if `vstart` is out
+of bounds. It is not required to trap, as a possible future use of
+upper `vstart` bits is to store imprecise trap information.
+
+The `vstart` CSR is writable by unprivileged code, but non-zero
+`vstart` values may cause vector instructions to run substantially
+slower on some implementations, so `vstart` should not be used by
+application programmers. A few vector instructions cannot be
+executed with a non-zero `vstart` value and will raise an illegal
+instruction exception as defined below.
+
+NOTE: Making `vstart` visible to unprivileged code supports user-level
+threading libraries.
+
+Implementations are permitted to raise illegal instruction exceptions when
+attempting to execute a vector instruction with a value of `vstart` that the
+implementation can never produce when executing that same instruction with
+the same `vtype` setting.
+
+NOTE: For example, some implementations will never take interrupts during
+execution of a vector arithmetic instruction, instead waiting until the
+instruction completes to take the interrupt. Such implementations are
+permitted to raise an illegal instruction exception when attempting to execute
+a vector arithmetic instruction when `vstart` is nonzero.
+
+NOTE: When migrating a software thread between two harts with
+different microarchitectures, the `vstart` value might not be
+supported by the new hart microarchitecture. The runtime on the
+receiving hart might then have to emulate instruction execution up to the
+next supported `vstart` element position. Alternatively, migration events
+can be constrained to only occur at mutually supported `vstart`
+locations.
+
+==== Vector Fixed-Point Rounding Mode Register `vxrm`
+
+The vector fixed-point rounding-mode register holds a two-bit
+read-write rounding-mode field in the least-significant bits
+(`vxrm[1:0]`). The upper bits, `vxrm[XLEN-1:2]`, should be written as
+zeros.
+
+The vector fixed-point rounding-mode is given a separate CSR address
+to allow independent access, but is also reflected as a field in
+`vcsr`.
+
+NOTE: A new rounding mode can be set while saving the original
+rounding mode using a single `csrwi` instruction.
+
+The fixed-point rounding algorithm is specified as follows.
+Suppose the pre-rounding result is `v`, and `d` bits of that result are to be
+rounded off.
+Then the rounded result is `(v >> d) + r`, where `r` depends on the rounding
+mode as specified in the following table.
+
+.vxrm encoding
+//[cols="1,1,4,10,5"]
+[%autowidth,float="center",align="center",cols="<,<,<,<,<",options="header"]
+|===
+2+| `vxrm[1:0]` | Abbreviation | Rounding Mode | Rounding increment, `r`
+
+| 0 | 0 | rnu | round-to-nearest-up (add +0.5 LSB) | `v[d-1]`
+| 0 | 1 | rne | round-to-nearest-even | `v[d-1] & (v[d-2:0]{ne}0 \| v[d])`
+| 1 | 0 | rdn | round-down (truncate) | `0`
+| 1 | 1 | rod | round-to-odd (OR bits into LSB, aka "jam") | `!v[d] & v[d-1:0]{ne}0`
+|===
+
+The rounding functions:
+----
+roundoff_unsigned(v, d) = (unsigned(v) >> d) + r
+roundoff_signed(v, d) = (signed(v) >> d) + r
+----
+are used to represent this operation in the instruction descriptions below.
+
+==== Vector Fixed-Point Saturation Flag `vxsat`
+
+The `vxsat` CSR has a single read-write least-significant bit
+(`vxsat[0]`) that indicates if a fixed-point instruction has had to
+saturate an output value to fit into a destination format.
+Bits `vxsat[XLEN-1:1]` should be written as zeros.
+
+The `vxsat` bit is mirrored in `vcsr`.
+
+==== Vector Control and Status Register `vcsr`
+
+The `vxrm` and `vxsat` separate CSRs can also be accessed via fields
+in the _XLEN_-bit-wide vector control and status CSR, `vcsr`.
+
+.vcsr layout
+[cols=">2,4,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Bits | Name | Description
+
+| XLEN-1:3 | | Reserved
+| 2:1 | vxrm[1:0] | Fixed-point rounding mode
+| 0 | vxsat | Fixed-point accrued saturation flag
+|===
+
+==== State of Vector Extension at Reset
+
+The vector extension must have a consistent state at reset. In
+particular, `vtype` and `vl` must have values that can be read and
+then restored with a single `vsetvl` instruction.
+
+NOTE: It is recommended that at reset, `vtype.vill` is set, the
+remaining bits in `vtype` are zero, and `vl` is set to zero.
+
+The `vstart`, `vxrm`, `vxsat` CSRs can have arbitrary values at reset.
+
+NOTE: Most uses of the vector unit will require an initial `vset{i}vl{i}`,
+which will reset `vstart`. The `vxrm` and `vxsat` fields should be
+reset explicitly in software before use.
+
+The vector registers can have arbitrary values at reset.
+
+=== Mapping of Vector Elements to Vector Register State
+
+The following diagrams illustrate how different width elements are
+packed into the bytes of a vector register depending on the current
+SEW and LMUL settings, as well as implementation VLEN. Elements are
+packed into each vector register with the least-significant byte in
+the lowest-numbered bits.
+
+The mapping was chosen to provide the simplest and most portable model
+for software, but might appear to incur large wiring cost for wider
+vector datapaths on certain operations. The vector instruction set
+was expressly designed to support implementations that internally
+rearrange vector data for different SEW to reduce datapath wiring
+costs, while externally preserving the simple software model.
+
+NOTE: For example, microarchitectures can track the EEW with which a
+vector register was written, and then insert additional scrambling
+operations to rearrange data if the register is accessed with a
+different EEW.
+
+==== Mapping for LMUL = 1
+
+When LMUL=1, elements are simply packed in order from the
+least-significant to most-significant bits of the vector register.
+
+NOTE: To increase readability, vector register layouts are drawn with
+bytes ordered from right to left with increasing byte address. Bits
+within an element are numbered in a little-endian format with
+increasing bit index from right to left corresponding to increasing
+magnitude.
+
+----
+LMUL=1 examples.
+
+The element index is given in hexadecimal and is shown placed at the
+least-significant byte of the stored element.
+
+
+ VLEN=32b
+
+ Byte 3 2 1 0
+
+ SEW=8b 3 2 1 0
+ SEW=16b 1 0
+ SEW=32b 0
+
+ VLEN=64b
+
+ Byte 7 6 5 4 3 2 1 0
+
+ SEW=8b 7 6 5 4 3 2 1 0
+ SEW=16b 3 2 1 0
+ SEW=32b 1 0
+ SEW=64b 0
+
+ VLEN=128b
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+
+ SEW=8b F E D C B A 9 8 7 6 5 4 3 2 1 0
+ SEW=16b 7 6 5 4 3 2 1 0
+ SEW=32b 3 2 1 0
+ SEW=64b 1 0
+
+ VLEN=256b
+
+ Byte 1F1E1D1C1B1A19181716151413121110 F E D C B A 9 8 7 6 5 4 3 2 1 0
+
+ SEW=8b 1F1E1D1C1B1A19181716151413121110 F E D C B A 9 8 7 6 5 4 3 2 1 0
+ SEW=16b F E D C B A 9 8 7 6 5 4 3 2 1 0
+ SEW=32b 7 6 5 4 3 2 1 0
+ SEW=64b 3 2 1 0
+----
+
+==== Mapping for LMUL < 1
+
+When LMUL < 1, only the first LMUL*VLEN/SEW elements in the vector
+register are used. The remaining space in the vector register is
+treated as part of the tail, and hence must obey the vta setting.
+
+----
+ Example, VLEN=128b, LMUL=1/4
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+
+ SEW=8b - - - - - - - - - - - - 3 2 1 0
+ SEW=16b - - - - - - 1 0
+ SEW=32b - - - 0
+----
+
+==== Mapping for LMUL > 1
+
+When vector registers are grouped, the elements of the vector register
+group are packed contiguously in element order beginning with the
+lowest-numbered vector register and moving to the
+next-highest-numbered vector register in the group once each vector
+register is filled.
+
+----
+ LMUL > 1 examples
+
+ VLEN=32b, SEW=8b, LMUL=2
+
+ Byte 3 2 1 0
+ v2*n 3 2 1 0
+ v2*n+1 7 6 5 4
+
+ VLEN=32b, SEW=16b, LMUL=2
+
+ Byte 3 2 1 0
+ v2*n 1 0
+ v2*n+1 3 2
+
+ VLEN=32b, SEW=16b, LMUL=4
+
+ Byte 3 2 1 0
+ v4*n 1 0
+ v4*n+1 3 2
+ v4*n+2 5 4
+ v4*n+3 7 6
+
+ VLEN=32b, SEW=32b, LMUL=4
+
+ Byte 3 2 1 0
+ v4*n 0
+ v4*n+1 1
+ v4*n+2 2
+ v4*n+3 3
+
+ VLEN=64b, SEW=32b, LMUL=2
+
+ Byte 7 6 5 4 3 2 1 0
+ v2*n 1 0
+ v2*n+1 3 2
+
+ VLEN=64b, SEW=32b, LMUL=4
+
+ Byte 7 6 5 4 3 2 1 0
+ v4*n 1 0
+ v4*n+1 3 2
+ v4*n+2 5 4
+ v4*n+3 7 6
+
+ VLEN=128b, SEW=32b, LMUL=2
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+ v2*n 3 2 1 0
+ v2*n+1 7 6 5 4
+
+ VLEN=128b, SEW=32b, LMUL=4
+
+ Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+ v4*n 3 2 1 0
+ v4*n+1 7 6 5 4
+ v4*n+2 B A 9 8
+ v4*n+3 F E D C
+----
+
+[[sec-mapping-mixed]]
+==== Mapping across Mixed-Width Operations
+
+The vector ISA is designed to support mixed-width operations without
+requiring additional explicit rearrangement instructions. The
+recommended software strategy when operating on multiple vectors with
+different precision values is to modify `vtype` dynamically to keep
+SEW/LMUL constant (and hence VLMAX constant).
+
+The following example shows four different packed element widths (8b,
+16b, 32b, 64b) in a VLEN=128b implementation. The vector register
+grouping factor (LMUL) is increased by the relative element size such
+that each group can hold the same number of vector elements (VLMAX=8
+in this example) to simplify stripmining code.
+
+----
+Example VLEN=128b, with SEW/LMUL=16
+
+Byte F E D C B A 9 8 7 6 5 4 3 2 1 0
+vn - - - - - - - - 7 6 5 4 3 2 1 0 SEW=8b, LMUL=1/2
+
+vn 7 6 5 4 3 2 1 0 SEW=16b, LMUL=1
+
+v2*n 3 2 1 0 SEW=32b, LMUL=2
+v2*n+1 7 6 5 4
+
+v4*n 1 0 SEW=64b, LMUL=4
+v4*n+1 3 2
+v4*n+2 5 4
+v4*n+3 7 6
+----
+
+The following table shows each possible constant SEW/LMUL operating
+point for loops with mixed-width operations. Each column represents a
+constant SEW/LMUL operating point. Entries in table are the LMUL
+values that yield that column's SEW/LMUL value for the datawidth on
+that row. In each column, an LMUL setting for a datawidth indicates
+that it can be aligned with the other datawidths in the same column
+that also have an LMUL setting, such that all have the same VLMAX.
+
+|===
+| 7+^| SEW/LMUL
+| | 1 | 2 | 4 | 8 | 16 | 32 | 64
+
+| SEW= 8 | 8 | 4 | 2 | 1 | 1/2 | 1/4 | 1/8
+| SEW= 16 | | 8 | 4 | 2 | 1 | 1/2 | 1/4
+| SEW= 32 | | | 8 | 4 | 2 | 1 | 1/2
+| SEW= 64 | | | | 8 | 4 | 2 | 1
+|===
+
+Larger LMUL settings can also used to simply increase vector length to
+reduce instruction fetch and dispatch overheads in cases where fewer
+vector register groups are needed.
+
+[[sec-mask-register-layout]]
+==== Mask Register Layout
+
+A vector mask occupies only one vector register regardless of SEW and
+LMUL.
+
+Each element is allocated a single mask bit in a mask vector register.
+The mask bit for element _i_ is located in bit _i_ of the mask
+register, independent of SEW or LMUL.
+
+=== Vector Instruction Formats
+
+The instructions in the vector extension fit under two existing major
+opcodes (LOAD-FP and STORE-FP) and one new major opcode (OP-V).
+
+Vector loads and stores are encoded within the scalar floating-point
+load and store major opcodes (LOAD-FP/STORE-FP). The vector load and
+store encodings repurpose a portion of the standard scalar
+floating-point load/store 12-bit immediate field to provide further
+vector instruction encoding, with bit 25 holding the standard vector
+mask bit (see <<sec-vector-mask-encoding>>).
+
+include::images/wavedrom/vmem-format.adoc[]
+
+include::images/wavedrom/valu-format.adoc[]
+
+include::images/wavedrom/vcfg-format.adoc[]
+
+Vector instructions can have scalar or vector source operands and
+produce scalar or vector results, and most vector instructions can be
+performed either unconditionally or conditionally under a mask.
+
+Vector loads and stores move bit patterns between vector register
+elements and memory. Vector arithmetic instructions operate on values
+held in vector register elements.
+
+==== Scalar Operands
+
+Scalar operands can be immediates, or taken from the `x` registers,
+the `f` registers, or element 0 of a vector register. Scalar results
+are written to an `x` or `f` register or to element 0 of a vector
+register. Any vector register can be used to hold a scalar regardless
+of the current LMUL setting.
+
+NOTE: Zfinx ("F in X") is a new ISA extension where
+floating-point instructions take their arguments from the integer
+register file. The vector extension is also compatible with Zfinx,
+where the Zfinx vector extension has vector-scalar floating-point
+instructions taking their scalar argument from the `x` registers.
+
+NOTE: We considered but did not pursue overlaying the `f` registers on
+`v` registers. The adopted approach reduces vector register pressure,
+avoids interactions with the standard calling convention, simplifies
+high-performance scalar floating-point design, and provides
+compatibility with the Zfinx ISA option. Overlaying `f` with `v`
+would provide the advantage of lowering the number of state bits in
+some implementations, but complicates high-performance designs and
+would prevent compatibility with the Zfinx ISA option.
+
+[[sec-vec-operands]]
+==== Vector Operands
+
+Each vector operand has an _effective_ _element_ _width_ (EEW) and an
+_effective_ LMUL (EMUL) that is used to determine the size and
+location of all the elements within a vector register group. By
+default, for most operands of most instructions, EEW=SEW and
+EMUL=LMUL.
+
+Some vector instructions have source and destination vector operands
+with the same number of elements but different widths, so that EEW and
+EMUL differ from SEW and LMUL respectively but EEW/EMUL = SEW/LMUL.
+For example, most widening arithmetic instructions have a source group
+with EEW=SEW and EMUL=LMUL but have a destination group with EEW=2*SEW and
+EMUL=2*LMUL. Narrowing instructions have a source operand that has
+EEW=2*SEW and EMUL=2*LMUL but with a destination where EEW=SEW and EMUL=LMUL.
+
+Vector operands or results may occupy one or more vector registers
+depending on EMUL, but are always specified using the lowest-numbered
+vector register in the group. Using other than the lowest-numbered
+vector register to specify a vector register group is a reserved
+encoding.
+
+A vector register cannot be used to provide source operands with more
+than one EEW for a single instruction. A mask register source is
+considered to have EEW=1 for this constraint. An encoding that would
+result in the same vector register being read with two or more
+different EEWs, including when the vector register appears at
+different positions within two or more vector register groups, is
+reserved.
+
+NOTE: In practice, there is no software benefit to reading the same
+register with different EEW in the same instruction, and this
+constraint reduces complexity for implementations that internally
+rearrange data dependent on EEW.
+
+A destination vector register group can overlap a source vector register
+group only if one of the following holds:
+
+- The destination EEW equals the source EEW.
+- The destination EEW is smaller than the source EEW and the overlap is in
+ the lowest-numbered part of the source register group (e.g., when LMUL=1,
+ `vnsrl.wi v0, v0, 3` is legal, but a destination of `v1` is not).
+- The destination EEW is greater than the source EEW, the source EMUL is
+ at least 1, and the overlap is in the highest-numbered part of the
+ destination register group (e.g., when LMUL=8, `vzext.vf4 v0, v6` is legal,
+ but a source of `v0`, `v2`, or `v4` is not).
+
+For the purpose of determining register group overlap constraints,
+mask elements have EEW=1.
+
+NOTE: The overlap constraints are designed to support resumable
+exceptions in machines without register renaming.
+
+Any instruction encoding that violates the overlap constraints is reserved.
+
+When source and destination registers overlap and have different EEW, the
+instruction is mask- and tail-agnostic, regardless of the setting of the
+`vta` and `vma` bits in `vtype`.
+
+The largest vector register group used by an instruction can not be
+greater than 8 vector registers (i.e., EMUL{le}8), and if a vector
+instruction would require greater than 8 vector registers in a group,
+the instruction encoding is reserved. For example, a widening
+operation that produces a widened vector register group result when
+LMUL=8 is reserved as this would imply a result EMUL=16.
+
+Widened scalar values, e.g., input and output to a widening reduction
+operation, are held in the first element of a vector register and
+have EMUL=1.
+
+==== Vector Masking
+
+Masking is supported on many vector instructions. Element operations
+that are masked off (inactive) never generate exceptions. The
+destination vector register elements corresponding to masked-off
+elements are handled with either a mask-undisturbed or mask-agnostic
+policy depending on the setting of the `vma` bit in `vtype` (Section
+<<sec-agnostic>>).
+
+The mask value used to control execution of a masked vector
+instruction is always supplied by vector register `v0`.
+
+NOTE: Masks are held in vector registers, rather than in a separate mask
+register file, to reduce total architectural state and to simplify the ISA.
+
+NOTE: Future vector extensions may provide longer instruction
+encodings with space for a full mask register specifier.
+
+The destination vector register group for a masked vector instruction
+cannot overlap the source mask register (`v0`), unless the destination
+vector register is being written with a mask value (e.g., compares)
+or the scalar result of a reduction. These instruction encodings are
+reserved.
+
+NOTE: This constraint supports restart with a non-zero `vstart` value.
+
+Other vector registers can be used to hold working mask values, and
+mask vector logical operations are provided to perform predicate
+calculations. [[sec-mask-vector-logical]]
+
+As specified in Section <<sec-agnostic>>, mask destination values are
+always treated as tail-agnostic, regardless of the setting of `vta`.
+
+[[sec-vector-mask-encoding]]
+===== Mask Encoding
+
+Where available, masking is encoded in a single-bit `vm` field in the
+ instruction (`inst[25]`).
+
+[cols="1,15"]
+|===
+| vm | Description
+
+| 0 | vector result, only where v0.mask[i] = 1
+| 1 | unmasked
+|===
+
+Vector masking is represented in assembler code as another vector
+operand, with `.t` indicating that the operation occurs when
+`v0.mask[i]` is `1` (`t` for "true"). If no masking operand is
+specified, unmasked vector execution (`vm=1`) is assumed.
+
+----
+ vop.v* v1, v2, v3, v0.t # enabled where v0.mask[i]=1, vm=0
+ vop.v* v1, v2, v3 # unmasked vector operation, vm=1
+----
+
+NOTE: Even though the current vector extensions only support one vector
+mask register `v0` and only the true form of predication, the assembly
+syntax writes it out in full to be compatible with future extensions
+that might add a mask register specifier and support both true and
+complement mask values. The `.t` suffix on the masking operand also helps
+to visually encode the use of a mask.
+
+NOTE: The `.mask` suffix is not part of the assembly syntax.
+We only append it in contexts where a mask vector is subscripted,
+e.g., `v0.mask[i]`.
+
+[[sec-inactive-defs]]
+==== Prestart, Active, Inactive, Body, and Tail Element Definitions
+
+The destination element indices operated on during a vector
+instruction's execution can be divided into three disjoint subsets.
+
+* The _prestart_ elements are those whose element index is less than the
+initial value in the `vstart` register. The prestart elements do not
+raise exceptions and do not update the destination vector register.
+
+* The _body_ elements are those whose element index is greater than or equal
+to the initial value in the `vstart` register, and less than the current
+vector length setting in `vl`. The body can be split into two disjoint subsets:
+
+** The _active_ elements during a vector instruction's execution are the
+elements within the body and where the current mask is enabled at that element
+position. The active elements can raise exceptions and update the destination
+vector register group.
+
+** The _inactive_ elements are the elements within the body
+but where the current mask is disabled at that element
+position. The inactive elements do not raise exceptions and do not
+update any destination vector register group unless masked agnostic is
+specified (`vtype.vma`=1), in which case inactive elements may be
+overwritten with 1s.
+
+* The _tail_ elements during a vector instruction's execution are the
+elements past the current vector length setting specified in `vl`.
+The tail elements do not raise exceptions, and do not update any
+destination vector register group unless tail agnostic is specified
+(`vtype.vta`=1), in which case tail elements may be overwritten with
+1s, or with the result of the instruction in the case of
+mask-producing instructions except for mask loads. When LMUL < 1, the
+tail includes the elements past VLMAX that are held in the same vector
+register.
+
+----
+ for element index x
+ prestart(x) = (0 <= x < vstart)
+ body(x) = (vstart <= x < vl)
+ tail(x) = (vl <= x < max(VLMAX,VLEN/SEW))
+ mask(x) = unmasked || v0.mask[x] == 1
+ active(x) = body(x) && mask(x)
+ inactive(x) = body(x) && !mask(x)
+----
+
+When `vstart` {ge} `vl`, there are no body elements, and no elements
+are updated in any destination vector register group, including that
+no tail elements are updated with agnostic values.
+
+NOTE: As a consequence, when `vl`=0, no elements, including agnostic
+elements, are updated in the destination vector register group
+regardless of `vstart`.
+
+Instructions that write an `x` register or `f` register
+do so even when `vstart` {ge} `vl`, including when `vl`=0.
+
+NOTE: Some instructions such as `vslidedown` and `vrgather` may read
+indices past `vl` or even VLMAX in source vector register groups. The
+general policy is to return the value 0 when the index is greater than
+VLMAX in the source vector register group.
+
+[[sec-vector-config]]
+=== Configuration-Setting Instructions (`vsetvli`/`vsetivli`/`vsetvl`)
+
+One of the common approaches to handling a large number of elements is
+"stripmining" where each iteration of a loop handles some number of elements,
+and the iterations continue until all elements have been processed. The RISC-V
+vector specification provides direct, portable support for this approach.
+The application specifies the total number of elements to be processed (the application vector length or AVL) as a
+candidate value for `vl`, and the hardware responds via a general-purpose
+register with the (frequently smaller) number of elements that the hardware
+will handle per iteration (stored in `vl`), based on the microarchitectural
+implementation and the `vtype` setting. A straightforward loop structure,
+shown in <<example-stripmine-sew>>, depicts the ease with which the code keeps
+track of the remaining number of elements and the amount per iteration handled
+by hardware.
+
+A set of instructions is provided to allow rapid configuration of the
+values in `vl` and `vtype` to match application needs. The
+`vset{i}vl{i}` instructions set the `vtype` and `vl` CSRs based on
+their arguments, and write the new value of `vl` into `rd`.
+
+----
+ vsetvli rd, rs1, vtypei # rd = new vl, rs1 = AVL, vtypei = new vtype setting
+ vsetivli rd, uimm, vtypei # rd = new vl, uimm = AVL, vtypei = new vtype setting
+ vsetvl rd, rs1, rs2 # rd = new vl, rs1 = AVL, rs2 = new vtype value
+----
+
+include::images/wavedrom/vcfg-format.adoc[]
+
+==== `vtype` encoding
+
+include::images/wavedrom/vtype-format.adoc[]
+
+The new `vtype` value is encoded in the immediate fields of `vsetvli`
+and `vsetivli`, and in the `rs2` register for `vsetvl`.
+
+----
+ Suggested assembler names used for vset{i}vli vtypei immediate
+
+ e8 # SEW=8b
+ e16 # SEW=16b
+ e32 # SEW=32b
+ e64 # SEW=64b
+
+ mf8 # LMUL=1/8
+ mf4 # LMUL=1/4
+ mf2 # LMUL=1/2
+ m1 # LMUL=1, assumed if m setting absent
+ m2 # LMUL=2
+ m4 # LMUL=4
+ m8 # LMUL=8
+
+Examples:
+ vsetvli t0, a0, e8, ta, ma # SEW= 8, LMUL=1
+ vsetvli t0, a0, e8, m2, ta, ma # SEW= 8, LMUL=2
+ vsetvli t0, a0, e32, mf2, ta, ma # SEW=32, LMUL=1/2
+----
+
+The `vsetvl` variant operates similarly to `vsetvli` except that it
+takes a `vtype` value from `rs2` and can be used for context restore.
+
+===== Unsupported `vtype` Values
+
+If the `vtype` value is not supported by the implementation, then
+the `vill` bit is set in `vtype`, the remaining bits in `vtype` are
+set to zero, and the `vl` register is also set to zero.
+
+NOTE: Earlier drafts required a trap when setting `vtype` to an
+illegal value. However, this would have added the first
+data-dependent trap on a CSR write to the ISA. Implementations could
+choose to trap when illegal values are written to `vtype` instead of
+setting `vill`, to allow emulation to support new configurations for
+forward-compatibility. The current scheme supports light-weight
+runtime interrogation of the supported vector unit configurations by
+checking if `vill` is clear for a given setting.
+
+A `vtype` value with `vill` set is treated as an unsupported
+configuration.
+
+Implementations must consider all bits of the `vtype` value to
+determine if the configuration is supported. An unsupported value in
+any location within the `vtype` value must result in `vill` being set.
+
+NOTE: In particular, all XLEN bits of the register `vtype` argument to
+the `vsetvl` instruction must be checked. Implementations cannot
+ignore fields they do not implement. All bits must be checked to
+ensure that new code assuming unsupported vector features in `vtype`
+traps instead of executing incorrectly on an older implementation.
+
+==== AVL encoding
+
+The new vector
+length setting is based on AVL, which for `vsetvli` and `vsetvl` is encoded in the `rs1` and `rd`
+fields as follows:
+
+.AVL used in `vsetvli` and `vsetvl` instructions
+[cols="2,2,10,10"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| `rd` | `rs1` | AVL value | Effect on `vl`
+| - | !x0 | Value in `x[rs1]` | Normal stripmining
+| !x0 | x0 | ~0 | Set `vl` to VLMAX
+| x0 | x0 | Value in `vl` register | Keep existing `vl` (of course, `vtype` may change)
+|===
+
+When `rs1` is not `x0`, the AVL is an unsigned integer held in the `x`
+register specified by `rs1`, and the new `vl` value is also written to
+the `x` register specified by `rd`.
+
+When `rs1=x0` but `rd!=x0`, the maximum unsigned integer value (`~0`)
+is used as the AVL, and the resulting VLMAX is written to `vl` and
+also to the `x` register specified by `rd`.
+
+When `rs1=x0` and `rd=x0`, the instruction operates as if the current
+vector length in `vl` is used as the AVL, and the resulting value is
+written to `vl`, but not to a destination register. This form can
+only be used when VLMAX and hence `vl` is not actually changed by the
+new SEW/LMUL ratio. Use of the instruction with a new SEW/LMUL ratio
+that would result in a change of VLMAX is reserved.
+Use of the instruction is also reserved if `vill` was 1 beforehand.
+Implementations may set `vill` in either case.
+
+NOTE: This last form of the instructions allows the `vtype` register to
+be changed while maintaining the current `vl`, provided VLMAX is not
+reduced. This design was chosen to ensure `vl` would always hold a
+legal value for current `vtype` setting. The current `vl` value can
+be read from the `vl` CSR. The `vl` value could be reduced by this
+instruction if the new SEW/LMUL ratio causes VLMAX to shrink, and so
+this case has been reserved as it is not clear this is a generally
+useful operation, and implementations can otherwise assume `vl` is not
+changed by this instruction to optimize their microarchitecture.
+
+For the `vsetivli` instruction, the AVL is encoded as a 5-bit
+zero-extended immediate (0--31) in the `rs1` field.
+
+NOTE: The encoding of AVL for `vsetivli` is the same as for regular
+CSR immediate values.
+
+NOTE: The `vsetivli` instruction provides more compact code when the
+dimensions of vectors are small and known to fit inside the vector
+registers, in which case there is no stripmining overhead.
+
+==== Constraints on Setting `vl`
+
+The `vset{i}vl{i}` instructions first set VLMAX according to their `vtype`
+argument, then set `vl` obeying the following constraints:
+
+. `vl = AVL` if `AVL {le} VLMAX`
+. `ceil(AVL / 2) {le} vl {le} VLMAX` if `AVL < (2 * VLMAX)`
+. `vl = VLMAX` if `AVL {ge} (2 * VLMAX)`
+. Deterministic on any given implementation for same input AVL and VLMAX values
+. These specific properties follow from the prior rules:
+.. `vl = 0` if `AVL = 0`
+.. `vl > 0` if `AVL > 0`
+.. `vl {le} VLMAX`
+.. `vl {le} AVL`
+.. a value read from `vl` when used as the AVL argument to `vset{i}vl{i}` results in the same
+value in `vl`, provided the resultant VLMAX equals the value of VLMAX at the time that `vl` was read
+
+[NOTE]
+--
+The `vl` setting rules are designed to be sufficiently strict to
+preserve `vl` behavior across register spills and context swaps for
+`AVL {le} VLMAX`, yet flexible enough to enable implementations to improve
+vector lane utilization for `AVL > VLMAX`.
+
+For example, this permits an implementation to set `vl = ceil(AVL / 2)`
+for `VLMAX < AVL < 2*VLMAX` in order to evenly distribute work over the
+last two iterations of a stripmine loop.
+Requirement 2 ensures that the first stripmine iteration of reduction
+loops uses the largest vector length of all iterations, even in the case
+of `AVL < 2*VLMAX`.
+This allows software to avoid needing to explicitly calculate a running
+maximum of vector lengths observed during a stripmined loop.
+Requirement 2 also allows an implementation to set vl to VLMAX for `VLMAX < AVL < 2*VLMAX`
+--
+
+[[example-stripmine-sew]]
+==== Example of stripmining and changes to SEW
+
+The SEW and LMUL settings can be changed dynamically to provide high
+throughput on mixed-width operations in a single loop.
+----
+# Example: Load 16-bit values, widen multiply to 32b, shift 32b result
+# right by 3, store 32b values.
+# On entry:
+# a0 holds the total number of elements to process
+# a1 holds the address of the source array
+# a2 holds the address of the destination array
+
+loop:
+ vsetvli a3, a0, e16, m4, ta, ma # vtype = 16-bit integer vectors;
+ # also update a3 with vl (# of elements this iteration)
+ vle16.v v4, (a1) # Get 16b vector
+ slli t1, a3, 1 # Multiply # elements this iteration by 2 bytes/source element
+ add a1, a1, t1 # Bump pointer
+ vwmul.vx v8, v4, x10 # Widening multiply into 32b in <v8--v15>
+
+ vsetvli x0, x0, e32, m8, ta, ma # Operate on 32b values
+ vsrl.vi v8, v8, 3
+ vse32.v v8, (a2) # Store vector of 32b elements
+ slli t1, a3, 2 # Multiply # elements this iteration by 4 bytes/destination element
+ add a2, a2, t1 # Bump pointer
+ sub a0, a0, a3 # Decrement count by vl
+ bnez a0, loop # Any more?
+----
+
+[[sec-vector-memory]]
+=== Vector Loads and Stores
+
+Vector loads and stores move values between vector registers and
+memory.
+Vector loads and stores can be masked, and they only access memory or raise
+exceptions for active elements.
+Masked vector loads do not update inactive elements in the destination vector
+register group, unless masked agnostic is specified (`vtype.vma`=1).
+All vector loads and stores may
+generate and accept a non-zero `vstart` value.
+
+==== Vector Load/Store Instruction Encoding
+
+Vector loads and stores are encoded within the scalar floating-point
+load and store major opcodes (LOAD-FP/STORE-FP). The vector load and
+store encodings repurpose a portion of the standard scalar
+floating-point load/store 12-bit immediate field to provide further
+vector instruction encoding, with bit 25 holding the standard vector
+mask bit (see <<sec-vector-mask-encoding>>).
+
+include::images/wavedrom/vmem-format.adoc[]
+
+[cols="4,12"]
+|===
+| Field | Description
+
+| rs1[4:0] | specifies x register holding base address
+| rs2[4:0] | specifies x register holding stride
+| vs2[4:0] | specifies v register holding address offsets
+| vs3[4:0] | specifies v register holding store data
+| vd[4:0] | specifies v register destination of load
+| vm | specifies whether vector masking is enabled (0 = mask enabled, 1 = mask disabled)
+| width[2:0] | specifies size of memory elements, and distinguishes from FP scalar
+| mew | extended memory element width. See <<sec-vector-loadstore-width-encoding>>
+| mop[1:0] | specifies memory addressing mode
+| nf[2:0] | specifies the number of fields in each segment, for segment load/stores
+| lumop[4:0]/sumop[4:0] | are additional fields encoding variants of unit-stride instructions
+|===
+
+Vector memory unit-stride and constant-stride operations directly
+encode EEW of the data to be transferred statically in the instruction
+to reduce the number of `vtype` changes when accessing memory in a
+mixed-width routine. Indexed operations use the explicit EEW encoding
+in the instruction to set the size of the indices used, and use
+SEW/LMUL to specify the data width.
+
+==== Vector Load/Store Addressing Modes
+
+The vector extension supports unit-stride, strided, and
+indexed (scatter/gather) addressing modes. Vector load/store base
+registers and strides are taken from the GPR `x` registers.
+
+The base effective address for all vector accesses is given by the
+contents of the `x` register named in `rs1`.
+
+Vector unit-stride operations access elements stored contiguously in
+memory starting from the base effective address.
+
+Vector constant-strided operations access the first memory element at the base
+effective address, and then access subsequent elements at address
+increments given by the byte offset contained in the `x` register
+specified by `rs2`.
+
+Vector indexed operations add the contents of each element of the
+vector offset operand specified by `vs2` to the base effective address
+to give the effective address of each element. The data vector
+register group has EEW=SEW, EMUL=LMUL, while the offset vector
+register group has EEW encoded in the instruction and
+EMUL=(EEW/SEW)*LMUL.
+
+The vector offset operand is treated as a vector of byte-address
+offsets.
+
+NOTE: The indexed operations can also be used to access fields within
+a vector of objects, where the `vs2` vector holds pointers to the base
+of the objects and the scalar `x` register holds the offset of the
+member field in each object. Supporting this case is why the indexed
+operations were not defined to scale the element indices by the data
+EEW.
+
+If the vector offset elements are narrower than XLEN, they are
+zero-extended to XLEN before adding to the base effective address. If
+the vector offset elements are wider than XLEN, the least-significant
+XLEN bits are used in the address calculation. An implementation must
+raise an illegal instruction exception if the EEW is not supported for
+offset elements.
+
+NOTE: A profile may place an upper limit on the maximum supported index
+EEW (e.g., only up to XLEN) smaller than ELEN.
+
+The vector addressing modes are encoded using the 2-bit `mop[1:0]`
+field.
+
+.encoding for loads
+[cols="1,1,7,6"]
+|===
+2+| mop [1:0] | Description | Opcodes
+
+| 0 | 0 | unit-stride | VLE<EEW>
+| 0 | 1 | indexed-unordered | VLUXEI<EEW>
+| 1 | 0 | strided | VLSE<EEW>
+| 1 | 1 | indexed-ordered | VLOXEI<EEW>
+|===
+
+.encoding for stores
+[cols="1,1,7,6"]
+|===
+2+| mop [1:0] | Description | Opcodes
+
+| 0 | 0 | unit-stride | VSE<EEW>
+| 0 | 1 | indexed-unordered | VSUXEI<EEW>
+| 1 | 0 | strided | VSSE<EEW>
+| 1 | 1 | indexed-ordered | VSOXEI<EEW>
+|===
+
+Vector unit-stride and constant-stride memory accesses do not
+guarantee ordering between individual element accesses. The vector
+indexed load and store memory operations have two forms, ordered and
+unordered. The indexed-ordered variants preserve element ordering on
+memory accesses.
+
+For unordered instructions (`mop[1:0]`!=11) there is no guarantee on
+element access order. If the accesses are to a strongly ordered IO
+region, the element accesses can be initiated in any order.
+
+NOTE: To provide ordered vector accesses to a strongly ordered IO
+region, the ordered indexed instructions should be used.
+
+For implementations with precise vector traps, exceptions on
+indexed-unordered stores must also be precise.
+
+Additional unit-stride vector addressing modes are encoded using the
+5-bit `lumop` and `sumop` fields in the unit-stride load and store
+instruction encodings respectively.
+
+.lumop
+[cols="1,1,1,1,1,11"]
+|===
+5+| lumop[4:0] | Description
+
+| 0 | 0 | 0 | 0 | 0 | unit-stride load
+| 0 | 1 | 0 | 0 | 0 | unit-stride, whole register load
+| 0 | 1 | 0 | 1 | 1 | unit-stride, mask load, EEW=8
+| 1 | 0 | 0 | 0 | 0 | unit-stride fault-only-first
+| x | x | x | x | x | other encodings reserved
+|===
+
+.sumop
+[cols="1,1,1,1,1,11"]
+|===
+5+| sumop[4:0] | Description
+
+| 0 | 0 | 0 | 0 | 0 | unit-stride store
+| 0 | 1 | 0 | 0 | 0 | unit-stride, whole register store
+| 0 | 1 | 0 | 1 | 1 | unit-stride, mask store, EEW=8
+| x | x | x | x | x | other encodings reserved
+|===
+
+The `nf[2:0]` field encodes the number of fields in each segment. For
+regular vector loads and stores, `nf`=0, indicating that a single
+value is moved between a vector register group and memory at each
+element position. Larger values in the `nf` field are used to access
+multiple contiguous fields within a segment as described below in
+Section <<sec-aos>>.
+
+The `nf[2:0]` field also encodes the number of whole vector registers
+to transfer for the whole vector register load/store instructions.
+
+[[sec-vector-loadstore-width-encoding]]
+==== Vector Load/Store Width Encoding
+
+Vector loads and stores have an EEW encoded directly in the
+instruction. The corresponding EMUL is calculated as EMUL =
+(EEW/SEW)*LMUL. If the EMUL would be out of range (EMUL>8 or
+EMUL<1/8), the instruction encoding is reserved. The vector register
+groups must have legal register specifiers for the selected EMUL,
+otherwise the instruction encoding is reserved.
+
+Vector unit-stride and constant-stride use the EEW/EMUL encoded in the
+instruction for the data values, while vector indexed loads and stores
+use the EEW/EMUL encoded in the instruction for the index values and
+the SEW/LMUL encoded in `vtype` for the data values.
+
+Vector loads and stores are encoded using width values that are not
+claimed by the standard scalar floating-point loads and stores.
+
+Implementations must provide vector loads and stores with EEWs
+corresponding to all supported SEW settings. Vector load/store
+encodings for unsupported EEW widths must raise an illegal
+instruction exception.
+
+.Width encoding for vector loads and stores.
+[cols="5,1,1,1,1,>3,>3,>3,3"]
+|===
+| | mew 3+| width [2:0] | Mem bits | Data Reg bits | Index bits | Opcodes
+
+| Standard scalar FP | x | 0 | 0 | 1 | 16| FLEN | - | FLH/FSH
+| Standard scalar FP | x | 0 | 1 | 0 | 32| FLEN | - | FLW/FSW
+| Standard scalar FP | x | 0 | 1 | 1 | 64| FLEN | - | FLD/FSD
+| Standard scalar FP | x | 1 | 0 | 0 | 128| FLEN | - | FLQ/FSQ
+| Vector 8b element | 0 | 0 | 0 | 0 | 8| 8 | - | VLxE8/VSxE8
+| Vector 16b element | 0 | 1 | 0 | 1 | 16| 16 | - | VLxE16/VSxE16
+| Vector 32b element | 0 | 1 | 1 | 0 | 32| 32 | - | VLxE32/VSxE32
+| Vector 64b element | 0 | 1 | 1 | 1 | 64| 64 | - | VLxE64/VSxE64
+| Vector 8b index | 0 | 0 | 0 | 0 | SEW | SEW | 8 | VLxEI8/VSxEI8
+| Vector 16b index | 0 | 1 | 0 | 1 | SEW | SEW | 16 | VLxEI16/VSxEI16
+| Vector 32b index | 0 | 1 | 1 | 0 | SEW | SEW | 32 | VLxEI32/VSxEI32
+| Vector 64b index | 0 | 1 | 1 | 1 | SEW | SEW | 64 | VLxEI64/VSxEI64
+| Reserved | 1 | X | X | X | - | - | - |
+|===
+
+Mem bits is the size of each element accessed in memory.
+
+Data reg bits is the size of each data element accessed in register.
+
+Index bits is the size of each index accessed in register.
+
+The `mew` bit (`inst[28]`) when set is expected to be used to encode
+expanded memory sizes of 128 bits and above, but these encodings are
+currently reserved.
+
+==== Vector Unit-Stride Instructions
+
+----
+ # Vector unit-stride loads and stores
+
+ # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
+ vle8.v vd, (rs1), vm # 8-bit unit-stride load
+ vle16.v vd, (rs1), vm # 16-bit unit-stride load
+ vle32.v vd, (rs1), vm # 32-bit unit-stride load
+ vle64.v vd, (rs1), vm # 64-bit unit-stride load
+
+ # vs3 store data, rs1 base address, vm is mask encoding (v0.t or <missing>)
+ vse8.v vs3, (rs1), vm # 8-bit unit-stride store
+ vse16.v vs3, (rs1), vm # 16-bit unit-stride store
+ vse32.v vs3, (rs1), vm # 32-bit unit-stride store
+ vse64.v vs3, (rs1), vm # 64-bit unit-stride store
+----
+
+Additional unit-stride mask load and store instructions are
+provided to transfer mask values to/from memory. These
+operate similarly to unmasked byte loads or stores (EEW=8), except that
+the effective vector length is ``evl``=ceil(``vl``/8) (i.e. EMUL=1),
+and the destination register is always written with a tail-agnostic
+policy.
+
+----
+ # Vector unit-stride mask load
+ vlm.v vd, (rs1) # Load byte vector of length ceil(vl/8)
+
+ # Vector unit-stride mask store
+ vsm.v vs3, (rs1) # Store byte vector of length ceil(vl/8)
+----
+
+`vlm.v` and `vsm.v` are encoded with the same `width[2:0]`=0 encoding as
+`vle8.v` and `vse8.v`, but are distinguished by different
+`lumop` and `sumop` encodings. Since `vlm.v` and `vsm.v` operate as byte loads and stores,
+`vstart` is in units of bytes for these instructions.
+
+NOTE: `vlm.v` and `vsm.v` respect the `vill` field in `vtype`, as
+they depend on `vtype` indirectly through its constraints on `vl`.
+
+NOTE: The previous assembler mnemonics `vle1.v` and `vse1.v` were
+confusing as length was handled differently for these instructions
+versus other element load/store instructions. To avoid software
+churn, these older assembly mnemonics are being retained as aliases.
+
+NOTE: The primary motivation to provide mask load and store is to
+support machines that internally rearrange data to reduce
+cross-datapath wiring. However, these instructions also provide a convenient
+mechanism to use packed bit vectors in memory as mask values,
+and also reduce the cost of mask spill/fill by reducing need to change
+`vl`.
+
+==== Vector Strided Instructions
+
+----
+ # Vector strided loads and stores
+
+ # vd destination, rs1 base address, rs2 byte stride
+ vlse8.v vd, (rs1), rs2, vm # 8-bit strided load
+ vlse16.v vd, (rs1), rs2, vm # 16-bit strided load
+ vlse32.v vd, (rs1), rs2, vm # 32-bit strided load
+ vlse64.v vd, (rs1), rs2, vm # 64-bit strided load
+
+ # vs3 store data, rs1 base address, rs2 byte stride
+ vsse8.v vs3, (rs1), rs2, vm # 8-bit strided store
+ vsse16.v vs3, (rs1), rs2, vm # 16-bit strided store
+ vsse32.v vs3, (rs1), rs2, vm # 32-bit strided store
+ vsse64.v vs3, (rs1), rs2, vm # 64-bit strided store
+----
+
+Negative and zero strides are supported.
+
+Element accesses within a strided instruction are unordered with
+respect to each other.
+
+When `rs2`=`x0`, then an implementation is allowed, but not required,
+to perform fewer memory operations than the number of active elements,
+and may perform different numbers of memory operations across
+different dynamic executions of the same static instruction.
+
+NOTE: Compilers must be aware to not use the `x0` form for rs2 when
+the immediate stride is `0` if the intent is to require all memory
+accesses are performed.
+
+When `rs2!=x0` and the value of `x[rs2]=0`, the implementation must
+perform one memory access for each active element (but these accesses
+will not be ordered).
+
+NOTE: As with other architectural mandates, implementations must
+_appear_ to perform each memory access. Microarchitectures are
+free to optimize away accesses that would not be observed by another
+agent, for example, in idempotent memory regions obeying RVWMO. For
+non-idempotent memory regions, where by definition each access can be
+observed by a device, the optimization would not be possible.
+
+NOTE: When repeating ordered vector accesses to the same memory
+address are required, then an ordered indexed operation can be used.
+
+==== Vector Indexed Instructions
+
+----
+ # Vector indexed loads and stores
+
+ # Vector indexed-unordered load instructions
+ # vd destination, rs1 base address, vs2 byte offsets
+ vluxei8.v vd, (rs1), vs2, vm # unordered 8-bit indexed load of SEW data
+ vluxei16.v vd, (rs1), vs2, vm # unordered 16-bit indexed load of SEW data
+ vluxei32.v vd, (rs1), vs2, vm # unordered 32-bit indexed load of SEW data
+ vluxei64.v vd, (rs1), vs2, vm # unordered 64-bit indexed load of SEW data
+
+ # Vector indexed-ordered load instructions
+ # vd destination, rs1 base address, vs2 byte offsets
+ vloxei8.v vd, (rs1), vs2, vm # ordered 8-bit indexed load of SEW data
+ vloxei16.v vd, (rs1), vs2, vm # ordered 16-bit indexed load of SEW data
+ vloxei32.v vd, (rs1), vs2, vm # ordered 32-bit indexed load of SEW data
+ vloxei64.v vd, (rs1), vs2, vm # ordered 64-bit indexed load of SEW data
+
+ # Vector indexed-unordered store instructions
+ # vs3 store data, rs1 base address, vs2 byte offsets
+ vsuxei8.v vs3, (rs1), vs2, vm # unordered 8-bit indexed store of SEW data
+ vsuxei16.v vs3, (rs1), vs2, vm # unordered 16-bit indexed store of SEW data
+ vsuxei32.v vs3, (rs1), vs2, vm # unordered 32-bit indexed store of SEW data
+ vsuxei64.v vs3, (rs1), vs2, vm # unordered 64-bit indexed store of SEW data
+
+ # Vector indexed-ordered store instructions
+ # vs3 store data, rs1 base address, vs2 byte offsets
+ vsoxei8.v vs3, (rs1), vs2, vm # ordered 8-bit indexed store of SEW data
+ vsoxei16.v vs3, (rs1), vs2, vm # ordered 16-bit indexed store of SEW data
+ vsoxei32.v vs3, (rs1), vs2, vm # ordered 32-bit indexed store of SEW data
+ vsoxei64.v vs3, (rs1), vs2, vm # ordered 64-bit indexed store of SEW data
+
+----
+
+NOTE: The assembler syntax for indexed loads and stores uses
+``ei``__x__ instead of ``e``__x__ to indicate the statically encoded EEW
+is of the index not the data.
+
+NOTE: The indexed operations mnemonics have a "U" or "O" to
+distinguish between unordered and ordered, while the other vector
+addressing modes have no character. While this is perhaps a little
+less consistent, this approach minimizes disruption to existing
+software, as VSXEI previously meant "ordered" - and the opcode can be
+retained as an alias during transition to help reduce software churn.
+
+==== Unit-stride Fault-Only-First Loads
+
+The unit-stride fault-only-first load instructions are used to
+vectorize loops with data-dependent exit conditions ("while" loops).
+These instructions execute as a regular load except that they will
+only take a trap caused by a synchronous exception on element 0. If
+element 0 raises an exception, `vl` is not modified, and the trap is
+taken. If an element > 0 raises an exception, the corresponding trap
+is not taken, and the vector length `vl` is reduced to the index of
+the element that would have raised an exception.
+
+Load instructions may overwrite active destination vector register
+group elements past the element index at which the trap is reported.
+Similarly, fault-only-first load instructions may update active destination
+elements past the element that causes trimming of the vector length
+(but not past the original vector length). The values of these
+spurious updates do not have to correspond to the values in memory at
+the addressed memory locations. Non-idempotent memory locations can
+only be accessed when it is known the corresponding element load
+operation will not be restarted due to a trap or vector-length
+trimming.
+
+----
+ # Vector unit-stride fault-only-first loads
+
+ # vd destination, rs1 base address, vm is mask encoding (v0.t or <missing>)
+ vle8ff.v vd, (rs1), vm # 8-bit unit-stride fault-only-first load
+ vle16ff.v vd, (rs1), vm # 16-bit unit-stride fault-only-first load
+ vle32ff.v vd, (rs1), vm # 32-bit unit-stride fault-only-first load
+ vle64ff.v vd, (rs1), vm # 64-bit unit-stride fault-only-first load
+----
+
+----
+strlen example using unit-stride fault-only-first instruction
+
+include::example/strlen.s[lines=4..-1]
+----
+
+NOTE: There is a security concern with fault-on-first loads, as they
+can be used to probe for valid effective addresses. The unit-stride
+versions only allow probing a region immediately contiguous to a known
+region, and so reduce the security impact when used in unprivileged
+code. However, code running in S-mode can establish arbitrary page
+translations that allow probing of random guest physical addresses
+provided by a hypervisor. Strided and scatter/gather fault-only-first
+instructions are not provided due to lack of encoding space, but they
+can also represent a larger security hole, allowing even unprivileged
+software to easily check multiple random pages for accessibility
+without experiencing a trap. This standard does not address possible
+security mitigations for fault-only-first instructions.
+
+Even when an exception is not raised, implementations are permitted to process
+fewer than `vl` elements and reduce `vl` accordingly, but if `vstart`=0 and
+`vl`>0, then at least one element must be processed.
+
+When the fault-only-first instruction takes a trap due to an
+interrupt, implementations should not reduce `vl` and should instead
+set a `vstart` value.
+
+NOTE: When the fault-only-first instruction would trigger a debug
+data-watchpoint trap on an element after the first, implementations
+should not reduce `vl` but instead should trigger the debug trap as
+otherwise the event might be lost.
+
+[[sec-aos]]
+==== Vector Load/Store Segment Instructions
+
+The vector load/store segment instructions move multiple contiguous
+fields in memory to and from consecutively numbered vector registers.
+
+NOTE: The name "segment" reflects that the items moved are subarrays
+with homogeneous elements. These operations can be used to transpose
+arrays between memory and registers, and can support operations on
+"array-of-structures" datatypes by unpacking each field in a structure
+into a separate vector register.
+
+The three-bit `nf` field in the vector instruction encoding is an
+unsigned integer that contains one less than the number of fields per
+segment, _NFIELDS_.
+
+[[fig-nf]]
+.NFIELDS Encoding
+[cols="1,1,1,13"]
+|===
+3+| nf[2:0] | NFIELDS
+
+| 0 | 0 | 0 | 1
+| 0 | 0 | 1 | 2
+| 0 | 1 | 0 | 3
+| 0 | 1 | 1 | 4
+| 1 | 0 | 0 | 5
+| 1 | 0 | 1 | 6
+| 1 | 1 | 0 | 7
+| 1 | 1 | 1 | 8
+|===
+
+The EMUL setting must be such that EMUL * NFIELDS {le} 8, otherwise
+the instruction encoding is reserved.
+
+NOTE: The product ceil(EMUL) * NFIELDS represents the number of underlying
+vector registers that will be touched by a segmented load or store
+instruction. This constraint makes this total no larger than 1/4 of
+the architectural register file, and the same as for regular
+operations with EMUL=8.
+
+Each field will be held in successively numbered vector register
+groups. When EMUL>1, each field will occupy a vector register group
+held in multiple successively numbered vector registers, and the
+vector register group for each field must follow the usual vector
+register alignment constraints (e.g., when EMUL=2 and NFIELDS=4, each
+field's vector register group must start at an even vector register,
+but does not have to start at a multiple of 8 vector register number).
+
+If the vector register numbers accessed by the segment load or store
+would increment past 31, then the instruction encoding is reserved.
+
+NOTE: This constraint is to help allow for forward-compatibility with
+a possible future longer instruction encoding that has more
+addressable vector registers.
+
+The `vl` register gives the number of segments to move, which is
+equal to the number of elements transferred to each vector register
+group. Masking is also applied at the level of whole segments.
+
+For segment loads and stores, the individual memory accesses used to
+access fields within each segment are unordered with respect to each
+other even for ordered indexed segment loads and stores.
+
+The `vstart` value is in units of whole segments. If a trap occurs during
+access to a segment, it is implementation-defined whether a subset
+of the faulting segment's accesses are performed before the trap is taken.
+
+===== Vector Unit-Stride Segment Loads and Stores
+
+The vector unit-stride load and store segment instructions move packed
+contiguous segments into multiple destination vector register groups.
+
+NOTE: Where the segments hold structures with heterogeneous-sized
+fields, software can later unpack individual structure fields using
+additional instructions after the segment load brings data into the
+vector registers.
+
+The assembler prefixes `vlseg`/`vsseg` are used for unit-stride
+segment loads and stores respectively.
+
+----
+ # Format
+ vlseg<nf>e<eew>.v vd, (rs1), vm # Unit-stride segment load template
+ vsseg<nf>e<eew>.v vs3, (rs1), vm # Unit-stride segment store template
+
+ # Examples
+ vlseg8e8.v vd, (rs1), vm # Load eight vector registers with eight byte fields.
+
+ vsseg3e32.v vs3, (rs1), vm # Store packed vector of 3*4-byte segments from vs3,vs3+1,vs3+2 to memory
+----
+
+For loads, the `vd` register will hold the first field loaded from the
+segment. For stores, the `vs3` register is read to provide the first
+field to be stored to each segment.
+
+----
+ # Example 1
+ # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp)
+ vsetvli a1, t0, e8, ta, ma
+ vlseg3e8.v v8, (a0), vm
+ # v8 holds the red pixels
+ # v9 holds the green pixels
+ # v10 holds the blue pixels
+
+ # Example 2
+ # Memory structure holds complex values, 32b for real and 32b for imaginary
+ vsetvli a1, t0, e32, ta, ma
+ vlseg2e32.v v8, (a0), vm
+ # v8 holds real
+ # v9 holds imaginary
+----
+
+There are also fault-only-first versions of the unit-stride instructions.
+
+----
+ # Template for vector fault-only-first unit-stride segment loads.
+ vlseg<nf>e<eew>ff.v vd, (rs1), vm # Unit-stride fault-only-first segment loads
+----
+
+For fault-only-first segment loads, if an exception is detected partway
+through accessing a segment, regardless of whether the element index is zero,
+it is implementation-defined whether a subset of the segment is loaded.
+
+These instructions may overwrite destination vector register group
+elements past the point at which a trap is reported or past the point
+at which vector length is trimmed.
+
+===== Vector Strided Segment Loads and Stores
+
+Vector strided segment loads and stores move contiguous segments where
+each segment is separated by the byte-stride offset given in the `rs2`
+GPR argument.
+
+NOTE: Negative and zero strides are supported.
+
+----
+ # Format
+ vlsseg<nf>e<eew>.v vd, (rs1), rs2, vm # Strided segment loads
+ vssseg<nf>e<eew>.v vs3, (rs1), rs2, vm # Strided segment stores
+
+ # Examples
+ vsetvli a1, t0, e8, ta, ma
+ vlsseg3e8.v v4, (x5), x6 # Load bytes at addresses x5+i*x6 into v4[i],
+ # and bytes at addresses x5+i*x6+1 into v5[i],
+ # and bytes at addresses x5+i*x6+2 into v6[i].
+
+ # Examples
+ vsetvli a1, t0, e32, ta, ma
+ vssseg2e32.v v2, (x5), x6 # Store words from v2[i] to address x5+i*x6
+ # and words from v3[i] to address x5+i*x6+4
+----
+
+Accesses to the fields within each segment can occur in any order,
+including the case where the byte stride is such that segments overlap
+in memory.
+
+===== Vector Indexed Segment Loads and Stores
+
+Vector indexed segment loads and stores move contiguous segments where
+each segment is located at an address given by adding the scalar base
+address in the `rs1` field to byte offsets in vector register `vs2`.
+Both ordered and unordered forms are provided, where the ordered forms
+access segments in element order. However, even for the ordered form,
+accesses to the fields within an individual segment are not ordered
+with respect to each other.
+
+The data vector register group has EEW=SEW, EMUL=LMUL, while the index
+vector register group has EEW encoded in the instruction with
+EMUL=(EEW/SEW)*LMUL.
+The EMUL * NFIELDS {le} 8 constraint applies to the data vector register group.
+
+----
+ # Format
+ vluxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-unordered segment loads
+ vloxseg<nf>ei<eew>.v vd, (rs1), vs2, vm # Indexed-ordered segment loads
+ vsuxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-unordered segment stores
+ vsoxseg<nf>ei<eew>.v vs3, (rs1), vs2, vm # Indexed-ordered segment stores
+
+ # Examples
+ vsetvli a1, t0, e8, ta, ma
+ vluxseg3ei8.v v4, (x5), v3 # Load bytes at addresses x5+v3[i] into v4[i],
+ # and bytes at addresses x5+v3[i]+1 into v5[i],
+ # and bytes at addresses x5+v3[i]+2 into v6[i].
+
+ # Examples
+ vsetvli a1, t0, e32, ta, ma
+ vsuxseg2ei32.v v2, (x5), v5 # Store words from v2[i] to address x5+v5[i]
+ # and words from v3[i] to address x5+v5[i]+4
+----
+
+For vector indexed segment loads, the destination vector register
+groups cannot overlap the source vector register group (specified by
+`vs2`), else the instruction encoding is reserved.
+
+NOTE: This constraint supports restart of indexed segment loads
+that raise exceptions partway through loading a structure.
+
+==== Vector Load/Store Whole Register Instructions
+
+Format for Vector Load Whole Register Instructions under LOAD-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | mew| 00 | 1| 01000 | rs1 | width | vd |0000111| VL<nf>R
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x07, attr: 'VL*R*'},
+ {bits: 5, name: 'vd', attr: 'destination of load', type: 2},
+ {bits: 3, name: 'width'},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 8, attr: 'lumop'},
+ {bits: 1, name: 1, attr: 'vm'},
+ {bits: 2, name: 0x10000, attr: 'mop'},
+ {bits: 1, name: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+Format for Vector Store Whole Register Instructions under STORE-FP major opcode
+
+////
+31 29 28 27 26 25 24 20 19 15 14 12 11 7 6 0
+ nf | 0 | 00 | 1| 01000 | rs1 | 000 | vs3 |0100111| VS<nf>R
+////
+
+[wavedrom,,svg]
+....
+{reg: [
+ {bits: 7, name: 0x27, attr: 'VS*R*'},
+ {bits: 5, name: 'vs3', attr: 'store data', type: 2},
+ {bits: 3, name: 0x1000},
+ {bits: 5, name: 'rs1', attr: 'base address', type: 4},
+ {bits: 5, name: 8, attr: 'sumop'},
+ {bits: 1, name: 1, attr: 'vm'},
+ {bits: 2, name: 0x100, attr: 'mop'},
+ {bits: 1, name: 0x100, attr: 'mew'},
+ {bits: 3, name: 'nf'},
+]}
+....
+
+These instructions load and store whole vector register groups.
+
+NOTE: These instructions are intended to be used to save and restore
+vector registers when the type or length of the current contents of
+the vector register is not known, or where modifying `vl` and `vtype`
+would be costly. Examples include compiler register spills, vector
+function calls where values are passed in vector registers, interrupt
+handlers, and OS context switches. Software can determine the number
+of bytes transferred by reading the `vlenb` register.
+
+The load instructions have an EEW encoded in the `mew` and `width`
+fields following the pattern of regular unit-stride loads.
+
+NOTE: Because in-register byte layouts are identical to in-memory byte
+layouts, the same data is written to the destination register group
+regardless of EEW.
+Hence, it would have sufficed to provide only EEW=8 variants.
+The full set of EEW variants is provided so that the encoded EEW can be used
+as a hint to indicate the destination register group will next be accessed
+with this EEW, which aids implementations that rearrange data internally.
+
+The vector whole register store instructions are encoded similar to
+unmasked unit-stride store of elements with EEW=8.
+
+The `nf` field encodes how many vector registers to load and store using the NFIELDS encoding (Figure <<fig-nf>>).
+The encoded number of registers must be a power of 2 and the vector
+register numbers must be aligned as with a vector register group,
+otherwise the instruction encoding is reserved. NFIELDS
+indicates the number of vector registers to transfer, numbered
+successively after the base. Only NFIELDS values of 1, 2, 4, 8 are
+supported, with other values reserved. When multiple registers are
+transferred, the lowest-numbered vector register is held in the
+lowest-numbered memory addresses and successive vector register
+numbers are placed contiguously in memory.
+
+The instructions operate with an effective vector length,
+`evl`=NFIELDS*VLEN/EEW, regardless of current settings in `vtype` and
+`vl`. The usual property that no elements are written if `vstart`
+{ge} `vl` does not apply to these instructions. Instead, no elements
+are written if `vstart` {ge} `evl`.
+
+The instructions operate similarly to unmasked unit-stride load and
+store instructions, with the base address passed in the scalar `x`
+register specified by `rs1`.
+
+Implementations are allowed to raise a misaligned address exception on
+whole register loads and stores if the base address is not naturally
+aligned to the larger of the size of the encoded EEW in bytes (EEW/8)
+or the implementation's smallest supported SEW size in bytes
+(SEW~MIN~/8).
+
+NOTE: Allowing misaligned exceptions to be raised based on
+non-alignment to the encoded EEW simplifies the implementation of these
+instructions. Some subset implementations might not support smaller
+SEW widths, so are allowed to report misaligned exceptions for the
+smallest supported SEW even if larger than encoded EEW. An extreme
+non-standard implementation might have SEW~MIN~>XLEN for example. Software
+environments can mandate the minimum alignment requirements to support
+an ABI.
+
+----
+ # Format of whole register load and store instructions.
+ vl1r.v v3, (a0) # Pseudoinstruction equal to vl1re8.v
+
+ vl1re8.v v3, (a0) # Load v3 with VLEN/8 bytes held at address in a0
+ vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords held at address in a0
+ vl1re32.v v3, (a0) # Load v3 with VLEN/32 words held at address in a0
+ vl1re64.v v3, (a0) # Load v3 with VLEN/64 doublewords held at address in a0
+
+ vl2r.v v2, (a0) # Pseudoinstruction equal to vl2re8.v
+
+ vl2re8.v v2, (a0) # Load v2-v3 with 2*VLEN/8 bytes from address in a0
+ vl2re16.v v2, (a0) # Load v2-v3 with 2*VLEN/16 halfwords held at address in a0
+ vl2re32.v v2, (a0) # Load v2-v3 with 2*VLEN/32 words held at address in a0
+ vl2re64.v v2, (a0) # Load v2-v3 with 2*VLEN/64 doublewords held at address in a0
+
+ vl4r.v v4, (a0) # Pseudoinstruction equal to vl4re8.v
+
+ vl4re8.v v4, (a0) # Load v4-v7 with 4*VLEN/8 bytes from address in a0
+ vl4re16.v v4, (a0)
+ vl4re32.v v4, (a0)
+ vl4re64.v v4, (a0)
+
+ vl8r.v v8, (a0) # Pseudoinstruction equal to vl8re8.v
+
+ vl8re8.v v8, (a0) # Load v8-v15 with 8*VLEN/8 bytes from address in a0
+ vl8re16.v v8, (a0)
+ vl8re32.v v8, (a0)
+ vl8re64.v v8, (a0)
+
+ vs1r.v v3, (a1) # Store v3 to address in a1
+ vs2r.v v2, (a1) # Store v2-v3 to address in a1
+ vs4r.v v4, (a1) # Store v4-v7 to address in a1
+ vs8r.v v8, (a1) # Store v8-v15 to address in a1
+----
+
+NOTE: Implementations should raise illegal instruction exceptions on
+`vl<nf>r` instructions for EEW values that are not supported.
+
+NOTE: We have considered adding a whole register mask load instruction
+(`vl1rm.v`) but have decided to omit from initial extension. The
+primary purpose would be to inform the microarchitecture that the data
+will be used as a mask. The same effect can be achieved with the
+following code sequence, whose cost is at most four instructions. Of
+these, the first could likely be removed as `vl` is often already
+in a scalar register, and the last might already be present if the
+following vector instruction needs a new SEW/LMUL. So, in best case
+only two instructions (of which only one performs vector operations) are needed to synthesize the effect of the
+dedicated instruction:
+----
+ csrr t0, vl # Save current vl (potentially not needed)
+ vsetvli t1, x0, e8, m8, ta, ma # Maximum VLMAX
+ vlm.v v0, (a0) # Load mask register
+ vsetvli x0, t0, <new type> # Restore vl (potentially already present)
+----
+
+=== Vector Memory Alignment Constraints
+
+If an element accessed by a vector memory instruction is not naturally
+aligned to the size of the element, either the element is transferred
+successfully or an address misaligned exception is raised on that
+element.
+
+Support for misaligned vector memory accesses is independent of an
+implementation's support for misaligned scalar memory accesses.
+
+NOTE: An implementation may have neither, one, or both scalar and
+vector memory accesses support some or all misaligned accesses in
+hardware. A separate PMA should be defined to determine if vector
+misaligned accesses are supported in the associated address range.
+
+Vector misaligned memory accesses follow the same rules for atomicity
+as scalar misaligned memory accesses.
+
+=== Vector Memory Consistency Model
+
+Vector memory instructions appear to execute in program order on the
+local hart.
+
+Vector memory instructions follow RVWMO at the instruction level.
+If the Ztso extension is implemented, vector memory instructions additionally
+follow RVTSO at the instruction level.
+
+Except for vector indexed-ordered loads and stores, element operations
+are unordered within the instruction.
+
+Vector indexed-ordered loads and stores read and write elements
+from/to memory in element order respectively,
+obeying RVWMO at the element level.
+
+NOTE: Ztso only imposes RVTSO at the instruction level; intra-instruction
+ordering follows RVWMO regardless of whether Ztso is implemented.
+
+NOTE: More formal definitions required.
+
+Instructions affected by the vector length register `vl` have a control
+dependency on `vl`, rather than a data dependency.
+Similarly, masked vector instructions have a control dependency on the source
+mask register, rather than a data dependency.
+
+NOTE: Treating the vector length and mask as control rather than data
+typically matches the semantics of the corresponding scalar code, where branch
+instructions ordinarily would have been used.
+Treating the mask as control allows masked vector load instructions to access
+memory before the mask value is known, without the need for
+a misspeculation-recovery mechanism.
+
+=== Vector Arithmetic Instruction Formats
+
+The vector arithmetic instructions use a new major opcode (OP-V =
+1010111~2~) which neighbors OP-FP. The three-bit `funct3` field is
+used to define sub-categories of vector instructions.
+
+include::images/wavedrom/valu-format.adoc[]
+
+[[sec-arithmetic-encoding]]
+==== Vector Arithmetic Instruction encoding
+
+The `funct3` field encodes the operand type and source locations.
+
+.funct3
+[cols="1,1,1,3,5,5"]
+|===
+3+| funct3[2:0] | Category | Operands | Type of scalar operand
+
+| 0 | 0 | 0 | OPIVV | vector-vector | N/A
+| 0 | 0 | 1 | OPFVV | vector-vector | N/A
+| 0 | 1 | 0 | OPMVV | vector-vector | N/A
+| 0 | 1 | 1 | OPIVI | vector-immediate | `imm[4:0]`
+| 1 | 0 | 0 | OPIVX | vector-scalar | GPR `x` register `rs1`
+| 1 | 0 | 1 | OPFVF | vector-scalar | FP `f` register `rs1`
+| 1 | 1 | 0 | OPMVX | vector-scalar | GPR `x` register `rs1`
+| 1 | 1 | 1 | OPCFG | scalars-imms | GPR `x` register `rs1` & `rs2`/`imm`
+|===
+
+Integer operations are performed using unsigned or two's-complement
+signed integer arithmetic depending on the opcode.
+
+NOTE: In this discussion, fixed-point operations are
+considered to be integer operations.
+
+All standard vector floating-point arithmetic operations follow the
+IEEE-754/2008 standard. All vector floating-point operations use the
+dynamic rounding mode in the `frm` register. Use of the `frm` field
+when it contains an invalid rounding mode by any vector floating-point
+instruction--even those that do not depend on the rounding mode, or
+when `vl`=0, or when `vstart` {ge} `vl`--is reserved.
+
+NOTE: All vector floating-point code will rely on a valid value in
+`frm`. Implementations can make all vector FP instructions report
+exceptions when the rounding mode is invalid to simplify control
+logic.
+
+Vector-vector operations take two vectors of operands from vector
+register groups specified by `vs2` and `vs1` respectively.
+
+Vector-scalar operations can have three possible forms. In all three forms,
+the vector register group operand is specified by `vs2`. The second
+scalar source operand comes from one of three alternative sources:
+
+. For integer operations, the scalar can be a 5-bit immediate, `imm[4:0]`, encoded
+in the `rs1` field. The value is sign-extended to SEW bits, unless
+otherwise specified.
+
+. For integer operations, the scalar can be taken from the scalar `x`
+register specified by `rs1`. If XLEN>SEW, the least-significant SEW
+bits of the `x` register are used, unless otherwise specified. If
+XLEN<SEW, the value from the `x` register is sign-extended to SEW
+bits.
+
+. For floating-point operations, the scalar can be taken from a scalar
+`f` register. If FLEN > SEW, the value in the `f` registers is
+checked for a valid NaN-boxed value, in which case the
+least-significant SEW bits of the `f` register are used, else the
+canonical NaN value is used. Vector instructions where any
+floating-point vector operand's EEW is not a supported floating-point
+type width (which includes when FLEN < SEW) are reserved.
+
+NOTE: Some instructions _zero_-extend the 5-bit immediate, and denote this
+by naming the immediate `uimm` in the assembly syntax.
+
+NOTE: When adding a vector extension to the Zfinx/Zdinx/Zhinx
+extensions, floating-point scalar arguments are taken from the `x`
+registers. NaN-boxing is not supported in these extensions, and so
+the vector floating-point scalar value is produced using the same
+rules as for an integer scalar operand (i.e., when XLEN > SEW use the
+lowest SEW bits, when XLEN < SEW use the sign-extended value).
+
+Vector arithmetic instructions are masked under control of the `vm`
+field.
+
+----
+# Assembly syntax pattern for vector binary arithmetic instructions
+
+# Operations returning vector results, masked by vm (v0.t, <nothing>)
+vop.vv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vop.vx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+vop.vi vd, vs2, imm, vm # integer vector-immediate vd[i] = vs2[i] op imm
+
+vfop.vv vd, vs2, vs1, vm # FP vector-vector operation vd[i] = vs2[i] fop vs1[i]
+vfop.vf vd, vs2, rs1, vm # FP vector-scalar operation vd[i] = vs2[i] fop f[rs1]
+----
+
+NOTE: In the encoding, `vs2` is the first operand, while `rs1/imm`
+is the second operand. This is the opposite to the standard scalar
+ordering. This arrangement retains the existing encoding conventions
+that instructions that read only one scalar register, read it from
+`rs1`, and that 5-bit immediates are sourced from the `rs1` field.
+
+----
+# Assembly syntax pattern for vector ternary arithmetic instructions (multiply-add)
+
+# Integer operations overwriting sum input
+vop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vs2[i] + vd[i]
+vop.vx vd, rs1, vs2, vm # vd[i] = x[rs1] * vs2[i] + vd[i]
+
+# Integer operations overwriting product input
+vop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vd[i] + vs2[i]
+vop.vx vd, rs1, vs2, vm # vd[i] = x[rs1] * vd[i] + vs2[i]
+
+# Floating-point operations overwriting sum input
+vfop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vs2[i] + vd[i]
+vfop.vf vd, rs1, vs2, vm # vd[i] = f[rs1] * vs2[i] + vd[i]
+
+# Floating-point operations overwriting product input
+vfop.vv vd, vs1, vs2, vm # vd[i] = vs1[i] * vd[i] + vs2[i]
+vfop.vf vd, rs1, vs2, vm # vd[i] = f[rs1] * vd[i] + vs2[i]
+----
+
+NOTE: For ternary multiply-add operations, the assembler syntax always
+places the destination vector register first, followed by either `rs1`
+or `vs1`, then `vs2`. This ordering provides a more natural reading
+of the assembler for these ternary operations, as the multiply
+operands are always next to each other.
+
+[[sec-widening]]
+==== Widening Vector Arithmetic Instructions
+
+A few vector arithmetic instructions are defined to be __widening__
+operations where the destination vector register group has EEW=2*SEW
+and EMUL=2*LMUL. These are generally given a `vw*` prefix on the
+opcode, or `vfw*` for vector floating-point instructions.
+
+The first vector register group operand can be either single or
+double-width.
+
+----
+Assembly syntax pattern for vector widening arithmetic instructions
+
+# Double-width result, two single-width sources: 2*SEW = SEW op SEW
+vwop.vv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vwop.vx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+
+# Double-width result, first source double-width, second source single-width: 2*SEW = 2*SEW op SEW
+vwop.wv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vwop.wx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+----
+
+NOTE: Originally, a `w` suffix was used on opcode, but this could be
+confused with the use of a `w` suffix to mean word-sized operations in
+doubleword integers, so the `w` was moved to prefix.
+
+NOTE: The floating-point widening operations were changed to `vfw*`
+from `vwf*` to be more consistent with any scalar widening
+floating-point operations that will be written as `fw*`.
+
+Widening instruction encodings must follow the constraints in Section
+<<sec-vec-operands>>.
+
+[[sec-narrowing]]
+==== Narrowing Vector Arithmetic Instructions
+
+A few instructions are provided to convert double-width source vectors
+into single-width destination vectors. These instructions convert a
+vector register group specified by `vs2` with EEW/EMUL=2*SEW/2*LMUL to a vector register
+group with the current SEW/LMUL setting. Where there is a second
+source vector register group (specified by `vs1`), this has the same
+(narrower) width as the result (i.e., EEW=SEW).
+
+NOTE: An alternative design decision would have been to treat SEW/LMUL
+as defining the size of the source vector register group. The choice
+here is motivated by the belief the chosen approach will require fewer
+`vtype` changes.
+
+NOTE: Compare operations that set a mask register are also
+implicitly a narrowing operation.
+
+A `vn*` prefix on the opcode is used to distinguish these instructions
+in the assembler, or a `vfn*` prefix for narrowing floating-point
+opcodes. The double-width source vector register group is signified
+by a `w` in the source operand suffix (e.g., `vnsra.wv`)
+
+----
+Assembly syntax pattern for vector narrowing arithmetic instructions
+
+# Single-width result vd, double-width source vs2, single-width source vs1/rs1
+# SEW = 2*SEW op SEW
+vnop.wv vd, vs2, vs1, vm # integer vector-vector vd[i] = vs2[i] op vs1[i]
+vnop.wx vd, vs2, rs1, vm # integer vector-scalar vd[i] = vs2[i] op x[rs1]
+----
+
+Narrowing instruction encodings must follow the constraints in Section
+<<sec-vec-operands>>.
+
+[[sec-vector-integer]]
+=== Vector Integer Arithmetic Instructions
+
+A set of vector integer arithmetic instructions is provided. Unless
+otherwise stated, integer operations wrap around on overflow.
+
+==== Vector Single-Width Integer Add and Subtract
+
+Vector integer add and subtract are provided. Reverse-subtract
+instructions are also provided for the vector-scalar forms.
+
+----
+# Integer adds.
+vadd.vv vd, vs2, vs1, vm # Vector-vector
+vadd.vx vd, vs2, rs1, vm # vector-scalar
+vadd.vi vd, vs2, imm, vm # vector-immediate
+
+# Integer subtract
+vsub.vv vd, vs2, vs1, vm # Vector-vector
+vsub.vx vd, vs2, rs1, vm # vector-scalar
+
+# Integer reverse subtract
+vrsub.vx vd, vs2, rs1, vm # vd[i] = x[rs1] - vs2[i]
+vrsub.vi vd, vs2, imm, vm # vd[i] = imm - vs2[i]
+----
+
+NOTE: A vector of integer values can be negated using a
+reverse-subtract instruction with a scalar operand of `x0`. An
+assembly pseudoinstruction `vneg.v vd,vs` = `vrsub.vx vd,vs,x0` is provided.
+
+==== Vector Widening Integer Add/Subtract
+
+The widening add/subtract instructions are provided in both signed and
+unsigned variants, depending on whether the narrower source operands
+are first sign- or zero-extended before forming the double-width sum.
+
+----
+# Widening unsigned integer add/subtract, 2*SEW = SEW +/- SEW
+vwaddu.vv vd, vs2, vs1, vm # vector-vector
+vwaddu.vx vd, vs2, rs1, vm # vector-scalar
+vwsubu.vv vd, vs2, vs1, vm # vector-vector
+vwsubu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening signed integer add/subtract, 2*SEW = SEW +/- SEW
+vwadd.vv vd, vs2, vs1, vm # vector-vector
+vwadd.vx vd, vs2, rs1, vm # vector-scalar
+vwsub.vv vd, vs2, vs1, vm # vector-vector
+vwsub.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening unsigned integer add/subtract, 2*SEW = 2*SEW +/- SEW
+vwaddu.wv vd, vs2, vs1, vm # vector-vector
+vwaddu.wx vd, vs2, rs1, vm # vector-scalar
+vwsubu.wv vd, vs2, vs1, vm # vector-vector
+vwsubu.wx vd, vs2, rs1, vm # vector-scalar
+
+# Widening signed integer add/subtract, 2*SEW = 2*SEW +/- SEW
+vwadd.wv vd, vs2, vs1, vm # vector-vector
+vwadd.wx vd, vs2, rs1, vm # vector-scalar
+vwsub.wv vd, vs2, vs1, vm # vector-vector
+vwsub.wx vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: An integer value can be doubled in width using the widening add
+instructions with a scalar operand of `x0`. Assembly
+pseudoinstructions `vwcvt.x.x.v vd,vs,vm` = `vwadd.vx vd,vs,x0,vm` and
+`vwcvtu.x.x.v vd,vs,vm` = `vwaddu.vx vd,vs,x0,vm` are provided.
+
+==== Vector Integer Extension
+
+The vector integer extension instructions zero- or sign-extend a
+source vector integer operand with EEW less than SEW to fill SEW-sized
+elements in the destination. The EEW of the source is 1/2, 1/4, or
+1/8 of SEW, while EMUL of the source is (EEW/SEW)*LMUL. The
+destination has EEW equal to SEW and EMUL equal to LMUL.
+
+----
+vzext.vf2 vd, vs2, vm # Zero-extend SEW/2 source to SEW destination
+vsext.vf2 vd, vs2, vm # Sign-extend SEW/2 source to SEW destination
+vzext.vf4 vd, vs2, vm # Zero-extend SEW/4 source to SEW destination
+vsext.vf4 vd, vs2, vm # Sign-extend SEW/4 source to SEW destination
+vzext.vf8 vd, vs2, vm # Zero-extend SEW/8 source to SEW destination
+vsext.vf8 vd, vs2, vm # Sign-extend SEW/8 source to SEW destination
+----
+
+If the source EEW is not a supported width, or source EMUL would be
+below the minimum legal LMUL, the instruction encoding is reserved.
+
+NOTE: Standard vector load instructions access memory values that are
+the same size as the destination register elements. Some application
+code needs to operate on a range of operand widths in a wider element,
+for example, loading a byte from memory and adding to an eight-byte
+element. To avoid having to provide the cross-product of the number
+of vector load instructions by the number of data types (byte, word,
+halfword, and also signed/unsigned variants), we instead add explicit
+extension instructions that can be used if an appropriate widening
+arithmetic instruction is not available.
+
+==== Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+
+To support multi-word integer arithmetic, instructions that operate on
+a carry bit are provided. For each operation (add or subtract), two
+instructions are provided: one to provide the result (SEW width), and
+the second to generate the carry output (single bit encoded as a mask
+boolean).
+
+The carry inputs and outputs are represented using the mask register
+layout as described in Section <<sec-mask-register-layout>>. Due to
+encoding constraints, the carry input must come from the implicit `v0`
+register, but carry outputs can be written to any vector register that
+respects the source/destination overlap restrictions.
+
+`vadc` and `vsbc` add or subtract the source operands and the carry-in or
+borrow-in, and write the result to vector register `vd`.
+These instructions are encoded as masked instructions (`vm=0`), but they operate
+on and write back all body elements.
+Encodings corresponding to the unmasked versions (`vm=1`) are reserved.
+
+`vmadc` and `vmsbc` add or subtract the source operands, optionally
+add the carry-in or subtract the borrow-in if masked (`vm=0`), and
+write the result back to mask register `vd`. If unmasked (`vm=1`),
+there is no carry-in or borrow-in. These instructions operate on and
+write back all body elements, even if masked. Because these
+instructions produce a mask value, they always operate with a
+tail-agnostic policy.
+
+----
+ # Produce sum with carry.
+
+ # vd[i] = vs2[i] + vs1[i] + v0.mask[i]
+ vadc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd[i] = vs2[i] + x[rs1] + v0.mask[i]
+ vadc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # vd[i] = vs2[i] + imm + v0.mask[i]
+ vadc.vim vd, vs2, imm, v0 # Vector-immediate
+
+ # Produce carry out in mask register format
+
+ # vd.mask[i] = carry_out(vs2[i] + vs1[i] + v0.mask[i])
+ vmadc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd.mask[i] = carry_out(vs2[i] + x[rs1] + v0.mask[i])
+ vmadc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # vd.mask[i] = carry_out(vs2[i] + imm + v0.mask[i])
+ vmadc.vim vd, vs2, imm, v0 # Vector-immediate
+
+ # vd.mask[i] = carry_out(vs2[i] + vs1[i])
+ vmadc.vv vd, vs2, vs1 # Vector-vector, no carry-in
+
+ # vd.mask[i] = carry_out(vs2[i] + x[rs1])
+ vmadc.vx vd, vs2, rs1 # Vector-scalar, no carry-in
+
+ # vd.mask[i] = carry_out(vs2[i] + imm)
+ vmadc.vi vd, vs2, imm # Vector-immediate, no carry-in
+----
+
+Because implementing a carry propagation requires executing two
+instructions with unchanged inputs, destructive accumulations will
+require an additional move to obtain correct results.
+
+----
+ # Example multi-word arithmetic sequence, accumulating into v4
+ vmadc.vvm v1, v4, v8, v0 # Get carry into temp register v1
+ vadc.vvm v4, v4, v8, v0 # Calc new sum
+ vmmv.m v0, v1 # Move temp carry into v0 for next word
+----
+
+The subtract with borrow instruction `vsbc` performs the equivalent
+function to support long word arithmetic for subtraction. There are
+no subtract with immediate instructions.
+
+----
+ # Produce difference with borrow.
+
+ # vd[i] = vs2[i] - vs1[i] - v0.mask[i]
+ vsbc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd[i] = vs2[i] - x[rs1] - v0.mask[i]
+ vsbc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # Produce borrow out in mask register format
+
+ # vd.mask[i] = borrow_out(vs2[i] - vs1[i] - v0.mask[i])
+ vmsbc.vvm vd, vs2, vs1, v0 # Vector-vector
+
+ # vd.mask[i] = borrow_out(vs2[i] - x[rs1] - v0.mask[i])
+ vmsbc.vxm vd, vs2, rs1, v0 # Vector-scalar
+
+ # vd.mask[i] = borrow_out(vs2[i] - vs1[i])
+ vmsbc.vv vd, vs2, vs1 # Vector-vector, no borrow-in
+
+ # vd.mask[i] = borrow_out(vs2[i] - x[rs1])
+ vmsbc.vx vd, vs2, rs1 # Vector-scalar, no borrow-in
+----
+
+For `vmsbc`, the borrow is defined to be 1 iff the difference, prior to
+truncation, is negative.
+
+For `vadc` and `vsbc`, the instruction encoding is reserved if the
+destination vector register is `v0`.
+
+NOTE: This constraint corresponds to the constraint on masked vector
+operations that overwrite the mask register.
+
+==== Vector Bitwise Logical Instructions
+
+----
+# Bitwise logical operations.
+vand.vv vd, vs2, vs1, vm # Vector-vector
+vand.vx vd, vs2, rs1, vm # vector-scalar
+vand.vi vd, vs2, imm, vm # vector-immediate
+
+vor.vv vd, vs2, vs1, vm # Vector-vector
+vor.vx vd, vs2, rs1, vm # vector-scalar
+vor.vi vd, vs2, imm, vm # vector-immediate
+
+vxor.vv vd, vs2, vs1, vm # Vector-vector
+vxor.vx vd, vs2, rs1, vm # vector-scalar
+vxor.vi vd, vs2, imm, vm # vector-immediate
+----
+
+NOTE: With an immediate of -1, scalar-immediate forms of the `vxor`
+instruction provide a bitwise NOT operation. This is provided as
+an assembler pseudoinstruction `vnot.v vd,vs,vm` = `vxor.vi vd,vs,-1,vm`.
+
+==== Vector Single-Width Shift Instructions
+
+A full set of vector shift instructions are provided, including
+logical shift left (`sll`), and logical (zero-extending `srl`) and
+arithmetic (sign-extending `sra`) shift right. The data to be shifted
+is in the vector register group specified by `vs2` and the shift
+amount value can come from a vector register group `vs1`, a scalar
+integer register `rs1`, or a zero-extended 5-bit immediate. Only the low
+lg2(SEW) bits of the shift-amount value are used to control the shift
+amount.
+
+----
+# Bit shift operations
+vsll.vv vd, vs2, vs1, vm # Vector-vector
+vsll.vx vd, vs2, rs1, vm # vector-scalar
+vsll.vi vd, vs2, uimm, vm # vector-immediate
+
+vsrl.vv vd, vs2, vs1, vm # Vector-vector
+vsrl.vx vd, vs2, rs1, vm # vector-scalar
+vsrl.vi vd, vs2, uimm, vm # vector-immediate
+
+vsra.vv vd, vs2, vs1, vm # Vector-vector
+vsra.vx vd, vs2, rs1, vm # vector-scalar
+vsra.vi vd, vs2, uimm, vm # vector-immediate
+----
+
+==== Vector Narrowing Integer Right Shift Instructions
+
+The narrowing right shifts extract a smaller field from a wider
+operand and have both zero-extending (`srl`) and sign-extending
+(`sra`) forms. The shift amount can come from a vector register
+group, or a scalar `x` register, or a zero-extended 5-bit immediate.
+The low lg2(2*SEW) bits of the shift-amount value are
+used (e.g., the low 6 bits for a SEW=64-bit to SEW=32-bit narrowing
+operation).
+
+----
+ # Narrowing shift right logical, SEW = (2*SEW) >> SEW
+ vnsrl.wv vd, vs2, vs1, vm # vector-vector
+ vnsrl.wx vd, vs2, rs1, vm # vector-scalar
+ vnsrl.wi vd, vs2, uimm, vm # vector-immediate
+
+ # Narrowing shift right arithmetic, SEW = (2*SEW) >> SEW
+ vnsra.wv vd, vs2, vs1, vm # vector-vector
+ vnsra.wx vd, vs2, rs1, vm # vector-scalar
+ vnsra.wi vd, vs2, uimm, vm # vector-immediate
+----
+
+NOTE: Future extensions might add support for versions that narrow to
+a destination that is 1/4 the width of the source.
+
+NOTE: An integer value can be halved in width using the narrowing integer
+shift instructions with a scalar operand of `x0`. An assembly
+pseudoinstruction is provided `vncvt.x.x.w vd,vs,vm` = `vnsrl.wx vd,vs,x0,vm`.
+
+==== Vector Integer Compare Instructions
+
+The following integer compare instructions write 1 to the destination
+mask register element if the comparison evaluates to true, and 0
+otherwise. The destination mask vector is always held in a single
+vector register, with a layout of elements as described in Section
+<<sec-mask-register-layout>>. The destination mask vector register
+may be the same as the source vector mask register (`v0`).
+
+----
+# Set if equal
+vmseq.vv vd, vs2, vs1, vm # Vector-vector
+vmseq.vx vd, vs2, rs1, vm # vector-scalar
+vmseq.vi vd, vs2, imm, vm # vector-immediate
+
+# Set if not equal
+vmsne.vv vd, vs2, vs1, vm # Vector-vector
+vmsne.vx vd, vs2, rs1, vm # vector-scalar
+vmsne.vi vd, vs2, imm, vm # vector-immediate
+
+# Set if less than, unsigned
+vmsltu.vv vd, vs2, vs1, vm # Vector-vector
+vmsltu.vx vd, vs2, rs1, vm # Vector-scalar
+
+# Set if less than, signed
+vmslt.vv vd, vs2, vs1, vm # Vector-vector
+vmslt.vx vd, vs2, rs1, vm # vector-scalar
+
+# Set if less than or equal, unsigned
+vmsleu.vv vd, vs2, vs1, vm # Vector-vector
+vmsleu.vx vd, vs2, rs1, vm # vector-scalar
+vmsleu.vi vd, vs2, imm, vm # Vector-immediate
+
+# Set if less than or equal, signed
+vmsle.vv vd, vs2, vs1, vm # Vector-vector
+vmsle.vx vd, vs2, rs1, vm # vector-scalar
+vmsle.vi vd, vs2, imm, vm # vector-immediate
+
+# Set if greater than, unsigned
+vmsgtu.vx vd, vs2, rs1, vm # Vector-scalar
+vmsgtu.vi vd, vs2, imm, vm # Vector-immediate
+
+# Set if greater than, signed
+vmsgt.vx vd, vs2, rs1, vm # Vector-scalar
+vmsgt.vi vd, vs2, imm, vm # Vector-immediate
+
+# Following two instructions are not provided directly
+# Set if greater than or equal, unsigned
+# vmsgeu.vx vd, vs2, rs1, vm # Vector-scalar
+# Set if greater than or equal, signed
+# vmsge.vx vd, vs2, rs1, vm # Vector-scalar
+----
+
+The following table indicates how all comparisons are implemented in
+native machine code.
+
+----
+Comparison Assembler Mapping Assembler Pseudoinstruction
+
+va < vb vmslt{u}.vv vd, va, vb, vm
+va <= vb vmsle{u}.vv vd, va, vb, vm
+va > vb vmslt{u}.vv vd, vb, va, vm vmsgt{u}.vv vd, va, vb, vm
+va >= vb vmsle{u}.vv vd, vb, va, vm vmsge{u}.vv vd, va, vb, vm
+
+va < x vmslt{u}.vx vd, va, x, vm
+va <= x vmsle{u}.vx vd, va, x, vm
+va > x vmsgt{u}.vx vd, va, x, vm
+va >= x see below
+
+va < i vmsle{u}.vi vd, va, i-1, vm vmslt{u}.vi vd, va, i, vm
+va <= i vmsle{u}.vi vd, va, i, vm
+va > i vmsgt{u}.vi vd, va, i, vm
+va >= i vmsgt{u}.vi vd, va, i-1, vm vmsge{u}.vi vd, va, i, vm
+
+va, vb vector register groups
+x scalar integer register
+i immediate
+----
+
+NOTE: The immediate forms of `vmslt{u}.vi` are not provided as the
+immediate value can be decreased by 1 and the `vmsle{u}.vi` variants
+used instead. The `vmsle.vi` range is -16 to 15, resulting in an
+effective `vmslt.vi` range of -15 to 16. The `vmsleu.vi` range is 0
+to 15 giving an effective `vmsltu.vi` range of 1 to 16 (Note,
+`vmsltu.vi` with immediate 0 is not useful as it is always
+false).
+
+NOTE: Because the 5-bit vector immediates are always sign-extended,
+when the high bit of the `simm5` immediate is set, `vmsleu.vi` also
+supports unsigned immediate values in the range `2^SEW^-16` to
+`2^SEW^-1`, allowing corresponding `vmsltu.vi` compares against
+unsigned immediates in the range `2^SEW^-15` to `2^SEW^`. Note that
+`vmsltu.vi` with immediate `2^SEW^` is not useful as it is always
+true.
+
+Similarly, `vmsge{u}.vi` is not provided and the compare is
+implemented using `vmsgt{u}.vi` with the immediate decremented by one.
+The resulting effective `vmsge.vi` range is -15 to 16, and the
+resulting effective `vmsgeu.vi` range is 1 to 16 (Note, `vmsgeu.vi` with
+immediate 0 is not useful as it is always true).
+
+NOTE: The `vmsgt` forms for register scalar and immediates are provided
+to allow a single compare instruction to provide the correct
+polarity of mask value without using additional mask logical
+instructions.
+
+To reduce encoding space, the `vmsge{u}.vx` form is not directly
+provided, and so the `va {ge} x` case requires special treatment.
+
+NOTE: The `vmsge{u}.vx` could potentially be encoded in a
+non-orthogonal way under the unused OPIVI variant of `vmslt{u}`. These
+would be the only instructions in OPIVI that use a scalar `x`register
+however. Alternatively, a further two funct6 encodings could be used,
+but these would have a different operand format (writes to mask
+register) than others in the same group of 8 funct6 encodings. The
+current PoR is to omit these instructions and to synthesize where
+needed as described below.
+
+The `vmsge{u}.vx` operation can be synthesized by reducing the
+value of `x` by 1 and using the `vmsgt{u}.vx` instruction, when it is
+known that this will not underflow the representation in `x`.
+
+----
+Sequences to synthesize `vmsge{u}.vx` instruction
+
+va >= x, x > minimum
+
+ addi t0, x, -1; vmsgt{u}.vx vd, va, t0, vm
+----
+
+The above sequence will usually be the most efficient implementation,
+but assembler pseudoinstructions can be provided for cases where the
+range of `x` is unknown.
+
+----
+unmasked va >= x
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x
+ expansion: vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd
+
+masked va >= x, vd != v0
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t
+ expansion: vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+
+masked va >= x, vd == v0
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+ expansion: vmslt{u}.vx vt, va, x; vmandn.mm vd, vd, vt
+
+masked va >= x, any vd
+
+ pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+ expansion: vmslt{u}.vx vt, va, x; vmandn.mm vt, v0, vt; vmandn.mm vd, vd, v0; vmor.mm vd, vt, vd
+
+ The vt argument to the pseudoinstruction must name a temporary vector register that is
+ not same as vd and which will be clobbered by the pseudoinstruction
+----
+
+Compares effectively AND in the mask under a mask-undisturbed policy if the destination register is `v0`, e.g.,
+
+----
+ # (a < b) && (b < c) in two instructions when mask-undisturbed
+ vmslt.vv v0, va, vb # All body elements written
+ vmslt.vv v0, vb, vc, v0.t # Only update at set mask
+----
+
+Compares write mask registers, and so always operate under a
+tail-agnostic policy.
+
+==== Vector Integer Min/Max Instructions
+
+Signed and unsigned integer minimum and maximum instructions are
+supported.
+
+----
+# Unsigned minimum
+vminu.vv vd, vs2, vs1, vm # Vector-vector
+vminu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed minimum
+vmin.vv vd, vs2, vs1, vm # Vector-vector
+vmin.vx vd, vs2, rs1, vm # vector-scalar
+
+# Unsigned maximum
+vmaxu.vv vd, vs2, vs1, vm # Vector-vector
+vmaxu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed maximum
+vmax.vv vd, vs2, vs1, vm # Vector-vector
+vmax.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Integer Multiply Instructions
+
+The single-width multiply instructions perform a SEW-bit*SEW-bit
+multiply to generate a 2*SEW-bit product, then return one half of the
+product in the SEW-bit-wide destination. The `*mul*` versions write
+the low word of the product to the destination register, while the
+`*mulh*` versions write the high word of the product to the
+destination register.
+
+----
+# Signed multiply, returning low bits of product
+vmul.vv vd, vs2, vs1, vm # Vector-vector
+vmul.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed multiply, returning high bits of product
+vmulh.vv vd, vs2, vs1, vm # Vector-vector
+vmulh.vx vd, vs2, rs1, vm # vector-scalar
+
+# Unsigned multiply, returning high bits of product
+vmulhu.vv vd, vs2, vs1, vm # Vector-vector
+vmulhu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Signed(vs2)-Unsigned multiply, returning high bits of product
+vmulhsu.vv vd, vs2, vs1, vm # Vector-vector
+vmulhsu.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: There is no `vmulhus.vx` opcode to return high half of
+unsigned-vector * signed-scalar product. The scalar can be splatted
+to a vector, then a `vmulhsu.vv` used.
+
+NOTE: The current `vmulh*` opcodes perform simple fractional
+multiplies, but with no option to scale, round, and/or saturate the
+result. A possible future extension can consider variants of `vmulh`,
+`vmulhu`, `vmulhsu` that use the `vxrm` rounding mode when discarding
+low half of product. There is no possibility of overflow in these
+cases.
+
+==== Vector Integer Divide Instructions
+
+The divide and remainder instructions are equivalent to the RISC-V
+standard scalar integer multiply/divides, with the same results for
+extreme inputs.
+
+----
+ # Unsigned divide.
+ vdivu.vv vd, vs2, vs1, vm # Vector-vector
+ vdivu.vx vd, vs2, rs1, vm # vector-scalar
+
+ # Signed divide
+ vdiv.vv vd, vs2, vs1, vm # Vector-vector
+ vdiv.vx vd, vs2, rs1, vm # vector-scalar
+
+ # Unsigned remainder
+ vremu.vv vd, vs2, vs1, vm # Vector-vector
+ vremu.vx vd, vs2, rs1, vm # vector-scalar
+
+ # Signed remainder
+ vrem.vv vd, vs2, vs1, vm # Vector-vector
+ vrem.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: The decision to include integer divide and remainder was
+contentious. The argument in favor is that without a standard
+instruction, software would have to pick some algorithm to perform the
+operation, which would likely perform poorly on some
+microarchitectures versus others.
+
+NOTE: There is no instruction to perform a "scalar divide by vector"
+operation.
+
+==== Vector Widening Integer Multiply Instructions
+
+The widening integer multiply instructions return the full 2*SEW-bit
+product from an SEW-bit*SEW-bit multiply.
+
+----
+# Widening signed-integer multiply
+vwmul.vv vd, vs2, vs1, vm # vector-vector
+vwmul.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening unsigned-integer multiply
+vwmulu.vv vd, vs2, vs1, vm # vector-vector
+vwmulu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Widening signed(vs2)-unsigned integer multiply
+vwmulsu.vv vd, vs2, vs1, vm # vector-vector
+vwmulsu.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Integer Multiply-Add Instructions
+
+The integer multiply-add instructions are destructive and are provided
+in two forms, one that overwrites the addend or minuend
+(`vmacc`, `vnmsac`) and one that overwrites the first multiplicand
+(`vmadd`, `vnmsub`).
+
+The low half of the product is added or subtracted from the third operand.
+
+NOTE: `sac` is intended to be read as "subtract from accumulator". The
+opcode is `vnmsac` to match the (unfortunately counterintuitive)
+floating-point `fnmsub` instruction definition. Similarly for the
+`vnmsub` opcode.
+
+----
+# Integer multiply-add, overwrite addend
+vmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vmacc.vx vd, rs1, vs2, vm # vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+
+# Integer multiply-sub, overwrite minuend
+vnmsac.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+vnmsac.vx vd, rs1, vs2, vm # vd[i] = -(x[rs1] * vs2[i]) + vd[i]
+
+# Integer multiply-add, overwrite multiplicand
+vmadd.vv vd, vs1, vs2, vm # vd[i] = (vs1[i] * vd[i]) + vs2[i]
+vmadd.vx vd, rs1, vs2, vm # vd[i] = (x[rs1] * vd[i]) + vs2[i]
+
+# Integer multiply-sub, overwrite multiplicand
+vnmsub.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vd[i]) + vs2[i]
+vnmsub.vx vd, rs1, vs2, vm # vd[i] = -(x[rs1] * vd[i]) + vs2[i]
+----
+
+==== Vector Widening Integer Multiply-Add Instructions
+
+The widening integer multiply-add instructions add the full 2*SEW-bit
+product from a SEW-bit*SEW-bit multiply to a 2*SEW-bit value and
+produce a 2*SEW-bit result. All combinations of signed and unsigned
+multiply operands are supported.
+
+----
+# Widening unsigned-integer multiply-add, overwrite addend
+vwmaccu.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vwmaccu.vx vd, rs1, vs2, vm # vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+
+# Widening signed-integer multiply-add, overwrite addend
+vwmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vwmacc.vx vd, rs1, vs2, vm # vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+
+# Widening signed-unsigned-integer multiply-add, overwrite addend
+vwmaccsu.vv vd, vs1, vs2, vm # vd[i] = +(signed(vs1[i]) * unsigned(vs2[i])) + vd[i]
+vwmaccsu.vx vd, rs1, vs2, vm # vd[i] = +(signed(x[rs1]) * unsigned(vs2[i])) + vd[i]
+
+# Widening unsigned-signed-integer multiply-add, overwrite addend
+vwmaccus.vx vd, rs1, vs2, vm # vd[i] = +(unsigned(x[rs1]) * signed(vs2[i])) + vd[i]
+----
+
+==== Vector Integer Merge Instructions
+
+The vector integer merge instructions combine two source operands
+based on a mask. Unlike regular arithmetic instructions, the
+merge operates on all body elements (i.e., the set of elements from
+`vstart` up to the current vector length in `vl`).
+
+The `vmerge` instructions are encoded as masked instructions (`vm=0`).
+The instructions combine two
+sources as follows. At elements where the mask value is zero, the
+first operand is copied to the destination element, otherwise the
+second operand is copied to the destination element. The first
+operand is always a vector register group specified by `vs2`. The
+second operand is a vector register group specified by `vs1` or a
+scalar `x` register specified by `rs1` or a 5-bit sign-extended
+immediate.
+
+----
+vmerge.vvm vd, vs2, vs1, v0 # vd[i] = v0.mask[i] ? vs1[i] : vs2[i]
+vmerge.vxm vd, vs2, rs1, v0 # vd[i] = v0.mask[i] ? x[rs1] : vs2[i]
+vmerge.vim vd, vs2, imm, v0 # vd[i] = v0.mask[i] ? imm : vs2[i]
+----
+
+==== Vector Integer Move Instructions
+
+The vector integer move instructions copy a source operand to a vector
+register group.
+The `vmv.v.v` variant copies a vector register group, whereas the `vmv.v.x`
+and `vmv.v.i` variants __splat__ a scalar register or immediate to all active
+elements of the destination vector register group.
+These instructions are encoded as unmasked instructions (`vm=1`).
+The first operand specifier (`vs2`) must contain `v0`, and any other vector
+register number in `vs2` is _reserved_.
+
+----
+vmv.v.v vd, vs1 # vd[i] = vs1[i]
+vmv.v.x vd, rs1 # vd[i] = x[rs1]
+vmv.v.i vd, imm # vd[i] = imm
+----
+
+NOTE: Mask values can be widened into SEW-width elements using a
+sequence `vmv.v.i vd, 0; vmerge.vim vd, vd, 1, v0`.
+
+NOTE: The vector integer move instructions share the encoding with the vector
+merge instructions, but with `vm=1` and `vs2=v0`.
+
+The form `vmv.v.v vd, vd`, which leaves body elements unchanged,
+can be used to indicate that the register will next be used
+with an EEW equal to SEW.
+
+NOTE: Implementations that internally reorganize data according to EEW
+can shuffle the internal representation according to SEW.
+Implementations that do not internally reorganize data can dynamically
+elide this instruction, and treat as a NOP.
+
+NOTE: The `vmv.v.v vd. vd` instruction is not a RISC-V HINT as a
+tail-agnostic setting may cause an architectural state change on some
+implementations.
+
+[[sec-vector-fixed-point]]
+=== Vector Fixed-Point Arithmetic Instructions
+
+The preceding set of integer arithmetic instructions is extended to support
+fixed-point arithmetic.
+
+A fixed-point number is a two's-complement signed or unsigned integer
+interpreted as the numerator in a fraction with an implicit denominator.
+The fixed-point instructions are intended to be applied to the numerators;
+it is the responsibility of software to manage the denominators.
+An N-bit element can hold two's-complement signed integers in the
+range -2^N-1^...+2^N-1^-1, and unsigned integers in the range 0
+... +2^N^-1. The fixed-point instructions help preserve precision in
+narrow operands by supporting scaling and rounding, and can handle
+overflow by saturating results into the destination format range.
+
+NOTE: The widening integer operations described above can also be used
+to avoid overflow.
+
+==== Vector Single-Width Saturating Add and Subtract
+
+Saturating forms of integer add and subtract are provided, for both
+signed and unsigned integers. If the result would overflow the
+destination, the result is replaced with the closest representable
+value, and the `vxsat` bit is set.
+
+----
+# Saturating adds of unsigned integers.
+vsaddu.vv vd, vs2, vs1, vm # Vector-vector
+vsaddu.vx vd, vs2, rs1, vm # vector-scalar
+vsaddu.vi vd, vs2, imm, vm # vector-immediate
+
+# Saturating adds of signed integers.
+vsadd.vv vd, vs2, vs1, vm # Vector-vector
+vsadd.vx vd, vs2, rs1, vm # vector-scalar
+vsadd.vi vd, vs2, imm, vm # vector-immediate
+
+# Saturating subtract of unsigned integers.
+vssubu.vv vd, vs2, vs1, vm # Vector-vector
+vssubu.vx vd, vs2, rs1, vm # vector-scalar
+
+# Saturating subtract of signed integers.
+vssub.vv vd, vs2, vs1, vm # Vector-vector
+vssub.vx vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Averaging Add and Subtract
+
+The averaging add and subtract instructions right shift the result by
+one bit and round off the result according to the setting in `vxrm`.
+Both unsigned and signed versions are provided.
+For `vaaddu` and `vaadd` there can be no overflow in the result.
+For `vasub` and `vasubu`, overflow is ignored and the result wraps around.
+
+NOTE: For `vasub`, overflow occurs only when subtracting the smallest number
+from the largest number under `rnu` or `rne` rounding.
+
+----
+# Averaging add
+
+# Averaging adds of unsigned integers.
+vaaddu.vv vd, vs2, vs1, vm # roundoff_unsigned(vs2[i] + vs1[i], 1)
+vaaddu.vx vd, vs2, rs1, vm # roundoff_unsigned(vs2[i] + x[rs1], 1)
+
+# Averaging adds of signed integers.
+vaadd.vv vd, vs2, vs1, vm # roundoff_signed(vs2[i] + vs1[i], 1)
+vaadd.vx vd, vs2, rs1, vm # roundoff_signed(vs2[i] + x[rs1], 1)
+
+# Averaging subtract
+
+# Averaging subtract of unsigned integers.
+vasubu.vv vd, vs2, vs1, vm # roundoff_unsigned(vs2[i] - vs1[i], 1)
+vasubu.vx vd, vs2, rs1, vm # roundoff_unsigned(vs2[i] - x[rs1], 1)
+
+# Averaging subtract of signed integers.
+vasub.vv vd, vs2, vs1, vm # roundoff_signed(vs2[i] - vs1[i], 1)
+vasub.vx vd, vs2, rs1, vm # roundoff_signed(vs2[i] - x[rs1], 1)
+----
+
+==== Vector Single-Width Fractional Multiply with Rounding and Saturation
+
+The signed fractional multiply instruction produces a 2*SEW product of
+the two SEW inputs, then shifts the result right by SEW-1 bits,
+rounding these bits according to `vxrm`, then saturates the result to
+fit into SEW bits. If the result causes saturation, the `vxsat` bit
+is set.
+
+----
+# Signed saturating and rounding fractional multiply
+# See vxrm description for rounding calculation
+vsmul.vv vd, vs2, vs1, vm # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1))
+vsmul.vx vd, vs2, rs1, vm # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1))
+----
+
+NOTE: When multiplying two N-bit signed numbers, the largest magnitude
+is obtained for -2^N-1^ * -2^N-1^ producing a result +2^2N-2^, which
+has a single (zero) sign bit when held in 2N bits. All other products
+have two sign bits in 2N bits. To retain greater precision in N
+result bits, the product is shifted right by one bit less than N,
+saturating the largest magnitude result but increasing result
+precision by one bit for all other products.
+
+NOTE: We do not provide an equivalent fractional multiply where one
+input is unsigned, as these would retain all upper SEW bits and would
+not need to saturate. This operation is partly covered by the
+`vmulhu` and `vmulhsu` instructions, for the case where rounding is
+simply truncation (`rdn`).
+
+==== Vector Single-Width Scaling Shift Instructions
+
+These instructions shift the input value right, and round off the
+shifted out bits according to `vxrm`. The scaling right shifts have
+both zero-extending (`vssrl`) and sign-extending (`vssra`) forms. The
+data to be shifted is in the vector register group specified by `vs2`
+and the shift amount value can come from a vector register group
+`vs1`, a scalar integer register `rs1`, or a zero-extended 5-bit
+immediate. Only the low lg2(SEW) bits of the shift-amount value are
+used to control the shift amount.
+
+----
+ # Scaling shift right logical
+ vssrl.vv vd, vs2, vs1, vm # vd[i] = roundoff_unsigned(vs2[i], vs1[i])
+ vssrl.vx vd, vs2, rs1, vm # vd[i] = roundoff_unsigned(vs2[i], x[rs1])
+ vssrl.vi vd, vs2, uimm, vm # vd[i] = roundoff_unsigned(vs2[i], uimm)
+
+ # Scaling shift right arithmetic
+ vssra.vv vd, vs2, vs1, vm # vd[i] = roundoff_signed(vs2[i],vs1[i])
+ vssra.vx vd, vs2, rs1, vm # vd[i] = roundoff_signed(vs2[i], x[rs1])
+ vssra.vi vd, vs2, uimm, vm # vd[i] = roundoff_signed(vs2[i], uimm)
+----
+
+==== Vector Narrowing Fixed-Point Clip Instructions
+
+The `vnclip` instructions are used to pack a fixed-point value into a
+narrower destination. The instructions support rounding, scaling, and
+saturation into the final destination format. The source data is in
+the vector register group specified by `vs2`. The scaling shift amount
+value can come from a vector register group `vs1`, a scalar integer
+register `rs1`, or a zero-extended 5-bit immediate. The low
+lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the
+low 6 bits for a SEW=64-bit to SEW=32-bit narrowing operation) are
+used to control the right shift amount, which provides the scaling.
+----
+# Narrowing unsigned clip
+# SEW 2*SEW SEW
+ vnclipu.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], vs1[i]))
+ vnclipu.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_unsigned(vs2[i], x[rs1]))
+ vnclipu.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_unsigned(vs2[i], uimm))
+
+# Narrowing signed clip
+ vnclip.wv vd, vs2, vs1, vm # vd[i] = clip(roundoff_signed(vs2[i], vs1[i]))
+ vnclip.wx vd, vs2, rs1, vm # vd[i] = clip(roundoff_signed(vs2[i], x[rs1]))
+ vnclip.wi vd, vs2, uimm, vm # vd[i] = clip(roundoff_signed(vs2[i], uimm))
+----
+
+For `vnclipu`/`vnclip`, the rounding mode is specified in the `vxrm`
+CSR. Rounding occurs around the least-significant bit of the
+destination and before saturation.
+
+For `vnclipu`, the shifted rounded source value is treated as an
+unsigned integer and saturates if the result would overflow the
+destination viewed as an unsigned integer.
+
+NOTE: There is no single instruction that can saturate a signed value
+into an unsigned destination. A sequence of two vector instructions
+that first removes negative numbers by performing a max against 0
+using `vmax` then clips the resulting unsigned value into the
+destination using `vnclipu` can be used if setting `vxsat` value for
+negative numbers is not required. A `vsetvli` is required inbetween
+these two instructions to change SEW.
+
+For `vnclip`, the shifted rounded source value is treated as a signed
+integer and saturates if the result would overflow the destination viewed
+as a signed integer.
+
+If any destination element is saturated, the `vxsat` bit is set in the
+`vxsat` register.
+
+[[sec-vector-float]]
+=== Vector Floating-Point Instructions
+
+The standard vector floating-point instructions treat elements as
+IEEE-754/2008-compatible values. If the EEW of a vector
+floating-point operand does not correspond to a supported IEEE
+floating-point type, the instruction encoding is reserved.
+
+NOTE: Whether floating-point is supported, and for which element
+widths, is determined by the specific vector extension. The current
+set of extensions include support for 32-bit and 64-bit floating-point
+values. When 16-bit and 128-bit element widths are added, they will be
+also be treated as IEEE-754/2008-compatible values. Other
+floating-point formats may be supported in future extensions.
+
+Vector floating-point instructions require the presence of base scalar
+floating-point extensions corresponding to the supported vector
+floating-point element widths.
+
+NOTE: In particular, future vector extensions supporting 16-bit
+half-precision floating-point values will also require some scalar
+half-precision floating-point support.
+
+If the floating-point unit status field `mstatus.FS` is `Off` then any
+attempt to execute a vector floating-point instruction will raise an
+illegal instruction exception. Any vector floating-point instruction
+that modifies any floating-point extension state (i.e., floating-point
+CSRs or `f` registers) must set `mstatus.FS` to `Dirty`.
+
+If the hypervisor extension is implemented and V=1, the `vsstatus.FS` field is
+additionally in effect for vector floating-point instructions. If
+`vsstatus.FS` or `mstatus.FS` is `Off` then any
+attempt to execute a vector floating-point instruction will raise an
+illegal instruction exception. Any vector floating-point instruction
+that modifies any floating-point extension state (i.e., floating-point
+CSRs or `f` registers) must set both `mstatus.FS` and `vsstatus.FS` to `Dirty`.
+
+The vector floating-point instructions have the same behavior as the
+scalar floating-point instructions with regard to NaNs.
+
+Scalar values for floating-point vector-scalar operations are sourced
+as described in Section <<sec-arithmetic-encoding>>.
+
+==== Vector Floating-Point Exception Flags
+
+A vector floating-point exception at any active floating-point element
+sets the standard FP exception flags in the `fflags` register. Inactive
+elements do not set FP exception flags.
+
+==== Vector Single-Width Floating-Point Add/Subtract Instructions
+
+----
+ # Floating-point add
+ vfadd.vv vd, vs2, vs1, vm # Vector-vector
+ vfadd.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Floating-point subtract
+ vfsub.vv vd, vs2, vs1, vm # Vector-vector
+ vfsub.vf vd, vs2, rs1, vm # Vector-scalar vd[i] = vs2[i] - f[rs1]
+ vfrsub.vf vd, vs2, rs1, vm # Scalar-vector vd[i] = f[rs1] - vs2[i]
+----
+
+==== Vector Widening Floating-Point Add/Subtract Instructions
+
+----
+# Widening FP add/subtract, 2*SEW = SEW +/- SEW
+vfwadd.vv vd, vs2, vs1, vm # vector-vector
+vfwadd.vf vd, vs2, rs1, vm # vector-scalar
+vfwsub.vv vd, vs2, vs1, vm # vector-vector
+vfwsub.vf vd, vs2, rs1, vm # vector-scalar
+
+# Widening FP add/subtract, 2*SEW = 2*SEW +/- SEW
+vfwadd.wv vd, vs2, vs1, vm # vector-vector
+vfwadd.wf vd, vs2, rs1, vm # vector-scalar
+vfwsub.wv vd, vs2, vs1, vm # vector-vector
+vfwsub.wf vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Floating-Point Multiply/Divide Instructions
+
+----
+ # Floating-point multiply
+ vfmul.vv vd, vs2, vs1, vm # Vector-vector
+ vfmul.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Floating-point divide
+ vfdiv.vv vd, vs2, vs1, vm # Vector-vector
+ vfdiv.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Reverse floating-point divide vector = scalar / vector
+ vfrdiv.vf vd, vs2, rs1, vm # scalar-vector, vd[i] = f[rs1]/vs2[i]
+----
+
+==== Vector Widening Floating-Point Multiply
+
+----
+# Widening floating-point multiply
+vfwmul.vv vd, vs2, vs1, vm # vector-vector
+vfwmul.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+
+All four varieties of fused multiply-add are provided, and in two
+destructive forms that overwrite one of the operands, either the
+addend or the first multiplicand.
+
+----
+# FP multiply-accumulate, overwrites addend
+vfmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vfmacc.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) + vd[i]
+
+# FP negate-(multiply-accumulate), overwrites subtrahend
+vfnmacc.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) - vd[i]
+vfnmacc.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) - vd[i]
+
+# FP multiply-subtract-accumulator, overwrites subtrahend
+vfmsac.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) - vd[i]
+vfmsac.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) - vd[i]
+
+# FP negate-(multiply-subtract-accumulator), overwrites minuend
+vfnmsac.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+vfnmsac.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) + vd[i]
+
+# FP multiply-add, overwrites multiplicand
+vfmadd.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vd[i]) + vs2[i]
+vfmadd.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vd[i]) + vs2[i]
+
+# FP negate-(multiply-add), overwrites multiplicand
+vfnmadd.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vd[i]) - vs2[i]
+vfnmadd.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vd[i]) - vs2[i]
+
+# FP multiply-sub, overwrites multiplicand
+vfmsub.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vd[i]) - vs2[i]
+vfmsub.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vd[i]) - vs2[i]
+
+# FP negate-(multiply-sub), overwrites multiplicand
+vfnmsub.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vd[i]) + vs2[i]
+vfnmsub.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vd[i]) + vs2[i]
+----
+
+NOTE: While we considered using the two unused rounding modes
+in the scalar FP FMA encoding to provide a few non-destructive FMAs,
+these would complicate microarchitectures by being the only maskable
+operation with three inputs and separate output.
+
+==== Vector Widening Floating-Point Fused Multiply-Add Instructions
+
+The widening floating-point fused multiply-add instructions all
+overwrite the wide addend with the result. The multiplier inputs are
+all SEW wide, while the addend and destination is 2*SEW bits wide.
+
+----
+# FP widening multiply-accumulate, overwrites addend
+vfwmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+vfwmacc.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) + vd[i]
+
+# FP widening negate-(multiply-accumulate), overwrites addend
+vfwnmacc.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) - vd[i]
+vfwnmacc.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) - vd[i]
+
+# FP widening multiply-subtract-accumulator, overwrites addend
+vfwmsac.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) - vd[i]
+vfwmsac.vf vd, rs1, vs2, vm # vd[i] = +(f[rs1] * vs2[i]) - vd[i]
+
+# FP widening negate-(multiply-subtract-accumulator), overwrites addend
+vfwnmsac.vv vd, vs1, vs2, vm # vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+vfwnmsac.vf vd, rs1, vs2, vm # vd[i] = -(f[rs1] * vs2[i]) + vd[i]
+----
+
+==== Vector Floating-Point Square-Root Instruction
+
+This is a unary vector-vector instruction.
+
+----
+ # Floating-point square root
+ vfsqrt.v vd, vs2, vm # Vector-vector square root
+----
+
+==== Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+
+----
+ # Floating-point reciprocal square-root estimate to 7 bits.
+ vfrsqrt7.v vd, vs2, vm
+----
+
+This is a unary vector-vector instruction that returns an estimate of
+1/sqrt(x) accurate to 7 bits.
+
+NOTE: An earlier draft version had used the assembler name `vfrsqrte7`
+but this was deemed to cause confusion with the ``e``__x__ notation for element
+width. The earlier name can be retained as alias in tool chains for
+backward compatibility.
+
+The following table describes the instruction's behavior for all
+classes of floating-point inputs:
+
+[cols="1,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Input | Output | Exceptions raised
+
+| -{inf} {le} _x_ < -0.0 | canonical NaN | NV
+| -0.0 | -{inf} | DZ
+| +0.0 | +{inf} | DZ
+| +0.0 < _x_ < +{inf} | _estimate of 1/sqrt(x)_ |
+| +{inf} | +0.0 |
+| qNaN | canonical NaN |
+| sNaN | canonical NaN | NV
+|===
+
+NOTE: All positive normal and subnormal inputs produce normal outputs.
+
+NOTE: The output value is independent of the dynamic rounding mode.
+
+For the non-exceptional cases, the low bit of the exponent and the six high
+bits of significand (after the leading one) are concatenated and used to
+address the following table.
+The output of the table becomes the seven high bits of the result significand
+(after the leading one); the remainder of the result significand is zero.
+Subnormal inputs are normalized and the exponent adjusted appropriately before
+the lookup.
+The output exponent is chosen to make the result approximate the reciprocal of
+the square root of the argument.
+
+More precisely, the result is computed as follows.
+Let the normalized input exponent be equal to the input exponent if the input
+is normal, or 0 minus the number of leading zeros in the significand
+otherwise.
+If the input is subnormal, the normalized input significand is given by
+shifting the input significand left by 1 minus the normalized input exponent,
+discarding the leading 1 bit.
+The output exponent equals floor((3*B - 1 - the normalized input exponent) / 2),
+where B is the exponent bias. The output sign equals the input sign.
+
+The following table gives the seven MSBs of the output significand as a
+function of the LSB of the normalized input exponent and the six MSBs of the
+normalized input significand; the other bits of the output significand are zero.
+
+include::images/wavedrom/vfrsqrt7.adoc[]
+
+NOTE: For example, when SEW=32, vfrsqrt7(0x00718abc ({approx} 1.043e-38)) = 0x5f080000 ({approx} 9.800e18), and vfrsqrt7(0x7f765432 ({approx} 3.274e38)) = 0x1f820000 ({approx} 5.506e-20).
+
+NOTE: The 7 bit accuracy was chosen as it requires 0,1,2,3
+Newton-Raphson iterations to converge to close to bfloat16, FP16,
+FP32, FP64 accuracy respectively. Future instructions can be defined
+with greater estimate accuracy.
+
+==== Vector Floating-Point Reciprocal Estimate Instruction
+
+----
+ # Floating-point reciprocal estimate to 7 bits.
+ vfrec7.v vd, vs2, vm
+----
+
+NOTE: An earlier draft version had used the assembler name `vfrece7`
+but this was deemed to cause confusion with ``e``__x__ notation for element
+width. The earlier name can be retained as alias in tool chains for
+backward compatibility.
+
+This is a unary vector-vector instruction that returns an estimate of
+1/x accurate to 7 bits.
+
+The following table describes the instruction's behavior for all
+classes of floating-point inputs, where _B_ is the exponent bias:
+
+[cols="1,1,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Input (_x_) | Rounding Mode | Output (_y_ {approx} _1/x_) | Exceptions raised
+
+| -{inf} | _any_ | -0.0 |
+| -2^B+1^ < _x_ {le} -2^B^ (normal) | _any_ | -2^-(B+1)^ {ge} _y_ > -2^-B^ (subnormal, sig=01...) |
+| -2^B^ < _x_ {le} -2^B-1^ (normal) | _any_ | -2^-B^ {ge} _y_ > -2^-B+1^ (subnormal, sig=1...) |
+| -2^B-1^ < _x_ {le} -2^-B+1^ (normal) | _any_ | -2^-B+1^ {ge} _y_ > -2^B-1^ (normal) |
+| -2^-B+1^ < _x_ {le} -2^-B^ (subnormal, sig=1...) | _any_ | -2^B-1^ {ge} _y_ > -2^B^ (normal) |
+| -2^-B^ < _x_ {le} -2^-(B+1)^ (subnormal, sig=01...) | _any_ | -2^B^ {ge} _y_ > -2^B+1^ (normal) |
+| -2^-(B+1)^ < _x_ < -0.0 (subnormal, sig=00...) | RUP, RTZ | greatest-mag. negative finite value | NX, OF
+| -2^-(B+1)^ < _x_ < -0.0 (subnormal, sig=00...) | RDN, RNE, RMM | -{inf} | NX, OF
+| -0.0 | _any_ | -{inf} | DZ
+| +0.0 | _any_ | +{inf} | DZ
+| +0.0 < _x_ < 2^-(B+1)^ (subnormal, sig=00...) | RUP, RNE, RMM | +{inf} | NX, OF
+| +0.0 < _x_ < 2^-(B+1)^ (subnormal, sig=00...) | RDN, RTZ | greatest finite value | NX, OF
+| 2^-(B+1)^ {le} _x_ < 2^-B^ (subnormal, sig=01...) | _any_ | 2^B+1^ > _y_ {ge} 2^B^ (normal) |
+| 2^-B^ {le} _x_ < 2^-B+1^ (subnormal, sig=1...) | _any_ | 2^B^ > _y_ {ge} 2^B-1^ (normal) |
+| 2^-B+1^ {le} _x_ < 2^B-1^ (normal) | _any_ | 2^B-1^ > _y_ {ge} 2^-B+1^ (normal) |
+| 2^B-1^ {le} _x_ < 2^B^ (normal) | _any_ | 2^-B+1^ > _y_ {ge} 2^-B^ (subnormal, sig=1...) |
+| 2^B^ {le} _x_ < 2^B+1^ (normal) | _any_ | 2^-B^ > _y_ {ge} 2^-(B+1)^ (subnormal, sig=01...) |
+| +{inf} | _any_ | +0.0 |
+| qNaN | _any_ | canonical NaN |
+| sNaN | _any_ | canonical NaN | NV
+|===
+
+NOTE: Subnormal inputs with magnitude at least 2^-(B+1)^ produce normal outputs;
+other subnormal inputs produce infinite outputs.
+Normal inputs with magnitude at least 2^B-1^ produce subnormal outputs;
+other normal inputs produce normal outputs.
+
+NOTE: The output value depends on the dynamic rounding mode when
+the overflow exception is raised.
+
+For the non-exceptional cases, the seven high bits of significand (after the
+leading one) are used to address the following table.
+The output of the table becomes the seven high bits of the result significand
+(after the leading one); the remainder of the result significand is zero.
+Subnormal inputs are normalized and the exponent adjusted appropriately before
+the lookup.
+The output exponent is chosen to make the result approximate the reciprocal of
+the argument, and subnormal outputs are denormalized accordingly.
+
+More precisely, the result is computed as follows.
+Let the normalized input exponent be equal to the input exponent if the input
+is normal, or 0 minus the number of leading zeros in the significand
+otherwise.
+The normalized output exponent equals (2*B - 1 - the normalized input exponent).
+If the normalized output exponent is outside the range [-1, 2*B], the result
+corresponds to one of the exceptional cases in the table above.
+
+If the input is subnormal, the normalized input significand is given by
+shifting the input significand left by 1 minus the normalized input exponent,
+discarding the leading 1 bit.
+Otherwise, the normalized input significand equals the input significand.
+The following table gives the seven MSBs of the normalized output significand
+as a function of the seven MSBs of the normalized input significand; the other
+bits of the normalized output significand are zero.
+
+include::images/wavedrom/vfrec7.adoc[]
+
+If the normalized output exponent is 0 or -1, the result is subnormal: the
+output exponent is 0, and the output significand is given by concatenating
+a 1 bit to the left of the normalized output significand, then shifting that
+quantity right by 1 minus the normalized output exponent.
+Otherwise, the output exponent equals the normalized output exponent, and the
+output significand equals the normalized output significand.
+The output sign equals the input sign.
+
+NOTE: For example, when SEW=32, vfrec7(0x00718abc ({approx} 1.043e-38)) = 0x7e900000 ({approx} 9.570e37), and vfrec7(0x7f765432 ({approx} 3.274e38)) = 0x00214000 ({approx} 3.053e-39).
+
+NOTE: The 7 bit accuracy was chosen as it requires 0,1,2,3
+Newton-Raphson iterations to converge to close to bfloat16, FP16,
+FP32, FP64 accuracy respectively. Future instructions can be defined
+with greater estimate accuracy.
+
+==== Vector Floating-Point MIN/MAX Instructions
+
+The vector floating-point `vfmin` and `vfmax` instructions have the
+same behavior as the corresponding scalar floating-point instructions
+in version 2.2 of the RISC-V F/D/Q extension: they perform the `minimumNumber`
+or `maximumNumber` operation on active elements.
+
+----
+ # Floating-point minimum
+ vfmin.vv vd, vs2, vs1, vm # Vector-vector
+ vfmin.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Floating-point maximum
+ vfmax.vv vd, vs2, vs1, vm # Vector-vector
+ vfmax.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+==== Vector Floating-Point Sign-Injection Instructions
+
+Vector versions of the scalar sign-injection instructions. The result
+takes all bits except the sign bit from the vector `vs2` operands.
+
+----
+ vfsgnj.vv vd, vs2, vs1, vm # Vector-vector
+ vfsgnj.vf vd, vs2, rs1, vm # vector-scalar
+
+ vfsgnjn.vv vd, vs2, vs1, vm # Vector-vector
+ vfsgnjn.vf vd, vs2, rs1, vm # vector-scalar
+
+ vfsgnjx.vv vd, vs2, vs1, vm # Vector-vector
+ vfsgnjx.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+NOTE: A vector of floating-point values can be negated using a
+sign-injection instruction with both source operands set to the same
+vector operand. An assembly pseudoinstruction is provided: `vfneg.v vd,vs` = `vfsgnjn.vv vd,vs,vs`.
+
+NOTE: The absolute value of a vector of floating-point elements can be
+calculated using a sign-injection instruction with both source
+operands set to the same vector operand. An assembly
+pseudoinstruction is provided: `vfabs.v vd,vs` = `vfsgnjx.vv vd,vs,vs`.
+
+==== Vector Floating-Point Compare Instructions
+
+These vector FP compare instructions compare two source operands and
+write the comparison result to a mask register. The destination mask
+vector is always held in a single vector register, with a layout of
+elements as described in Section <<sec-mask-register-layout>>. The
+destination mask vector register may be the same as the source vector
+mask register (`v0`). Compares write mask registers, and so always
+operate under a tail-agnostic policy.
+
+The compare instructions follow the semantics of the scalar
+floating-point compare instructions. `vmfeq` and `vmfne` raise the invalid
+operation exception only on signaling NaN inputs. `vmflt`, `vmfle`, `vmfgt`,
+and `vmfge` raise the invalid operation exception on both signaling and
+quiet NaN inputs.
+`vmfne` writes 1 to the destination element when either
+operand is NaN, whereas the other compares write 0 when either operand
+is NaN.
+
+----
+ # Compare equal
+ vmfeq.vv vd, vs2, vs1, vm # Vector-vector
+ vmfeq.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare not equal
+ vmfne.vv vd, vs2, vs1, vm # Vector-vector
+ vmfne.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare less than
+ vmflt.vv vd, vs2, vs1, vm # Vector-vector
+ vmflt.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare less than or equal
+ vmfle.vv vd, vs2, vs1, vm # Vector-vector
+ vmfle.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare greater than
+ vmfgt.vf vd, vs2, rs1, vm # vector-scalar
+
+ # Compare greater than or equal
+ vmfge.vf vd, vs2, rs1, vm # vector-scalar
+----
+
+----
+Comparison Assembler Mapping Assembler pseudoinstruction
+
+va < vb vmflt.vv vd, va, vb, vm
+va <= vb vmfle.vv vd, va, vb, vm
+va > vb vmflt.vv vd, vb, va, vm vmfgt.vv vd, va, vb, vm
+va >= vb vmfle.vv vd, vb, va, vm vmfge.vv vd, va, vb, vm
+
+va < f vmflt.vf vd, va, f, vm
+va <= f vmfle.vf vd, va, f, vm
+va > f vmfgt.vf vd, va, f, vm
+va >= f vmfge.vf vd, va, f, vm
+
+va, vb vector register groups
+f scalar floating-point register
+----
+
+NOTE: Providing all forms is necessary to correctly handle unordered
+compares for NaNs.
+
+NOTE: C99 floating-point quiet compares can be implemented by masking
+the signaling compares when either input is NaN, as follows. When
+the comparand is a non-NaN constant, the middle two instructions can be
+omitted.
+
+----
+ # Example of implementing isgreater()
+ vmfeq.vv v0, va, va # Only set where A is not NaN.
+ vmfeq.vv v1, vb, vb # Only set where B is not NaN.
+ vmand.mm v0, v0, v1 # Only set where A and B are ordered,
+ vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
+----
+
+NOTE: In the above sequence, it is tempting to mask the second `vmfeq`
+instruction and remove the `vmand` instruction, but this more efficient
+sequence incorrectly fails to raise the invalid exception when an
+element of `va` contains a quiet NaN and the corresponding element in
+`vb` contains a signaling NaN.
+
+==== Vector Floating-Point Classify Instruction
+
+This is a unary vector-vector instruction that operates in the same
+way as the scalar classify instruction.
+
+----
+ vfclass.v vd, vs2, vm # Vector-vector
+----
+
+The 10-bit mask produced by this instruction is placed in the
+least-significant bits of the result elements. The upper (SEW-10)
+bits of the result are filled with zeros. The instruction is only
+defined for SEW=16b and above, so the result will always fit in the
+destination elements.
+
+==== Vector Floating-Point Merge Instruction
+
+A vector-scalar floating-point merge instruction is provided, which
+operates on all body elements from `vstart` up to the current vector
+length in `vl` regardless of mask value.
+
+The `vfmerge.vfm` instruction is encoded as a masked instruction (`vm=0`).
+At elements where the mask value is zero, the first vector operand is
+copied to the destination element, otherwise a scalar floating-point
+register value is copied to the destination element.
+
+----
+vfmerge.vfm vd, vs2, rs1, v0 # vd[i] = v0.mask[i] ? f[rs1] : vs2[i]
+----
+
+[[sec-vector-float-move]]
+==== Vector Floating-Point Move Instruction
+
+The vector floating-point move instruction __splats__ a floating-point
+scalar operand to a vector register group. The instruction copies a
+scalar `f` register value to all active elements of a vector register
+group. This instruction is encoded as an unmasked instruction (`vm=1`).
+The instruction must have the `vs2` field set to `v0`, with all other
+values for `vs2` reserved.
+
+----
+vfmv.v.f vd, rs1 # vd[i] = f[rs1]
+----
+
+NOTE: The `vfmv.v.f` instruction shares the encoding with the `vfmerge.vfm`
+instruction, but with `vm=1` and `vs2=v0`.
+
+==== Single-Width Floating-Point/Integer Type-Convert Instructions
+
+Conversion operations are provided to convert to and from
+floating-point values and unsigned and signed integers, where both
+source and destination are SEW wide.
+
+----
+vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer.
+vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer.
+
+vfcvt.rtz.xu.f.v vd, vs2, vm # Convert float to unsigned integer, truncating.
+vfcvt.rtz.x.f.v vd, vs2, vm # Convert float to signed integer, truncating.
+
+vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float.
+vfcvt.f.x.v vd, vs2, vm # Convert signed integer to float.
+----
+
+The conversions follow the same rules on exceptional conditions as the
+scalar conversion instructions.
+The conversions use the dynamic rounding mode in `frm`, except for the `rtz`
+variants, which round towards zero.
+
+NOTE: The `rtz` variants are provided to accelerate truncating conversions
+from floating-point to integer, as is common in languages like C and Java.
+
+==== Widening Floating-Point/Integer Type-Convert Instructions
+
+A set of conversion instructions is provided to convert between
+narrower integer and floating-point datatypes to a type of twice the
+width.
+
+----
+vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
+vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer.
+
+vfwcvt.rtz.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer, truncating.
+vfwcvt.rtz.x.f.v vd, vs2, vm # Convert float to double-width signed integer, truncating.
+
+vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
+vfwcvt.f.x.v vd, vs2, vm # Convert signed integer to double-width float.
+
+vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
+----
+
+These instructions have the same constraints on vector register overlap
+as other widening instructions (see <<sec-widening>>).
+
+NOTE: A double-width IEEE floating-point value can always represent a
+single-width integer exactly.
+
+NOTE: A double-width IEEE floating-point value can always represent a
+single-width IEEE floating-point value exactly.
+
+NOTE: A full set of floating-point widening conversions is not
+supported as single instructions, but any widening conversion can be
+implemented as several doubling steps with equivalent results and no
+additional exception flags raised.
+
+==== Narrowing Floating-Point/Integer Type-Convert Instructions
+
+A set of conversion instructions is provided to convert wider integer
+and floating-point datatypes to a type of half the width.
+
+----
+vfncvt.xu.f.w vd, vs2, vm # Convert double-width float to unsigned integer.
+vfncvt.x.f.w vd, vs2, vm # Convert double-width float to signed integer.
+
+vfncvt.rtz.xu.f.w vd, vs2, vm # Convert double-width float to unsigned integer, truncating.
+vfncvt.rtz.x.f.w vd, vs2, vm # Convert double-width float to signed integer, truncating.
+
+vfncvt.f.xu.w vd, vs2, vm # Convert double-width unsigned integer to float.
+vfncvt.f.x.w vd, vs2, vm # Convert double-width signed integer to float.
+
+vfncvt.f.f.w vd, vs2, vm # Convert double-width float to single-width float.
+vfncvt.rod.f.f.w vd, vs2, vm # Convert double-width float to single-width float,
+ # rounding towards odd.
+----
+
+These instructions have the same constraints on vector register overlap
+as other narrowing instructions (see <<sec-narrowing>>).
+
+NOTE: A full set of floating-point narrowing conversions is not
+supported as single instructions. Conversions can be implemented in
+a sequence of halving steps. Results are equivalently rounded and
+the same exception flags are raised if all but the last halving step
+use round-towards-odd (`vfncvt.rod.f.f.w`). Only the final step
+should use the desired rounding mode.
+
+NOTE: For `vfncvt.rod.f.f.w`, a finite value that exceeds the range of the
+destination format is converted to the destination format's largest finite value with the same sign.
+
+=== Vector Reduction Operations
+
+Vector reduction operations take a vector register group of elements
+and a scalar held in element 0 of a vector register, and perform a
+reduction using some binary operator, to produce a scalar result in
+element 0 of a vector register. The scalar input and output operands
+are held in element 0 of a single vector register, not a vector
+register group, so any vector register can be the scalar source or
+destination of a vector reduction regardless of LMUL setting.
+
+The destination vector register can overlap the source operands,
+including the mask register.
+
+NOTE: Vector reductions read and write the scalar operand and result
+into element 0 of a vector register instead of a scalar register to
+avoid a loss of decoupling with the scalar processor, and to support
+future polymorphic use with future types not supported in the scalar
+unit.
+
+Inactive elements from the source vector register group are excluded
+from the reduction, but the scalar operand is always included
+regardless of the mask values.
+
+The other elements in the destination vector register ( 0 < index <
+VLEN/SEW) are considered the tail and are managed with the current
+tail agnostic/undisturbed policy.
+
+If `vl`=0, no operation is performed and the destination register is
+not updated.
+
+NOTE: This choice of behavior for `vl`=0 reduces implementation
+complexity as it is consistent with other operations on vector
+register state. For the common case that the source and destination
+scalar operand are the same vector register, this behavior also
+produces the expected result. For the uncommon case that the source
+and destination scalar operand are in different vector registers, this
+instruction will not copy the source into the destination when `vl`=0.
+However, it is expected that in most of these cases it will be
+statically known that `vl` is not zero. In other cases, a check for
+`vl`=0 will have to be added to ensure that the source scalar is
+copied to the destination (e.g., by explicitly setting `vl`=1 and
+performing a register-register copy).
+
+Traps on vector reduction instructions are always reported with a
+`vstart` of 0. Vector reduction operations raise an illegal
+instruction exception if `vstart` is non-zero.
+
+The assembler syntax for a reduction operation is `vredop.vs`, where
+the `.vs` suffix denotes the first operand is a vector register group
+and the second operand is a scalar stored in element 0 of a vector
+register.
+
+[[sec-vector-integer-reduce]]
+==== Vector Single-Width Integer Reduction Instructions
+
+All operands and results of single-width reduction instructions have
+the same SEW width. Overflows wrap around on arithmetic sums.
+
+----
+ # Simple reductions, where [*] denotes all active elements:
+ vredsum.vs vd, vs2, vs1, vm # vd[0] = sum( vs1[0] , vs2[*] )
+ vredmaxu.vs vd, vs2, vs1, vm # vd[0] = maxu( vs1[0] , vs2[*] )
+ vredmax.vs vd, vs2, vs1, vm # vd[0] = max( vs1[0] , vs2[*] )
+ vredminu.vs vd, vs2, vs1, vm # vd[0] = minu( vs1[0] , vs2[*] )
+ vredmin.vs vd, vs2, vs1, vm # vd[0] = min( vs1[0] , vs2[*] )
+ vredand.vs vd, vs2, vs1, vm # vd[0] = and( vs1[0] , vs2[*] )
+ vredor.vs vd, vs2, vs1, vm # vd[0] = or( vs1[0] , vs2[*] )
+ vredxor.vs vd, vs2, vs1, vm # vd[0] = xor( vs1[0] , vs2[*] )
+----
+
+[[sec-vector-integer-reduce-widen]]
+==== Vector Widening Integer Reduction Instructions
+
+The unsigned `vwredsumu.vs` instruction zero-extends the SEW-wide
+vector elements before summing them, then adds the 2*SEW-width scalar
+element, and stores the result in a 2*SEW-width scalar element.
+
+The `vwredsum.vs` instruction sign-extends the SEW-wide vector
+elements before summing them.
+
+For both `vwredsumu.vs` and `vwredsum.vs`, overflows wrap around.
+
+----
+ # Unsigned sum reduction into double-width accumulator
+ vwredsumu.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(zero-extend(SEW))
+
+ # Signed sum reduction into double-width accumulator
+ vwredsum.vs vd, vs2, vs1, vm # 2*SEW = 2*SEW + sum(sign-extend(SEW))
+----
+
+[[sec-vector-float-reduce]]
+==== Vector Single-Width Floating-Point Reduction Instructions
+
+----
+ # Simple reductions.
+ vfredosum.vs vd, vs2, vs1, vm # Ordered sum
+ vfredusum.vs vd, vs2, vs1, vm # Unordered sum
+ vfredmax.vs vd, vs2, vs1, vm # Maximum value
+ vfredmin.vs vd, vs2, vs1, vm # Minimum value
+
+----
+
+NOTE: Older assembler mnemonic `vfredsum` is retained as alias for `vfredusum`.
+
+===== Vector Ordered Single-Width Floating-Point Sum Reduction
+
+The `vfredosum` instruction must sum the floating-point values in
+element order, starting with the scalar in `vs1[0]`--that is, it
+performs the computation:
+
+----
+ vd[0] = `(((vs1[0] + vs2[0]) + vs2[1]) + ...) + vs2[vl-1]`
+----
+where each addition operates identically to the scalar floating-point
+instructions in terms of raising exception flags and generating or
+propagating special values.
+
+NOTE: The ordered reduction supports compiler autovectorization, while
+the unordered FP sum allows for faster implementations.
+
+When the operation is masked (`vm=0`), the masked-off elements do not
+affect the result or the exception flags.
+
+NOTE: If no elements are active, no additions are performed, so the scalar in
+`vs1[0]` is simply copied to the destination register, without canonicalizing
+NaN values and without setting any exception flags. This behavior preserves
+the handling of NaNs, exceptions, and rounding when autovectorizing a scalar
+summation loop.
+
+===== Vector Unordered Single-Width Floating-Point Sum Reduction
+
+The unordered sum reduction instruction, `vfredusum`, provides an
+implementation more freedom in performing the reduction.
+
+The implementation must produce a result equivalent to a reduction tree
+composed of binary operator nodes, with the inputs being elements from
+the source vector register group (`vs2`) and the source scalar value
+(`vs1[0]`). Each operator in the tree accepts two inputs and produces
+one result.
+Each operator first computes an exact sum as a RISC-V scalar floating-point
+addition with infinite exponent range and precision, then converts this exact
+sum to a floating-point format with range and precision each at least as great
+as the element floating-point format indicated by SEW, rounding using the
+currently active floating-point dynamic rounding mode and raising exception
+flags as necessary.
+A different floating-point range and precision may be chosen for the result of
+each operator.
+A node where one input is derived only from elements masked-off or beyond the
+active vector length may either treat that input as the additive identity of the
+appropriate EEW or simply copy the other input to its output.
+The rounded result from the root node in the tree is converted (rounded again,
+using the dynamic rounding mode) to the standard floating-point format
+indicated by SEW.
+An implementation
+is allowed to add an additional additive identity to the final result.
+
+The additive identity is +0.0 when rounding down (towards -{inf}) or
+-0.0 for all other rounding modes.
+
+The reduction tree structure must be deterministic for a given value
+in `vtype` and `vl`.
+
+NOTE: As a consequence of this definition, implementations need not propagate
+NaN payloads through the reduction tree when no elements are active. In
+particular, if no elements are active and the scalar input is NaN,
+implementations are permitted to canonicalize the NaN and, if the NaN is
+signaling, set the invalid exception flag. Implementations are alternatively
+permitted to pass through the original NaN and set no exception flags, as with
+`vfredosum`.
+
+NOTE: The `vfredosum` instruction is a valid implementation of the
+`vfredusum` instruction.
+
+===== Vector Single-Width Floating-Point Max and Min Reductions
+
+The `vfredmin` and `vfredmax` instructions reduce the scalar argument in
+`vs1[0]` and active elements in `vs2` using the `minimumNumber` and
+`maximumNumber` operations, respectively.
+
+NOTE: Floating-point max and min reductions should return the same
+final value and raise the same exception flags regardless of operation
+order.
+
+NOTE: If no elements are active, the scalar in `vs1[0]` is simply copied to
+the destination register, without canonicalizing NaN values and without
+setting any exception flags.
+
+[[sec-vector-float-reduce-widen]]
+==== Vector Widening Floating-Point Reduction Instructions
+
+Widening forms of the sum reductions are provided that
+read and write a double-width reduction result.
+
+----
+ # Simple reductions.
+ vfwredosum.vs vd, vs2, vs1, vm # Ordered sum
+ vfwredusum.vs vd, vs2, vs1, vm # Unordered sum
+----
+
+NOTE: Older assembler mnemonic `vfwredsum` is retained as alias for `vfwredusum`.
+
+The reduction of the SEW-width elements is performed as in the
+single-width reduction case, with the elements in `vs2` promoted
+to 2*SEW bits before adding to the 2*SEW-bit accumulator.
+
+NOTE: `vfwredosum.vs` handles inactive elements and NaN payloads analogously
+to `vfredosum.vs`; `vfwredusum.vs` does so analogously to `vfredusum.vs`.
+
+[[sec-vector-mask]]
+=== Vector Mask Instructions
+
+Several instructions are provided to help operate on mask values held in
+a vector register.
+
+[[sec-mask-register-logical]]
+==== Vector Mask-Register Logical Instructions
+
+Vector mask-register logical operations operate on mask registers.
+Each element in a mask register is a single bit, so these instructions
+all operate on single vector registers regardless of the setting of
+the `vlmul` field in `vtype`. They do not change the value of
+`vlmul`. The destination vector register may be the same as either
+source vector register.
+
+As with other vector instructions, the elements with indices less than
+`vstart` are unchanged, and `vstart` is reset to zero after execution.
+Vector mask logical instructions are always unmasked, so there are no
+inactive elements, and the encodings with `vm=0` are reserved.
+Mask elements past `vl`, the tail elements, are
+always updated with a tail-agnostic policy.
+
+----
+ vmand.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && vs1.mask[i]
+ vmnand.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] && vs1.mask[i])
+ vmandn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] && !vs1.mask[i]
+ vmxor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] ^^ vs1.mask[i]
+ vmor.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || vs1.mask[i]
+ vmnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] || vs1.mask[i])
+ vmorn.mm vd, vs2, vs1 # vd.mask[i] = vs2.mask[i] || !vs1.mask[i]
+ vmxnor.mm vd, vs2, vs1 # vd.mask[i] = !(vs2.mask[i] ^^ vs1.mask[i])
+----
+
+NOTE: The previous assembler mnemonics `vmandnot` and `vmornot` have
+been changed to `vmandn` and `vmorn` to be consistent with the
+equivalent scalar instructions. The old `vmandnot` and `vmornot`
+mnemonics can be retained as assembler aliases for compatibility.
+
+Several assembler pseudoinstructions are defined as shorthand for
+common uses of mask logical operations:
+----
+ vmmv.m vd, vs => vmand.mm vd, vs, vs # Copy mask register
+ vmclr.m vd => vmxor.mm vd, vd, vd # Clear mask register
+ vmset.m vd => vmxnor.mm vd, vd, vd # Set mask register
+ vmnot.m vd, vs => vmnand.mm vd, vs, vs # Invert bits
+----
+
+NOTE: The `vmmv.m` instruction was previously called `vmcpy.m`, but
+with new layout it is more consistent to name as a "mv" because bits
+are copied without interpretation. The `vmcpy.m` assembler
+pseudoinstruction can be retained for compatibility. For
+implementations that internally rearrange bits according to EEW, a
+`vmmv.m` instruction with same source and destination can be used as
+idiom to force an internal reformat into a mask vector.
+
+The set of eight mask logical instructions can generate any of the 16
+possibly binary logical functions of the two input masks:
+
+[cols="1,1,1,1,12"]
+|===
+4+| inputs |
+
+| 0 | 0 | 1 | 1 | src1
+| 0 | 1 | 0 | 1 | src2
+|===
+
+[cols="1,1,1,1,6,6"]
+|===
+4+| output | instruction | pseudoinstruction
+
+| 0 | 0 | 0 | 0 | vmxor.mm vd, vd, vd | vmclr.m vd
+| 1 | 0 | 0 | 0 | vmnor.mm vd, src1, src2 |
+| 0 | 1 | 0 | 0 | vmandn.mm vd, src2, src1 |
+| 1 | 1 | 0 | 0 | vmnand.mm vd, src1, src1 | vmnot.m vd, src1
+| 0 | 0 | 1 | 0 | vmandn.mm vd, src1, src2 |
+| 1 | 0 | 1 | 0 | vmnand.mm vd, src2, src2 | vmnot.m vd, src2
+| 0 | 1 | 1 | 0 | vmxor.mm vd, src1, src2 |
+| 1 | 1 | 1 | 0 | vmnand.mm vd, src1, src2 |
+| 0 | 0 | 0 | 1 | vmand.mm vd, src1, src2 |
+| 1 | 0 | 0 | 1 | vmxnor.mm vd, src1, src2 |
+| 0 | 1 | 0 | 1 | vmand.mm vd, src2, src2 | vmmv.m vd, src2
+| 1 | 1 | 0 | 1 | vmorn.mm vd, src2, src1 |
+| 0 | 0 | 1 | 1 | vmand.mm vd, src1, src1 | vmmv.m vd, src1
+| 1 | 0 | 1 | 1 | vmorn.mm vd, src1, src2 |
+| 0 | 1 | 1 | 1 | vmor.mm vd, src1, src2 |
+| 1 | 1 | 1 | 1 | vmxnor.mm vd, vd, vd | vmset.m vd
+|===
+
+NOTE: The vector mask logical instructions are designed to be easily
+fused with a following masked vector operation to effectively expand
+the number of predicate registers by moving values into `v0` before
+use.
+
+
+==== Vector count population in mask `vcpop.m`
+
+----
+ vcpop.m rd, vs2, vm
+----
+
+NOTE: This instruction previously had the assembler mnemonic `vpopc.m`
+but was renamed to be consistent with the scalar instruction. The
+assembler instruction alias `vpopc.m` is being retained for software
+compatibility.
+
+The source operand is a single vector register holding mask register
+values as described in Section <<sec-mask-register-layout>>.
+
+The `vcpop.m` instruction counts the number of mask elements of the
+active elements of the vector source mask register that have the value
+1 and writes the result to a scalar `x` register.
+
+The operation can be performed under a mask, in which case only the
+masked elements are counted.
+
+----
+ vcpop.m rd, vs2, v0.t # x[rd] = sum_i ( vs2.mask[i] && v0.mask[i] )
+----
+
+The `vcpop.m` instruction writes `x[rd]` even if `vl`=0 (with the
+value 0, since no mask elements are active).
+
+Traps on `vcpop.m` are always reported with a `vstart` of 0. The
+`vcpop.m` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+==== `vfirst` find-first-set mask bit
+
+----
+ vfirst.m rd, vs2, vm
+----
+
+The `vfirst` instruction finds the lowest-numbered active element of
+the source mask vector that has the value 1 and writes that element's
+index to a GPR. If no active element has the value 1, -1 is written
+to the GPR.
+
+NOTE: Software can assume that any negative value (highest bit set)
+corresponds to no element found, as vector lengths will never reach
+2^(XLEN-1)^ on any implementation.
+
+The `vfirst.m` instruction writes `x[rd]` even if `vl`=0 (with the
+value -1, since no mask elements are active).
+
+Traps on `vfirst` are always reported with a `vstart` of 0. The
+`vfirst` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+==== `vmsbf.m` set-before-first mask bit
+
+----
+ vmsbf.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsbf.m v2, v3
+ 0 0 0 0 0 0 1 1 v2 contents
+
+ 1 0 0 1 0 1 0 1 v3 contents
+ vmsbf.m v2, v3
+ 0 0 0 0 0 0 0 0 v2
+
+ 0 0 0 0 0 0 0 0 v3 contents
+ vmsbf.m v2, v3
+ 1 1 1 1 1 1 1 1 v2
+
+ 1 1 0 0 0 0 1 1 v0 vcontents
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsbf.m v2, v3, v0.t
+ 0 1 x x x x 1 1 v2 contents
+----
+
+The `vmsbf.m` instruction takes a mask register as input and writes
+results to a mask register. The instruction writes a 1 to all active
+mask elements before the first active source element that is a 1, then
+writes a 0 to that element and all following active elements. If
+there is no set bit in the active elements of the source vector, then
+all active elements in the destination are written with a 1.
+
+The tail elements in the destination mask register are updated under a
+tail-agnostic policy.
+
+Traps on `vmsbf.m` are always reported with a `vstart` of 0. The
+`vmsbf` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+The destination register cannot overlap the source register
+and, if masked, cannot overlap the mask register ('v0').
+
+==== `vmsif.m` set-including-first mask bit
+
+The vector mask set-including-first instruction is similar to
+set-before-first, except it also includes the element with a set bit.
+
+----
+ vmsif.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsif.m v2, v3
+ 0 0 0 0 0 1 1 1 v2 contents
+
+ 1 0 0 1 0 1 0 1 v3 contents
+ vmsif.m v2, v3
+ 0 0 0 0 0 0 0 1 v2
+
+ 1 1 0 0 0 0 1 1 v0 vcontents
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsif.m v2, v3, v0.t
+ 1 1 x x x x 1 1 v2 contents
+----
+
+The tail elements in the destination mask register are updated under a
+tail-agnostic policy.
+
+Traps on `vmsif.m` are always reported with a `vstart` of 0. The
+`vmsif` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+The destination register cannot overlap the source register
+and, if masked, cannot overlap the mask register ('v0').
+
+==== `vmsof.m` set-only-first mask bit
+
+The vector mask set-only-first instruction is similar to
+set-before-first, except it only sets the first element with a bit
+set, if any.
+
+----
+ vmsof.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 1 0 0 v3 contents
+ vmsof.m v2, v3
+ 0 0 0 0 0 1 0 0 v2 contents
+
+ 1 0 0 1 0 1 0 1 v3 contents
+ vmsof.m v2, v3
+ 0 0 0 0 0 0 0 1 v2
+
+ 1 1 0 0 0 0 1 1 v0 vcontents
+ 1 1 0 1 0 1 0 0 v3 contents
+ vmsof.m v2, v3, v0.t
+ 0 1 x x x x 0 0 v2 contents
+----
+
+The tail elements in the destination mask register are updated under a
+tail-agnostic policy.
+
+Traps on `vmsof.m` are always reported with a `vstart` of 0. The
+`vmsof` instruction will raise an illegal instruction exception if
+`vstart` is non-zero.
+
+The destination register cannot overlap the source register
+and, if masked, cannot overlap the mask register ('v0').
+
+==== Example using vector mask instructions
+
+The following is an example of vectorizing a data-dependent exit loop.
+
+----
+include::example/strcpy.s[lines=4..-1]
+----
+----
+include::example/strncpy.s[lines=4..-1]
+----
+
+==== Vector Iota Instruction
+
+The `viota.m` instruction reads a source vector mask register and
+writes to each element of the destination vector register group the
+sum of all the bits of elements in the mask register
+whose index is less than the element, e.g., a parallel prefix sum of
+the mask values.
+
+This instruction can be masked, in which case only the enabled
+elements contribute to the sum.
+
+----
+ viota.m vd, vs2, vm
+
+ # Example
+
+ 7 6 5 4 3 2 1 0 Element number
+
+ 1 0 0 1 0 0 0 1 v2 contents
+ viota.m v4, v2 # Unmasked
+ 2 2 2 1 1 1 1 0 v4 result
+
+ 1 1 1 0 1 0 1 1 v0 contents
+ 1 0 0 1 0 0 0 1 v2 contents
+ 2 3 4 5 6 7 8 9 v4 contents
+ viota.m v4, v2, v0.t # Masked, vtype.vma=0
+ 1 1 1 5 1 7 1 0 v4 results
+----
+
+The result value is zero-extended to fill the destination element if
+SEW is wider than the result. If the result value would overflow the
+destination SEW, the least-significant SEW bits are retained.
+
+Traps on `viota.m` are always reported with a `vstart` of 0, and
+execution is always restarted from the beginning when resuming after a
+trap handler. An illegal instruction exception is raised if `vstart`
+is non-zero.
+
+The destination register group cannot overlap the source register
+and, if masked, cannot overlap the mask register (`v0`).
+
+The `viota.m` instruction can be combined with memory scatter
+instructions (indexed stores) to perform vector compress functions.
+
+----
+ # Compact non-zero elements from input memory array to output memory array
+ #
+ # size_t compact_non_zero(size_t n, const int* in, int* out)
+ # {
+ # size_t i;
+ # size_t count = 0;
+ # int *p = out;
+ #
+ # for (i=0; i<n; i++)
+ # {
+ # const int v = *in++;
+ # if (v != 0)
+ # *p++ = v;
+ # }
+ #
+ # return (size_t) (p - out);
+ # }
+ #
+ # a0 = n
+ # a1 = &in
+ # a2 = &out
+
+compact_non_zero:
+ li a6, 0 # Clear count of non-zero elements
+loop:
+ vsetvli a5, a0, e32, m8, ta, ma # 32-bit integers
+ vle32.v v8, (a1) # Load input vector
+ sub a0, a0, a5 # Decrement number done
+ slli a5, a5, 2 # Multiply by four bytes
+ vmsne.vi v0, v8, 0 # Locate non-zero values
+ add a1, a1, a5 # Bump input pointer
+ vcpop.m a5, v0 # Count number of elements set in v0
+ viota.m v16, v0 # Get destination offsets of active elements
+ add a6, a6, a5 # Accumulate number of elements
+ vsll.vi v16, v16, 2, v0.t # Multiply offsets by four bytes
+ slli a5, a5, 2 # Multiply number of non-zero elements by four bytes
+ vsuxei32.v v8, (a2), v16, v0.t # Scatter using scaled viota results under mask
+ add a2, a2, a5 # Bump output pointer
+ bnez a0, loop # Any more?
+
+ mv a0, a6 # Return count
+ ret
+----
+
+==== Vector Element Index Instruction
+
+The `vid.v` instruction writes each element's index to the
+destination vector register group, from 0 to `vl`-1.
+
+----
+ vid.v vd, vm # Write element ID to destination.
+----
+
+The instruction can be masked. Masking does not change the
+index value written to active elements.
+
+The `vs2` field of the instruction must be set to `v0`, otherwise the
+encoding is _reserved_.
+
+The result value is zero-extended to fill the destination element if
+SEW is wider than the result. If the result value would overflow the
+destination SEW, the least-significant SEW bits are retained.
+
+NOTE: Microarchitectures can implement `vid.v` instruction using the
+same datapath as `viota.m` but with an implicit set mask source.
+
+[[sec-vector-permute]]
+=== Vector Permutation Instructions
+
+A range of permutation instructions are provided to move elements
+around within the vector registers.
+
+==== Integer Scalar Move Instructions
+
+The integer scalar read/write instructions transfer a single
+value between a scalar `x` register and element 0 of a vector
+register. The instructions ignore LMUL and vector register groups.
+
+----
+vmv.x.s rd, vs2 # x[rd] = vs2[0] (vs1=0)
+vmv.s.x vd, rs1 # vd[0] = x[rs1] (vs2=0)
+----
+
+The `vmv.x.s` instruction copies a single SEW-wide element from index 0 of the
+source vector register to a destination integer register. If SEW > XLEN, the
+least-significant XLEN bits are transferred and the upper SEW-XLEN bits are
+ignored. If SEW < XLEN, the value is sign-extended to XLEN bits.
+
+NOTE: `vmv.x.s` performs its operation even if `vstart` {ge} `vl` or `vl`=0.
+
+The `vmv.s.x` instruction copies the scalar integer register to element 0 of
+the destination vector register. If SEW < XLEN, the least-significant bits
+are copied and the upper XLEN-SEW bits are ignored. If SEW > XLEN, the value
+is sign-extended to SEW bits. The other elements in the destination vector
+register ( 0 < index < VLEN/SEW) are treated as tail elements using the current tail agnostic/undisturbed policy. If `vstart` {ge} `vl`, no
+operation is performed and the destination register is not updated.
+
+NOTE: As a consequence, when `vl`=0, no elements are updated in the
+destination vector register group, regardless of `vstart`.
+
+The encodings corresponding to the masked versions (`vm=0`) of `vmv.x.s`
+and `vmv.s.x` are reserved.
+
+==== Floating-Point Scalar Move Instructions
+
+The floating-point scalar read/write instructions transfer a single
+value between a scalar `f` register and element 0 of a vector
+register. The instructions ignore LMUL and vector register groups.
+
+----
+vfmv.f.s rd, vs2 # f[rd] = vs2[0] (rs1=0)
+vfmv.s.f vd, rs1 # vd[0] = f[rs1] (vs2=0)
+----
+
+The `vfmv.f.s` instruction copies a single SEW-wide element from index
+0 of the source vector register to a destination scalar floating-point
+register.
+
+NOTE: `vfmv.f.s` performs its operation even if `vstart` {ge} `vl` or `vl`=0.
+
+The `vfmv.s.f` instruction copies the scalar floating-point register
+to element 0 of the destination vector register. The other elements
+in the destination vector register ( 0 < index < VLEN/SEW) are treated
+as tail elements using the current tail agnostic/undisturbed policy.
+If `vstart` {ge} `vl`, no operation is performed and the destination
+register is not updated.
+
+NOTE: As a consequence, when `vl`=0, no elements are updated in the
+destination vector register group, regardless of `vstart`.
+
+The encodings corresponding to the masked versions (`vm=0`) of `vfmv.f.s`
+and `vfmv.s.f` are reserved.
+
+==== Vector Slide Instructions
+
+The slide instructions move elements up and down a vector register
+group.
+
+NOTE: The slide operations can be implemented much more efficiently
+than using the arbitrary register gather instruction. Implementations
+may optimize certain OFFSET values for `vslideup` and `vslidedown`.
+In particular, power-of-2 offsets may operate substantially faster
+than other offsets.
+
+For all of the `vslideup`, `vslidedown`, `v[f]slide1up`, and
+`v[f]slide1down` instructions, if `vstart` {ge} `vl`, the instruction performs no
+operation and leaves the destination vector register unchanged.
+
+NOTE: As a consequence, when `vl`=0, no elements are updated in the
+destination vector register group, regardless of `vstart`.
+
+The tail agnostic/undisturbed policy is followed for tail elements.
+
+The slide instructions may be masked, with mask element _i_
+controlling whether _destination_ element _i_ is written. The mask
+undisturbed/agnostic policy is followed for inactive elements.
+
+===== Vector Slideup Instructions
+
+----
+ vslideup.vx vd, vs2, rs1, vm # vd[i+x[rs1]] = vs2[i]
+ vslideup.vi vd, vs2, uimm, vm # vd[i+uimm] = vs2[i]
+----
+
+For `vslideup`, the value in `vl` specifies the maximum number of destination
+elements that are written. The start index (_OFFSET_) for the
+destination can be either specified using an unsigned integer in the
+`x` register specified by `rs1`, or a 5-bit immediate, zero-extended to XLEN bits.
+If XLEN > SEW, _OFFSET_ is _not_ truncated to SEW bits.
+Destination elements _OFFSET_ through `vl`-1 are written if unmasked and
+if _OFFSET_ < `vl`.
+
+----
+ vslideup behavior for destination elements (`vstart` < `vl`)
+
+ OFFSET is amount to slideup, either from x register or a 5-bit immediate
+
+ 0 <= i < min(vl, max(vstart, OFFSET)) Unchanged
+ max(vstart, OFFSET) <= i < vl vd[i] = vs2[i-OFFSET] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+----
+
+The destination vector register group for `vslideup` cannot overlap
+the source vector register group, otherwise the instruction encoding
+is reserved.
+
+NOTE: The non-overlap constraint avoids WAR hazards on the
+input vectors during execution, and enables restart with non-zero
+`vstart`.
+
+===== Vector Slidedown Instructions
+
+----
+ vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+x[rs1]]
+ vslidedown.vi vd, vs2, uimm, vm # vd[i] = vs2[i+uimm]
+----
+
+For `vslidedown`, the value in `vl` specifies the maximum number of
+destination elements that are written. The remaining elements past
+`vl` are handled according to the current tail policy (Section
+<<sec-agnostic>>).
+
+The start index (_OFFSET_) for the source can be either specified
+using an unsigned integer in the `x` register specified by `rs1`, or a
+5-bit immediate, zero-extended to XLEN bits.
+If XLEN > SEW, _OFFSET_ is _not_ truncated to SEW bits.
+
+----
+ vslidedown behavior for source elements for element i in slide (`vstart` < `vl`)
+ 0 <= i+OFFSET < VLMAX src[i] = vs2[i+OFFSET]
+ VLMAX <= i+OFFSET src[i] = 0
+
+ vslidedown behavior for destination element i in slide (`vstart` < `vl`)
+ 0 <= i < vstart Unchanged
+ vstart <= i < vl vd[i] = src[i] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+
+----
+
+===== Vector Slide1up
+
+Variants of slide are provided that only move by one element but which
+also allow a scalar integer value to be inserted at the vacated
+element position.
+
+----
+ vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i]
+----
+
+The `vslide1up` instruction places the `x` register argument at
+location 0 of the destination vector register group, provided that
+element 0 is active, otherwise the destination element update follows the
+current mask agnostic/undisturbed policy. If XLEN < SEW, the value is
+sign-extended to SEW bits. If XLEN > SEW, the least-significant bits
+are copied over and the high XLEN-SEW bits are ignored.
+
+The remaining active `vl`-1 elements are copied over from index _i_ in
+the source vector register group to index _i_+1 in the destination
+vector register group.
+
+The `vl` register specifies the maximum number of destination vector
+register elements updated with source values, and remaining elements
+past `vl` are handled according to the current tail policy (Section
+<<sec-agnostic>>).
+
+
+----
+ vslide1up behavior when vl > 0
+
+ i < vstart unchanged
+ 0 = i = vstart vd[i] = x[rs1] if v0.mask[i] enabled
+ max(vstart, 1) <= i < vl vd[i] = vs2[i-1] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+----
+
+The `vslide1up` instruction requires that the destination vector
+register group does not overlap the source vector register group.
+Otherwise, the instruction encoding is reserved.
+
+[[sec-vfslide1up]]
+===== Vector Floating-Point Slide1up Instruction
+
+----
+ vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i]
+----
+
+The `vfslide1up` instruction is defined analogously to `vslide1up`,
+but sources its scalar argument from an `f` register.
+
+===== Vector Slide1down Instruction
+
+The `vslide1down` instruction copies the first `vl`-1 active elements
+values from index _i_+1 in the source vector register group to index
+_i_ in the destination vector register group.
+
+The `vl` register specifies the maximum number of destination vector
+register elements written with source values, and remaining elements
+past `vl` are handled according to the current tail policy (Section
+<<sec-agnostic>>).
+
+----
+ vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1]
+----
+
+The `vslide1down` instruction places the `x` register argument at
+location `vl`-1 in the destination vector register, provided that
+element `vl-1` is active, otherwise the destination element update
+follows the current mask agnostic/undisturbed policy.
+If XLEN < SEW, the value is sign-extended to SEW bits. If
+XLEN > SEW, the least-significant bits are copied over and the high
+SEW-XLEN bits are ignored.
+
+----
+ vslide1down behavior
+
+ i < vstart unchanged
+ vstart <= i < vl-1 vd[i] = vs2[i+1] if v0.mask[i] enabled
+ vstart <= i = vl-1 vd[vl-1] = x[rs1] if v0.mask[i] enabled
+ vl <= i < VLMAX Follow tail policy
+----
+
+NOTE: The `vslide1down` instruction can be used to load values into a
+vector register without using memory and without disturbing other
+vector registers. This provides a path for debuggers to modify the
+contents of a vector register, albeit slowly, with multiple repeated
+`vslide1down` invocations.
+
+[[sec-vfslide1down]]
+===== Vector Floating-Point Slide1down Instruction
+
+----
+ vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1]
+----
+
+The `vfslide1down` instruction is defined analogously to `vslide1down`,
+but sources its scalar argument from an `f` register.
+
+==== Vector Register Gather Instructions
+
+The vector register gather instructions read elements from a first
+source vector register group at locations given by a second source
+vector register group. The index values in the second vector are
+treated as unsigned integers. The source vector can be read at any
+index < VLMAX regardless of `vl`. The maximum number of elements to write to
+the destination register is given by `vl`, and the remaining elements
+past `vl` are handled according to the current tail policy
+(Section <<sec-agnostic>>). The operation can be masked, and the mask
+undisturbed/agnostic policy is followed for inactive elements.
+
+----
+vrgather.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+vrgatherei16.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+----
+
+The `vrgather.vv` form uses SEW/LMUL for both the data and
+indices. The `vrgatherei16.vv` form uses SEW/LMUL for the data in
+`vs2` but EEW=16 and EMUL = (16/SEW)*LMUL for the indices in `vs1`.
+
+NOTE: When SEW=8, `vrgather.vv` can only reference vector elements
+0-255. The `vrgatherei16` form can index 64K elements, and can also
+be used to reduce the register capacity needed to hold indices when
+SEW > 16.
+
+If an element index is out of range ( `vs1[i]` {ge} VLMAX )
+then zero is returned for the element value.
+
+Vector-scalar and vector-immediate forms of the register gather are
+also provided. These read one element from the source vector at the
+given index, and write this value to the active elements
+of the destination vector register. The index value in the scalar
+register and the immediate, zero-extended to XLEN bits, are treated as
+unsigned integers. If XLEN > SEW, the index value is _not_ truncated
+to SEW bits.
+
+NOTE: These forms allow any vector element to be "splatted" to an entire vector.
+
+----
+vrgather.vx vd, vs2, rs1, vm # vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[x[rs1]]
+vrgather.vi vd, vs2, uimm, vm # vd[i] = (uimm >= VLMAX) ? 0 : vs2[uimm]
+----
+
+For any `vrgather` instruction, the destination vector register group
+cannot overlap with the source vector register groups, otherwise the
+instruction encoding is reserved.
+
+==== Vector Compress Instruction
+
+The vector compress instruction allows elements selected by a vector
+mask register from a source vector register group to be packed into
+contiguous elements at the start of the destination vector register
+group.
+
+----
+ vcompress.vm vd, vs2, vs1 # Compress into vd elements of vs2 where vs1 is enabled
+----
+
+The vector mask register specified by `vs1` indicates which of the
+first `vl` elements of vector register group `vs2` should be extracted
+and packed into contiguous elements at the beginning of vector
+register `vd`. The remaining elements of `vd` are treated as tail
+elements according to the current tail policy (Section
+<<sec-agnostic>>).
+
+----
+ Example use of vcompress instruction
+
+ 8 7 6 5 4 3 2 1 0 Element number
+
+ 1 1 0 1 0 0 1 0 1 v0
+ 8 7 6 5 4 3 2 1 0 v1
+ 1 2 3 4 5 6 7 8 9 v2
+ vsetivli t0, 9, e8, m1, tu, ma
+ vcompress.vm v2, v1, v0
+ 1 2 3 4 8 7 5 2 0 v2
+----
+
+`vcompress` is encoded as an unmasked instruction (`vm=1`). The equivalent
+masked instruction (`vm=0`) is reserved.
+
+The destination vector register group cannot overlap the source vector
+register group or the source mask register, otherwise the instruction
+encoding is reserved.
+
+A trap on a `vcompress` instruction is always reported with a
+`vstart` of 0. Executing a `vcompress` instruction with a non-zero
+`vstart` raises an illegal instruction exception.
+
+NOTE: Although possible, `vcompress` is one of the more difficult
+instructions to restart with a non-zero `vstart`, so assumption is
+implementations will choose not do that but will instead restart from
+element 0. This does mean elements in destination register after
+`vstart` will already have been updated.
+
+===== Synthesizing `vdecompress`
+
+There is no inverse `vdecompress` provided, as this operation can be
+readily synthesized using iota and a masked vrgather:
+
+----
+ Desired functionality of 'vdecompress'
+ 7 6 5 4 3 2 1 0 # vid
+
+ e d c b a # packed vector of 5 elements
+ 1 0 0 1 1 1 0 1 # mask vector of 8 elements
+ p q r s t u v w # destination register before vdecompress
+
+ e q r d c b v a # result of vdecompress
+----
+
+----
+ # v0 holds mask
+ # v1 holds packed data
+ # v11 holds input expanded vector and result
+ viota.m v10, v0 # Calc iota from mask in v0
+ vrgather.vv v11, v1, v10, v0.t # Expand into destination
+----
+----
+ p q r s t u v w # v11 destination register
+ e d c b a # v1 source vector
+ 1 0 0 1 1 1 0 1 # v0 mask vector
+
+ 4 4 4 3 2 1 1 0 # v10 result of viota.m
+ e q r d c b v a # v11 destination after vrgather using viota.m under mask
+----
+
+==== Whole Vector Register Move
+
+The `vmv<nr>r.v` instructions copy whole vector registers (i.e., all
+VLEN bits) and can copy whole vector register groups. The `nr` value
+in the opcode is the number of individual vector registers, NREG, to
+copy. The instructions operate as if EEW=SEW, EMUL = NREG, effective
+length `evl`= EMUL * VLEN/SEW.
+
+NOTE: These instructions are intended to aid compilers to shuffle
+vector registers without needing to know or change `vl` or `vtype`.
+
+NOTE: The usual property that no elements are written if `vstart` {ge} `vl`
+does not apply to these instructions.
+Instead, no elements are written if `vstart` {ge} `evl`.
+
+NOTE: If `vd` is equal to `vs2` the instruction is an architectural
+NOP, but is treated as a hint to implementations that rearrange data
+internally that the register group will next be accessed with an EEW
+equal to SEW.
+
+The instruction is encoded as an OPIVI instruction. The number of
+vector registers to copy is encoded in the low three bits of the
+`simm` field (`simm[2:0]`) using the same encoding as the `nf[2:0]` field for memory
+instructions (Figure <<fig-nf>>), i.e., `simm[2:0]` = NREG-1.
+
+The value of NREG must be 1, 2, 4, or 8, and values of `simm[4:0]`
+other than 0, 1, 3, and 7 are reserved.
+
+NOTE: A future extension may support other numbers of registers to be moved.
+
+NOTE: The instruction uses the same funct6 encoding as the `vsmul`
+instruction but with an immediate operand, and only the unmasked
+version (`vm=1`). This encoding is chosen as it is close to the
+related `vmerge` encoding, and it is unlikely the `vsmul` instruction
+would benefit from an immediate form.
+
+----
+ vmv<nr>r.v vd, vs2 # General form
+
+ vmv1r.v v1, v2 # Copy v1=v2
+ vmv2r.v v10, v12 # Copy v10=v12; v11=v13
+ vmv4r.v v4, v8 # Copy v4=v8; v5=v9; v6=v10; v7=v11
+ vmv8r.v v0, v8 # Copy v0=v8; v1=v9; ...; v7=v15
+----
+
+The source and destination vector register numbers must be aligned
+appropriately for the vector register group size, and encodings with
+other vector register numbers are reserved.
+
+NOTE: A future extension may relax the vector register alignment
+restrictions.
+
+=== Exception Handling
+
+On a trap during a vector instruction (caused by either a synchronous
+exception or an asynchronous interrupt), the existing `*epc` CSR is
+written with a pointer to the trapping vector instruction, while the
+`vstart` CSR contains the element index on which the trap was
+taken.
+
+NOTE: We chose to add a `vstart` CSR to allow resumption of a
+partially executed vector instruction to reduce interrupt latencies
+and to simplify forward-progress guarantees. This is similar to the
+scheme in the IBM 3090 vector facility. To ensure forward progress
+without the `vstart` CSR, implementations would have to guarantee an
+entire vector instruction can always complete atomically without
+generating a trap. This is particularly difficult to ensure in the
+presence of strided or scatter/gather operations and demand-paged
+virtual memory.
+
+==== Precise vector traps
+
+NOTE: We assume most supervisor-mode environments with demand-paging
+will require precise vector traps.
+
+Precise vector traps require that:
+
+. all instructions older than the trapping vector instruction have committed their results
+. no instructions newer than the trapping vector instruction have altered architectural state
+. any operations within the trapping vector instruction affecting result elements preceding the index in the `vstart` CSR have committed their results
+. no operations within the trapping vector instruction affecting elements at or following the `vstart` CSR have altered architectural state except if restarting and completing the affected vector instruction will nevertheless produce the correct final state.
+
+We relax the last requirement to allow elements following `vstart` to
+have been updated at the time the trap is reported, provided that
+re-executing the instruction from the given `vstart` will correctly
+overwrite those elements.
+
+In idempotent memory regions, vector store instructions may have
+updated elements in memory past the element causing a synchronous
+trap. Non-idempotent memory regions must not have been updated for
+indices equal to or greater than the element that caused a synchronous
+trap during a vector store instruction.
+
+Except where noted above, vector instructions are allowed to overwrite
+their inputs, and so in most cases, the vector instruction restart
+must be from the `vstart` element index. However, there are a number of
+cases where this overwrite is prohibited to enable execution of the
+vector instructions to be idempotent and hence restartable from an
+earlier index location.
+
+Implementations must ensure forward progress can be eventually
+guaranteed for the element or segment reported by `vstart`.
+
+==== Imprecise vector traps
+
+Imprecise vector traps are traps that are not precise. In particular,
+instructions newer than `*epc` may have committed results, and
+instructions older than `*epc` may have not completed execution.
+Imprecise traps are primarily intended to be used in situations where
+reporting an error and terminating execution is the appropriate
+response.
+
+NOTE: A profile might specify that interrupts are precise while other
+traps are imprecise. We assume many embedded implementations will
+generate only imprecise traps for vector instructions on fatal errors,
+as they will not require resumable traps.
+
+Imprecise traps shall report the faulting element in `vstart` for
+traps caused by synchronous vector exceptions.
+
+There is no support for imprecise traps in the current standard extensions.
+
+==== Selectable precise/imprecise traps
+
+Some profiles may choose to provide a privileged mode bit to select
+between precise and imprecise vector traps. Imprecise mode would run
+at high-performance but possibly make it difficult to discern error
+causes, while precise mode would run more slowly, but support
+debugging of errors albeit with a possibility of not experiencing the
+same errors as in imprecise mode.
+
+This mechanism is not defined in the current standard extensions.
+
+==== Swappable traps
+
+Another trap mode can support swappable state in the vector unit,
+where on a trap, special instructions can save and restore the vector
+unit microarchitectural state, to allow execution to continue
+correctly around imprecise traps.
+
+This mechanism is not defined in the current standard extensions.
+
+NOTE: A future extension might define a standard way of saving and
+restoring opaque microarchitectural state from a vector unit
+implementation to support context switching with imprecise traps.
+
+[[sec-vector-extensions]]
+=== Standard Vector Extensions
+
+This section describes the standard vector extensions.
+A set of smaller extensions intended for embedded
+use are named with a "Zve" prefix, while a larger vector extension
+designed for application processors is named as a single-letter V
+extension. A set of vector length extension names with prefix "Zvl"
+are also provided.
+
+The initial vector extensions are designed to act as a base for
+additional vector extensions in various domains, including
+cryptography and machine learning.
+
+==== Zvl*: Minimum Vector Length Standard Extensions
+
+All standard vector extensions have a minimum required VLEN as
+described below. A set of vector length extensions are provided to
+increase the minimum vector length of a vector extension.
+
+NOTE: The vector length extensions can be used to either specify
+additional software or architecture profile requirements, or to
+advertise hardware capabilities.
+
+.Vector length extensions
+[cols="1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Extension | Minimum VLEN
+
+| Zvl32b | 32
+| Zvl64b | 64
+| Zvl128b | 128
+| Zvl256b | 256
+| Zvl512b | 512
+| Zvl1024b | 1024
+|===
+
+NOTE: Longer vector length extensions should follow the same pattern.
+
+NOTE: Every vector length extension effectively includes all shorter
+vector length extensions.
+
+NOTE: The syntax for extension names is being revised, and these names
+are subject to change. The trailing "b" will be required to
+disambiguate numeric fields from version numbers.
+
+NOTE: Explicit use of the Zvl32b extension string is not required for
+any standard vector extension as they all effectively mandate at least
+this minimum, but the string can be useful when stating hardware
+capabilities.
+
+==== Zve*: Vector Extensions for Embedded Processors
+
+The following five standard extensions are defined to provide varying
+degrees of vector support and are intended for use with embedded
+processors. Any of these extensions can be added to base ISAs with
+XLEN=32 or XLEN=64. The table lists the minimum VLEN and supported
+EEWs for each extension as well as what floating-point types are
+supported.
+
+.Embedded vector extensions
+[cols="1,1,2,1,1"]
+[%autowidth,float="center",align="center",options="header"]
+|===
+| Extension | Minimum VLEN | Supported EEW | FP32 | FP64
+
+| Zve32x | 32 | 8, 16, 32 | N | N
+| Zve32f | 32 | 8, 16, 32 | Y | N
+| Zve64x | 64 | 8, 16, 32, 64 | N | N
+| Zve64f | 64 | 8, 16, 32, 64 | Y | N
+| Zve64d | 64 | 8, 16, 32, 64 | Y | Y
+|===
+
+The Zve32f and Zve64x extensions depend on the Zve32x extension.
+The Zve64f extension depends on the Zve32f and Zve64x extensions.
+The Zve64d extension depends on the Zve64f extension.
+
+All Zve* extensions have precise traps.
+
+NOTE: There is currently no standard support for handling imprecise
+traps, so standard extensions have to provide precise traps.
+
+All Zve* extensions provide support for EEW of 8, 16, and 32, and
+Zve64* extensions also support EEW of 64.
+
+All Zve* extensions support the vector configuration instructions
+(Section <<sec-vector-config>>).
+
+All Zve* extensions support all vector load and store instructions
+(Section <<sec-vector-memory>>), except Zve64* extensions do not
+support EEW=64 for index values when XLEN=32.
+
+All Zve* extensions support all vector integer instructions (Section
+<<sec-vector-integer>>), except that the `vmulh` integer multiply
+variants that return the high word of the product (`vmulh.vv`,
+`vmulh.vx`, `vmulhu.vv`, `vmulhu.vx`, `vmulhsu.vv`, `vmulhsu.vx`) are
+not included for EEW=64 in Zve64*.
+
+NOTE: Producing the high-word of a product can take substantial
+additional gates for large EEW.
+
+All Zve* extensions support all vector fixed-point arithmetic
+instructions (<<sec-vector-fixed-point>>), except that `vsmul.vv` and
+`vsmul.vx` are not included in EEW=64 in Zve64*.
+
+NOTE: As with `vmulh`, `vsmul` requires a large amount of additional
+logic, and 64-bit fixed-point multiplies are relatively rare.
+
+All Zve* extensions support all vector integer single-width and
+widening reduction operations (Sections <<sec-vector-integer-reduce>>,
+<<sec-vector-integer-reduce-widen>>).
+
+All Zve* extensions support all vector mask instructions (Section
+<<sec-vector-mask>>).
+
+All Zve* extensions support all vector permutation instructions
+(Section <<sec-vector-permute>>), except that Zve32x and Zve64x
+do not include those with floating-point operands, and Zve64f does not include those
+with EEW=64 floating-point operands.
+
+The Zve32x extension depends on the Zicsr extension.
+The Zve32f and Zve64f extensions depend upon the F extension,
+and implement all
+vector floating-point instructions (Section <<sec-vector-float>>) for
+floating-point operands with EEW=32. Vector single-width floating-point reduction
+operations (<<sec-vector-float-reduce>>) for EEW=32 are supported.
+
+The Zve64d extension depends upon the D extension,
+and implements all vector
+floating-point instructions (Section <<sec-vector-float>>) for
+floating-point operands with EEW=32 or EEW=64 (including widening
+instructions and conversions between FP32 and FP64). Vector
+single-width floating-point reductions (<<sec-vector-float-reduce>>)
+for EEW=32 and EEW=64 are supported as well as widening reductions
+from FP32 to FP64.
+
+==== V: Vector Extension for Application Processors
+
+The single-letter V extension is intended for use in application
+processor profiles.
+
+The `misa.v` bit is set for implementations providing `misa` and
+supporting V.
+
+The V vector extension has precise traps.
+
+The V vector extension depends upon the Zvl128b and Zve64d extensions.
+
+NOTE: The value of 128 was chosen as a compromise for application
+processors. Providing a larger VLEN allows stripmining code to be
+elided in some cases for short vectors, but also increases the size of
+the minimum implementation. Note that larger LMUL can be used to
+avoid stripmining for longer known-size application vectors at the
+cost of having fewer available vector register groups. For example, an
+LMUL of 8 allows vectors of up to sixteen 64-bit elements to be
+processed without stripmining using four vector register groups.
+
+The V extension supports EEW of 8, 16, and 32, and 64.
+
+The V extension supports the vector configuration instructions
+(Section <<sec-vector-config>>).
+
+The V extension supports all vector load and store instructions
+(Section <<sec-vector-memory>>), except the V extension does not
+support EEW=64 for index values when XLEN=32.
+
+The V extension supports all vector integer instructions (Section
+<<sec-vector-integer>>).
+
+The V extension supports all vector fixed-point arithmetic
+instructions (<<sec-vector-fixed-point>>).
+
+The V extension supports all vector integer single-width and
+widening reduction operations (Sections <<sec-vector-integer-reduce>>,
+<<sec-vector-integer-reduce-widen>>).
+
+The V extension supports all vector mask instructions (Section
+<<sec-vector-mask>>).
+
+The V extension supports all vector permutation instructions (Section
+<<sec-vector-permute>>).
+
+The V extension depends upon the F and D
+extensions, and implements all vector floating-point instructions
+(Section <<sec-vector-float>>) for floating-point operands with EEW=32
+or EEW=64 (including widening instructions and conversions between
+FP32 and FP64). Vector single-width floating-point reductions
+(<<sec-vector-float-reduce>>) for EEW=32 and EEW=64 are supported as
+well as widening reductions from FP32 to FP64.
+
+[NOTE]
+====
+As is the case with other RISC-V extensions, it is valid to
+include overlapping extensions in the same ISA string. For example,
+RV64GCV and RV64GCV_Zve64f are both valid and equivalent ISA strings,
+as is RV64GCV_Zve64f_Zve32x_Zvl128b.
+====
+
+==== Zvfhmin: Vector Extension for Minimal Half-Precision Floating-Point
+
+The Zvfhmin extension provides minimal support for vectors of IEEE 754-2008
+binary16 values, adding conversions to and from binary32.
+When the Zvfhmin extension is implemented, the `vfwcvt.f.f.v` and
+`vfncvt.f.f.w` instructions become defined when SEW=16.
+The EEW=16 floating-point operands of these instructions use the binary16
+format.
+
+The Zvfhmin extension depends on the Zve32f extension.
+
+==== Zvfh: Vector Extension for Half-Precision Floating-Point
+
+The Zvfh extension provides support for vectors of IEEE 754-2008
+binary16 values.
+When the Zvfh extension is implemented, all instructions in Sections
+<<sec-vector-float>>, <<sec-vector-float-reduce>>,
+<<sec-vector-float-reduce-widen>>, <<sec-vector-float-move>>,
+<<sec-vfslide1up>>, and <<sec-vfslide1down>>
+become defined when SEW=16.
+The EEW=16 floating-point operands of these instructions use the binary16
+format.
+
+Additionally, conversions between 8-bit integers and binary16 values are
+provided. The floating-point-to-integer narrowing conversions
+(`vfncvt[.rtz].x[u].f.w`) and integer-to-floating-point
+widening conversions (`vfwcvt.f.x[u].v`) become defined when SEW=8.
+
+The Zvfh extension depends on the Zve32f and Zfhmin extensions.
+
+NOTE: Requiring basic scalar half-precision support makes Zvfh's
+vector-scalar instructions substantially more useful.
+We considered requiring more complete scalar half-precision support, but we
+reasoned that, for many half-precision vector workloads, performing the scalar
+computation in single-precision will suffice.
+
+=== Vector Instruction Listing
+
+include::images/wavedrom/v-inst-table.adoc[]
+
diff --git a/src/vector-examples.adoc b/src/vector-examples.adoc
new file mode 100644
index 0000000..9e54acd
--- /dev/null
+++ b/src/vector-examples.adoc
@@ -0,0 +1,125 @@
+[appendix]
+== Vector Assembly Code Examples
+
+The following are provided as non-normative text to help explain the vector ISA.
+
+=== Vector-vector add example
+
+----
+include::example/vvaddint32.s[lines=4..-1]
+----
+
+=== Example with mixed-width mask and compute.
+
+----
+# Code using one width for predicate and different width for masked
+# compute.
+# int8_t a[]; int32_t b[], c[];
+# for (i=0; i<n; i++) { b[i] = (a[i] < 5) ? c[i] : 1; }
+#
+# Mixed-width code that keeps SEW/LMUL=8
+ loop:
+ vsetvli a4, a0, e8, m1, ta, ma # Byte vector for predicate calc
+ vle8.v v1, (a1) # Load a[i]
+ add a1, a1, a4 # Bump pointer.
+ vmslt.vi v0, v1, 5 # a[i] < 5?
+
+ vsetvli x0, a0, e32, m4, ta, mu # Vector of 32-bit values.
+ sub a0, a0, a4 # Decrement count
+ vmv.v.i v4, 1 # Splat immediate to destination
+ vle32.v v4, (a3), v0.t # Load requested elements of C, others undisturbed
+ sll t1, a4, 2
+ add a3, a3, t1 # Bump pointer.
+ vse32.v v4, (a2) # Store b[i].
+ add a2, a2, t1 # Bump pointer.
+ bnez a0, loop # Any more?
+----
+
+=== Memcpy example
+
+----
+include::example/memcpy.s[lines=4..-1]
+----
+
+=== Conditional example
+
+----
+# (int16) z[i] = ((int8) x[i] < 5) ? (int16) a[i] : (int16) b[i];
+#
+
+loop:
+ vsetvli t0, a0, e8, m1, ta, ma # Use 8b elements.
+ vle8.v v0, (a1) # Get x[i]
+ sub a0, a0, t0 # Decrement element count
+ add a1, a1, t0 # x[i] Bump pointer
+ vmslt.vi v0, v0, 5 # Set mask in v0
+ vsetvli x0, x0, e16, m2, ta, mu # Use 16b elements.
+ slli t0, t0, 1 # Multiply by 2 bytes
+ vle16.v v2, (a2), v0.t # z[i] = a[i] case
+ vmnot.m v0, v0 # Invert v0
+ add a2, a2, t0 # a[i] bump pointer
+ vle16.v v2, (a3), v0.t # z[i] = b[i] case
+ add a3, a3, t0 # b[i] bump pointer
+ vse16.v v2, (a4) # Store z
+ add a4, a4, t0 # z[i] bump pointer
+ bnez a0, loop
+----
+=== SAXPY example
+
+----
+include::example/saxpy.s[lines=4..-1]
+----
+
+=== SGEMM example
+
+----
+include::example/sgemm.S[lines=4..-1]
+----
+
+=== Division approximation example
+
+----
+# v1 = v1 / v2 to almost 23 bits of precision.
+
+vfrec7.v v3, v2 # Estimate 1/v2
+ li t0, 0x40000000
+vmv.v.x v4, t0 # Splat 2.0
+vfnmsac.vv v4, v2, v3 # 2.0 - v2 * est(1/v2)
+vfmul.vv v3, v3, v4 # Better estimate of 1/v2
+vmv.v.x v4, t0 # Splat 2.0
+vfnmsac.vv v4, v2, v3 # 2.0 - v2 * est(1/v2)
+vfmul.vv v3, v3, v4 # Better estimate of 1/v2
+vfmul.vv v1, v1, v3 # Estimate of v1/v2
+----
+
+=== Square root approximation example
+
+----
+# v1 = sqrt(v1) to almost 23 bits of precision.
+
+ fmv.w.x ft0, x0 # Mask off zero inputs
+vmfne.vf v0, v1, ft0 # to avoid div by zero
+vfrsqrt7.v v2, v1, v0.t # Estimate 1/sqrt(x)
+vmfne.vf v0, v2, ft0, v0.t # Additionally mask off +inf inputs
+ li t0, 0x40400000
+vmv.v.x v4, t0 # Splat 3.0
+vfmul.vv v3, v1, v2, v0.t # x * est
+vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
+vfmul.vv v3, v3, v2, v0.t # est * (-x * est * est + 3)
+ li t0, 0x3f000000
+ fmv.w.x ft0, t0 # 0.5
+vfmul.vf v2, v3, ft0, v0.t # Estimate to 14 bits
+vfmul.vv v3, v1, v2, v0.t # x * est
+vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
+vfmul.vv v3, v3, v2, v0.t # est * (-x * est * est + 3)
+vfmul.vf v2, v3, ft0, v0.t # Estimate to 23 bits
+vfmul.vv v1, v2, v1, v0.t # x * 1/sqrt(x)
+----
+
+=== C standard library strcmp example
+
+----
+include::example/strcmp.s[lines=4..-1]
+----
+
+include::fraclmul.adoc[]
diff --git a/src/zawrs.adoc b/src/zawrs.adoc
new file mode 100644
index 0000000..eb94036
--- /dev/null
+++ b/src/zawrs.adoc
@@ -0,0 +1,105 @@
+== "Zawrs" Standard extension for Wait-on-Reservation-Set instructions, Version 1.01
+
+The Zawrs extension defines a pair of instructions to be used in polling loops
+that allows a core to enter a low-power state and wait on a store to a memory
+location. Waiting for a memory location to be updated is a common pattern in
+many use cases such as:
+
+. Contenders for a lock waiting for the lock variable to be updated.
+
+. Consumers waiting on the tail of an empty queue for the producer to queue
+ work/data. The producer may be code executing on a RISC-V hart, an accelerator
+ device, an external I/O agent.
+
+. Code waiting on a flag to be set in memory indicative of an event occurring.
+ For example, software on a RISC-V hart may wait on a "done" flag to be set in
+ memory by an accelerator device indicating completion of a job previously
+ submitted to the device.
+
+Such use cases involve polling on memory locations, and such busy loops can be a
+wasteful expenditure of energy. To mitigate the wasteful looping in such usages,
+a `WRS.NTO` (WRS-with-no-timeout) instruction is provided. Instead of polling
+for a store to a specific memory location, software registers a reservation set
+that includes all the bytes of the memory location using the `LR` instruction.
+Then a subsequent `WRS.NTO` instruction would cause the hart to temporarily
+stall execution in a low-power state until a store occurs to the reservation set
+or an interrupt is observed.
+
+Sometimes the program waiting on a memory update may also need to carry out a
+task at a future time or otherwise place an upper bound on the wait. To support
+such use cases a second instruction `WRS.STO` (WRS-with-short-timeout) is
+provided that works like `WRS.NTO` but bounds the stall duration to an
+implementation-define short timeout such that the stall is terminated on the
+timeout if no other conditions have occurred to terminate the stall. The
+program using this instruction may then determine if its deadline has been
+reached.
+
+[NOTE]
+====
+The instructions in the Zawrs extension are only useful in conjunction with the
+LR instruction, which is provided by the A extension, and which we also expect
+to be provided by a narrower Zalrsc extension in the future.
+====
+[[Zawrs]]
+=== Wait-on-Reservation-Set Instructions
+
+The `WRS.NTO` and `WRS.STO` instructions cause the hart to temporarily stall
+execution in a low-power state as long as the reservation set is valid and no
+pending interrupts, even if disabled, are observed. For `WRS.STO` the stall
+duration is bounded by an implementation defined short timeout. These
+instructions are available in all privilege modes. These instructions are not
+supported in a constrained `LR`/`SC` loop.
+
+[wavedrom, ,svg]
+....
+{reg: [
+ {bits: 7, name: 'opcode', attr: ['SYSTEM(0x73)'] },
+ {bits: 5, name: 'rd', attr: ['0'] },
+ {bits: 3, name: 'funct3', attr: ['0'] },
+ {bits: 5, name: 'rs1', attr: ['0'] },
+ {bits: 12, name: 'funct12', attr:['WRS.NTO(0x0d)', 'WRS.STO(0x1d)'] },
+], config:{lanes: 1, hspace:1024}}
+....
+
+<<<
+
+Hart execution may be stalled while the following conditions are all satisfied:
+[loweralpha]
+ . The reservation set is valid
+ . If `WRS.STO`, a "short" duration since start of stall has not elapsed
+ . No pending interrupt is observed (see the rules below)
+
+While stalled, an implementation is permitted to occasionally terminate the
+stall and complete execution for any reason.
+
+`WRS.NTO` and `WRS.STO` instructions follow the rules of the `WFI` instruction
+for resuming execution on a pending interrupt.
+
+When the `TW` (Timeout Wait) bit in `mstatus` is set and `WRS.NTO` is executed
+in any privilege mode other than M mode, and it does not complete within an
+implementation-specific bounded time limit, the `WRS.NTO` instruction will cause
+an illegal instruction exception.
+
+When executing in VS or VU mode, if the `VTW` bit is set in `hstatus`, the
+`TW` bit in `mstatus` is clear, and the `WRS.NTO` does not complete within an
+implementation-specific bounded time limit, the `WRS.NTO` instruction will cause
+a virtual instruction exception.
+
+[NOTE]
+====
+Since the `WRS.STO` and `WRS.NTO` instructions can complete execution for
+reasons other than stores to the reservation set, software will likely need
+a means of looping until the required stores have occurred.
+
+The duration of a `WRS.STO` instruction's timeout may vary significantly within
+and among implementations. In typical implementations this duration should be
+roughly in the range of 10 to 100 times an on-chip cache miss latency or a
+cacheless access to main memory.
+
+`WRS.NTO`, unlike `WFI`, is not specified to cause an illegal instruction
+exception if executed in U-mode when the governing `TW` bit is 0. `WFI` is
+typically not expected to be used in U-mode and on many systems may promptly
+cause an illegal instruction exception if used at U-mode. Unlike `WFI`,
+`WRS.NTO` is expected to be used by software in U-mode when waiting on
+memory but without a deadline for that wait.
+==== \ No newline at end of file
diff --git a/src/zc.adoc b/src/zc.adoc
new file mode 100644
index 0000000..2f2ef37
--- /dev/null
+++ b/src/zc.adoc
@@ -0,0 +1,2611 @@
+[#Zc]
+== "Zc*" Standard Extension for Code Size Reduction
+
+=== Zc* Overview
+
+Zc* is a group of extensions that define subsets of the existing C extension (Zca, Zcd, Zcf) and new extensions which only contain 16-bit encodings.
+
+Zcm* all reuse the encodings for _c.fld_, _c.fsd_, _c.fldsp_, _c.fsdsp_.
+
+.Zc* extension overview
+[width="100%",options=header,cols="3,1,1,1,1,1,1"]
+|====================================================================================
+|Instruction |Zca |Zcf |Zcd |Zcb |Zcmp |Zcmt
+7+|*The Zca extension is added as way to refer to instructions in the C extension that do not include the floating-point loads and stores*
+|C excl. c.f* |yes | | | | |
+7+|*The Zcf extension is added as a way to refer to compressed single-precision floating-point load/stores*
+|c.flw | |rv32 | | | |
+|c.flwsp | |rv32 | | | |
+|c.fsw | |rv32 | | | |
+|c.fswsp | |rv32 | | | |
+7+|*The Zcd extension is added as a way to refer to compressed double-precision floating-point load/stores*
+|c.fld | | |yes | | |
+|c.fldsp | | |yes | | |
+|c.fsd | | |yes | | |
+|c.fsdsp | | |yes | | |
+7+|*Simple operations for use on all architectures*
+|c.lbu | | | |yes | |
+|c.lh | | | |yes | |
+|c.lhu | | | |yes | |
+|c.sb | | | |yes | |
+|c.sh | | | |yes | |
+|c.zext.b | | | |yes | |
+|c.sext.b | | | |yes | |
+|c.zext.h | | | |yes | |
+|c.sext.h | | | |yes | |
+|c.zext.w | | | |yes | |
+|c.mul | | | |yes | |
+|c.not | | | |yes | |
+7+|*PUSH/POP and double move which overlap with _c.fsdsp_. Complex operations intended for embedded CPUs*
+|cm.push | | | | |yes |
+|cm.pop | | | | |yes |
+|cm.popret | | | | |yes |
+|cm.popretz | | | | |yes |
+|cm.mva01s | | | | |yes |
+|cm.mvsa01 | | | | |yes |
+7+|*Table jump which overlaps with _c.fsdsp_. Complex operations intended for embedded CPUs*
+|cm.jt | | | | | |yes
+|cm.jalt | | | | | |yes
+|====================================================================================
+
+[#C]
+=== C
+
+The C extension is the superset of the following extensions:
+
+* Zca
+* Zcf if F is specified (RV32 only)
+* Zcd if D is specified
+
+As C defines the same instructions as Zca, Zcf and Zcd, the rule is that:
+
+* C always implies Zca
+* C+F implies Zcf (RV32 only)
+* C+D implies Zcd
+
+[reftext="Zce"]
+=== Zce
+
+The Zce extension is intended to be used for microcontrollers, and includes all relevant Zc extensions.
+
+* Specifying Zce on RV32 without F includes Zca, Zcb, Zcmp, Zcmt
+* Specifying Zce on RV32 with F includes Zca, Zcb, Zcmp, Zcmt _and_ Zcf
+* Specifying Zce on RV64 always includes Zca, Zcb, Zcmp, Zcmt
+** Zcf doesn't exist for RV64
+
+Therefore common ISA strings can be updated as follows to include the relevant Zc extensions, for example:
+
+* RV32IMC becomes RV32IM_Zce
+* RV32IMCF becomes RV32IMF_Zce
+
+[#misaC]
+=== MISA.C
+
+MISA.C is set if the following extensions are selected:
+
+* Zca and not F
+* Zca, Zcf and F is specified (RV32 only)
+* Zca, Zcf and Zcd if D is specified (RV32 only)
+** this configuration excludes Zcmp, Zcmt
+* Zca, Zcd if D is specified (RV64 only)
+** this configuration excludes Zcmp, Zcmt
+
+[reftext="Zca"]
+=== Zca
+
+The Zca extension is added as way to refer to instructions in the C extension that do not include the floating-point loads and stores.
+
+Therefore it _excluded_ all 16-bit floating point loads and stores: _c.flw_, _c.flwsp_, _c.fsw_, _c.fswsp_, _c.fld_, _c.fldsp_, _c.fsd_, _c.fsdsp_.
+
+[NOTE]
+====
+the C extension only includes F/D instructions when D and F are also specified
+====
+
+[reftext="Zcf"]
+=== Zcf (RV32 only)
+
+Zcf is the existing set of compressed single precision floating point loads and stores: _c.flw_, _c.flwsp_, _c.fsw_, _c.fswsp_.
+
+Zcf is only relevant to RV32, it cannot be specified for RV64.
+
+The Zcf extension depends on the <<Zca>> and F extensions.
+
+[reftext="Zcd"]
+=== Zcd
+
+Zcd is the existing set of compressed double precision floating point loads and stores: _c.fld_, _c.fldsp_, _c.fsd_, _c.fsdsp_.
+
+The Zcd extension depends on the <<Zca>> and D extensions.
+
+[reftext="Zcb"]
+=== Zcb
+
+Zcb has simple code-size saving instructions which are easy to implement on all CPUs.
+
+All encodings are currently reserved for all architectures, and have no conflicts with any existing extensions.
+
+NOTE: Zcb can be implemented on _any_ CPU as the instructions are 16-bit versions of existing 32-bit instructions from the application class profile.
+
+The Zcb extension depends on the <<Zca>> extension.
+
+As shown on the individual instruction pages, many of the instructions in Zcb depend upon another extension being implemented. For example, _c.mul_ is only implemented if M or Zmmul is implemented, and _c.sext.b_ is only implemented if Zbb is implemented.
+
+The _c.mul_ encoding uses the CA register format along with other instructions such as _c.sub_, _c.xor_ etc.
+
+[NOTE]
+
+ _c.sext.w_ is a pseudo-instruction for _c.addiw rd, 0_ (RV64)
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|yes
+|yes
+|c.lbu _rd'_, uimm(_rs1'_)
+|<<#insns-c_lbu>>
+
+|yes
+|yes
+|c.lhu _rd'_, uimm(_rs1'_)
+|<<#insns-c_lhu>>
+
+|yes
+|yes
+|c.lh _rd'_, uimm(_rs1'_)
+|<<#insns-c_lh>>
+
+|yes
+|yes
+|c.sb _rs2'_, uimm(_rs1'_)
+|<<#insns-c_sb>>
+
+|yes
+|yes
+|c.sh _rs2'_, uimm(_rs1'_)
+|<<#insns-c_sh>>
+
+|yes
+|yes
+|c.zext.b _rsd'_
+|<<#insns-c_zext_b>>
+
+|yes
+|yes
+|c.sext.b _rsd'_
+|<<#insns-c_sext_b>>
+
+|yes
+|yes
+|c.zext.h _rsd'_
+|<<#insns-c_zext_h>>
+
+|yes
+|yes
+|c.sext.h _rsd'_
+|<<#insns-c_sext_h>>
+
+|
+|yes
+|c.zext.w _rsd'_
+|<<#insns-c_zext_w>>
+
+|yes
+|yes
+|c.not _rsd'_
+|<<#insns-c_not>>
+
+|yes
+|yes
+|c.mul _rsd'_, _rs2'_
+|<<#insns-c_mul>>
+
+|===
+
+<<<
+
+[#Zcmp]
+=== Zcmp
+
+The Zcmp extension is a set of instructions which may be executed as a series of existing 32-bit RISC-V instructions.
+
+This extension reuses some encodings from _c.fsdsp_. Therefore it is _incompatible_ with <<Zcd>>,
+ which is included when C and D extensions are both present.
+
+NOTE: Zcmp is primarily targeted at embedded class CPUs due to implementation complexity. Additionally, it is not compatible with architecture class profiles.
+
+The Zcmp extension depends on the <<Zca>> extension.
+
+The PUSH/POP assembly syntax uses several variables, the meaning of which are:
+
+* _reg_list_ is a list containing 1 to 13 registers (ra and 0 to 12 s registers)
+** valid values: {ra}, {ra, s0}, {ra, s0-s1}, {ra, s0-s2}, ..., {ra, s0-s8}, {ra, s0-s9}, {ra, s0-s11}
+** note that {ra, s0-s10} is _not_ valid, giving 12 lists not 13 for better encoding
+* _stack_adj_ is the total size of the stack frame.
+** valid values vary with register list length and the specific encoding, see the instruction pages for details.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|yes
+|yes
+|cm.push _{reg_list}, -stack_adj_
+|<<#insns-cm_push>>
+
+|yes
+|yes
+|cm.pop _{reg_list}, stack_adj_
+|<<#insns-cm_pop>>
+
+|yes
+|yes
+|cm.popret _{reg_list}, stack_adj_
+|<<#insns-cm_popret>>
+
+|yes
+|yes
+|cm.popretz _{reg_list}, stack_adj_
+|<<#insns-cm_popretz>>
+
+|yes
+|yes
+|cm.mva01s _rs1', rs2'_
+|<<#insns-cm_mva01s>>
+
+|yes
+|yes
+|cm.mvsa01 _r1s', r2s'_
+|<<#insns-cm_mvsa01>>
+
+|===
+
+<<<
+
+[#Zcmt]
+=== Zcmt
+
+Zcmt adds the table jump instructions and also adds the JVT CSR. The JVT CSR requires a
+state enable if Smstateen is implemented. See <<csrs-jvt>> for details.
+
+This extension reuses some encodings from _c.fsdsp_. Therefore it is _incompatible_ with <<Zcd>>,
+ which is included when C and D extensions are both present.
+
+NOTE: Zcmt is primarily targeted at embedded class CPUs due to implementation complexity. Additionally, it is not compatible with RVA profiles.
+
+The Zcmt extension depends on the <<Zca>> and Zicsr extensions.
+
+[%header,cols="^1,^1,4,8"]
+|===
+|RV32
+|RV64
+|Mnemonic
+|Instruction
+
+|yes
+|yes
+|cm.jt _index_
+|<<#insns-cm_jt>>
+
+|yes
+|yes
+|cm.jalt _index_
+|<<#insns-cm_jalt>>
+
+|===
+
+[#Zc_formats]
+=== Zc instruction formats
+
+Several instructions in this specification use the following new instruction formats.
+
+[%header,cols="2,3,2,1,1,1,1,1,1,1,1,1,1"]
+|=====================================================================
+| Format | instructions | 15:10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
+| CLB | c.lbu | funct6 3+| rs1' 2+| uimm 3+| rd' 2+| op
+| CSB | c.sb | funct6 3+| rs1' 2+| uimm 3+| rs2' 2+| op
+| CLH | c.lhu, c.lh | funct6 3+| rs1' | funct1 | uimm 3+| rd' 2+| op
+| CSH | c.sh | funct6 3+| rs1' | funct1 | uimm 3+| rs2' 2+| op
+| CU | c.[sz]ext.*, c.not | funct6 3+| rd'/rs1' 5+| funct5 2+| op
+| CMMV | cm.mvsa01 cm.mva01s| funct6 3+| r1s' 2+| funct2 3+| r2s' 2+| op
+| CMJT | cm.jt cm.jalt | funct6 8+| index 2+| op
+| CMPP | cm.push*, cm.pop* | funct6 2+| funct2 4+| urlist 2+| spimm 2+| op
+|=====================================================================
+
+[NOTE]
+====
+c.mul uses the existing CA format.
+====
+
+<<<
+
+[#Zcb_instructions]
+=== Zcb instructions
+
+[#insns-c_lbu,reftext="Load unsigned byte, 16-bit encoding"]
+==== c.lbu
+
+Synopsis:
+
+Load unsigned byte, 16-bit encoding
+
+Mnemonic:
+
+c.lbu _rd'_, _uimm_(_rs1'_)
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x0, attr: ['C0'] },
+ { bits: 3, name: 'rd\'' },
+ { bits: 2, name: 'uimm[0|1]' },
+ { bits: 3, name: 'rs1\'' },
+ { bits: 3, name: 0x0 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+The immediate offset is formed as follows:
+
+[source,sail]
+--
+ uimm[31:2] = 0;
+ uimm[1] = encoding[5];
+ uimm[0] = encoding[6];
+--
+
+Description:
+
+This instruction loads a byte from the memory address formed by adding _rs1'_ to the zero extended immediate _uimm_. The resulting byte is zero extended to XLEN bits and is written to _rd'_.
+
+[NOTE]
+====
+_rd'_ and _rs1'_ are from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+//32-bit equivalent:
+//<<insns-lbu>>
+
+Operation:
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+X(rdc) = EXTZ(mem[X(rs1c)+EXTZ(uimm)][7..0]);
+----
+
+<<<
+[#insns-c_lhu,reftext="Load unsigned halfword, 16-bit encoding"]
+==== c.lhu
+
+Synopsis:
+
+Load unsigned halfword, 16-bit encoding
+
+Mnemonic:
+
+c.lhu _rd'_, _uimm_(_rs1'_)
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x0, attr: ['C0'] },
+ { bits: 3, name: 'rd\'' },
+ { bits: 1, name: 'uimm[1]' },
+ { bits: 1, name: 0x0 },
+ { bits: 3, name: 'rs1\'' },
+ { bits: 3, name: 0x1 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+The immediate offset is formed as follows:
+
+[source,sail]
+----
+ uimm[31:2] = 0;
+ uimm[1] = encoding[5];
+ uimm[0] = 0;
+----
+
+Description:
+
+This instruction loads a halfword from the memory address formed by adding _rs1'_ to the zero extended immediate _uimm_. The resulting halfword is zero extended to XLEN bits and is written to _rd'_.
+
+[NOTE]
+====
+_rd'_ and _rs1'_ are from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+//32-bit equivalent:
+//
+//<<insns-lhu>>
+
+Operation:
+
+[source,sail]
+--
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+X(rdc) = EXTZ(load_mem[X(rs1c)+EXTZ(uimm)][15..0]);
+--
+
+<<<
+[#insns-c_lh,reftext="Load signed halfword, 16-bit encoding"]
+==== c.lh
+
+Synopsis:
+
+Load signed halfword, 16-bit encoding
+
+Mnemonic:
+
+c.lh _rd'_, _uimm_(_rs1'_)
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x0, attr: ['C0'] },
+ { bits: 3, name: 'rd\'' },
+ { bits: 1, name: 'uimm[1]' },
+ { bits: 1, name: 0x1 },
+ { bits: 3, name: 'rs1\'' },
+ { bits: 3, name: 0x1 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+The immediate offset is formed as follows:
+
+[source,sail]
+----
+ uimm[31:2] = 0;
+ uimm[1] = encoding[5];
+ uimm[0] = 0;
+----
+
+Description:
+
+This instruction loads a halfword from the memory address formed by adding _rs1'_ to the zero extended immediate _uimm_. The resulting halfword is sign extended to XLEN bits and is written to _rd'_.
+
+[NOTE]
+====
+_rd'_ and _rs1'_ are from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+//32-bit equivalent:
+//
+//<<insns-lh>>
+
+Operation:
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+X(rdc) = EXTS(load_mem[X(rs1c)+EXTZ(uimm)][15..0]);
+----
+
+<<<
+[#insns-c_sb,reftext="Store byte, 16-bit encoding"]
+==== c.sb
+
+Synopsis:
+
+Store byte, 16-bit encoding
+
+Mnemonic:
+
+c.sb _rs2'_, _uimm_(_rs1'_)
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x0, attr: ['C0'] },
+ { bits: 3, name: 'rs2\'' },
+ { bits: 2, name: 'uimm[0|1]' },
+ { bits: 3, name: 'rs1\'' },
+ { bits: 3, name: 0x2 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+The immediate offset is formed as follows:
+
+[source,sail]
+----
+ uimm[31:2] = 0;
+ uimm[1] = encoding[5];
+ uimm[0] = encoding[6];
+----
+
+Description:
+
+This instruction stores the least significant byte of _rs2'_ to the memory address formed by adding _rs1'_ to the zero extended immediate _uimm_.
+
+[NOTE]
+====
+_rs1'_ and _rs2'_ are from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+//
+//32-bit equivalent:
+//
+//<<insns-sb>>
+
+Operation:
+
+[source,sail]
+--
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+mem[X(rs1c)+EXTZ(uimm)][7..0] = X(rs2c)
+--
+
+<<<
+[#insns-c_sh,reftext="Store halfword, 16-bit encoding"]
+==== c.sh
+
+Synopsis:
+
+Store halfword, 16-bit encoding
+
+Mnemonic:
+
+c.sh _rs2'_, _uimm_(_rs1'_)
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x0, attr: ['C0'] },
+ { bits: 3, name: 'rs2\'' },
+ { bits: 1, name: 'uimm[1]' },
+ { bits: 1, name: '0' },
+ { bits: 3, name: 'rs1\'' },
+ { bits: 3, name: 0x3 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+The immediate offset is formed as follows:
+
+[source,sail]
+----
+ uimm[31:2] = 0;
+ uimm[1] = encoding[5];
+ uimm[0] = 0;
+----
+
+Description:
+
+This instruction stores the least significant halfword of _rs2'_ to the memory address formed by adding _rs1'_ to the zero extended immediate _uimm_.
+
+[NOTE]
+====
+_rs1'_ and _rs2'_ are from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+//
+//32-bit equivalent:
+//
+//<<insns-sh>>
+
+Operation:
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+mem[X(rs1c)+EXTZ(uimm)][15..0] = X(rs2c)
+----
+
+<<<
+[#insns-c_zext_b,reftext="Zero extend byte, 16-bit encoding"]
+==== c.zext.b
+
+Synopsis:
+
+Zero extend byte, 16-bit encoding
+
+Mnemonic:
+
+c.zext.b _rd'/rs1'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 0x0, attr: ['C.ZEXT.B'] },
+ { bits: 2, name: 0x3, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction takes a single source/destination operand.
+It zero-extends the least-significant byte of the operand to XLEN bits by inserting zeros into all of
+the bits more significant than 7.
+
+[NOTE]
+====
+_rd'/rs1'_ is from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+[source,sail]
+----
+andi rd'/rs1', rd'/rs1', 0xff
+----
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+====
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = EXTZ(X(rsdc)[7..0]);
+----
+
+<<<
+[#insns-c_sext_b,reftext="Sign extend byte, 16-bit encoding"]
+==== c.sext.b
+
+Synopsis:
+
+Sign extend byte, 16-bit encoding
+
+Mnemonic:
+
+c.sext.b _rd'/rs1'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 0x1, attr: ['C.SEXT.B'] },
+ { bits: 2, name: 0x3, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction takes a single source/destination operand.
+It sign-extends the least-significant byte in the operand to XLEN bits by copying the most-significant bit
+in the byte (i.e., bit 7) to all of the more-significant bits.
+
+[NOTE]
+====
+_rd'/rs1'_ is from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+Zbb is also required.
+//
+//32-bit equivalent:
+//
+//<<insns-sext_b>> from Zbb
+
+[NOTE]
+
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = EXTS(X(rsdc)[7..0]);
+----
+
+Prerequisites:
+
+Zbb is also required.
+//
+//32-bit equivalent:
+//
+//<<insns-zext_h>> from Zbb
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+====
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = EXTZ(X(rsdc)[15..0]);
+----
+
+<<<
+[#insns-c_zext_h,reftext="Zero extend halfword, 16-bit encoding"]
+==== c.zext.h
+
+Synopsis:
+
+Zero extend halfword, 16-bit encoding
+
+Mnemonic:
+
+c.zext.h _rd'/rs1'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 0x2, attr: ['C.ZEXT.H'] },
+ { bits: 2, name: 0x3, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction takes a single source/destination operand.
+It zero-extends the least-significant halfword of the operand to XLEN bits by inserting zeros into all of
+the bits more significant than 15.
+
+[NOTE]
+====
+_rd'/rs1'_ is from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+Zbb is also required.
+//
+//32-bit equivalent:
+//
+//<<insns-zext_h>> from Zbb
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+====
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = EXTZ(X(rsdc)[15..0]);
+----
+
+<<<
+[#insns-c_sext_h,reftext="Sign extend halfword, 16-bit encoding"]
+==== c.sext.h
+
+Synopsis:
+
+Sign extend halfword, 16-bit encoding
+
+Mnemonic:
+
+c.sext.h _rd'/rs1'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 0x3, attr: ['C.SEXT.H'] },
+ { bits: 2, name: 0x3, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction takes a single source/destination operand.
+It sign-extends the least-significant halfword in the operand to XLEN bits by copying the most-significant bit
+in the halfword (i.e., bit 15) to all of the more-significant bits.
+
+[NOTE]
+====
+_rd'/rs1'_ is from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+Zbb is also required.
+//
+//32-bit equivalent:
+//
+//<<insns-sext_h>> from Zbb
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+====
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = EXTS(X(rsdc)[15..0]);
+----
+
+<<<
+[#insns-c_zext_w,reftext="Zero extend word, 16-bit encoding"]
+==== c.zext.w
+
+Synopsis:
+
+Zero extend word, 16-bit encoding
+
+Mnemonic:
+
+c.zext.w _rd'/rs1'_
+
+Encoding (RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 0x4, attr: ['C.ZEXT.W'] },
+ { bits: 2, name: 0x3, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction takes a single source/destination operand.
+It zero-extends the least-significant word of the operand to XLEN bits by inserting zeros into all of
+the bits more significant than 31.
+
+[NOTE]
+====
+_rd'/rs1'_ is from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+Zba is also required.
+
+32-bit equivalent:
+
+[source,sail]
+----
+add.uw rd'/rs1', rd'/rs1', zero
+----
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+====
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = EXTZ(X(rsdc)[31..0]);
+----
+
+<<<
+[#insns-c_not,reftext="Bitwise not, 16-bit encoding"]
+==== c.not
+
+Synopsis:
+
+Bitwise not, 16-bit encoding
+
+Mnemonic:
+
+c.not _rd'/rs1'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 0x5, attr: ['C.NOT'] },
+ { bits: 2, name: 0x3, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction takes the one's complement of _rd'/rs1'_ and writes the result to the same register.
+
+[NOTE]
+====
+rd'/rs1' is from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+[source,sail]
+----
+xori rd'/rs1', rd'/rs1', -1
+----
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_.
+====
+
+Operation:
+
+[source,sail]
+----
+X(rsdc) = X(rsdc) XOR -1;
+----
+
+<<<
+[#insns-c_mul,reftext="Multiply, 16-bit encoding"]
+==== c.mul
+
+Synopsis:
+
+Multiply, 16-bit encoding
+
+Mnemonic:
+
+c.mul _rsd'_, _rs2'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x1, attr: ['C1'] },
+ { bits: 3, name: 'rs2\'', attr: ['SRC2'] },
+ { bits: 2, name: 0x2, attr: ['FUNCT2'] },
+ { bits: 3, name: 'rd\'/rs1\'', attr: ['SRCDST'] },
+ { bits: 3, name: 0x7 },
+ { bits: 3, name: 0x4, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Description:
+
+This instruction multiplies XLEN bits of the source operands from _rsd'_ and _rs2'_ and writes the lowest XLEN bits of the result to _rsd'_.
+
+[NOTE]
+====
+_rd'/rs1'_ and _rs2'_ are from the standard 8-register set x8-x15.
+====
+
+Prerequisites:
+
+M or Zmmul must be configured.
+//
+//32-bit equivalent:
+//
+//<<insns-mul>>
+
+[NOTE]
+====
+The SAIL module variable for _rd'/rs1'_ is called _rsdc_, and for _rs2'_ is called _rs2c_.
+====
+
+Operation:
+
+[source,sail]
+----
+let result_wide = to_bits(2 * sizeof(xlen), signed(X(rsdc)) * signed(X(rs2c)));
+X(rsdc) = result_wide[(sizeof(xlen) - 1) .. 0];
+----
+
+<<<
+
+[#insns-pushpop,reftext="PUSH/POP Register Instructions"]
+=== PUSH/POP register instructions
+
+These instructions are collectively referred to as PUSH/POP:
+
+* <<#insns-cm_push>>
+* <<#insns-cm_pop>>
+* <<#insns-cm_popret>>
+* <<#insns-cm_popretz>>
+
+The term PUSH refers to _cm.push_.
+
+The term POP refers to _cm.pop_.
+
+The term POPRET refers to _cm.popret and cm.popretz_.
+
+Common details for these instructions are in this section.
+
+==== PUSH/POP functional overview
+
+PUSH, POP, POPRET are used to reduce the size of function prologues and epilogues.
+
+. The PUSH instruction
+** adjusts the stack pointer to create the stack frame
+** pushes (stores) the registers specified in the register list to the stack frame
+
+. The POP instruction
+** pops (loads) the registers in the register list from the stack frame
+** adjusts the stack pointer to destroy the stack frame
+
+. The POPRET instructions
+** pop (load) the registers in the register list from the stack frame
+** _cm.popretz_ also moves zero into _a0_ as the return value
+** adjust the stack pointer to destroy the stack frame
+** execute a _ret_ instruction to return from the function
+
+<<<
+==== Example usage
+
+This example gives an illustration of the use of PUSH and POPRET.
+
+The function _processMarkers_ in the EMBench benchmark picojpeg in the following file on github: https://github.com/embench/embench-iot/blob/master/src/picojpeg/libpicojpeg.c[libpicojpeg.c]
+
+The prologue and epilogue compile with GCC10 to:
+
+[source,SAIL]
+----
+
+ 0001098a <processMarkers>:
+ 1098a: 711d addi sp,sp,-96 ;#cm.push(1)
+ 1098c: c8ca sw s2,80(sp) ;#cm.push(2)
+ 1098e: c6ce sw s3,76(sp) ;#cm.push(3)
+ 10990: c4d2 sw s4,72(sp) ;#cm.push(4)
+ 10992: ce86 sw ra,92(sp) ;#cm.push(5)
+ 10994: cca2 sw s0,88(sp) ;#cm.push(6)
+ 10996: caa6 sw s1,84(sp) ;#cm.push(7)
+ 10998: c2d6 sw s5,68(sp) ;#cm.push(8)
+ 1099a: c0da sw s6,64(sp) ;#cm.push(9)
+ 1099c: de5e sw s7,60(sp) ;#cm.push(10)
+ 1099e: dc62 sw s8,56(sp) ;#cm.push(11)
+ 109a0: da66 sw s9,52(sp) ;#cm.push(12)
+ 109a2: d86a sw s10,48(sp);#cm.push(13)
+ 109a4: d66e sw s11,44(sp);#cm.push(14)
+...
+ 109f4: 4501 li a0,0 ;#cm.popretz(1)
+ 109f6: 40f6 lw ra,92(sp) ;#cm.popretz(2)
+ 109f8: 4466 lw s0,88(sp) ;#cm.popretz(3)
+ 109fa: 44d6 lw s1,84(sp) ;#cm.popretz(4)
+ 109fc: 4946 lw s2,80(sp) ;#cm.popretz(5)
+ 109fe: 49b6 lw s3,76(sp) ;#cm.popretz(6)
+ 10a00: 4a26 lw s4,72(sp) ;#cm.popretz(7)
+ 10a02: 4a96 lw s5,68(sp) ;#cm.popretz(8)
+ 10a04: 4b06 lw s6,64(sp) ;#cm.popretz(9)
+ 10a06: 5bf2 lw s7,60(sp) ;#cm.popretz(10)
+ 10a08: 5c62 lw s8,56(sp) ;#cm.popretz(11)
+ 10a0a: 5cd2 lw s9,52(sp) ;#cm.popretz(12)
+ 10a0c: 5d42 lw s10,48(sp);#cm.popretz(13)
+ 10a0e: 5db2 lw s11,44(sp);#cm.popretz(14)
+ 10a10: 6125 addi sp,sp,96 ;#cm.popretz(15)
+ 10a12: 8082 ret ;#cm.popretz(16)
+----
+
+<<<
+
+with the GCC option _-msave-restore_ the output is the following:
+
+[source,SAIL]
+----
+0001080e <processMarkers>:
+ 1080e: 73a012ef jal t0,11f48 <__riscv_save_12>
+ 10812: 1101 addi sp,sp,-32
+...
+ 10862: 4501 li a0,0
+ 10864: 6105 addi sp,sp,32
+ 10866: 71e0106f j 11f84 <__riscv_restore_12>
+----
+
+with PUSH/POPRET this reduces to
+
+[source,SAIL]
+----
+0001080e <processMarkers>:
+ 1080e: b8fa cm.push {ra,s0-s11},-96
+...
+ 10866: bcfa cm.popretz {ra,s0-s11}, 96
+----
+
+The prologue / epilogue reduce from 60-bytes in the original code, to 14-bytes with _-msave-restore_,
+and to 4-bytes with PUSH and POPRET.
+As well as reducing the code-size PUSH and POPRET eliminate the branches from
+calling the millicode _save/restore_ routines and so may also perform better.
+
+[NOTE]
+====
+The calls to _<riscv_save_0>/<riscv_restore_0>_ become 64-bit when the target functions are out of the ±1MB range, increasing the prologue/epilogue size to 22-bytes.
+====
+
+[NOTE]
+====
+POP is typically used in tail-calling sequences where _ret_ is not used to return to _ra_ after destroying the stack frame.
+====
+
+[#pushpop-areg-list]
+
+===== Stack pointer adjustment handling
+
+The instructions all automatically adjust the stack pointer by enough to cover the memory required for the registers being saved or restored.
+Additionally the _spimm_ field in the encoding allows the stack pointer to be adjusted in additional increments of 16-bytes. There is only a small restricted
+range available in the encoding; if the range is insufficient then a separate _c.addi16sp_ can be used to increase the range.
+
+===== Register list handling
+
+There is no support for the _{ra, s0-s10}_ register list without also adding _s11_. Therefore the _{ra, s0-s11}_ register list must be used in this case.
+
+[#pushpop-idempotent-memory]
+==== PUSH/POP Fault handling
+
+Correct execution requires that _sp_ refers to idempotent memory (also see <<pushpop_non-idem-mem>>), because the core must be able to
+handle traps detected during the sequence.
+The entire PUSH/POP sequence is re-executed after returning from the trap handler, and multiple traps are possible during the sequence.
+
+If a trap occurs during the sequence then _xEPC_ is updated with the PC of the instruction, _xTVAL_ (if not read-only-zero) updated with the bad address if it was an access fault and _xCAUSE_ updated with the type of trap.
+
+NOTE: It is implementation defined whether interrupts can also be taken during the sequence execution.
+
+[#pushpop-software-view]
+==== Software view of execution
+
+===== Software view of the PUSH sequence
+
+From a software perspective the PUSH sequence appears as:
+
+* A sequence of stores writing the bytes required by the pseudo-code
+** The bytes may be written in any order.
+** The bytes may be grouped into larger accesses.
+** Any of the bytes may be written multiple times.
+* A stack pointer adjustment
+
+[NOTE]
+====
+If an implementation allows interrupts during the sequence, and the interrupt handler uses _sp_ to allocate stack memory, then any stores which were executed before the interrupt may be overwritten by the handler. This is safe because the memory is idempotent and the stores will be re-executed when execution resumes.
+====
+
+The stack pointer adjustment must only be committed only when it is certain that the entire PUSH instruction will commit.
+
+Stores may also return imprecise faults from the bus.
+It is platform defined whether the core implementation waits for the bus responses before continuing to the final stage of the sequence,
+or handles errors responses after completing the PUSH instruction.
+
+<<<
+
+For example:
+
+[source,sail]
+----
+cm.push {ra, s0-s5}, -64
+----
+
+Appears to software as:
+
+[source,sail]
+----
+# any bytes from sp-1 to sp-28 may be written multiple times before
+# the instruction completes therefore these updates may be visible in
+# the interrupt/exception handler below the stack pointer
+sw s5, -4(sp)
+sw s4, -8(sp)
+sw s3,-12(sp)
+sw s2,-16(sp)
+sw s1,-20(sp)
+sw s0,-24(sp)
+sw ra,-28(sp)
+
+# this must only execute once, and will only execute after all stores
+# completed without any precise faults, therefore this update is only
+# visible in the interrupt/exception handler if cm.push has completed
+addi sp, sp, -64
+----
+
+===== Software view of the POP/POPRET sequence
+
+From a software perspective the POP/POPRET sequence appears as:
+
+* A sequence of loads reading the bytes required by the pseudo-code.
+** The bytes may be loaded in any order.
+** The bytes may be grouped into larger accesses.
+** Any of the bytes may be loaded multiple times.
+* A stack pointer adjustment
+* An optional `li a0, 0`
+* An optional `ret`
+
+If a trap occurs during the sequence, then any loads which were executed before the trap may update architectural state.
+The loads will be re-executed once the trap handler completes, so the values will be overwritten.
+Therefore it is permitted for an implementation to update some of the destination registers before taking a fault.
+
+The optional `li a0, 0`, stack pointer adjustment and optional `ret` must only be committed only when it is certain that the entire POP/POPRET instruction will commit.
+
+For POPRET once the stack pointer adjustment has been committed the `ret` must execute.
+
+<<<
+For example:
+
+[source,sail]
+----
+cm.popretz {ra, s0-s3}, 32;
+----
+
+Appears to software as:
+
+[source,sail]
+----
+# any or all of these load instructions may execute multiple times
+# therefore these updates may be visible in the interrupt/exception handler
+lw s3, 28(sp)
+lw s2, 24(sp)
+lw s1, 20(sp)
+lw s0, 16(sp)
+lw ra, 12(sp)
+
+# these must only execute once, will only execute after all loads
+# complete successfully all instructions must execute atomically
+# therefore these updates are not visible in the interrupt/exception handler
+li a0, 0
+addi sp, sp, 32
+ret
+----
+
+[[pushpop_non-idem-mem,Non-idempotent memory handling]]
+==== Non-idempotent memory handling
+
+An implementation may have a requirement to issue a PUSH/POP instruction to non-idempotent memory.
+
+If the core implementation does not support PUSH/POP to non-idempotent memories, the core may use an idempotency PMA to detect it and take a
+load (POP/POPRET) or store (PUSH) access fault exception in order to avoid unpredictable results.
+
+Software should only use these instructions on non-idempotent memory regions when software can tolerate the required memory accesses
+being issued repeatedly in the case that they cause exceptions.
+
+<<<
+
+==== Example RV32I PUSH/POP sequences
+
+The examples are included show the load/store series expansion and the stack adjustment.
+Examples of _cm.popret_ and _cm.popretz_ are not included, as the difference in the expanded sequence from _cm.pop_ is trivial in all cases.
+
+===== cm.push {ra, s0-s2}, -64
+
+Encoding: _rlist_=7, _spimm_=3
+
+expands to:
+
+[source,sail]
+----
+sw s2, -4(sp);
+sw s1, -8(sp);
+sw s0, -12(sp);
+sw ra, -16(sp);
+addi sp, sp, -64;
+----
+
+===== cm.push {ra, s0-s11}, -112
+
+Encoding: _rlist_=15, _spimm_=3
+
+expands to:
+
+[source,sail]
+----
+sw s11, -4(sp);
+sw s10, -8(sp);
+sw s9, -12(sp);
+sw s8, -16(sp);
+sw s7, -20(sp);
+sw s6, -24(sp);
+sw s5, -28(sp);
+sw s4, -32(sp);
+sw s3, -36(sp);
+sw s2, -40(sp);
+sw s1, -44(sp);
+sw s0, -48(sp);
+sw ra, -52(sp);
+addi sp, sp, -112;
+----
+
+<<<
+
+===== cm.pop {ra}, 16
+
+Encoding: _rlist_=4, _spimm_=0
+
+expands to:
+
+[source,sail]
+----
+lw ra, 12(sp);
+addi sp, sp, 16;
+----
+
+===== cm.pop {ra, s0-s3}, 48
+
+Encoding: _rlist_=8, _spimm_=1
+
+expands to:
+
+[source,sail]
+----
+lw s3, 44(sp);
+lw s2, 40(sp);
+lw s1, 36(sp);
+lw s0, 32(sp);
+lw ra, 28(sp);
+addi sp, sp, 48;
+----
+
+===== cm.pop {ra, s0-s4}, 64
+
+Encoding: _rlist_=9, _spimm_=2
+
+expands to:
+
+[source,sail]
+----
+lw s4, 60(sp);
+lw s3, 56(sp);
+lw s2, 52(sp);
+lw s1, 48(sp);
+lw s0, 44(sp);
+lw ra, 40(sp);
+addi sp, sp, 64;
+----
+
+
+<<<
+[#insns-cm_push,reftext="cm.push"]
+==== cm.push
+
+Synopsis:
+
+Create stack frame: store ra and 0 to 12 saved registers to the stack frame, optionally allocate additional stack space.
+
+Mnemonic:
+
+cm.push _{reg_list}, -stack_adj_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 2, name: 'spimm\[5:4\]', attr: [] },
+ { bits: 4, name: 'rlist', attr: [] },
+ { bits: 5, name: 0x18, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+_rlist_ values 0 to 3 are reserved for a future EABI variant called _cm.push.e_
+====
+
+Assembly Syntax:
+
+[source,sail]
+--
+cm.push {reg_list}, -stack_adj
+cm.push {xreg_list}, -stack_adj
+--
+
+The variables used in the assembly syntax are defined below.
+
+[source,sail]
+----
+RV32E:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32I, RV64:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ case 7: {reg_list="ra, s0-s2"; xreg_list="x1, x8-x9, x18";}
+ case 8: {reg_list="ra, s0-s3"; xreg_list="x1, x8-x9, x18-x19";}
+ case 9: {reg_list="ra, s0-s4"; xreg_list="x1, x8-x9, x18-x20";}
+ case 10: {reg_list="ra, s0-s5"; xreg_list="x1, x8-x9, x18-x21";}
+ case 11: {reg_list="ra, s0-s6"; xreg_list="x1, x8-x9, x18-x22";}
+ case 12: {reg_list="ra, s0-s7"; xreg_list="x1, x8-x9, x18-x23";}
+ case 13: {reg_list="ra, s0-s8"; xreg_list="x1, x8-x9, x18-x24";}
+ case 14: {reg_list="ra, s0-s9"; xreg_list="x1, x8-x9, x18-x25";}
+ //note - to include s10, s11 must also be included
+ case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32E:
+
+stack_adj_base = 16;
+Valid values:
+stack_adj = [16|32|48|64];
+----
+
+[source,sail]
+----
+RV32I:
+
+switch (rlist) {
+ case 4.. 7: stack_adj_base = 16;
+ case 8..11: stack_adj_base = 32;
+ case 12..14: stack_adj_base = 48;
+ case 15: stack_adj_base = 64;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 7: stack_adj = [16|32|48| 64];
+ case 8..11: stack_adj = [32|48|64| 80];
+ case 12..14: stack_adj = [48|64|80| 96];
+ case 15: stack_adj = [64|80|96|112];
+}
+----
+
+[source,sail]
+----
+RV64:
+
+switch (rlist) {
+ case 4.. 5: stack_adj_base = 16;
+ case 6.. 7: stack_adj_base = 32;
+ case 8.. 9: stack_adj_base = 48;
+ case 10..11: stack_adj_base = 64;
+ case 12..13: stack_adj_base = 80;
+ case 14: stack_adj_base = 96;
+ case 15: stack_adj_base = 112;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 5: stack_adj = [ 16| 32| 48| 64];
+ case 6.. 7: stack_adj = [ 32| 48| 64| 80];
+ case 8.. 9: stack_adj = [ 48| 64| 80| 96];
+ case 10..11: stack_adj = [ 64| 80| 96|112];
+ case 12..13: stack_adj = [ 80| 96|112|128];
+ case 14: stack_adj = [ 96|112|128|144];
+ case 15: stack_adj = [112|128|144|160];
+}
+----
+
+<<<
+Description:
+
+This instruction pushes (stores) the registers in _reg_list_ to the memory below the stack pointer,
+and then creates the stack frame by decrementing the stack pointer by _stack_adj_,
+including any additional stack space requested by the value of _spimm_.
+
+
+[NOTE]
+====
+All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.
+====
+
+For further information see <<insns-pushpop>>.
+
+Stack Adjustment Calculation:
+
+_stack_adj_base_ is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.
+
+_spimm_ is the number of additional 16-byte address increments allocated for the stack frame.
+
+The total stack adjustment represents the total size of the stack frame, which is _stack_adj_base_ added to _spimm_ scaled by 16,
+as defined above.
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists
+
+Operation:
+
+The first section of pseudo-code may be executed multiple times before the instruction successfully completes.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+if (XLEN==32) bytes=4; else bytes=8;
+
+addr=sp-bytes;
+for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1) {
+ //if register i is in xreg_list
+ if (xreg_list[i]) {
+ switch(bytes) {
+ 4: asm("sw x[i], 0(addr)");
+ 8: asm("sd x[i], 0(addr)");
+ }
+ addr-=bytes;
+ }
+}
+----
+
+The final section of pseudo-code executes atomically, and only executes if the section above completes without any exceptions or interrupts.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+sp-=stack_adj;
+----
+
+<<<
+[#insns-cm_pop,reftext="cm.pop"]
+==== cm.pop
+
+Synopsis:
+
+Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame.
+
+Mnemonic:
+
+cm.pop _{reg_list}, stack_adj_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 2, name: 'spimm\[5:4\]', attr: [] },
+ { bits: 4, name: 'rlist', attr: [] },
+ { bits: 5, name: 0x1a, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+_rlist_ values 0 to 3 are reserved for a future EABI variant called _cm.pop.e_
+====
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.pop {reg_list}, stack_adj
+cm.pop {xreg_list}, stack_adj
+----
+
+The variables used in the assembly syntax are defined below.
+
+[source,sail]
+----
+RV32E:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32I, RV64:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ case 7: {reg_list="ra, s0-s2"; xreg_list="x1, x8-x9, x18";}
+ case 8: {reg_list="ra, s0-s3"; xreg_list="x1, x8-x9, x18-x19";}
+ case 9: {reg_list="ra, s0-s4"; xreg_list="x1, x8-x9, x18-x20";}
+ case 10: {reg_list="ra, s0-s5"; xreg_list="x1, x8-x9, x18-x21";}
+ case 11: {reg_list="ra, s0-s6"; xreg_list="x1, x8-x9, x18-x22";}
+ case 12: {reg_list="ra, s0-s7"; xreg_list="x1, x8-x9, x18-x23";}
+ case 13: {reg_list="ra, s0-s8"; xreg_list="x1, x8-x9, x18-x24";}
+ case 14: {reg_list="ra, s0-s9"; xreg_list="x1, x8-x9, x18-x25";}
+ //note - to include s10, s11 must also be included
+ case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32E:
+
+stack_adj_base = 16;
+Valid values:
+stack_adj = [16|32|48|64];
+----
+
+[source,sail]
+----
+RV32I:
+
+switch (rlist) {
+ case 4.. 7: stack_adj_base = 16;
+ case 8..11: stack_adj_base = 32;
+ case 12..14: stack_adj_base = 48;
+ case 15: stack_adj_base = 64;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 7: stack_adj = [16|32|48| 64];
+ case 8..11: stack_adj = [32|48|64| 80];
+ case 12..14: stack_adj = [48|64|80| 96];
+ case 15: stack_adj = [64|80|96|112];
+}
+----
+
+[source,sail]
+----
+RV64:
+
+switch (rlist) {
+ case 4.. 5: stack_adj_base = 16;
+ case 6.. 7: stack_adj_base = 32;
+ case 8.. 9: stack_adj_base = 48;
+ case 10..11: stack_adj_base = 64;
+ case 12..13: stack_adj_base = 80;
+ case 14: stack_adj_base = 96;
+ case 15: stack_adj_base = 112;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 5: stack_adj = [ 16| 32| 48| 64];
+ case 6.. 7: stack_adj = [ 32| 48| 64| 80];
+ case 8.. 9: stack_adj = [ 48| 64| 80| 96];
+ case 10..11: stack_adj = [ 64| 80| 96|112];
+ case 12..13: stack_adj = [ 80| 96|112|128];
+ case 14: stack_adj = [ 96|112|128|144];
+ case 15: stack_adj = [112|128|144|160];
+}
+----
+
+<<<
+
+Description:
+
+This instruction pops (loads) the registers in _reg_list_ from stack memory,
+and then adjusts the stack pointer by _stack_adj_.
+
+[NOTE]
+====
+All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.
+====
+
+For further information see <<insns-pushpop>>.
+
+Stack Adjustment Calculation:
+
+_stack_adj_base_ is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.
+
+_spimm_ is the number of additional 16-byte address increments allocated for the stack frame.
+
+The total stack adjustment represents the total size of the stack frame, which is _stack_adj_base_ added to _spimm_ scaled by 16,
+as defined above.
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists
+
+Operation:
+
+The first section of pseudo-code may be executed multiple times before the instruction successfully completes.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+if (XLEN==32) bytes=4; else bytes=8;
+
+addr=sp+stack_adj-bytes;
+for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1) {
+ //if register i is in xreg_list
+ if (xreg_list[i]) {
+ switch(bytes) {
+ 4: asm("lw x[i], 0(addr)");
+ 8: asm("ld x[i], 0(addr)");
+ }
+ addr-=bytes;
+ }
+}
+----
+
+The final section of pseudo-code executes atomically, and only executes if the section above completes without any exceptions or interrupts.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+sp+=stack_adj;
+----
+
+<<<
+[#insns-cm_popretz,reftext="cm.popretz"]
+==== cm.popretz
+
+Synopsis:
+
+Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame, move zero into a0, return to ra.
+
+Mnemonic:
+
+cm.popretz _{reg_list}, stack_adj_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 2, name: 'spimm\[5:4\]', attr: [] },
+ { bits: 4, name: 'rlist', attr: [] },
+ { bits: 5, name: 0x1c, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+_rlist_ values 0 to 3 are reserved for a future EABI variant called _cm.popretz.e_
+====
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.popretz {reg_list}, stack_adj
+cm.popretz {xreg_list}, stack_adj
+----
+
+[source,sail]
+----
+RV32E:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32I, RV64:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ case 7: {reg_list="ra, s0-s2"; xreg_list="x1, x8-x9, x18";}
+ case 8: {reg_list="ra, s0-s3"; xreg_list="x1, x8-x9, x18-x19";}
+ case 9: {reg_list="ra, s0-s4"; xreg_list="x1, x8-x9, x18-x20";}
+ case 10: {reg_list="ra, s0-s5"; xreg_list="x1, x8-x9, x18-x21";}
+ case 11: {reg_list="ra, s0-s6"; xreg_list="x1, x8-x9, x18-x22";}
+ case 12: {reg_list="ra, s0-s7"; xreg_list="x1, x8-x9, x18-x23";}
+ case 13: {reg_list="ra, s0-s8"; xreg_list="x1, x8-x9, x18-x24";}
+ case 14: {reg_list="ra, s0-s9"; xreg_list="x1, x8-x9, x18-x25";}
+ //note - to include s10, s11 must also be included
+ case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32E:
+
+stack_adj_base = 16;
+Valid values:
+stack_adj = [16|32|48|64];
+----
+
+[source,sail]
+----
+RV32I:
+
+switch (rlist) {
+ case 4.. 7: stack_adj_base = 16;
+ case 8..11: stack_adj_base = 32;
+ case 12..14: stack_adj_base = 48;
+ case 15: stack_adj_base = 64;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 7: stack_adj = [16|32|48| 64];
+ case 8..11: stack_adj = [32|48|64| 80];
+ case 12..14: stack_adj = [48|64|80| 96];
+ case 15: stack_adj = [64|80|96|112];
+}
+----
+
+[source,sail]
+----
+RV64:
+
+switch (rlist) {
+ case 4.. 5: stack_adj_base = 16;
+ case 6.. 7: stack_adj_base = 32;
+ case 8.. 9: stack_adj_base = 48;
+ case 10..11: stack_adj_base = 64;
+ case 12..13: stack_adj_base = 80;
+ case 14: stack_adj_base = 96;
+ case 15: stack_adj_base = 112;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 5: stack_adj = [ 16| 32| 48| 64];
+ case 6.. 7: stack_adj = [ 32| 48| 64| 80];
+ case 8.. 9: stack_adj = [ 48| 64| 80| 96];
+ case 10..11: stack_adj = [ 64| 80| 96|112];
+ case 12..13: stack_adj = [ 80| 96|112|128];
+ case 14: stack_adj = [ 96|112|128|144];
+ case 15: stack_adj = [112|128|144|160];
+}
+----
+
+<<<
+
+Description:
+
+This instruction pops (loads) the registers in _reg_list_ from stack memory, adjusts the stack pointer by _stack_adj_, moves zero into a0 and then returns to _ra_.
+
+[NOTE]
+====
+All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.
+====
+
+For further information see <<insns-pushpop>>.
+
+Stack Adjustment Calculation:
+
+_stack_adj_base_ is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.
+
+_spimm_ is the number of additional 16-byte address increments allocated for the stack frame.
+
+The total stack adjustment represents the total size of the stack frame, which is _stack_adj_base_ added to _spimm_ scaled by 16, as defined above.
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists
+
+
+Operation:
+
+The first section of pseudo-code may be executed multiple times before the instruction successfully completes.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+if (XLEN==32) bytes=4; else bytes=8;
+
+addr=sp+stack_adj-bytes;
+for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1) {
+ //if register i is in xreg_list
+ if (xreg_list[i]) {
+ switch(bytes) {
+ 4: asm("lw x[i], 0(addr)");
+ 8: asm("ld x[i], 0(addr)");
+ }
+ addr-=bytes;
+ }
+}
+----
+
+The final section of pseudo-code executes atomically, and only executes if the section above completes without any exceptions or interrupts.
+
+[NOTE]
+====
+The _li a0, 0_ *could* be executed more than once, but is included in the atomic section for convenience.
+====
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+asm("li a0, 0");
+sp+=stack_adj;
+asm("ret");
+----
+
+<<<
+[#insns-cm_popret,reftext="cm.popret"]
+==== cm.popret
+
+Synopsis:
+
+Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame, return to ra.
+
+Mnemonic:
+
+cm.popret _{reg_list}, stack_adj_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 2, name: 'spimm\[5:4\]', attr: [] },
+ { bits: 4, name: 'rlist', attr: [] },
+ { bits: 5, name: 0x1e, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+_rlist_ values 0 to 3 are reserved for a future EABI variant called _cm.popret.e_
+====
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.popret {reg_list}, stack_adj
+cm.popret {xreg_list}, stack_adj
+----
+
+The variables used in the assembly syntax are defined below.
+
+[source,sail]
+----
+RV32E:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32I, RV64:
+
+switch (rlist){
+ case 4: {reg_list="ra"; xreg_list="x1";}
+ case 5: {reg_list="ra, s0"; xreg_list="x1, x8";}
+ case 6: {reg_list="ra, s0-s1"; xreg_list="x1, x8-x9";}
+ case 7: {reg_list="ra, s0-s2"; xreg_list="x1, x8-x9, x18";}
+ case 8: {reg_list="ra, s0-s3"; xreg_list="x1, x8-x9, x18-x19";}
+ case 9: {reg_list="ra, s0-s4"; xreg_list="x1, x8-x9, x18-x20";}
+ case 10: {reg_list="ra, s0-s5"; xreg_list="x1, x8-x9, x18-x21";}
+ case 11: {reg_list="ra, s0-s6"; xreg_list="x1, x8-x9, x18-x22";}
+ case 12: {reg_list="ra, s0-s7"; xreg_list="x1, x8-x9, x18-x23";}
+ case 13: {reg_list="ra, s0-s8"; xreg_list="x1, x8-x9, x18-x24";}
+ case 14: {reg_list="ra, s0-s9"; xreg_list="x1, x8-x9, x18-x25";}
+ //note - to include s10, s11 must also be included
+ case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
+ default: reserved();
+}
+stack_adj = stack_adj_base + spimm[5:4] * 16;
+----
+
+[source,sail]
+----
+RV32E:
+
+stack_adj_base = 16;
+Valid values:
+stack_adj = [16|32|48|64];
+----
+
+[source,sail]
+----
+RV32I:
+
+switch (rlist) {
+ case 4.. 7: stack_adj_base = 16;
+ case 8..11: stack_adj_base = 32;
+ case 12..14: stack_adj_base = 48;
+ case 15: stack_adj_base = 64;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 7: stack_adj = [16|32|48| 64];
+ case 8..11: stack_adj = [32|48|64| 80];
+ case 12..14: stack_adj = [48|64|80| 96];
+ case 15: stack_adj = [64|80|96|112];
+}
+----
+
+[source,sail]
+----
+RV64:
+
+switch (rlist) {
+ case 4.. 5: stack_adj_base = 16;
+ case 6.. 7: stack_adj_base = 32;
+ case 8.. 9: stack_adj_base = 48;
+ case 10..11: stack_adj_base = 64;
+ case 12..13: stack_adj_base = 80;
+ case 14: stack_adj_base = 96;
+ case 15: stack_adj_base = 112;
+}
+
+Valid values:
+switch (rlist) {
+ case 4.. 5: stack_adj = [ 16| 32| 48| 64];
+ case 6.. 7: stack_adj = [ 32| 48| 64| 80];
+ case 8.. 9: stack_adj = [ 48| 64| 80| 96];
+ case 10..11: stack_adj = [ 64| 80| 96|112];
+ case 12..13: stack_adj = [ 80| 96|112|128];
+ case 14: stack_adj = [ 96|112|128|144];
+ case 15: stack_adj = [112|128|144|160];
+}
+----
+
+<<<
+
+Description:
+
+This instruction pops (loads) the registers in _reg_list_ from stack memory, adjusts the stack pointer by _stack_adj_ and then returns to _ra_.
+
+[NOTE]
+====
+All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.
+====
+
+For further information see <<insns-pushpop>>.
+
+Stack Adjustment Calculation:
+
+_stack_adj_base_ is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.
+
+_spimm_ is the number of additional 16-byte address increments allocated for the stack frame.
+
+The total stack adjustment represents the total size of the stack frame, which is _stack_adj_base_ added to _spimm_ scaled by 16, as defined above.
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists
+
+Operation:
+
+The first section of pseudo-code may be executed multiple times before the instruction successfully completes.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+if (XLEN==32) bytes=4; else bytes=8;
+
+addr=sp+stack_adj-bytes;
+for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1) {
+ //if register i is in xreg_list
+ if (xreg_list[i]) {
+ switch(bytes) {
+ 4: asm("lw x[i], 0(addr)");
+ 8: asm("ld x[i], 0(addr)");
+ }
+ addr-=bytes;
+ }
+}
+----
+
+The final section of pseudo-code executes atomically, and only executes if the section above completes without any exceptions or interrupts.
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+sp+=stack_adj;
+asm("ret");
+----
+
+<<<
+
+[#insns-cm_mvsa01,reftext="Move a0-a1 into two different s0-s7 registers"]
+==== cm.mvsa01
+
+Synopsis:
+
+Move a0-a1 into two registers of s0-s7
+
+Mnemonic:
+
+cm.mvsa01 _r1s'_, _r2s'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 3, name: 'r2s\'', attr: [] },
+ { bits: 2, name: 0x1, attr: [] },
+ { bits: 3, name: 'r1s\'', attr: [] },
+ { bits: 3, name: 0x3, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+For the encoding to be legal _r1s'_ != _r2s'_.
+====
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.mvsa01 r1s', r2s'
+----
+
+Description:
+This instruction moves _a0_ into _r1s'_ and _a1_ into _r2s'_. _r1s'_ and _r2s'_ must be different.
+The execution is atomic, so it is not possible to observe state where only one of _r1s'_ or _r2s'_ has been updated.
+
+The encoding uses _sreg_ number specifiers instead of _xreg_ number specifiers to save encoding space.
+The mapping between them is specified in the pseudo-code below.
+
+[NOTE]
+====
+The _s_ register mapping is taken from the UABI, and may not match the currently unratified EABI. _cm.mvsa01.e_ may be included in the future.
+====
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists.
+
+Operation:
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+if (RV32E && (r1sc>1 || r2sc>1)) {
+ reserved();
+}
+xreg1 = {r1sc[2:1]>0,r1sc[2:1]==0,r1sc[2:0]};
+xreg2 = {r2sc[2:1]>0,r2sc[2:1]==0,r2sc[2:0]};
+X[xreg1] = X[10];
+X[xreg2] = X[11];
+----
+
+<<<
+
+[#insns-cm_mva01s,reftext="Move two s0-s7 registers into a0-a1"]
+==== cm.mva01s
+
+Synopsis:
+
+Move two s0-s7 registers into a0-a1
+
+Mnemonic:
+
+cm.mva01s _r1s'_, _r2s'_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 3, name: 'r2s\'', attr: [] },
+ { bits: 2, name: 0x3, attr: [] },
+ { bits: 3, name: 'r1s\'', attr: [] },
+ { bits: 3, name: 0x3, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.mva01s r1s', r2s'
+----
+
+Description:
+This instruction moves _r1s'_ into _a0_ and _r2s'_ into _a1_.
+The execution is atomic, so it is not possible to observe state where only one of _a0_ or _a1_ have been updated.
+
+The encoding uses _sreg_ number specifiers instead of _xreg_ number specifiers to save encoding space.
+The mapping between them is specified in the pseudo-code below.
+
+[NOTE]
+====
+The _s_ register mapping is taken from the UABI, and may not match the currently unratified EABI. _cm.mva01s.e_ may be included in the future.
+====
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists.
+
+Operation:
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+if (RV32E && (r1sc>1 || r2sc>1)) {
+ reserved();
+}
+xreg1 = {r1sc[2:1]>0,r1sc[2:1]==0,r1sc[2:0]};
+xreg2 = {r2sc[2:1]>0,r2sc[2:1]==0,r2sc[2:0]};
+X[10] = X[xreg1];
+X[11] = X[xreg2];
+----
+
+<<<
+
+[#insns-tablejump,reftext="Table Jump Overview"]
+=== Table Jump Overview
+
+_cm.jt_ (<<#insns-cm_jt>>) and _cm.jalt_ (<<#insns-cm_jalt>>) are referred to as table jump.
+
+Table jump uses a 256-entry XLEN wide table in instruction memory to contain function addresses.
+The table must be a minimum of 64-byte aligned.
+
+Table entries follow the current data endianness. This is different from normal instruction fetch which is always little-endian.
+
+_cm.jt_ and _cm.jalt_ encodings index the table, giving access to functions within the full XLEN wide address space.
+
+This is used as a form of dictionary compression to reduce the code size of _jal_ / _auipc+jalr_ / _jr_ / _auipc+jr_ instructions.
+
+Table jump allows the linker to replace the following instruction sequences with a _cm.jt_ or _cm.jalt_ encoding, and an entry in the table:
+
+* 32-bit _j_ calls
+* 32-bit _jal_ ra calls
+* 64-bit _auipc+jr_ calls to fixed locations
+* 64-bit _auipc+jalr ra_ calls to fixed locations
+** The _auipc+jr/jalr_ sequence is used because the offset from the PC is out of the ±1MB range.
+
+If a return address stack is implemented, then as _cm.jalt_ is equivalent to _jal ra_, it pushes to the stack.
+
+==== JVT
+
+The base of the table is in the JVT CSR (see <<csrs-jvt>>), each table entry is XLEN bits.
+
+If the same function is called with and without linking then it must have two entries in the table.
+This is typically caused by the same function being called with and without tail calling.
+
+[#tablejump-fault-handling]
+==== Table Jump Fault handling
+
+For a table jump instruction, the table entry that the instruction selects is considered an extension of the instruction itself.
+Hence, the execution of a table jump instruction involves two instruction fetches, the first to read the instruction (_cm.jt_/_cm.jalt_)
+and the second to read from the jump vector table (JVT). Both instruction fetches are _implicit_ reads, and both require
+execute permission; read permission is irrelevant. It is recommended that the second fetch be ignored for hardware triggers and breakpoints.
+
+Memory writes to the jump vector table require an instruction barrier (_fence.i_) to guarantee that they are visible to the instruction fetch.
+
+Multiple contexts may have different jump vector tables. JVT may be switched between them without an instruction barrier
+if the tables have not been updated in memory since the last _fence.i_.
+
+If an exception occurs on either instruction fetch, xEPC is set to the PC of the table jump instruction, xCAUSE is set as expected for the type of fault and xTVAL (if not set to zero) contains the fetch address which caused the fault.
+
+<<<
+[#csrs-jvt,reftext="JVT CSR, table jump base vector and control register"]
+==== JVT CSR
+
+Synopsis:
+
+Table jump base vector and control register
+
+Address:
+
+0x0017
+
+Permissions:
+
+URW
+
+Format (RV32):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 6, name: 'mode', attr: ['6'] },
+ { bits: 26, name: 'base[XLEN-1:6] (WARL)', attr: ['XLEN-6'] },
+],config:{bits:32}}
+....
+
+Format (RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 6, name: 'mode', attr: ['6'] },
+ { bits: 58, name: 'base[XLEN-1:6] (WARL)', attr: ['XLEN-6'] },
+],config:{bits:64}}
+....
+
+Description:
+
+The _JVT_ register is an XLEN-bit *WARL* read/write register that holds the jump table configuration, consisting of the jump table base address (BASE) and the jump table mode (MODE).
+
+If <<Zcmt>> is implemented then _JVT_ must also be implemented, but can contain a read-only value. If _JVT_ is writable, the set of values the register may hold can vary by implementation. The value in the BASE field must always be aligned on a 64-byte boundary.
+
+_JVT.base_ is a virtual address, whenever virtual memory is enabled.
+
+The memory pointed to by _JVT.base_ is treated as instruction memory for the purpose of executing table jump instructions, implying execute access permission.
+
+[#JVT-config-table]
+._JVT.mode_ definition
+[width="60%",options=header]
+|=============================================================================================
+| JVT.mode | Comment
+| 000000 | Jump table mode
+| others | *reserved for future standard use*
+|=============================================================================================
+
+_JVT.mode_ is a *WARL* field, so can only be programmed to modes which are implemented. Therefore the discovery mechanism is to
+attempt to program different modes and read back the values to see which are available. Jump table mode _must_ be implemented.
+
+[NOTE]
+====
+in future the RISC-V Unified Discovery method will report the available modes.
+====
+
+Architectural State:
+
+_JVT_ adds architectural state to the system software context (such as an OS process), therefore must be saved/restored on context switches.
+
+State Enable:
+
+If the Smstateen extension is implemented, then bit 2 in _mstateen0_, _sstateen0_, and _hstateen0_ is implemented. If bit 2 of a controlling _stateen0_ CSR is zero, then access to the _JVT_ CSR and execution of a _cm.jalt_ or _cm.jt_ instruction by a lower privilege level results in an Illegal Instruction trap (or, if appropriate, a Virtual Instruction trap).
+
+<<<
+[#insns-cm_jt,reftext="Jump via table"]
+==== cm.jt
+
+Synopsis:
+
+jump via table
+
+Mnemonic:
+
+cm.jt _index_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 8, name: 'index', attr: [] },
+ { bits: 3, name: 0x0, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+For this encoding to decode as _cm.jt_, _index<32_, otherwise it decodes as _cm.jalt_, see <<insns-cm_jalt>>.
+====
+
+[NOTE]
+====
+If JVT.mode = 0 (Jump Table Mode) then _cm.jt_ behaves as specified here. If JVT.mode is a reserved value, then _cm.jt_ is also reserved. In the future other defined values of JVT.mode may change the behaviour of _cm.jt_.
+====
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.jt index
+----
+
+Description:
+
+_cm.jt_ reads an entry from the jump vector table in memory and jumps to the address that was read.
+
+For further information see <<insns-tablejump>>.
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists.
+
+<<<
+
+[#insns-cm_jt-SAIL,reftext="cm.jt SAIL code"]
+Operation:
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+# target_address is temporary internal state, it doesn't represent a real register
+# InstMemory is byte indexed
+
+switch(XLEN) {
+ 32: table_address[XLEN-1:0] = JVT.base + (index<<2);
+ 64: table_address[XLEN-1:0] = JVT.base + (index<<3);
+}
+
+//fetch from the jump table
+target_address[XLEN-1:0] = InstMemory[table_address][XLEN-1:0];
+
+j target_address[XLEN-1:0]&~0x1;
+
+----
+
+<<<
+[#insns-cm_jalt,reftext="Jump and link via table"]
+==== cm.jalt
+
+Synopsis:
+
+jump via table with optional link
+
+Mnemonic:
+
+cm.jalt _index_
+
+Encoding (RV32, RV64):
+
+[wavedrom, , svg]
+....
+{reg:[
+ { bits: 2, name: 0x2, attr: ['C2'] },
+ { bits: 8, name: 'index', attr: [] },
+ { bits: 3, name: 0x0, attr: [] },
+ { bits: 3, name: 0x5, attr: ['FUNCT3'] },
+],config:{bits:16}}
+....
+
+[NOTE]
+====
+For this encoding to decode as _cm.jalt_, _index>=32_, otherwise it decodes as _cm.jt_, see <<insns-cm_jt>>.
+====
+
+[NOTE]
+====
+If JVT.mode = 0 (Jump Table Mode) then _cm.jalt_ behaves as specified here. If JVT.mode is a reserved value, then _cm.jalt_ is also reserved. In the future other defined values of JVT.mode may change the behaviour of _cm.jalt_.
+====
+
+Assembly Syntax:
+
+[source,sail]
+----
+cm.jalt index
+----
+
+Description:
+
+_cm.jalt_ reads an entry from the jump vector table in memory and jumps to the address that was read, linking to _ra_.
+
+For further information see <<insns-tablejump>>.
+
+Prerequisites:
+
+None
+
+32-bit equivalent:
+
+No direct equivalent encoding exists.
+
+<<<
+
+[#insns-cm_jalt-SAIL,reftext="cm.jalt SAIL code"]
+Operation:
+
+[source,sail]
+----
+//This is not SAIL, it's pseudo-code. The SAIL hasn't been written yet.
+
+# target_address is temporary internal state, it doesn't represent a real register
+# InstMemory is byte indexed
+
+switch(XLEN) {
+ 32: table_address[XLEN-1:0] = JVT.base + (index<<2);
+ 64: table_address[XLEN-1:0] = JVT.base + (index<<3);
+}
+
+//fetch from the jump table
+target_address[XLEN-1:0] = InstMemory[table_address][XLEN-1:0];
+
+jal ra, target_address[XLEN-1:0]&~0x1;
+
+----
+
+
+
diff --git a/src/zfh.adoc b/src/zfh.adoc
index f16514c..9e8710e 100644
--- a/src/zfh.adoc
+++ b/src/zfh.adoc
@@ -91,7 +91,7 @@ floating-point number to a quad-precision floating-point number, or
vice-versa, respectively.
include::images/wavedrom/half-prec-flpt-to-flpt-conv.adoc[]
-[half-prec-flpt-to-flpt-conv]
+[[half-prec-flpt-to-flpt-conv]]
Floating-point to floating-point sign-injection instructions, FSGNJ.H,
FSGNJN.H, and FSGNJX.H are defined analogously to the single-precision
diff --git a/src/zicsr.adoc b/src/zicsr.adoc
index 9648bb7..50183a8 100644
--- a/src/zicsr.adoc
+++ b/src/zicsr.adoc
@@ -38,27 +38,27 @@ of the CSR, zero-extends the value to XLEN bits, and writes it to
integer register _rd_. The initial value in integer register _rs1_ is
treated as a bit mask that specifies bit positions to be set in the CSR.
Any bit that is high in _rs1_ will cause the corresponding bit to be set
-in the CSR, if that CSR bit is writable. Other bits in the CSR are not
-explicitly written.
+in the CSR, if that CSR bit is writable.
The CSRRC (Atomic Read and Clear Bits in CSR) instruction reads the
value of the CSR, zero-extends the value to XLEN bits, and writes it to
integer register _rd_. The initial value in integer register _rs1_ is
treated as a bit mask that specifies bit positions to be cleared in the
CSR. Any bit that is high in _rs1_ will cause the corresponding bit to
-be cleared in the CSR, if that CSR bit is writable. Other bits in the
-CSR are not explicitly written.
+be cleared in the CSR, if that CSR bit is writable.
For both CSRRS and CSRRC, if _rs1_=`x0`, then the instruction will not
write to the CSR at all, and so shall not cause any of the side effects
that might otherwise occur on a CSR write, nor raise illegal-instruction
exceptions on accesses to read-only CSRs. Both CSRRS and CSRRC always
read the addressed CSR and cause any read side effects regardless of
-_rs1_ and _rd_ fields. Note that if _rs1_ specifies a register holding a
-zero value other than `x0`, the instruction will still attempt to write
-the unmodified value back to the CSR and will cause any attendant side
-effects. A CSRRW with _rs1_=`x0` will attempt to write zero to the
-destination CSR.
+_rs1_ and _rd_ fields.
+Note that if _rs1_ specifies a register other than `x0`, and that register
+holds a zero value, the instruction will not action any attendant per-field
+side effects, but will action any side effects caused by writing to the entire
+CSR.
+
+A CSRRW with _rs1_=`x0` will attempt to write zero to the destination CSR.
The CSRRWI, CSRRSI, and CSRRCI variants are similar to CSRRW, CSRRS, and
CSRRC respectively, except they update the CSR using an XLEN-bit value
@@ -105,6 +105,21 @@ CSR
<<csrsideeffects>> summarizes the behavior of the CSR
instructions with respect to whether they read and/or write the CSR.
+In addition to side effects that occur as a consequence of reading or
+writing a CSR, individual fields within a CSR might have side effects
+when written. The CSRRW[I] instructions action side effects for all
+such fields within the written CSR. The CSRRS[I] an CSRRC[I] instructions
+only action side effects for fields for which the _rs1_ or _uimm_ argument
+has at least one bit set corresponding to that field.
+[NOTE]
+====
+As of this writing, no standard CSRs have side effects on field writes.
+Hence, whether a standard CSR access has any side effects can be determined
+solely from the opcode.
+
+Defining CSRs with side effects on field writes is not recommended.
+====
+
For any event or consequence that occurs due to a CSR having a
particular value, if a write to the CSR gives it that value, the
resulting event or consequence is said to be an _indirect effect_ of the