aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
Diffstat (limited to 'docs')
-rw-r--r--docs/COLO-FT.txt4
-rw-r--r--docs/about/build-platforms.rst37
-rw-r--r--docs/about/deprecated.rst340
-rw-r--r--docs/about/emulation.rst105
-rw-r--r--docs/about/removed-features.rst180
-rw-r--r--docs/conf.py71
-rw-r--r--docs/devel/atomics.rst6
-rw-r--r--docs/devel/blkdebug.txt162
-rw-r--r--docs/devel/build-environment.rst118
-rw-r--r--docs/devel/build-system.rst27
-rw-r--r--docs/devel/ci-definitions.rst.inc121
-rw-r--r--docs/devel/ci.rst14
-rw-r--r--docs/devel/clocks.rst6
-rw-r--r--docs/devel/code-provenance.rst338
-rw-r--r--docs/devel/codebase.rst215
-rw-r--r--docs/devel/control-flow-integrity.rst2
-rw-r--r--docs/devel/decodetree.rst2
-rw-r--r--docs/devel/ebpf_rss.rst2
-rw-r--r--docs/devel/index-api.rst1
-rw-r--r--docs/devel/index-build.rst16
-rw-r--r--docs/devel/index-internals.rst5
-rw-r--r--docs/devel/index-process.rst2
-rw-r--r--docs/devel/index.rst2
-rw-r--r--docs/devel/kconfig.rst16
-rw-r--r--docs/devel/loads-stores.rst2
-rw-r--r--docs/devel/lockcnt.rst (renamed from docs/devel/lockcnt.txt)89
-rw-r--r--docs/devel/maintainers.rst4
-rw-r--r--docs/devel/memory.rst2
-rw-r--r--docs/devel/migration/CPR.rst184
-rw-r--r--docs/devel/migration/compatibility.rst5
-rw-r--r--docs/devel/migration/features.rst1
-rw-r--r--docs/devel/migration/main.rst8
-rw-r--r--docs/devel/migration/mapped-ram.rst4
-rw-r--r--docs/devel/migration/qatzip-compression.rst165
-rw-r--r--docs/devel/migration/uadk-compression.rst4
-rw-r--r--docs/devel/migration/vfio.rst45
-rw-r--r--docs/devel/multi-thread-tcg.rst9
-rw-r--r--docs/devel/multiple-iothreads.rst139
-rw-r--r--docs/devel/multiple-iothreads.txt130
-rw-r--r--docs/devel/nested-papr.txt119
-rw-r--r--docs/devel/qapi-code-gen.rst61
-rw-r--r--docs/devel/qapi-domain.rst716
-rw-r--r--docs/devel/qom.rst8
-rw-r--r--docs/devel/rcu.rst (renamed from docs/devel/rcu.txt)172
-rw-r--r--docs/devel/replay.rst3
-rw-r--r--docs/devel/reset.rst31
-rw-r--r--docs/devel/rust.rst478
-rw-r--r--docs/devel/style.rst20
-rw-r--r--docs/devel/submitting-a-patch.rst66
-rw-r--r--docs/devel/tcg-ops.rst247
-rw-r--r--docs/devel/tcg-plugins.rst13
-rw-r--r--docs/devel/testing/acpi-bits.rst (renamed from docs/devel/acpi-bits.rst)86
-rw-r--r--docs/devel/testing/blkdebug.rst177
-rw-r--r--docs/devel/testing/blkverify.rst (renamed from docs/devel/blkverify.txt)30
-rw-r--r--docs/devel/testing/ci-jobs.rst.inc (renamed from docs/devel/ci-jobs.rst.inc)19
-rw-r--r--docs/devel/testing/ci-runners.rst.inc (renamed from docs/devel/ci-runners.rst.inc)0
-rw-r--r--docs/devel/testing/ci.rst34
-rw-r--r--docs/devel/testing/functional.rst380
-rw-r--r--docs/devel/testing/fuzzing.rst (renamed from docs/devel/fuzzing.rst)9
-rw-r--r--docs/devel/testing/index.rst17
-rw-r--r--docs/devel/testing/main.rst (renamed from docs/devel/testing.rst)682
-rw-r--r--docs/devel/testing/qgraph.rst (renamed from docs/devel/qgraph.rst)0
-rw-r--r--docs/devel/testing/qtest.rst (renamed from docs/devel/qtest.rst)2
-rw-r--r--docs/devel/uefi-vars.rst68
-rw-r--r--docs/devel/virtio-backends.rst5
-rw-r--r--docs/glossary.rst280
-rw-r--r--docs/igd-assign.txt272
-rw-r--r--docs/index.rst3
-rw-r--r--docs/interop/bitmaps.rst2
-rw-r--r--docs/interop/firmware.json12
-rw-r--r--docs/interop/index.rst6
-rw-r--r--docs/interop/live-block-operations.rst4
-rw-r--r--docs/interop/nbd.rst89
-rw-r--r--docs/interop/nbd.txt72
-rw-r--r--docs/interop/parallels.rst (renamed from docs/interop/parallels.txt)108
-rw-r--r--docs/interop/prl-xml.rst192
-rw-r--r--docs/interop/prl-xml.txt158
-rw-r--r--docs/interop/qcow2.rst (renamed from docs/interop/qcow2.txt)187
-rw-r--r--docs/interop/qed_spec.rst219
-rw-r--r--docs/interop/qed_spec.txt138
-rw-r--r--docs/interop/qemu-ga-ref.rst5
-rw-r--r--docs/interop/qemu-ga.rst4
-rw-r--r--docs/interop/qemu-qmp-ref.rst4
-rw-r--r--docs/interop/qemu-storage-daemon-qmp-ref.rst4
-rw-r--r--docs/interop/vfio-user.rst1520
-rw-r--r--docs/interop/vhost-user.rst24
-rw-r--r--docs/meson.build1
-rw-r--r--docs/pcie_sriov.txt8
-rw-r--r--docs/qcow2-cache.txt2
-rw-r--r--docs/specs/acpi_hest_ghes.rst6
-rw-r--r--docs/specs/aspeed-intc.rst136
-rw-r--r--docs/specs/fw_cfg.rst4
-rw-r--r--docs/specs/index.rst4
-rw-r--r--docs/specs/pci-ids.rst10
-rw-r--r--docs/specs/rapl-msr.rst25
-rw-r--r--docs/specs/riscv-aia.rst83
-rw-r--r--docs/specs/riscv-iommu.rst116
-rw-r--r--docs/specs/rocker.rst (renamed from docs/specs/rocker.txt)181
-rw-r--r--docs/specs/tpm.rst8
-rw-r--r--docs/sphinx-static/theme_overrides.css98
-rw-r--r--docs/sphinx/compat.py230
-rw-r--r--docs/sphinx/depfile.py3
-rw-r--r--docs/sphinx/qapi_domain.py1055
-rw-r--r--docs/sphinx/qapidoc.py936
-rw-r--r--docs/sphinx/qapidoc_legacy.py440
-rw-r--r--docs/sphinx/qmp_lexer.py2
-rw-r--r--docs/spin/tcg-exclusive.promela4
-rw-r--r--docs/system/arm/aspeed.rst315
-rw-r--r--docs/system/arm/bananapi_m2u.rst11
-rw-r--r--docs/system/arm/cpu-features.rst7
-rw-r--r--docs/system/arm/cubieboard.rst1
-rw-r--r--docs/system/arm/emulation.rst16
-rw-r--r--docs/system/arm/exynos.rst9
-rw-r--r--docs/system/arm/fby35.rst52
-rw-r--r--docs/system/arm/gumstix.rst21
-rw-r--r--docs/system/arm/imx8mp-evk.rst62
-rw-r--r--docs/system/arm/mainstone.rst25
-rw-r--r--docs/system/arm/mcimx6ul-evk.rst5
-rw-r--r--docs/system/arm/mcimx7d-sabre.rst5
-rw-r--r--docs/system/arm/nseries.rst33
-rw-r--r--docs/system/arm/nuvoton.rst27
-rw-r--r--docs/system/arm/orangepi.rst16
-rw-r--r--docs/system/arm/palm.rst23
-rw-r--r--docs/system/arm/stm32.rst7
-rw-r--r--docs/system/arm/virt.rst30
-rw-r--r--docs/system/arm/vmapple.rst65
-rw-r--r--docs/system/arm/xlnx-versal-virt.rst3
-rw-r--r--docs/system/arm/xlnx-zcu102.rst19
-rw-r--r--docs/system/arm/xscale.rst35
-rw-r--r--docs/system/bootindex.rst7
-rw-r--r--docs/system/confidential-guest-support.rst1
-rw-r--r--docs/system/cpu-hotplug.rst54
-rw-r--r--docs/system/cpu-models-x86.rst.inc50
-rw-r--r--docs/system/device-emulation.rst2
-rw-r--r--docs/system/devices/cxl.rst18
-rw-r--r--docs/system/devices/igb.rst5
-rw-r--r--docs/system/devices/ivshmem-flat.rst33
-rw-r--r--docs/system/devices/net.rst100
-rw-r--r--docs/system/devices/nvme.rst7
-rw-r--r--docs/system/devices/vfio-user.rst26
-rw-r--r--docs/system/devices/virtio-gpu.rst11
-rw-r--r--docs/system/gdb.rst2
-rw-r--r--docs/system/i386/hyperv.rst43
-rw-r--r--docs/system/i386/nitro-enclave.rst78
-rw-r--r--docs/system/i386/tdx.rst161
-rw-r--r--docs/system/i386/xenpvh.rst49
-rw-r--r--docs/system/images.rst2
-rw-r--r--docs/system/index.rst1
-rw-r--r--docs/system/introduction.rst2
-rw-r--r--docs/system/linuxboot.rst6
-rw-r--r--docs/system/loongarch/virt.rst33
-rw-r--r--docs/system/ppc/amigang.rst17
-rw-r--r--docs/system/ppc/embedded.rst1
-rw-r--r--docs/system/ppc/powermac.rst4
-rw-r--r--docs/system/ppc/powernv.rst9
-rw-r--r--docs/system/ppc/pseries.rst17
-rw-r--r--docs/system/riscv/microblaze-v-generic.rst42
-rw-r--r--docs/system/riscv/microchip-icicle-kit.rst124
-rw-r--r--docs/system/riscv/virt.rst30
-rw-r--r--docs/system/s390x/bootdevices.rst53
-rw-r--r--docs/system/sriov.rst37
-rw-r--r--docs/system/target-arm.rst16
-rw-r--r--docs/system/target-i386.rst5
-rw-r--r--docs/system/target-loongarch.rst19
-rw-r--r--docs/system/target-mips.rst2
-rw-r--r--docs/system/target-riscv.rst1
-rw-r--r--docs/system/targets.rst1
-rw-r--r--docs/tools/index.rst1
-rw-r--r--docs/tools/qemu-nbd.rst7
-rw-r--r--docs/tools/qemu-storage-daemon.rst2
-rw-r--r--docs/tools/qemu-vmsr-helper.rst4
-rw-r--r--docs/tools/virtfs-proxy-helper.rst75
-rw-r--r--docs/user/main.rst26
173 files changed, 11248 insertions, 3404 deletions
diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
index 2e760a4..2283a09 100644
--- a/docs/COLO-FT.txt
+++ b/docs/COLO-FT.txt
@@ -193,8 +193,8 @@ any IP's here, except for the $primary_ip variable.
-device piix3-usb-uhci -device usb-tablet -name secondary \
-netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \
-device rtl8139,id=e0,netdev=hn0 \
- -chardev socket,id=red0,host=$primary_ip,port=9003,reconnect=1 \
- -chardev socket,id=red1,host=$primary_ip,port=9004,reconnect=1 \
+ -chardev socket,id=red0,host=$primary_ip,port=9003,reconnect-ms=1000 \
+ -chardev socket,id=red1,host=$primary_ip,port=9004,reconnect-ms=1000 \
-object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0 \
-object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 \
-object filter-rewriter,id=rew0,netdev=hn0,queue=all \
diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst
index 8fd7da1..8ecbd6b 100644
--- a/docs/about/build-platforms.rst
+++ b/docs/about/build-platforms.rst
@@ -29,6 +29,9 @@ The `Repology`_ site is a useful resource to identify
currently shipped versions of software in various operating systems,
though it does not cover all distros listed below.
+You can find how to install build dependencies for different systems on the
+:ref:`setup-build-env` page.
+
Supported host architectures
----------------------------
@@ -40,8 +43,8 @@ Those hosts are officially supported, with various accelerators:
* - CPU Architecture
- Accelerators
* - Arm
- - kvm (64 bit only), tcg, xen
- * - MIPS (little endian only)
+ - hvf (64 bit only), kvm (64 bit only), tcg, xen
+ * - MIPS (64 bit little endian only)
- kvm, tcg
* - PPC
- kvm, tcg
@@ -98,7 +101,7 @@ Python runtime
option of the ``configure`` script to point QEMU to a supported
version of the Python runtime.
- As of QEMU |version|, the minimum supported version of Python is 3.7.
+ As of QEMU |version|, the minimum supported version of Python is 3.9.
Python build dependencies
Some of QEMU's build dependencies are written in Python. Usually these
@@ -107,18 +110,34 @@ Python build dependencies
required, it may be necessary to fetch python modules from the Python
Package Index (PyPI) via ``pip``, in order to build QEMU.
+Rust build dependencies
+ QEMU is generally conservative in adding new Rust dependencies, and all
+ of them are included in the distributed tarballs. One exception is the
+ bindgen tool, which is too big to package and distribute. The minimum
+ supported version of bindgen is 0.60.x. For distributions that do not
+ include bindgen or have an older version, it is recommended to install
+ a newer version using ``cargo install bindgen-cli``.
+
+ QEMU requires Rust 1.77.0. This is available on all supported platforms
+ with one exception, namely the ``mips64el`` architecture on Debian bookworm.
+ For all other architectures, Debian bookworm provides a new-enough Rust
+ compiler in the ``rustc-web`` package.
+
+ Also, on Ubuntu 22.04 or 24.04 this requires the ``rustc-1.77``
+ (or newer) package. The path to ``rustc`` and ``rustdoc`` must be
+ provided manually to the configure script.
+
Optional build dependencies
- Build components whose absence does not affect the ability to build
- QEMU may not be available in distros, or may be too old for QEMU's
- requirements. Many of these, such as the Avocado testing framework
- or various linters, are written in Python and therefore can also
- be installed using ``pip``. Cross compilers are another example
+ Build components whose absence does not affect the ability to build QEMU
+ may not be available in distros, or may be too old for our requirements.
+ Many of these, such as additional modules for the functional testing
+ framework or various linters, are written in Python and therefore can
+ also be installed using ``pip``. Cross compilers are another example
of optional build-time dependency; in this case it is possible to
download them from repositories such as EPEL, to use container-based
cross compilation using ``docker`` or ``podman``, or to use pre-built
binaries distributed with QEMU.
-
Windows
-------
diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 88f0f03..4203713 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -24,12 +24,6 @@ should exclusively use a non-deprecated machine type, with use of the most
recent version highly recommended. Non-versioned machine types follow the
general feature deprecation policy.
-Prior to the 2.10.0 release there was no official policy on how
-long features would be deprecated prior to their removal, nor
-any documented list of which features were deprecated. Thus
-any features deprecated prior to 2.10.0 will be treated as if
-they were first deprecated in the 2.10.0 release.
-
What follows is a list of all features currently marked as
deprecated.
@@ -74,11 +68,18 @@ configurations (e.g. -smp drawers=1,books=1,clusters=1 for x86 PC machine) is
marked deprecated since 9.0, users have to ensure that all the topology members
described with -smp are supported by the target machine.
-``-runas`` (since 9.1)
-----------------------
-
-Use ``-run-with user=..`` instead.
+``-old-param`` option for booting Arm kernels via param_struct (since 10.0)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+The ``-old-param`` command line option is specific to Arm targets:
+it is used when directly booting a guest kernel to pass it the
+command line and other information via the old ``param_struct`` ABI,
+rather than the newer ATAGS or DTB mechanisms. This option was only
+ever needed to support ancient kernels on some old board types
+like the ``akita`` or ``terrier``; it has been deprecated in the
+kernel since 2001. None of the board types QEMU supports need
+``param_struct`` support, so this option has been deprecated and will
+be removed in a future QEMU version.
User-mode emulator command line arguments
-----------------------------------------
@@ -147,32 +148,66 @@ options are removed in favor of using explicit ``blockdev-create`` and
``blockdev-add`` calls. See :doc:`/interop/live-block-operations` for
details.
-Incorrectly typed ``device_add`` arguments (since 6.2)
-''''''''''''''''''''''''''''''''''''''''''''''''''''''
+``query-migrationthreads`` (since 9.2)
+''''''''''''''''''''''''''''''''''''''
+
+To be removed with no replacement, as it reports only a limited set of
+threads (for example, it only reports source side of multifd threads,
+without reporting any destination threads, or non-multifd source threads).
+For debugging purpose, please use ``-name $VM,debug-threads=on`` instead.
+
+``block-job-pause`` (since 10.1)
+''''''''''''''''''''''''''''''''
+
+Use ``job-pause`` instead. The only difference is that ``job-pause``
+always reports GenericError on failure when ``block-job-pause`` reports
+DeviceNotActive when block-job is not found.
+
+``block-job-resume`` (since 10.1)
+'''''''''''''''''''''''''''''''''
+
+Use ``job-resume`` instead. The only difference is that ``job-resume``
+always reports GenericError on failure when ``block-job-resume`` reports
+DeviceNotActive when block-job is not found.
+
+``block-job-complete`` (since 10.1)
+'''''''''''''''''''''''''''''''''''
-Due to shortcomings in the internal implementation of ``device_add``, QEMU
-incorrectly accepts certain invalid arguments: Any object or list arguments are
-silently ignored. Other argument types are not checked, but an implicit
-conversion happens, so that e.g. string values can be assigned to integer
-device properties or vice versa.
+Use ``job-complete`` instead. The only difference is that ``job-complete``
+always reports GenericError on failure when ``block-job-complete`` reports
+DeviceNotActive when block-job is not found.
-This is a bug in QEMU that will be fixed in the future so that previously
-accepted incorrect commands will return an error. Users should make sure that
-all arguments passed to ``device_add`` are consistent with the documented
-property types.
+``block-job-dismiss`` (since 10.1)
+''''''''''''''''''''''''''''''''''
+
+Use ``job-dismiss`` instead.
+
+``block-job-finalize`` (since 10.1)
+'''''''''''''''''''''''''''''''''''
+
+Use ``job-finalize`` instead.
+
+``migrate`` argument ``detach`` (since 10.1)
+''''''''''''''''''''''''''''''''''''''''''''
+
+This argument has always been ignored.
Host Architectures
------------------
-BE MIPS (since 7.2)
-'''''''''''''''''''
+Big endian MIPS since 7.2; 32-bit little endian MIPS since 9.2
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
As Debian 10 ("Buster") moved into LTS the big endian 32 bit version of
MIPS moved out of support making it hard to maintain our
cross-compilation CI tests of the architecture. As we no longer have
CI coverage support may bitrot away before the deprecation process
-completes. The little endian variants of MIPS (both 32 and 64 bit) are
-still a supported host architecture.
+completes.
+
+Likewise, the little endian variant of 32 bit MIPS is not supported by
+Debian 13 ("Trixie") and newer.
+
+64 bit little endian MIPS is still a supported host architecture.
System emulation on 32-bit x86 hosts (since 8.0)
''''''''''''''''''''''''''''''''''''''''''''''''
@@ -184,6 +219,53 @@ be an effective use of its limited resources, and thus intends to discontinue
it. Since all recent x86 hardware from the past >10 years is capable of the
64-bit x86 extensions, a corresponding 64-bit OS should be used instead.
+TCG Plugin support not enabled by default on 32-bit hosts (since 9.2)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+While it is still possible to enable TCG plugin support for 32-bit
+hosts there are a number of potential pitfalls when instrumenting
+64-bit guests. The plugin APIs typically pass most addresses as
+uint64_t but practices like encoding that address in a host pointer
+for passing as user-data will lose data. As most software analysis
+benefits from having plenty of host memory it seems reasonable to
+encourage users to use 64 bit builds of QEMU for analysis work
+whatever targets they are instrumenting.
+
+TCG Plugin support not enabled by default with TCI (since 9.2)
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+While the TCG interpreter can interpret the TCG ops used by plugins it
+is going to be so much slower it wouldn't make sense for any serious
+instrumentation. Due to implementation differences there will also be
+anomalies in things like memory instrumentation.
+
+32-bit host operating systems (since 10.0)
+''''''''''''''''''''''''''''''''''''''''''
+
+Keeping 32-bit host support alive is a substantial burden for the
+QEMU project. Thus QEMU will in future drop the support for all
+32-bit host systems.
+
+linux-user mode CPUs
+--------------------
+
+iwMMXt emulation and the ``pxa`` CPUs (since 10.0)
+''''''''''''''''''''''''''''''''''''''''''''''''''
+
+The ``pxa`` CPU family (``pxa250``, ``pxa255``, ``pxa260``,
+``pxa261``, ``pxa262``, ``pxa270-a0``, ``pxa270-a1``, ``pxa270``,
+``pxa270-b0``, ``pxa270-b1``, ``pxa270-c0``, ``pxa270-c5``) are no
+longer used in system emulation, because all the machine types which
+used these CPUs were removed in the QEMU 9.2 release. These CPUs can
+now only be used in linux-user mode, and to do that you would have to
+explicitly select one of these CPUs with the ``-cpu`` command line
+option or the ``QEMU_CPU`` environment variable.
+
+We don't believe that anybody is using the iwMMXt emulation, and we do
+not have any tests to validate it or any real hardware or similar
+known-good implementation to test against. GCC is in the process of
+dropping their support for iwMMXt codegen. These CPU types are
+therefore deprecated in QEMU, and will be removed in a future release.
System emulator CPUs
--------------------
@@ -206,17 +288,25 @@ in the QEMU object model anymore. ``Sun-UltraSparc-IIIi+`` and
but for consistency these will get removed in a future release, too.
Use ``Sun-UltraSparc-IIIi-plus`` and ``Sun-UltraSparc-IV-plus`` instead.
-CRIS CPU architecture (since 9.0)
-'''''''''''''''''''''''''''''''''
+PPC 405 CPUs (since 10.0)
+'''''''''''''''''''''''''
-The CRIS architecture was pulled from Linux in 4.17 and the compiler
-is no longer packaged in any distro making it harder to run the
-``check-tcg`` tests. Unless we can improve the testing situation there
-is a chance the code will bitrot without anyone noticing.
+The PPC 405 CPU has no known users and the ``ref405ep`` machine was
+removed in QEMU 10.0. Since the IBM POWER [8-11] processors uses an
+embedded 405 for power management (OCC) and other internal tasks, it
+is theoretically possible to use QEMU to model them. Let's keep the
+CPU implementation for a while before removing all support.
System emulator machines
------------------------
+Versioned machine types (aarch64, arm, i386, m68k, ppc64, s390x, x86_64)
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+In accordance with our versioned machine type deprecation policy, all machine
+types with version |VER_MACHINE_DEPRECATION_VERSION|, or older, have been
+deprecated.
+
Arm ``virt`` machine ``dtb-kaslr-seed`` property (since 7.1)
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
@@ -225,57 +315,48 @@ deprecated; use the new name ``dtb-randomness`` instead. The new name
better reflects the way this property affects all random data within
the device tree blob, not just the ``kaslr-seed`` node.
-``pc-i440fx-2.4`` up to ``pc-i440fx-2.12`` (since 9.1)
-''''''''''''''''''''''''''''''''''''''''''''''''''''''
+Mips ``mipssim`` machine (since 10.0)
+'''''''''''''''''''''''''''''''''''''
-These old machine types are quite neglected nowadays and thus might have
-various pitfalls with regards to live migration. Use a newer machine type
+Linux dropped support for this virtual machine type in kernel v3.7, and
+there does not seem to be anybody around who is still using this board
+in QEMU: Most former MIPS-related people are working on other architectures
+in their everyday job nowadays, and we are also not aware of anybody still
+using old binaries with this board (i.e. there is also no binary available
+online to check that this board did not completely bitrot yet). It is
+recommended to use another MIPS machine for future MIPS code development
instead.
-``shix`` (since 9.0)
-''''''''''''''''''''
+RISC-V default machine option (since 10.0)
+''''''''''''''''''''''''''''''''''''''''''
-The machine is no longer in existence and has been long unmaintained
-in QEMU. This also holds for the TC51828 16MiB flash that it uses.
+RISC-V defines ``spike`` as the default machine if no machine option is
+given in the command line. This happens because ``spike`` is the first
+RISC-V machine implemented in QEMU and setting it as default was
+convenient at that time. Now we have 7 riscv64 and 6 riscv32 machines
+and having ``spike`` as a default is no longer justified. This default
+will also promote situations where users think they're running ``virt``
+(the most used RISC-V machine type in 10.0) when in fact they're
+running ``spike``.
-``pseries-2.1`` up to ``pseries-2.12`` (since 9.0)
-''''''''''''''''''''''''''''''''''''''''''''''''''
+Removing the default machine option forces users to always set the machine
+they want to use and avoids confusion. Existing users of the ``spike``
+machine must ensure that they're setting the ``spike`` machine in the
+command line (``-M spike``).
+
+
+System emulator binaries
+------------------------
+
+``qemu-system-microblazeel`` (since 10.1)
+'''''''''''''''''''''''''''''''''''''''''
+
+The ``qemu-system-microblaze`` binary can emulate little-endian machines
+now, too, so the separate binary ``qemu-system-microblazeel`` (with the
+``el`` suffix) for little-endian targets is not required anymore. The
+``petalogix-s3adsp1800`` machine can now be switched to little endian by
+setting its ``endianness`` property to ``little``.
-Older pseries machines before version 3.0 have undergone many changes
-to correct issues, mostly regarding migration compatibility. These are
-no longer maintained and removing them will make the code easier to
-read and maintain. Use versions 3.0 and above as a replacement.
-
-Arm machines ``akita``, ``borzoi``, ``cheetah``, ``connex``, ``mainstone``, ``n800``, ``n810``, ``spitz``, ``terrier``, ``tosa``, ``verdex``, ``z2`` (since 9.0)
-''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
-
-QEMU includes models of some machine types where the QEMU code that
-emulates their SoCs is very old and unmaintained. This code is now
-blocking our ability to move forward with various changes across
-the codebase, and over many years nobody has been interested in
-trying to modernise it. We don't expect any of these machines to have
-a large number of users, because they're all modelling hardware that
-has now passed away into history. We are therefore dropping support
-for all machine types using the PXA2xx and OMAP2 SoCs. We are also
-dropping the ``cheetah`` OMAP1 board, because we don't have any
-test images for it and don't know of anybody who does; the ``sx1``
-and ``sx1-v1`` OMAP1 machines remain supported for now.
-
-PPC 405 ``ref405ep`` machine (since 9.1)
-''''''''''''''''''''''''''''''''''''''''
-
-The ``ref405ep`` machine and PPC 405 CPU have no known users, firmware
-images are not available, OpenWRT dropped support in 2019, U-Boot in
-2017, Linux also is dropping support in 2024. It is time to let go of
-this ancient hardware and focus on newer CPUs and platforms.
-
-Arm ``tacoma-bmc`` machine (since 9.1)
-''''''''''''''''''''''''''''''''''''''''
-
-The ``tacoma-bmc`` machine was a board including an AST2600 SoC based
-BMC and a witherspoon like OpenPOWER system. It was used for bring up
-of the AST2600 SoC in labs. It can be easily replaced by the
-``rainier-bmc`` machine which is a real product.
Backend options
---------------
@@ -324,41 +405,6 @@ the addition of volatile memory support, it is now necessary to distinguish
between persistent and volatile memory backends. As such, memdev is deprecated
in favor of persistent-memdev.
-``-fsdev proxy`` and ``-virtfs proxy`` (since 8.1)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The 9p ``proxy`` filesystem backend driver has been deprecated and will be
-removed (along with its proxy helper daemon) in a future version of QEMU. Please
-use ``-fsdev local`` or ``-virtfs local`` for using the 9p ``local`` filesystem
-backend, or alternatively consider deploying virtiofsd instead.
-
-The 9p ``proxy`` backend was originally developed as an alternative to the 9p
-``local`` backend. The idea was to enhance security by dispatching actual low
-level filesystem operations from 9p server (QEMU process) over to a separate
-process (the virtfs-proxy-helper binary). However this alternative never gained
-momentum. The proxy backend is much slower than the local backend, hasn't seen
-any development in years, and showed to be less secure, especially due to the
-fact that its helper daemon must be run as root, whereas with the local backend
-QEMU is typically run as unprivileged user and allows to tighten behaviour by
-mapping permissions et al by using its 'mapped' security model option.
-
-Nowadays it would make sense to reimplement the ``proxy`` backend by using
-QEMU's ``vhost`` feature, which would eliminate the high latency costs under
-which the 9p ``proxy`` backend currently suffers. However as of to date nobody
-has indicated plans for such kind of reimplementation unfortunately.
-
-RISC-V 'any' CPU type ``-cpu any`` (since 8.2)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The 'any' CPU type was introduced back in 2018 and has been around since the
-initial RISC-V QEMU port. Its usage has always been unclear: users don't know
-what to expect from a CPU called 'any', and in fact the CPU does not do anything
-special that isn't already done by the default CPUs rv32/rv64.
-
-After the introduction of the 'max' CPU type, RISC-V now has a good coverage
-of generic CPUs: rv32 and rv64 as default CPUs and 'max' as a feature complete
-CPU for both 32 and 64 bit builds. Users are then discouraged to use the 'any'
-CPU type starting in 8.2.
RISC-V CPU properties which start with capital 'Z' (since 8.2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -422,6 +468,15 @@ Specifying the iSCSI password in plain text on the command line using the
used instead, to refer to a ``--object secret...`` instance that provides
a password via a file, or encrypted.
+``gluster`` backend (since 9.2)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+According to https://marc.info/?l=fedora-devel-list&m=171934833215726
+the GlusterFS development effectively ended. Unless the development
+gains momentum again, the QEMU project will remove the gluster backend
+in a future release.
+
+
Character device options
''''''''''''''''''''''''
@@ -430,16 +485,49 @@ Backend ``memory`` (since 9.0)
``memory`` is a deprecated synonym for ``ringbuf``.
-CPU device properties
-'''''''''''''''''''''
+``reconnect`` (since 9.2)
+^^^^^^^^^^^^^^^^^^^^^^^^^
-``pcommit`` on x86 (since 9.1)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The ``reconnect`` option only allows specifying second granularity timeouts,
+which is not enough for all types of use cases, use ``reconnect-ms`` instead.
-The PCOMMIT instruction was never included in any physical processor.
-It was implemented as a no-op instruction in TCG up to QEMU 9.0, but
-only with ``-cpu max`` (which does not guarantee migration compatibility
-across versions).
+
+Net device options
+''''''''''''''''''
+
+Stream ``reconnect`` (since 9.2)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``reconnect`` option only allows specifying second granularity timeouts,
+which is not enough for all types of use cases, use ``reconnect-ms`` instead.
+
+VFIO device options
+'''''''''''''''''''
+
+``-device vfio-calxeda-xgmac`` (since 10.0)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The vfio-calxeda-xgmac device allows to assign a host Calxeda Highbank
+10Gb XGMAC Ethernet controller device ("calxeda,hb-xgmac" compatibility
+string) to a guest. Calxeda HW has been ewasted now and there is no point
+keeping that device.
+
+``-device vfio-amd-xgbe`` (since 10.0)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The vfio-amd-xgbe device allows to assign a host AMD 10GbE controller
+to a guest ("amd,xgbe-seattle-v1a" compatibility string). AMD "Seattle"
+is not supported anymore and there is no point keeping that device.
+
+``-device vfio-platform`` (since 10.0)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The vfio-platform device allows to assign a host platform device
+to a guest in a generic manner. Integrating a new device into
+the vfio-platform infrastructure requires some adaptation at
+both kernel and qemu level. No such attempt has been done for years
+and the conclusion is that vfio-platform has not got any traction.
+PCIe passthrough shall be the mainline solution.
+
+CPU device properties
+'''''''''''''''''''''
``pmu-num=n`` on RISC-V CPUs (since 8.2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -450,6 +538,14 @@ be calculated with ``((2 ^ n) - 1) << 3``. The least significant three bits
must be left clear.
+``pcommit`` on x86 (since 9.1)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PCOMMIT instruction was never included in any physical processor.
+It was implemented as a no-op instruction in TCG up to QEMU 9.0, but
+only with ``-cpu max`` (which does not guarantee migration compatibility
+across versions).
+
Backwards compatibility
-----------------------
@@ -503,3 +599,9 @@ usage of providing a file descriptor to a plain file has been
deprecated in favor of explicitly using the ``file:`` URI with the
file descriptor being passed as an ``fdset``. Refer to the ``add-fd``
command documentation for details on the ``fdset`` usage.
+
+``zero-blocks`` capability (since 9.2)
+''''''''''''''''''''''''''''''''''''''
+
+The ``zero-blocks`` capability was part of the block migration which
+doesn't exist anymore since it was removed in QEMU v9.1.
diff --git a/docs/about/emulation.rst b/docs/about/emulation.rst
index c03033e..456d01d 100644
--- a/docs/about/emulation.rst
+++ b/docs/about/emulation.rst
@@ -26,10 +26,6 @@ depending on the guest architecture.
- :ref:`Yes<AVR-System-emulator>`
- No
- 8 bit micro controller, often used in maker projects
- * - Cris
- - Yes
- - Yes
- - Embedded RISC chip developed by AXIS
* - Hexagon
- No
- Yes
@@ -175,11 +171,13 @@ for that architecture.
- Unified Hosting Interface (MD01069)
* - RISC-V
- System and User-mode
- - https://github.com/riscv/riscv-semihosting-spec/blob/main/riscv-semihosting-spec.adoc
+ - https://github.com/riscv-non-isa/riscv-semihosting/blob/main/riscv-semihosting.adoc
* - Xtensa
- System
- Tensilica ISS SIMCALL
+.. _tcg-plugins:
+
TCG Plugins
-----------
@@ -207,8 +205,8 @@ Once built a program can be run with multiple plugins loaded each with
their own arguments::
$QEMU $OTHER_QEMU_ARGS \
- -plugin contrib/plugin/libhowvec.so,inline=on,count=hint \
- -plugin contrib/plugin/libhotblocks.so
+ -plugin contrib/plugins/libhowvec.so,inline=on,count=hint \
+ -plugin contrib/plugins/libhotblocks.so
Arguments are plugin specific and can be used to modify their
behaviour. In this case the howvec plugin is being asked to use inline
@@ -219,6 +217,14 @@ Linux user-mode emulation also evaluates the environment variable
QEMU_PLUGIN="file=contrib/plugins/libhowvec.so,inline=on,count=hint" $QEMU
+QEMU plugins avoid to write directly to stdin/stderr, and use the log provided
+by the API (see function ``qemu_plugin_outs``).
+To show output, you may use this additional parameter::
+
+ $QEMU $OTHER_QEMU_ARGS \
+ -d plugin \
+ -plugin contrib/plugins/libhowvec.so,inline=on,count=hint
+
Example Plugins
~~~~~~~~~~~~~~~
@@ -260,11 +266,40 @@ Behaviour can be tweaked with the following arguments:
* - Option
- Description
* - inline=true|false
- - Use faster inline addition of a single counter. Not per-cpu and not
- thread safe.
+ - Use faster inline addition of a single counter.
* - idle=true|false
- Dump the current execution stats whenever the guest vCPU idles
+Basic Block Vectors
+...................
+
+``contrib/plugins/bbv.c``
+
+The bbv plugin allows you to generate basic block vectors for use with the
+`SimPoint <https://cseweb.ucsd.edu/~calder/simpoint/>`__ analysis tool.
+
+.. list-table:: Basic block vectors arguments
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - interval=N
+ - The interval to generate a basic block vector specified by the number of
+ instructions (Default: N = 100000000)
+ * - outfile=PATH
+ - The path to output files.
+ It will be suffixed with ``.N.bb`` where ``N`` is a vCPU index.
+
+Example::
+
+ $ qemu-aarch64 \
+ -plugin contrib/plugins/libbbv.so,interval=100,outfile=sha1 \
+ tests/tcg/aarch64-linux-user/sha1
+ SHA1=15dd99a1991e0b3826fede3deffc1feba42278e6
+ $ du sha1.0.bb
+ 23128 sha1.0.bb
+
Instruction
...........
@@ -381,6 +416,28 @@ run::
160 1 0
135 1 0
+Behaviour can be tweaked with the following arguments:
+
+.. list-table:: Syscall plugin arguments
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - print=true|false
+ - Print the number of times each syscall is called
+ * - log_writes=true|false
+ - Log the buffer of each write syscall in hexdump format
+
+Test inline operations
+......................
+
+``tests/plugins/inline.c``
+
+This plugin is used for testing all inline operations, conditional callbacks and
+scoreboard. It prints a per-cpu summary of all events.
+
+
Hot Blocks
..........
@@ -394,9 +451,6 @@ with linux-user execution as system emulation tends to generate
re-translations as blocks from different programs get swapped in and
out of system memory.
-If your program is single-threaded you can use the ``inline`` option for
-slightly faster (but not thread safe) counters.
-
Example::
$ qemu-aarch64 \
@@ -736,10 +790,35 @@ The plugin will log the reason of exit, for example::
0xd4 reached, exiting
+Limit instructions per second
+.............................
+
+This plugin can limit the number of Instructions Per Second that are executed::
+
+ # get number of instructions
+ $ num_insn=$(./build/qemu-x86_64 -plugin ./build/tests/plugin/libinsn.so -d plugin /bin/true |& grep total | sed -e 's/.*: //')
+ # limit speed to execute in 10 seconds
+ $ time ./build/qemu-x86_64 -plugin ./build/contrib/plugins/libips.so,ips=$(($num_insn/10)) /bin/true
+ real 10.000s
+
+
+.. list-table:: IPS arguments
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - ips=N
+ - Maximum number of instructions per cpu that can be executed in one second.
+ The plugin will sleep when the given number of instructions is reached.
+ * - ipq=N
+ - Instructions per quantum. How many instructions before we re-calculate time.
+ The lower the number the more accurate time will be, but the less efficient the plugin.
+ Defaults to ips/10
+
Other emulation features
------------------------
When running system emulation you can also enable deterministic
execution which allows for repeatable record/replay debugging. See
:ref:`Record/Replay<replay>` for more details.
-
diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst
index fc7b28e..d7c2113 100644
--- a/docs/about/removed-features.rst
+++ b/docs/about/removed-features.rst
@@ -162,6 +162,12 @@ specified with ``-mem-path`` can actually provide the guest RAM configured with
The ``name`` parameter of the ``-net`` option was a synonym
for the ``id`` parameter, which should now be used instead.
+RISC-V firmware not booted by default (removed in 5.1)
+''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+QEMU 5.1 changes the default behaviour from ``-bios none`` to ``-bios default``
+for the RISC-V ``virt`` machine and ``sifive_u`` machine.
+
``-numa node,mem=...`` (removed in 5.1)
'''''''''''''''''''''''''''''''''''''''
@@ -324,12 +330,6 @@ devices. Drives the board doesn't pick up can no longer be used with
This option was undocumented and not used in the field.
Use ``-device usb-ccid`` instead.
-RISC-V firmware not booted by default (removed in 5.1)
-''''''''''''''''''''''''''''''''''''''''''''''''''''''
-
-QEMU 5.1 changes the default behaviour from ``-bios none`` to ``-bios default``
-for the RISC-V ``virt`` machine and ``sifive_u`` machine.
-
``-no-quit`` (removed in 7.0)
'''''''''''''''''''''''''''''
@@ -355,13 +355,13 @@ The ``-writeconfig`` option was not able to serialize the entire contents
of the QEMU command line. It is thus considered a failed experiment
and removed without a replacement.
-``loaded`` property of ``secret`` and ``secret_keyring`` objects (removed in 7.1)
-'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+``loaded`` property of secret and TLS credential objects (removed in 9.2)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
The ``loaded=on`` option in the command line or QMP ``object-add`` either had
no effect (if ``loaded`` was the last option) or caused options to be
effectively ignored as if they were not given. The property is therefore
-useless and should simply be removed.
+useless and has been removed.
``opened`` property of ``rng-*`` objects (removed in 7.1)
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
@@ -403,13 +403,13 @@ Sound card devices should be created using ``-device`` or ``-audio``.
The exception is ``pcspk`` which can be activated using ``-machine
pcspk-audiodev=<name>``.
-``-watchdog`` (since 7.2)
-'''''''''''''''''''''''''
+``-watchdog`` (removed in 7.2)
+''''''''''''''''''''''''''''''
Use ``-device`` instead.
-Hexadecimal sizes with scaling multipliers (since 8.0)
-''''''''''''''''''''''''''''''''''''''''''''''''''''''
+Hexadecimal sizes with scaling multipliers (removed in 8.0)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Input parameters that take a size value should only use a size suffix
(such as 'k' or 'M') when the base is written in decimal, and not when
@@ -510,13 +510,56 @@ than zero.
Removed along with the ``compression`` migration capability.
-``-device virtio-blk,scsi=on|off`` (since 9.1)
-''''''''''''''''''''''''''''''''''''''''''''''
+``-device virtio-blk,scsi=on|off`` (removed in 9.1)
+'''''''''''''''''''''''''''''''''''''''''''''''''''
The virtio-blk SCSI passthrough feature is a legacy VIRTIO feature. VIRTIO 1.0
and later do not support it because the virtio-scsi device was introduced for
full SCSI support. Use virtio-scsi instead when SCSI passthrough is required.
+``-fsdev proxy`` and ``-virtfs proxy`` (removed in 9.2)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+The 9p ``proxy`` filesystem backend driver was originally developed to
+enhance security by dispatching low level filesystem operations from 9p
+server (QEMU process) over to a separate process (the virtfs-proxy-helper
+binary). However the proxy backend was much slower than the local backend,
+didn't see any development in years, and showed to be less secure,
+especially due to the fact that its helper daemon must be run as root.
+
+Use ``local``, possibly mapping permissions et al by using its 'mapped'
+security model option, or switch to ``virtiofs``. The virtiofs daemon
+``virtiofsd`` uses vhost to eliminate the high latency costs of the 9p
+``proxy`` backend.
+
+``-portrait`` and ``-rotate`` (removed in 9.2)
+''''''''''''''''''''''''''''''''''''''''''''''
+
+The ``-portrait`` and ``-rotate`` options were documented as only
+working with the PXA LCD device, and all the machine types using
+that display device were removed in 9.2, so these options also
+have been dropped.
+
+These options were intended to simulate a mobile device being
+rotated by the user, and had three effects:
+
+* the display output was rotated by 90, 180 or 270 degrees
+* the mouse/trackpad input was rotated the opposite way
+* the machine model would signal to the guest about its
+ orientation
+
+Of these three things, the input-rotation was coded without being
+restricted to boards which supported the full set of device-rotation
+handling, so in theory the options were usable on other machine models
+to produce an odd effect (rotating input but not display output). But
+this was never intended or documented behaviour, so we have dropped
+the options along with the machine models they were intended for.
+
+``-runas`` (removed in 10.0)
+''''''''''''''''''''''''''''
+
+Use ``-run-with user=..`` instead.
+
User-mode emulator command line arguments
-----------------------------------------
@@ -679,6 +722,15 @@ Use ``multifd-channels`` instead.
Use ``multifd-compression`` instead.
+Incorrectly typed ``device_add`` arguments (since 9.2)
+''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+Due to shortcomings in the internal implementation of ``device_add``,
+QEMU used to incorrectly accept certain invalid arguments. Any object
+or list arguments were silently ignored. Other argument types were not
+checked, but an implicit conversion happened, so that e.g. string
+values could be assigned to integer device properties or vice versa.
+
QEMU Machine Protocol (QMP) events
----------------------------------
@@ -815,6 +867,15 @@ QEMU. Since all recent x86 hardware from the past >10 years is
capable of the 64-bit x86 extensions, a corresponding 64-bit OS should
be used instead.
+32-bit hosts for 64-bit guests (removed in 10.0)
+''''''''''''''''''''''''''''''''''''''''''''''''
+
+In general, 32-bit hosts cannot support the memory space or atomicity
+requirements of 64-bit guests. Prior to 10.0, QEMU attempted to
+work around the atomicity issues in system mode by running all vCPUs
+in a single thread context; in user mode atomicity was simply broken.
+From 10.0, QEMU has disabled configuration of 64-bit guests on 32-bit hosts.
+
Guest Emulator ISAs
-------------------
@@ -889,6 +950,21 @@ Nios II CPU (removed in 9.1)
QEMU Nios II architecture was orphan; Intel has EOL'ed the Nios II
processor IP (see `Intel discontinuance notification`_).
+CRIS CPU architecture (removed in 9.2)
+''''''''''''''''''''''''''''''''''''''
+
+The CRIS architecture was pulled from Linux in 4.17 and the compiler
+was no longer packaged in any distro making it harder to run the
+``check-tcg`` tests.
+
+RISC-V 'any' CPU type ``-cpu any`` (removed in 9.2)
+'''''''''''''''''''''''''''''''''''''''''''''''''''
+
+The 'any' CPU type was introduced back in 2018 and was around since the
+initial RISC-V QEMU port. Its usage was always been unclear: users don't know
+what to expect from a CPU called 'any', and in fact the CPU does not do anything
+special that isn't already done by the default CPUs rv32/rv64.
+
System accelerators
-------------------
@@ -899,21 +975,28 @@ Userspace local APIC with KVM (x86, removed in 8.0)
a local APIC. The ``split`` setting is supported, as is using ``-M
kernel-irqchip=off`` when the CPU does not have a local APIC.
-HAXM (``-accel hax``) (removed in 8.2)
-''''''''''''''''''''''''''''''''''''''
-
-The HAXM project has been retired (see https://github.com/intel/haxm#status).
-Use "whpx" (on Windows) or "hvf" (on macOS) instead.
-
MIPS "Trap-and-Emulate" KVM support (removed in 8.0)
''''''''''''''''''''''''''''''''''''''''''''''''''''
The MIPS "Trap-and-Emulate" KVM host and guest support was removed
from Linux in 2021, and is not supported anymore by QEMU either.
+HAXM (``-accel hax``) (removed in 8.2)
+''''''''''''''''''''''''''''''''''''''
+
+The HAXM project has been retired (see https://github.com/intel/haxm#status).
+Use "whpx" (on Windows) or "hvf" (on macOS) instead.
+
System emulator machines
------------------------
+Versioned machine types (aarch64, arm, i386, m68k, ppc64, s390x, x86_64)
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+In accordance with our versioned machine type deprecation policy, all machine
+types with version |VER_MACHINE_DELETION_VERSION|, or older, have been
+removed.
+
``s390-virtio`` (removed in 2.6)
''''''''''''''''''''''''''''''''
@@ -948,12 +1031,6 @@ mips ``fulong2e`` machine alias (removed in 6.0)
This machine has been renamed ``fuloong2e``.
-``pc-0.10`` up to ``pc-i440fx-2.3`` (removed in 4.0 up to 9.0)
-''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
-
-These machine types were very old and likely could not be used for live
-migration from old QEMU versions anymore. Use a newer machine type instead.
-
Raspberry Pi ``raspi2`` and ``raspi3`` machines (removed in 6.2)
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
@@ -978,6 +1055,51 @@ Nios II ``10m50-ghrd`` and ``nios2-generic-nommu`` machines (removed in 9.1)
The Nios II architecture was orphan.
+``shix`` (removed in 9.2)
+'''''''''''''''''''''''''
+
+The machine was unmaintained.
+
+Arm machines ``akita``, ``borzoi``, ``cheetah``, ``connex``, ``mainstone``, ``n800``, ``n810``, ``spitz``, ``terrier``, ``tosa``, ``verdex``, ``z2`` (removed in 9.2)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+QEMU included models of some machine types where the QEMU code that
+emulates their SoCs was very old and unmaintained. This code was
+blocking our ability to move forward with various changes across
+the codebase, and over many years nobody has been interested in
+trying to modernise it. We don't expect any of these machines to have
+a large number of users, because they're all modelling hardware that
+has now passed away into history. We are therefore dropping support
+for all machine types using the PXA2xx and OMAP2 SoCs. We are also
+dropping the ``cheetah`` OMAP1 board, because we don't have any
+test images for it and don't know of anybody who does.
+
+Aspeed ``tacoma-bmc`` machine (removed in 10.0)
+'''''''''''''''''''''''''''''''''''''''''''''''
+
+The ``tacoma-bmc`` machine was removed because it didn't bring much
+compared to the ``rainier-bmc`` machine. Also, the ``tacoma-bmc`` was
+a board used for bring up of the AST2600 SoC that never left the
+labs. It can be easily replaced by the ``rainier-bmc`` machine, which
+was the actual final product, or by the ``ast2600-evb`` with some
+tweaks.
+
+ppc ``ref405ep`` machine (removed in 10.0)
+''''''''''''''''''''''''''''''''''''''''''
+
+This machine was removed because PPC 405 CPU have no known users,
+firmware images are not available, OpenWRT dropped support in 2019,
+U-Boot in 2017, and Linux in 2024.
+
+Big-Endian variants of ``petalogix-ml605`` and ``xlnx-zynqmp-pmu`` machines (removed in 10.1)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+Both the MicroBlaze ``petalogix-ml605`` and ``xlnx-zynqmp-pmu`` machines
+were added for little endian CPUs. Big endian support was never tested
+and likely never worked. Starting with QEMU v10.1, the machines are now
+only available as little-endian machines.
+
+
linux-user mode CPUs
--------------------
@@ -1006,8 +1128,8 @@ processor IP (see `Intel discontinuance notification`_).
TCG introspection features
--------------------------
-TCG trace-events (since 6.2)
-''''''''''''''''''''''''''''
+TCG trace-events (removed in 7.0)
+'''''''''''''''''''''''''''''''''
The ability to add new TCG trace points had bit rotted and as the
feature can be replicated with TCG plugins it was removed. If
diff --git a/docs/conf.py b/docs/conf.py
index 876f676..f892a6e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -60,7 +60,14 @@ needs_sphinx = '3.4.3'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
-extensions = ['kerneldoc', 'qmp_lexer', 'hxtool', 'depfile', 'qapidoc']
+extensions = [
+ 'depfile',
+ 'hxtool',
+ 'kerneldoc',
+ 'qapi_domain',
+ 'qapidoc',
+ 'qmp_lexer',
+]
if sphinx.version_info[:3] > (4, 0, 0):
tags.add('sphinx4')
@@ -87,7 +94,7 @@ default_role = 'any'
# General information about the project.
project = u'QEMU'
-copyright = u'2024, The QEMU Project Developers'
+copyright = u'2025, The QEMU Project Developers'
author = u'The QEMU Project Developers'
# The version info for the project you're documenting, acts as replacement for
@@ -110,6 +117,32 @@ finally:
else:
version = release = "unknown version"
+bits = version.split(".")
+
+major = int(bits[0])
+minor = int(bits[1])
+micro = int(bits[2])
+
+# Check for a dev snapshot, so we can adjust to next
+# predicted release version.
+#
+# This assumes we do 3 releases per year, so must bump
+# major if minor == 2
+if micro >= 50:
+ micro = 0
+ if minor == 2:
+ major += 1
+ minor = 0
+ else:
+ minor += 1
+
+# These thresholds must match the constants
+# MACHINE_VER_DELETION_MAJOR & MACHINE_VER_DEPRECATION_MAJOR
+# defined in include/hw/boards.h and the introductory text in
+# docs/about/deprecated.rst
+ver_machine_deprecation_version = "%d.%d.0" % (major - 3, minor)
+ver_machine_deletion_version = "%d.%d.0" % (major - 6, minor)
+
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
@@ -138,7 +171,18 @@ suppress_warnings = ["ref.option"]
# environment variable is not set is for the benefit of readthedocs
# style document building; our Makefile always sets the variable.
confdir = os.getenv('CONFDIR', "/etc/qemu")
-rst_epilog = ".. |CONFDIR| replace:: ``" + confdir + "``\n"
+
+vars = {
+ "CONFDIR": confdir,
+ "VER_MACHINE_DEPRECATION_VERSION": ver_machine_deprecation_version,
+ "VER_MACHINE_DELETION_VERSION": ver_machine_deletion_version,
+}
+
+rst_epilog = "".join([
+ ".. |" + key + "| replace:: ``" + vars[key] + "``\n"
+ for key in vars.keys()
+])
+
# We slurp in the defs.rst.inc and literally include it into rst_epilog,
# because Sphinx's include:: directive doesn't work with absolute paths
# and there isn't any one single relative path that will work for all
@@ -146,6 +190,22 @@ rst_epilog = ".. |CONFDIR| replace:: ``" + confdir + "``\n"
with open(os.path.join(qemu_docdir, 'defs.rst.inc')) as f:
rst_epilog += f.read()
+
+# Normally, the QAPI domain is picky about what field lists you use to
+# describe a QAPI entity. If you'd like to use arbitrary additional
+# fields in source documentation, add them here.
+qapi_allowed_fields = {
+ "see also",
+}
+
+# Due to a limitation in Sphinx, we need to know which indices to
+# generate in advance. Adding a namespace here allows that generation.
+qapi_namespaces = {
+ "QGA",
+ "QMP",
+ "QSD",
+}
+
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
@@ -186,7 +246,7 @@ html_js_files = [
]
html_context = {
- "display_gitlab": True,
+ "source_url_prefix": "https://gitlab.com/qemu-project/qemu/-/blob/master/docs/",
"gitlab_user": "qemu-project",
"gitlab_repo": "qemu",
"gitlab_version": "master",
@@ -275,9 +335,6 @@ man_pages = [
('tools/qemu-trace-stap', 'qemu-trace-stap',
'QEMU SystemTap trace tool',
[], 1),
- ('tools/virtfs-proxy-helper', 'virtfs-proxy-helper',
- 'QEMU 9p virtfs proxy filesystem helper',
- ['M. Mohan Kumar'], 1),
]
man_make_section_directory = False
diff --git a/docs/devel/atomics.rst b/docs/devel/atomics.rst
index b77c6e1..95c7b77 100644
--- a/docs/devel/atomics.rst
+++ b/docs/devel/atomics.rst
@@ -204,7 +204,7 @@ They come in six kinds:
before the second with respect to the other components of the system.
Therefore, unlike ``smp_rmb()`` or ``qatomic_load_acquire()``,
``smp_read_barrier_depends()`` can be just a compiler barrier on
- weakly-ordered architectures such as Arm or PPC[#]_.
+ weakly-ordered architectures such as Arm or PPC\ [#alpha]_.
Note that the first load really has to have a _data_ dependency and not
a control dependency. If the address for the second load is dependent
@@ -212,7 +212,7 @@ They come in six kinds:
than actually loading the address itself, then it's a _control_
dependency and a full read barrier or better is required.
-.. [#] The DEC Alpha is an exception, because ``smp_read_barrier_depends()``
+.. [#alpha] The DEC Alpha is an exception, because ``smp_read_barrier_depends()``
needs a processor barrier. On strongly-ordered architectures such
as x86 or s390, ``smp_rmb()`` and ``qatomic_load_acquire()`` can
also be compiler barriers only.
@@ -295,7 +295,7 @@ Acquire/release pairing and the *synchronizes-with* relation
------------------------------------------------------------
Atomic operations other than ``qatomic_set()`` and ``qatomic_read()`` have
-either *acquire* or *release* semantics [#rmw]_. This has two effects:
+either *acquire* or *release* semantics\ [#rmw]_. This has two effects:
.. [#rmw] Read-modify-write operations can have both---acquire applies to the
read part, and release to the write.
diff --git a/docs/devel/blkdebug.txt b/docs/devel/blkdebug.txt
deleted file mode 100644
index 0b0c128..0000000
--- a/docs/devel/blkdebug.txt
+++ /dev/null
@@ -1,162 +0,0 @@
-Block I/O error injection using blkdebug
-----------------------------------------
-Copyright (C) 2014-2015 Red Hat Inc
-
-This work is licensed under the terms of the GNU GPL, version 2 or later. See
-the COPYING file in the top-level directory.
-
-The blkdebug block driver is a rule-based error injection engine. It can be
-used to exercise error code paths in block drivers including ENOSPC (out of
-space) and EIO.
-
-This document gives an overview of the features available in blkdebug.
-
-Background
-----------
-Block drivers have many error code paths that handle I/O errors. Image formats
-are especially complex since metadata I/O errors during cluster allocation or
-while updating tables happen halfway through request processing and require
-discipline to keep image files consistent.
-
-Error injection allows test cases to trigger I/O errors at specific points.
-This way, all error paths can be tested to make sure they are correct.
-
-Rules
------
-The blkdebug block driver takes a list of "rules" that tell the error injection
-engine when to fail an I/O request.
-
-Each I/O request is evaluated against the rules. If a rule matches the request
-then its "action" is executed.
-
-Rules can be placed in a configuration file; the configuration file
-follows the same .ini-like format used by QEMU's -readconfig option, and
-each section of the file represents a rule.
-
-The following configuration file defines a single rule:
-
- $ cat blkdebug.conf
- [inject-error]
- event = "read_aio"
- errno = "28"
-
-This rule fails all aio read requests with ENOSPC (28). Note that the errno
-value depends on the host. On Linux, see
-/usr/include/asm-generic/errno-base.h for errno values.
-
-Invoke QEMU as follows:
-
- $ qemu-system-x86_64
- -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \
- -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0
-
-Rules support the following attributes:
-
- event - which type of operation to match (e.g. read_aio, write_aio,
- flush_to_os, flush_to_disk). See the "Events" section for
- information on events.
-
- state - (optional) the engine must be in this state number in order for this
- rule to match. See the "State transitions" section for information
- on states.
-
- errno - the numeric errno value to return when a request matches this rule.
- The errno values depend on the host since the numeric values are not
- standardized in the POSIX specification.
-
- sector - (optional) a sector number that the request must overlap in order to
- match this rule
-
- once - (optional, default "off") only execute this action on the first
- matching request
-
- immediately - (optional, default "off") return a NULL BlockAIOCB
- pointer and fail without an errno instead. This
- exercises the code path where BlockAIOCB fails and the
- caller's BlockCompletionFunc is not invoked.
-
-Events
-------
-Block drivers provide information about the type of I/O request they are about
-to make so rules can match specific types of requests. For example, the qcow2
-block driver tells blkdebug when it accesses the L1 table so rules can match
-only L1 table accesses and not other metadata or guest data requests.
-
-The core events are:
-
- read_aio - guest data read
-
- write_aio - guest data write
-
- flush_to_os - write out unwritten block driver state (e.g. cached metadata)
-
- flush_to_disk - flush the host block device's disk cache
-
-See qapi/block-core.json:BlkdebugEvent for the full list of events.
-You may need to grep block driver source code to understand the
-meaning of specific events.
-
-State transitions
------------------
-There are cases where more power is needed to match a particular I/O request in
-a longer sequence of requests. For example:
-
- write_aio
- flush_to_disk
- write_aio
-
-How do we match the 2nd write_aio but not the first? This is where state
-transitions come in.
-
-The error injection engine has an integer called the "state" that always starts
-initialized to 1. The state integer is internal to blkdebug and cannot be
-observed from outside but rules can interact with it for powerful matching
-behavior.
-
-Rules can be conditional on the current state and they can transition to a new
-state.
-
-When a rule's "state" attribute is non-zero then the current state must equal
-the attribute in order for the rule to match.
-
-For example, to match the 2nd write_aio:
-
- [set-state]
- event = "write_aio"
- state = "1"
- new_state = "2"
-
- [inject-error]
- event = "write_aio"
- state = "2"
- errno = "5"
-
-The first write_aio request matches the set-state rule and transitions from
-state 1 to state 2. Once state 2 has been entered, the set-state rule no
-longer matches since it requires state 1. But the inject-error rule now
-matches the next write_aio request and injects EIO (5).
-
-State transition rules support the following attributes:
-
- event - which type of operation to match (e.g. read_aio, write_aio,
- flush_to_os, flush_to_disk). See the "Events" section for
- information on events.
-
- state - (optional) the engine must be in this state number in order for this
- rule to match
-
- new_state - transition to this state number
-
-Suspend and resume
-------------------
-Exercising code paths in block drivers may require specific ordering amongst
-concurrent requests. The "breakpoint" feature allows requests to be halted on
-a blkdebug event and resumed later. This makes it possible to achieve
-deterministic ordering when multiple requests are in flight.
-
-Breakpoints on blkdebug events are associated with a user-defined "tag" string.
-This tag serves as an identifier by which the request can be resumed at a later
-point.
-
-See the qemu-io(1) break, resume, remove_break, and wait_break commands for
-details.
diff --git a/docs/devel/build-environment.rst b/docs/devel/build-environment.rst
new file mode 100644
index 0000000..661f6ea
--- /dev/null
+++ b/docs/devel/build-environment.rst
@@ -0,0 +1,118 @@
+
+.. _setup-build-env:
+
+Setup build environment
+=======================
+
+QEMU uses a lot of dependencies on the host system. glib2 is used everywhere in
+the code base, and most of the other dependencies are optional.
+
+We present here simple instructions to enable native builds on most popular
+systems.
+
+You can find additional instructions on `QEMU wiki <https://wiki.qemu.org/>`_:
+
+- `Linux <https://wiki.qemu.org/Hosts/Linux>`_
+- `MacOS <https://wiki.qemu.org/Hosts/Mac>`_
+- `Windows <https://wiki.qemu.org/Hosts/W32>`_
+- `BSD <https://wiki.qemu.org/Hosts/BSD>`_
+
+Note: Installing dependencies using your package manager build dependencies may
+miss out on deps that have been newly introduced in qemu.git. In more, it misses
+deps the distribution has decided to exclude.
+
+Linux
+-----
+
+Fedora
+++++++
+
+::
+
+ sudo dnf update && sudo dnf builddep qemu
+
+Debian/Ubuntu
++++++++++++++
+
+You first need to enable `Sources List <https://wiki.debian.org/SourcesList>`_.
+Then, use apt to install dependencies:
+
+::
+
+ sudo apt update && sudo apt build-dep qemu
+
+MacOS
+-----
+
+You first need to install `Homebrew <https://brew.sh/>`_. Then, use it to
+install dependencies:
+
+::
+
+ brew update && brew install $(brew deps --include-build qemu)
+
+Windows
+-------
+
+You first need to install `MSYS2 <https://www.msys2.org/>`_.
+MSYS2 offers `different environments <https://www.msys2.org/docs/environments/>`_.
+x86_64 environments are based on GCC, while aarch64 is based on Clang.
+
+We recommend to use MINGW64 for windows-x86_64 and CLANGARM64 for windows-aarch64
+(only available on windows-aarch64 hosts).
+
+Then, you can open a windows shell, and enter msys2 env using:
+
+::
+
+ c:/msys64/msys2_shell.cmd -defterm -here -no-start -mingw64
+ # Replace -ucrt64 by -clangarm64 or -ucrt64 for other environments.
+
+MSYS2 package manager does not offer a built-in way to install build
+dependencies. You can start with this list of packages using pacman:
+
+Note: Dependencies need to be installed again if you use a different MSYS2
+environment.
+
+::
+
+ # update MSYS2 itself, you need to reopen your shell at the end.
+ pacman -Syu
+ pacman -S \
+ base-devel binutils bison diffutils flex git grep make sed \
+ ${MINGW_PACKAGE_PREFIX}-toolchain \
+ ${MINGW_PACKAGE_PREFIX}-glib2 \
+ ${MINGW_PACKAGE_PREFIX}-gtk3 \
+ ${MINGW_PACKAGE_PREFIX}-libnfs \
+ ${MINGW_PACKAGE_PREFIX}-libssh \
+ ${MINGW_PACKAGE_PREFIX}-ninja \
+ ${MINGW_PACKAGE_PREFIX}-pixman \
+ ${MINGW_PACKAGE_PREFIX}-pkgconf \
+ ${MINGW_PACKAGE_PREFIX}-python \
+ ${MINGW_PACKAGE_PREFIX}-SDL2 \
+ ${MINGW_PACKAGE_PREFIX}-zstd
+
+If you want to install all dependencies, it's possible to use recipe used to
+build QEMU in MSYS2 itself.
+
+::
+
+ pacman -S wget base-devel git
+ wget https://raw.githubusercontent.com/msys2/MINGW-packages/refs/heads/master/mingw-w64-qemu/PKGBUILD
+ # Some packages may be missing for your environment, installation will still
+ # be done though.
+ makepkg --syncdeps --nobuild PKGBUILD || true
+
+Build on windows-aarch64
+++++++++++++++++++++++++
+
+When trying to cross compile meson for x86_64 using UCRT64 or MINGW64 env,
+configure will run into an error because the cpu detected is not correct.
+
+Meson detects x86_64 processes emulated, so you need to manually set the cpu,
+and force a cross compilation (with empty prefix).
+
+::
+
+ ./configure --cpu=x86_64 --cross-prefix=
+
diff --git a/docs/devel/build-system.rst b/docs/devel/build-system.rst
index 79eceb1..2c88419 100644
--- a/docs/devel/build-system.rst
+++ b/docs/devel/build-system.rst
@@ -134,7 +134,7 @@ in how the build process runs Python code.
At this stage, ``configure`` also queries the chosen Python interpreter
about QEMU's build dependencies. Note that the build process does *not*
-look for ``meson``, ``sphinx-build`` or ``avocado`` binaries in the PATH;
+look for ``meson`` or ``sphinx-build`` binaries in the PATH;
likewise, there are no options such as ``--meson`` or ``--sphinx-build``.
This avoids a potential mismatch, where Meson and Sphinx binaries on the
PATH might operate in a different Python environment than the one chosen
@@ -145,13 +145,13 @@ was installed in the ``site-packages`` directory of another interpreter,
or with the wrong ``pip`` program.
If a package is available for the chosen interpreter, ``configure``
-prepares a small script that invokes it from the venv itself[#distlib]_.
+prepares a small script that invokes it from the venv itself\ [#distlib]_.
If not, ``configure`` can also optionally install dependencies in the
virtual environment with ``pip``, either from wheels in ``python/wheels``
or by downloading the package with PyPI. Downloading can be disabled with
``--disable-download``; and anyway, it only happens when a ``configure``
option (currently, only ``--enable-docs``) is explicitly enabled but
-the dependencies are not present[#pip]_.
+the dependencies are not present.
.. [#distlib] The scripts are created based on the package's metadata,
specifically the ``console_script`` entry points. This is the
@@ -164,15 +164,11 @@ the dependencies are not present[#pip]_.
because the Python Packaging Authority provides a package
``distlib.scripts`` to perform this task.
-.. [#pip] ``pip`` might also be used when running ``make check-avocado``
- if downloading is enabled, to ensure that Avocado is
- available.
-
The required versions of the packages are stored in a configuration file
``pythondeps.toml``. The format is custom to QEMU, but it is documented
at the top of the file itself and it should be easy to understand. The
requirements should make it possible to use the version that is packaged
-that is provided by supported distros.
+by QEMU's supported distros.
When dependencies are downloaded, instead, ``configure`` uses a "known
good" version that is also listed in ``pythondeps.toml``. In this
@@ -260,7 +256,7 @@ Target-dependent emulator sourcesets:
Each emulator also includes sources for files in the ``hw/`` and ``target/``
subdirectories. The subdirectory used for each emulator comes
from the target's definition of ``TARGET_BASE_ARCH`` or (if missing)
- ``TARGET_ARCH``, as found in ``default-configs/targets/*.mak``.
+ ``TARGET_ARCH``, as found in ``configs/targets/*.mak``.
Each subdirectory in ``hw/`` adds one sourceset to the ``hw_arch`` dictionary,
for example::
@@ -317,8 +313,8 @@ Utility sourcesets:
The following files concur in the definition of which files are linked
into each emulator:
-``default-configs/devices/*.mak``
- The files under ``default-configs/devices/`` control the boards and devices
+``configs/devices/*.mak``
+ The files under ``configs/devices/`` control the boards and devices
that are built into each QEMU system emulation targets. They merely contain
a list of config variable definitions such as::
@@ -327,13 +323,13 @@ into each emulator:
CONFIG_XLNX_VERSAL=y
``*/Kconfig``
- These files are processed together with ``default-configs/devices/*.mak`` and
+ These files are processed together with ``configs/devices/*.mak`` and
describe the dependencies between various features, subsystems and
device models. They are described in :ref:`kconfig`
-``default-configs/targets/*.mak``
+``configs/targets/*.mak``
These files mostly define symbols that appear in the ``*-config-target.h``
- file for each emulator [#cfgtarget]_. However, the ``TARGET_ARCH``
+ file for each emulator\ [#cfgtarget]_. However, the ``TARGET_ARCH``
and ``TARGET_BASE_ARCH`` will also be used to select the ``hw/`` and
``target/`` subdirectories that are compiled into each target.
@@ -497,8 +493,7 @@ number of dynamically created files listed later.
``pyvenv/bin``, and calling ``pip`` to install dependencies.
``tests/Makefile.include``
- Rules for external test harnesses. These include the TCG tests
- and the Avocado-based integration tests.
+ Rules for external test harnesses like the TCG tests.
``tests/docker/Makefile.include``
Rules for Docker tests. Like ``tests/Makefile.include``, this file is
diff --git a/docs/devel/ci-definitions.rst.inc b/docs/devel/ci-definitions.rst.inc
deleted file mode 100644
index 6d5c6fd..0000000
--- a/docs/devel/ci-definitions.rst.inc
+++ /dev/null
@@ -1,121 +0,0 @@
-Definition of terms
-===================
-
-This section defines the terms used in this document and correlates them with
-what is currently used on QEMU.
-
-Automated tests
----------------
-
-An automated test is written on a test framework using its generic test
-functions/classes. The test framework can run the tests and report their
-success or failure [1]_.
-
-An automated test has essentially three parts:
-
-1. The test initialization of the parameters, where the expected parameters,
- like inputs and expected results, are set up;
-2. The call to the code that should be tested;
-3. An assertion, comparing the result from the previous call with the expected
- result set during the initialization of the parameters. If the result
- matches the expected result, the test has been successful; otherwise, it has
- failed.
-
-Unit testing
-------------
-
-A unit test is responsible for exercising individual software components as a
-unit, like interfaces, data structures, and functionality, uncovering errors
-within the boundaries of a component. The verification effort is in the
-smallest software unit and focuses on the internal processing logic and data
-structures. A test case of unit tests should be designed to uncover errors due
-to erroneous computations, incorrect comparisons, or improper control flow [2]_.
-
-On QEMU, unit testing is represented by the 'check-unit' target from 'make'.
-
-Functional testing
-------------------
-
-A functional test focuses on the functional requirement of the software.
-Deriving sets of input conditions, the functional tests should fully exercise
-all the functional requirements for a program. Functional testing is
-complementary to other testing techniques, attempting to find errors like
-incorrect or missing functions, interface errors, behavior errors, and
-initialization and termination errors [3]_.
-
-On QEMU, functional testing is represented by the 'check-qtest' target from
-'make'.
-
-System testing
---------------
-
-System tests ensure all application elements mesh properly while the overall
-functionality and performance are achieved [4]_. Some or all system components
-are integrated to create a complete system to be tested as a whole. System
-testing ensures that components are compatible, interact correctly, and
-transfer the right data at the right time across their interfaces. As system
-testing focuses on interactions, use case-based testing is a practical approach
-to system testing [5]_. Note that, in some cases, system testing may require
-interaction with third-party software, like operating system images, databases,
-networks, and so on.
-
-On QEMU, system testing is represented by the 'check-avocado' target from
-'make'.
-
-Flaky tests
------------
-
-A flaky test is defined as a test that exhibits both a passing and a failing
-result with the same code on different runs. Some usual reasons for an
-intermittent/flaky test are async wait, concurrency, and test order dependency
-[6]_.
-
-Gating
-------
-
-A gate restricts the move of code from one stage to another on a
-test/deployment pipeline. The step move is granted with approval. The approval
-can be a manual intervention or a set of tests succeeding [7]_.
-
-On QEMU, the gating process happens during the pull request. The approval is
-done by the project leader running its own set of tests. The pull request gets
-merged when the tests succeed.
-
-Continuous Integration (CI)
----------------------------
-
-Continuous integration (CI) requires the builds of the entire application and
-the execution of a comprehensive set of automated tests every time there is a
-need to commit any set of changes [8]_. The automated tests can be composed of
-the unit, functional, system, and other tests.
-
-Keynotes about continuous integration (CI) [9]_:
-
-1. System tests may depend on external software (operating system images,
- firmware, database, network).
-2. It may take a long time to build and test. It may be impractical to build
- the system being developed several times per day.
-3. If the development platform is different from the target platform, it may
- not be possible to run system tests in the developer’s private workspace.
- There may be differences in hardware, operating system, or installed
- software. Therefore, more time is required for testing the system.
-
-References
-----------
-
-.. [1] Sommerville, Ian (2016). Software Engineering. p. 233.
-.. [2] Pressman, Roger S. & Maxim, Bruce R. (2020). Software Engineering,
- A Practitioner’s Approach. p. 48, 376, 378, 381.
-.. [3] Pressman, Roger S. & Maxim, Bruce R. (2020). Software Engineering,
- A Practitioner’s Approach. p. 388.
-.. [4] Pressman, Roger S. & Maxim, Bruce R. (2020). Software Engineering,
- A Practitioner’s Approach. Software Engineering, p. 377.
-.. [5] Sommerville, Ian (2016). Software Engineering. p. 59, 232, 240.
-.. [6] Luo, Qingzhou, et al. An empirical analysis of flaky tests.
- Proceedings of the 22nd ACM SIGSOFT International Symposium on
- Foundations of Software Engineering. 2014.
-.. [7] Humble, Jez & Farley, David (2010). Continuous Delivery:
- Reliable Software Releases Through Build, Test, and Deployment, p. 122.
-.. [8] Humble, Jez & Farley, David (2010). Continuous Delivery:
- Reliable Software Releases Through Build, Test, and Deployment, p. 55.
-.. [9] Sommerville, Ian (2016). Software Engineering. p. 743.
diff --git a/docs/devel/ci.rst b/docs/devel/ci.rst
deleted file mode 100644
index ed88a20..0000000
--- a/docs/devel/ci.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. _ci:
-
-==
-CI
-==
-
-Most of QEMU's CI is run on GitLab's infrastructure although a number
-of other CI services are used for specialised purposes. The most up to
-date information about them and their status can be found on the
-`project wiki testing page <https://wiki.qemu.org/Testing/CI>`_.
-
-.. include:: ci-definitions.rst.inc
-.. include:: ci-jobs.rst.inc
-.. include:: ci-runners.rst.inc
diff --git a/docs/devel/clocks.rst b/docs/devel/clocks.rst
index 177ee1c..3f744f2 100644
--- a/docs/devel/clocks.rst
+++ b/docs/devel/clocks.rst
@@ -358,6 +358,12 @@ humans (for instance in debugging), use ``clock_display_freq()``,
which returns a prettified string-representation, e.g. "33.3 MHz".
The caller must free the string with g_free() after use.
+It's also possible to retrieve the clock period from a QTest by
+accessing QOM property ``qtest-clock-period`` using a QMP command.
+This property is only present when the device is being run under
+the ``qtest`` accelerator; it is not available when QEMU is
+being run normally.
+
Calculating expiry deadlines
----------------------------
diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
new file mode 100644
index 0000000..b5aae2e
--- /dev/null
+++ b/docs/devel/code-provenance.rst
@@ -0,0 +1,338 @@
+.. _code-provenance:
+
+Code provenance
+===============
+
+Certifying patch submissions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The QEMU community **mandates** all contributors to certify provenance of
+patch submissions they make to the project. To put it another way,
+contributors must indicate that they are legally permitted to contribute to
+the project.
+
+Certification is achieved with a low overhead by adding a single line to the
+bottom of every git commit::
+
+ Signed-off-by: YOUR NAME <YOUR@EMAIL>
+
+The addition of this line asserts that the author of the patch is contributing
+in accordance with the clauses specified in the
+`Developer's Certificate of Origin <https://developercertificate.org>`__:
+
+.. _dco:
+
+ Developer's Certificate of Origin 1.1
+
+ By making a contribution to this project, I certify that:
+
+ (a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+ (b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+ (c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+ (d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
+
+The name used with "Signed-off-by" does not need to be your legal name, nor
+birth name, nor appear on any government ID. It is the identity you choose to
+be known by in the community, but should not be anonymous, nor misrepresent
+whom you are.
+
+It is generally expected that the name and email addresses used in one of the
+``Signed-off-by`` lines, matches that of the git commit ``Author`` field.
+It's okay if you subscribe or contribute to the list via more than one
+address, but using multiple addresses in one commit just confuses
+things.
+
+If the person sending the mail is not one of the patch authors, they are
+nonetheless expected to add their own ``Signed-off-by`` to comply with the
+DCO clause (c).
+
+Multiple authorship
+~~~~~~~~~~~~~~~~~~~
+
+It is not uncommon for a patch to have contributions from multiple authors. In
+this scenario, git commits will usually be expected to have a ``Signed-off-by``
+line for each contributor involved in creation of the patch. Some edge cases:
+
+ * The non-primary author's contributions were so trivial that they can be
+ considered not subject to copyright. In this case the secondary authors
+ need not include a ``Signed-off-by``.
+
+ This case most commonly applies where QEMU reviewers give short snippets
+ of code as suggested fixes to a patch. The reviewers don't need to have
+ their own ``Signed-off-by`` added unless their code suggestion was
+ unusually large, but it is common to add ``Suggested-by`` as a credit
+ for non-trivial code.
+
+ * Both contributors work for the same employer and the employer requires
+ copyright assignment.
+
+ It can be said that in this case a ``Signed-off-by`` is indicating that
+ the person has permission to contribute from their employer who is the
+ copyright holder. It is nonetheless still preferable to include a
+ ``Signed-off-by`` for each contributor, as in some countries employees are
+ not able to assign copyright to their employer, and it also covers any
+ time invested outside working hours.
+
+When multiple ``Signed-off-by`` tags are present, they should be strictly kept
+in order of authorship, from oldest to newest.
+
+Other commit tags
+~~~~~~~~~~~~~~~~~
+
+While the ``Signed-off-by`` tag is mandatory, there are a number of other tags
+that are commonly used during QEMU development:
+
+ * **``Reviewed-by``**: when a QEMU community member reviews a patch on the
+ mailing list, if they consider the patch acceptable, they should send an
+ email reply containing a ``Reviewed-by`` tag. Subsystem maintainers who
+ review a patch should add this even if they are also adding their
+ ``Signed-off-by`` to the same commit.
+
+ * **``Acked-by``**: when a QEMU subsystem maintainer approves a patch that
+ touches their subsystem, but intends to allow a different maintainer to
+ queue it and send a pull request, they would send a mail containing a
+ ``Acked-by`` tag. Where a patch touches multiple subsystems, ``Acked-by``
+ only implies review of the maintainers' own areas of responsibility. If a
+ maintainer wants to indicate they have done a full review they should use
+ a ``Reviewed-by`` tag.
+
+ * **``Tested-by``**: when a QEMU community member has functionally tested the
+ behaviour of the patch in some manner, they should send an email reply
+ containing a ``Tested-by`` tag.
+
+ * **``Reported-by``**: when a QEMU community member reports a problem via the
+ mailing list, or some other informal channel that is not the issue tracker,
+ it is good practice to credit them by including a ``Reported-by`` tag on
+ any patch fixing the issue. When the problem is reported via the GitLab
+ issue tracker, however, it is sufficient to just include a link to the
+ issue.
+
+ * **``Suggested-by``**: when a reviewer or other 3rd party makes non-trivial
+ suggestions for how to change a patch, it is good practice to credit them
+ by including a ``Suggested-by`` tag.
+
+Subsystem maintainer requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a subsystem maintainer accepts a patch from a contributor, in addition to
+the normal code review points, they are expected to validate the presence of
+suitable ``Signed-off-by`` tags.
+
+At the time they queue the patch in their subsystem tree, the maintainer
+**must** also then add their own ``Signed-off-by`` to indicate that they have
+done the aforementioned validation. This is in addition to any of their own
+``Reviewed-by`` tags the subsystem maintainer may wish to include.
+
+When the maintainer modifies the patch after pulling into their tree, they
+should record their contribution. This is typically done via a note in the
+commit message, just prior to the maintainer's ``Signed-off-by``::
+
+ Signed-off-by: Cory Contributor <cory.contributor@example.com>
+ [Comment rephrased for clarity]
+ Signed-off-by: Mary Maintainer <mary.maintainer@mycorp.test>
+
+
+Tools for adding ``Signed-off-by``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are a variety of ways tools can support adding ``Signed-off-by`` tags
+for patches, avoiding the need for contributors to manually type in this
+repetitive text each time.
+
+git commands
+^^^^^^^^^^^^
+
+When creating, or amending, a commit the ``-s`` flag to ``git commit`` will
+append a suitable line matching the configured git author details.
+
+If preparing patches using the ``git format-patch`` tool, the ``-s`` flag can
+be used to append a suitable line in the emails it creates, without modifying
+the local commits. Alternatively to modify all the local commits on a branch::
+
+ git rebase master -x 'git commit --amend --no-edit -s'
+
+emacs
+^^^^^
+
+In the file ``$HOME/.emacs.d/abbrev_defs`` add:
+
+.. code:: elisp
+
+ (define-abbrev-table 'global-abbrev-table
+ '(
+ ("8rev" "Reviewed-by: YOUR NAME <your@email.addr>" nil 1)
+ ("8ack" "Acked-by: YOUR NAME <your@email.addr>" nil 1)
+ ("8test" "Tested-by: YOUR NAME <your@email.addr>" nil 1)
+ ("8sob" "Signed-off-by: YOUR NAME <your@email.addr>" nil 1)
+ ))
+
+with this change, if you type (for example) ``8rev`` followed by ``<space>``
+or ``<enter>`` it will expand to the whole phrase.
+
+vim
+^^^
+
+In the file ``$HOME/.vimrc`` add::
+
+ iabbrev 8rev Reviewed-by: YOUR NAME <your@email.addr>
+ iabbrev 8ack Acked-by: YOUR NAME <your@email.addr>
+ iabbrev 8test Tested-by: YOUR NAME <your@email.addr>
+ iabbrev 8sob Signed-off-by: YOUR NAME <your@email.addr>
+
+with this change, if you type (for example) ``8rev`` followed by ``<space>``
+or ``<enter>`` it will expand to the whole phrase.
+
+Re-starting abandoned work
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For a variety of reasons there are some patches that get submitted to QEMU but
+never merged. An unrelated contributor may decide (months or years later) to
+continue working from the abandoned patch and re-submit it with extra changes.
+
+The general principles when picking up abandoned work are:
+
+ * Continue to credit the original author for their work, by maintaining their
+ original ``Signed-off-by``
+ * Indicate where the original patch was obtained from (mailing list, bug
+ tracker, author's git repo, etc) when sending it for review
+ * Acknowledge the extra work of the new contributor by including their
+ ``Signed-off-by`` in the patch in addition to the orignal author's
+ * Indicate who is responsible for what parts of the patch. This is typically
+ done via a note in the commit message, just prior to the new contributor's
+ ``Signed-off-by``::
+
+ Signed-off-by: Some Person <some.person@example.com>
+ [Rebased and added support for 'foo']
+ Signed-off-by: New Person <new.person@mycorp.test>
+
+In complicated cases, or if otherwise unsure, ask for advice on the project
+mailing list.
+
+It is also recommended to attempt to contact the original author to let them
+know you are interested in taking over their work, in case they still intended
+to return to the work, or had any suggestions about the best way to continue.
+
+Inclusion of generated files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Files in patches contributed to QEMU are generally expected to be provided
+only in the preferred format for making modifications. The implication of
+this is that the output of code generators or compilers is usually not
+appropriate to contribute to QEMU.
+
+For reasons of practicality there are some exceptions to this rule, where
+generated code is permitted, provided it is also accompanied by the
+corresponding preferred source format. This is done where it is impractical
+to expect those building QEMU to run the code generation or compilation
+process. A non-exhaustive list of examples is:
+
+ * Images: where an bitmap image is created from a vector file it is common
+ to include the rendered bitmaps at desired resolution(s), since subtle
+ changes in the rasterization process / tools may affect quality. The
+ original vector file is expected to accompany any generated bitmaps.
+
+ * Firmware: QEMU includes pre-compiled binary ROMs for a variety of guest
+ firmwares. When such binary ROMs are contributed, the corresponding source
+ must also be provided, either directly, or through a git submodule link.
+
+ * Dockerfiles: the majority of the dockerfiles are automatically generated
+ from a canonical list of build dependencies maintained in tree, together
+ with the libvirt-ci git submodule link. The generated dockerfiles are
+ included in tree because it is desirable to be able to directly build
+ container images from a clean git checkout.
+
+ * eBPF: QEMU includes some generated eBPF machine code, since the required
+ eBPF compilation tools are not broadly available on all targetted OS
+ distributions. The corresponding eBPF C code for the binary is also
+ provided. This is a time-limited exception until the eBPF toolchain is
+ sufficiently broadly available in distros.
+
+In all cases above, the existence of generated files must be acknowledged
+and justified in the commit that introduces them.
+
+Tools which perform changes to existing code with deterministic algorithmic
+manipulation, driven by user specified inputs, are not generally considered
+to be "generators".
+
+For instance, using Coccinelle to convert code from one pattern to another
+pattern, or fixing documentation typos with a spell checker, or transforming
+code using sed / awk / etc, are not considered to be acts of code
+generation. Where an automated manipulation is performed on code, however,
+this should be declared in the commit message.
+
+At times contributors may use or create scripts/tools to generate an initial
+boilerplate code template which is then filled in to produce the final patch.
+The output of such a tool would still be considered the "preferred format",
+since it is intended to be a foundation for further human authored changes.
+Such tools are acceptable to use, provided there is clearly defined copyright
+and licensing for their output. Note in particular the caveats applying to AI
+content generators below.
+
+Use of AI content generators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TL;DR:
+
+ **Current QEMU project policy is to DECLINE any contributions which are
+ believed to include or derive from AI generated content. This includes
+ ChatGPT, Claude, Copilot, Llama and similar tools.**
+
+The increasing prevalence of AI-assisted software development results in a
+number of difficult legal questions and risks for software projects, including
+QEMU. Of particular concern is content generated by `Large Language Models
+<https://en.wikipedia.org/wiki/Large_language_model>`__ (LLMs).
+
+The QEMU community requires that contributors certify their patch submissions
+are made in accordance with the rules of the `Developer's Certificate of
+Origin (DCO) <dco>`.
+
+To satisfy the DCO, the patch contributor has to fully understand the
+copyright and license status of content they are contributing to QEMU. With AI
+content generators, the copyright and license status of the output is
+ill-defined with no generally accepted, settled legal foundation.
+
+Where the training material is known, it is common for it to include large
+volumes of material under restrictive licensing/copyright terms. Even where
+the training material is all known to be under open source licenses, it is
+likely to be under a variety of terms, not all of which will be compatible
+with QEMU's licensing requirements.
+
+How contributors could comply with DCO terms (b) or (c) for the output of AI
+content generators commonly available today is unclear. The QEMU project is
+not willing or able to accept the legal risks of non-compliance.
+
+The QEMU project thus requires that contributors refrain from using AI content
+generators on patches intended to be submitted to the project, and will
+decline any contribution if use of AI is either known or suspected.
+
+This policy does not apply to other uses of AI, such as researching APIs or
+algorithms, static analysis, or debugging, provided their output is not to be
+included in contributions.
+
+Examples of tools impacted by this policy includes GitHub's CoPilot, OpenAI's
+ChatGPT, Anthropic's Claude, and Meta's Code Llama, and code/content
+generation agents which are built on top of such tools.
+
+This policy may evolve as AI tools mature and the legal situation is
+clarifed. In the meanwhile, requests for exceptions to this policy will be
+evaluated by the QEMU project on a case by case basis. To be granted an
+exception, a contributor will need to demonstrate clarity of the license and
+copyright status for the tool's output in relation to its training model and
+code, to the satisfaction of the project maintainers.
diff --git a/docs/devel/codebase.rst b/docs/devel/codebase.rst
new file mode 100644
index 0000000..2a31437
--- /dev/null
+++ b/docs/devel/codebase.rst
@@ -0,0 +1,215 @@
+========
+Codebase
+========
+
+This section presents the various parts of QEMU and how the codebase is
+organized.
+
+Beyond giving succinct descriptions, the goal is to offer links to various
+parts of the documentation/codebase.
+
+Subsystems
+----------
+
+An exhaustive list of subsystems and associated files can be found in the
+`MAINTAINERS <https://gitlab.com/qemu-project/qemu/-/blob/master/MAINTAINERS>`_
+file.
+
+Some of the main QEMU subsystems are:
+
+- `Accelerators<Accelerators>`
+- Block devices and `disk images<disk images>` support
+- `CI<ci>` and `Tests<testing>`
+- `Devices<device-emulation>` & Board models
+- `Documentation <documentation-root>`
+- `GDB support<GDB usage>`
+- :ref:`Migration<migration>`
+- `Monitor<QEMU monitor>`
+- :ref:`QOM (QEMU Object Model)<qom>`
+- `System mode<System emulation>`
+- :ref:`TCG (Tiny Code Generator)<tcg>`
+- `User mode<user-mode>` (`Linux<linux-user-mode>` & `BSD<bsd-user-mode>`)
+- User Interfaces
+
+More documentation on QEMU subsystems can be found on :ref:`internal-subsystem`
+page.
+
+The Grand tour
+--------------
+
+We present briefly here what every folder in the top directory of the codebase
+contains. Hop on!
+
+The folder name links here will take you to that folder in our gitlab
+repository. Other links will take you to more detailed documentation for that
+subsystem, where we have it. Unfortunately not every subsystem has documentation
+yet, so sometimes the source code is all you have.
+
+* `accel <https://gitlab.com/qemu-project/qemu/-/tree/master/accel>`_:
+ Infrastructure and architecture agnostic code related to the various
+ `accelerators <Accelerators>` supported by QEMU
+ (TCG, KVM, hvf, whpx, xen, nvmm).
+ Contains interfaces for operations that will be implemented per
+ `target <https://gitlab.com/qemu-project/qemu/-/tree/master/target>`_.
+* `audio <https://gitlab.com/qemu-project/qemu/-/tree/master/audio>`_:
+ Audio (host) support.
+* `authz <https://gitlab.com/qemu-project/qemu/-/tree/master/authz>`_:
+ `QEMU Authorization framework<client authorization>`.
+* `backends <https://gitlab.com/qemu-project/qemu/-/tree/master/backends>`_:
+ Various backends that are used to access resources on the host (e.g. for
+ random number generation, memory backing or cryptographic functions).
+* `block <https://gitlab.com/qemu-project/qemu/-/tree/master/block>`_:
+ Block devices and `image formats<disk images>` implementation.
+* `bsd-user <https://gitlab.com/qemu-project/qemu/-/tree/master/bsd-user>`_:
+ `BSD User mode<bsd-user-mode>`.
+* build: Where the code built goes by default. You can tell the QEMU build
+ system to put the built code anywhere else you like.
+* `chardev <https://gitlab.com/qemu-project/qemu/-/tree/master/chardev>`_:
+ Various backends used by char devices.
+* `common-user <https://gitlab.com/qemu-project/qemu/-/tree/master/common-user>`_:
+ User-mode assembly code for dealing with signals occurring during syscalls.
+* `configs <https://gitlab.com/qemu-project/qemu/-/tree/master/configs>`_:
+ Makefiles defining configurations to build QEMU.
+* `contrib <https://gitlab.com/qemu-project/qemu/-/tree/master/contrib>`_:
+ Community contributed devices/plugins/tools.
+* `crypto <https://gitlab.com/qemu-project/qemu/-/tree/master/crypto>`_:
+ Cryptographic algorithms used in QEMU.
+* `disas <https://gitlab.com/qemu-project/qemu/-/tree/master/disas>`_:
+ Disassembly functions used by QEMU target code.
+* `docs <https://gitlab.com/qemu-project/qemu/-/tree/master/docs>`_:
+ QEMU Documentation.
+* `dump <https://gitlab.com/qemu-project/qemu/-/tree/master/dump>`_:
+ Code to dump memory of a running VM.
+* `ebpf <https://gitlab.com/qemu-project/qemu/-/tree/master/ebpf>`_:
+ eBPF program support in QEMU. `virtio-net RSS<ebpf-rss>` uses it.
+* `fpu <https://gitlab.com/qemu-project/qemu/-/tree/master/fpu>`_:
+ Floating-point software emulation.
+* `fsdev <https://gitlab.com/qemu-project/qemu/-/tree/master/fsdev>`_:
+ `VirtFS <https://www.linux-kvm.org/page/VirtFS>`_ support.
+* `gdbstub <https://gitlab.com/qemu-project/qemu/-/tree/master/gdbstub>`_:
+ `GDB <GDB usage>` support.
+* `gdb-xml <https://gitlab.com/qemu-project/qemu/-/tree/master/gdb-xml>`_:
+ Set of XML files describing architectures and used by `gdbstub <GDB usage>`.
+* `host <https://gitlab.com/qemu-project/qemu/-/tree/master/host>`_:
+ Various architecture specific header files (crypto, atomic, memory
+ operations).
+* `linux-headers <https://gitlab.com/qemu-project/qemu/-/tree/master/linux-headers>`_:
+ A subset of headers imported from Linux kernel and used for implementing
+ KVM support and user-mode.
+* `linux-user <https://gitlab.com/qemu-project/qemu/-/tree/master/linux-user>`_:
+ `User mode <user-mode>` implementation. Contains one folder per target
+ architecture.
+* `.gitlab-ci.d <https://gitlab.com/qemu-project/qemu/-/tree/master/.gitlab-ci.d>`_:
+ `CI <ci>` yaml and scripts.
+* `include <https://gitlab.com/qemu-project/qemu/-/tree/master/include>`_:
+ All headers associated to different subsystems in QEMU. The hierarchy used
+ mirrors source code organization and naming.
+* `hw <https://gitlab.com/qemu-project/qemu/-/tree/master/hw>`_:
+ `Devices <device-emulation>` and boards emulation. Devices are categorized by
+ type/protocol/architecture and located in associated subfolder.
+* `io <https://gitlab.com/qemu-project/qemu/-/tree/master/io>`_:
+ QEMU `I/O channels <https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg04208.html>`_.
+* `libdecnumber <https://gitlab.com/qemu-project/qemu/-/tree/master/libdecnumber>`_:
+ Import of gcc library, used to implement decimal number arithmetic.
+* `migration <https://gitlab.com/qemu-project/qemu/-/tree/master/migration>`__:
+ :ref:`Migration framework <migration>`.
+* `monitor <https://gitlab.com/qemu-project/qemu/-/tree/master/monitor>`_:
+ `Monitor <QEMU monitor>` implementation (HMP & QMP).
+* `nbd <https://gitlab.com/qemu-project/qemu/-/tree/master/nbd>`_:
+ QEMU NBD (Network Block Device) server.
+* `net <https://gitlab.com/qemu-project/qemu/-/tree/master/net>`_:
+ Network (host) support.
+* `pc-bios <https://gitlab.com/qemu-project/qemu/-/tree/master/pc-bios>`_:
+ Contains pre-built firmware binaries and boot images, ready to use in
+ QEMU without compilation.
+* `plugins <https://gitlab.com/qemu-project/qemu/-/tree/master/plugins>`_:
+ :ref:`TCG plugins <tcg-plugins>` core implementation. Plugins can be found in
+ `tests <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/tcg/plugins>`__
+ and `contrib <https://gitlab.com/qemu-project/qemu/-/tree/master/contrib/plugins>`__
+ folders.
+* `po <https://gitlab.com/qemu-project/qemu/-/tree/master/po>`_:
+ Translation files.
+* `python <https://gitlab.com/qemu-project/qemu/-/tree/master/python>`_:
+ Python part of our build/test system.
+* `qapi <https://gitlab.com/qemu-project/qemu/-/tree/master/qapi>`_:
+ `QAPI <qapi>` implementation.
+* `qobject <https://gitlab.com/qemu-project/qemu/-/tree/master/qobject>`_:
+ QEMU Object implementation.
+* `qga <https://gitlab.com/qemu-project/qemu/-/tree/master/qga>`_:
+ QEMU `Guest agent <qemu-ga>` implementation.
+* `qom <https://gitlab.com/qemu-project/qemu/-/tree/master/qom>`_:
+ QEMU :ref:`Object model <qom>` implementation, with monitor associated commands.
+* `replay <https://gitlab.com/qemu-project/qemu/-/tree/master/replay>`_:
+ QEMU :ref:`Record/replay <replay>` implementation.
+* `roms <https://gitlab.com/qemu-project/qemu/-/tree/master/roms>`_:
+ Contains source code for various firmware and ROMs, which can be compiled if
+ custom or updated versions are needed.
+* `rust <https://gitlab.com/qemu-project/qemu/-/tree/master/rust>`_:
+ Rust integration in QEMU. It contains the new interfaces defined and
+ associated devices using it.
+* `scripts <https://gitlab.com/qemu-project/qemu/-/tree/master/scripts>`_:
+ Collection of scripts used in build and test systems, and various
+ tools for QEMU codebase and execution traces.
+* `scsi <https://gitlab.com/qemu-project/qemu/-/tree/master/scsi>`_:
+ Code related to SCSI support, used by SCSI devices.
+* `semihosting <https://gitlab.com/qemu-project/qemu/-/tree/master/semihosting>`_:
+ QEMU `Semihosting <Semihosting>` implementation.
+* `stats <https://gitlab.com/qemu-project/qemu/-/tree/master/stats>`_:
+ `Monitor <QEMU monitor>` stats commands implementation.
+* `storage-daemon <https://gitlab.com/qemu-project/qemu/-/tree/master/storage-daemon>`_:
+ QEMU `Storage daemon <storage-daemon>` implementation.
+* `stubs <https://gitlab.com/qemu-project/qemu/-/tree/master/stubs>`_:
+ Various stubs (empty functions) used to compile QEMU with specific
+ configurations.
+* `subprojects <https://gitlab.com/qemu-project/qemu/-/tree/master/subprojects>`_:
+ QEMU submodules used by QEMU build system.
+* `system <https://gitlab.com/qemu-project/qemu/-/tree/master/system>`_:
+ QEMU `system mode <System emulation>` implementation (cpu, mmu, boot support).
+* `target <https://gitlab.com/qemu-project/qemu/-/tree/master/target>`_:
+ Contains code for all target architectures supported (one subfolder
+ per arch). For every architecture, you can find accelerator specific
+ implementations.
+* `tcg <https://gitlab.com/qemu-project/qemu/-/tree/master/tcg>`_:
+ :ref:`TCG <tcg>` related code.
+ Contains one subfolder per host supported architecture.
+* `tests <https://gitlab.com/qemu-project/qemu/-/tree/master/tests>`_:
+ QEMU `test <testing>` suite
+
+ - `data <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/data>`_:
+ Data for various tests.
+ - `decode <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/decode>`_:
+ Testsuite for :ref:`decodetree <decodetree>` implementation.
+ - `docker <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/docker>`_:
+ Code and scripts to create `containers <container-ref>` used in `CI <ci>`.
+ - `fp <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/fp>`_:
+ QEMU testsuite for soft float implementation.
+ - `functional <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/functional>`_:
+ `Functional tests <checkfunctional-ref>` (full VM boot).
+ - `lcitool <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/lcitool>`_:
+ Generate dockerfiles for CI containers.
+ - `migration <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/migration>`_:
+ Test scripts and data for :ref:`Migration framework <migration>`.
+ - `multiboot <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/multiboot>`_:
+ Test multiboot functionality for x86_64/i386.
+ - `qapi-schema <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/qapi-schema>`_:
+ Test scripts and data for `QAPI <qapi-tests>`.
+ - `qemu-iotests <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/qemu-iotests>`_:
+ `Disk image and block tests <qemu-iotests>`.
+ - `qtest <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/qtest>`_:
+ `Device emulation testing <qtest>`.
+ - `tcg <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/tcg>`__:
+ `TCG related tests <checktcg-ref>`. Contains code per architecture
+ (subfolder) and multiarch tests as well.
+ - `tsan <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/tsan>`_:
+ `Suppressions <tsan-suppressions>` for thread sanitizer.
+ - `uefi-test-tools <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/uefi-test-tools>`_:
+ Test tool for UEFI support.
+ - `unit <https://gitlab.com/qemu-project/qemu/-/tree/master/tests/unit>`_:
+ QEMU `Unit tests <unit-tests>`.
+* `trace <https://gitlab.com/qemu-project/qemu/-/tree/master/trace>`_:
+ :ref:`Tracing framework <tracing>`. Used to print information associated to various
+ events during execution.
+* `ui <https://gitlab.com/qemu-project/qemu/-/tree/master/ui>`_:
+ QEMU User interfaces.
+* `util <https://gitlab.com/qemu-project/qemu/-/tree/master/util>`_:
+ Utility code used by other parts of QEMU.
diff --git a/docs/devel/control-flow-integrity.rst b/docs/devel/control-flow-integrity.rst
index e6b73a4..3d5702f 100644
--- a/docs/devel/control-flow-integrity.rst
+++ b/docs/devel/control-flow-integrity.rst
@@ -1,3 +1,5 @@
+.. _cfi:
+
============================
Control-Flow Integrity (CFI)
============================
diff --git a/docs/devel/decodetree.rst b/docs/devel/decodetree.rst
index e3392aa..98ad33a 100644
--- a/docs/devel/decodetree.rst
+++ b/docs/devel/decodetree.rst
@@ -1,3 +1,5 @@
+.. _decodetree:
+
========================
Decodetree Specification
========================
diff --git a/docs/devel/ebpf_rss.rst b/docs/devel/ebpf_rss.rst
index 4a68682..ed5d337 100644
--- a/docs/devel/ebpf_rss.rst
+++ b/docs/devel/ebpf_rss.rst
@@ -1,3 +1,5 @@
+.. _ebpf-rss:
+
===========================
eBPF RSS virtio-net support
===========================
diff --git a/docs/devel/index-api.rst b/docs/devel/index-api.rst
index fe01b2b..1c487c1 100644
--- a/docs/devel/index-api.rst
+++ b/docs/devel/index-api.rst
@@ -9,6 +9,7 @@ generated from in-code annotations to function prototypes.
bitops
loads-stores
+ lockcnt
memory
modules
pci
diff --git a/docs/devel/index-build.rst b/docs/devel/index-build.rst
index 90b406c..3f3cb21 100644
--- a/docs/devel/index-build.rst
+++ b/docs/devel/index-build.rst
@@ -1,20 +1,16 @@
-QEMU Build and Test System
---------------------------
+QEMU Build System
+-----------------
-Details about how QEMU's build system works and how it is integrated
-into our testing infrastructure. You will need to understand some of
-the basics if you are adding new files and targets to the build.
+Details about how QEMU's build system works. You will need to understand
+some of the basics if you are adding new files and targets to the build.
.. toctree::
:maxdepth: 3
build-system
+ build-environment
kconfig
docs
- testing
- acpi-bits
- qtest
- ci
qapi-code-gen
- fuzzing
+ qapi-domain
control-flow-integrity
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index 4ac7725..7a0678c 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -1,3 +1,5 @@
+.. _internal-subsystem:
+
Internal Subsystem Information
------------------------------
@@ -8,6 +10,7 @@ Details about QEMU's various subsystems including how to add features to them.
qom
atomics
+ rcu
block-coroutine-wrapper
clocks
ebpf_rss
@@ -17,7 +20,9 @@ Details about QEMU's various subsystems including how to add features to them.
s390-cpu-topology
s390-dasd-ipl
tracing
+ uefi-vars
vfio-iommufd
writing-monitor-commands
virtio-backends
crypto
+ multiple-iothreads
diff --git a/docs/devel/index-process.rst b/docs/devel/index-process.rst
index 362f97e..5807752 100644
--- a/docs/devel/index-process.rst
+++ b/docs/devel/index-process.rst
@@ -13,7 +13,9 @@ Notes about how to interact with the community and how and where to submit patch
maintainers
style
submitting-a-patch
+ code-provenance
trivial-patches
stable-process
submitting-a-pull-request
secure-coding-practices
+ rust
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
index abf6045..29f032d 100644
--- a/docs/devel/index.rst
+++ b/docs/devel/index.rst
@@ -31,6 +31,8 @@ the :ref:`tcg_internals`.
index-process
index-build
+ testing/index
index-api
index-internals
index-tcg
+ codebase
diff --git a/docs/devel/kconfig.rst b/docs/devel/kconfig.rst
index 52d4b90..493b76c 100644
--- a/docs/devel/kconfig.rst
+++ b/docs/devel/kconfig.rst
@@ -38,7 +38,7 @@ originated in the Linux kernel, though it was heavily simplified and
the handling of dependencies is stricter in QEMU.
Unlike Linux, there is no user interface to edit the configuration, which
-is instead specified in per-target files under the ``default-configs/``
+is instead specified in per-target files under the ``configs/``
directory of the QEMU source tree. This is because, unlike Linux,
configuration and dependencies can be treated as a black box when building
QEMU; the default configuration that QEMU ships with should be okay in
@@ -103,7 +103,7 @@ directives can be included:
**default value**: ``default <value> [if <expr>]``
Default values are assigned to the config symbol if no other value was
- set by the user via ``default-configs/*.mak`` files, and only if
+ set by the user via ``configs/*.mak`` files, and only if
``select`` or ``depends on`` directives do not force the value to true
or false respectively. ``<value>`` can be ``y`` or ``n``; it cannot
be an arbitrary Boolean expression. However, a condition for applying
@@ -119,7 +119,7 @@ directives can be included:
This is similar to ``select`` as it applies a lower limit of ``y``
to another symbol. However, the lower limit is only a default
and the "implied" symbol's value may still be set to ``n`` from a
- ``default-configs/*.mak`` files. The following two examples are
+ ``configs/*.mak`` files. The following two examples are
equivalent::
config FOO
@@ -146,7 +146,7 @@ declares its dependencies in different ways:
bool
Subsystems always default to false (they have no ``default`` directive)
- and are never visible in ``default-configs/*.mak`` files. It's
+ and are never visible in ``configs/*.mak`` files. It's
up to other symbols to ``select`` whatever subsystems they require.
They sometimes have ``select`` directives to bring in other required
@@ -238,7 +238,7 @@ declares its dependencies in different ways:
include libraries (such as ``FDT``) or ``TARGET_BIG_ENDIAN``
(possibly negated).
- Boards are listed for convenience in the ``default-configs/*.mak``
+ Boards are listed for convenience in the ``configs/*.mak``
for the target they apply to.
**internal elements**
@@ -251,18 +251,18 @@ declares its dependencies in different ways:
Internal elements group code that is useful in several boards or
devices. They are usually enabled with ``select`` and in turn select
- other elements; they are never visible in ``default-configs/*.mak``
+ other elements; they are never visible in ``configs/*.mak``
files, and often not even in the Makefile.
Writing and modifying default configurations
--------------------------------------------
In addition to the Kconfig files under hw/, each target also includes
-a file called ``default-configs/TARGETNAME-softmmu.mak``. These files
+a file called ``configs/TARGETNAME-softmmu.mak``. These files
initialize some Kconfig variables to non-default values and provide the
starting point to turn on devices and subsystems.
-A file in ``default-configs/`` looks like the following example::
+A file in ``configs/`` looks like the following example::
# Default configuration for alpha-softmmu
diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
index ec627aa..9471bac 100644
--- a/docs/devel/loads-stores.rst
+++ b/docs/devel/loads-stores.rst
@@ -95,7 +95,7 @@ guest CPU state in case of a guest CPU exception. This is passed
to ``cpu_restore_state()``. Therefore the value should either be 0,
to indicate that the guest CPU state is already synchronized, or
the result of ``GETPC()`` from the top level ``HELPER(foo)``
-function, which is a return address into the generated code [#gpc]_.
+function, which is a return address into the generated code\ [#gpc]_.
.. [#gpc] Note that ``GETPC()`` should be used with great care: calling
it in other functions that are *not* the top level
diff --git a/docs/devel/lockcnt.txt b/docs/devel/lockcnt.rst
index a3fb3bc..8b43578 100644
--- a/docs/devel/lockcnt.txt
+++ b/docs/devel/lockcnt.rst
@@ -1,9 +1,9 @@
-DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt)
-===================================================
+Locked Counters (aka ``QemuLockCnt``)
+=====================================
QEMU often uses reference counts to track data structures that are being
accessed and should not be freed. For example, a loop that invoke
-callbacks like this is not safe:
+callbacks like this is not safe::
QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
if (ioh->revents & G_IO_OUT) {
@@ -11,11 +11,11 @@ callbacks like this is not safe:
}
}
-QLIST_FOREACH_SAFE protects against deletion of the current node (ioh)
-by stashing away its "next" pointer. However, ioh->fd_write could
+``QLIST_FOREACH_SAFE`` protects against deletion of the current node (``ioh``)
+by stashing away its ``next`` pointer. However, ``ioh->fd_write`` could
actually delete the next node from the list. The simplest way to
avoid this is to mark the node as deleted, and remove it from the
-list in the above loop:
+list in the above loop::
QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
if (ioh->deleted) {
@@ -29,7 +29,7 @@ list in the above loop:
}
If however this loop must also be reentrant, i.e. it is possible that
-ioh->fd_write invokes the loop again, some kind of counting is needed:
+``ioh->fd_write`` invokes the loop again, some kind of counting is needed::
walking_handlers++;
QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
@@ -46,8 +46,8 @@ ioh->fd_write invokes the loop again, some kind of counting is needed:
}
walking_handlers--;
-One may think of using the RCU primitives, rcu_read_lock() and
-rcu_read_unlock(); effectively, the RCU nesting count would take
+One may think of using the RCU primitives, ``rcu_read_lock()`` and
+``rcu_read_unlock()``; effectively, the RCU nesting count would take
the place of the walking_handlers global variable. Indeed,
reference counting and RCU have similar purposes, but their usage in
general is complementary:
@@ -70,14 +70,14 @@ general is complementary:
this can improve performance, but also delay reclamation undesirably.
With reference counting, reclamation is deterministic.
-This file documents QemuLockCnt, an abstraction for using reference
+This file documents ``QemuLockCnt``, an abstraction for using reference
counting in code that has to be both thread-safe and reentrant.
-QemuLockCnt concepts
---------------------
+``QemuLockCnt`` concepts
+------------------------
-A QemuLockCnt comprises both a counter and a mutex; it has primitives
+A ``QemuLockCnt`` comprises both a counter and a mutex; it has primitives
to increment and decrement the counter, and to take and release the
mutex. The counter notes how many visits to the data structures are
taking place (the visits could be from different threads, or there could
@@ -95,13 +95,14 @@ not just frees, though there could be cases where this is not necessary.
Reads, instead, can be done without taking the mutex, as long as the
readers and writers use the same macros that are used for RCU, for
-example qatomic_rcu_read, qatomic_rcu_set, QLIST_FOREACH_RCU, etc. This is
-because the reads are done outside a lock and a set or QLIST_INSERT_HEAD
+example ``qatomic_rcu_read``, ``qatomic_rcu_set``, ``QLIST_FOREACH_RCU``,
+etc. This is because the reads are done outside a lock and a set
+or ``QLIST_INSERT_HEAD``
can happen concurrently with the read. The RCU API ensures that the
processor and the compiler see all required memory barriers.
This could be implemented simply by protecting the counter with the
-mutex, for example:
+mutex, for example::
// (1)
qemu_mutex_lock(&walking_handlers_mutex);
@@ -125,33 +126,33 @@ mutex, for example:
Here, no frees can happen in the code represented by the ellipsis.
If another thread is executing critical section (2), that part of
the code cannot be entered, because the thread will not be able
-to increment the walking_handlers variable. And of course
+to increment the ``walking_handlers`` variable. And of course
during the visit any other thread will see a nonzero value for
-walking_handlers, as in the single-threaded code.
+``walking_handlers``, as in the single-threaded code.
Note that it is possible for multiple concurrent accesses to delay
-the cleanup arbitrarily; in other words, for the walking_handlers
+the cleanup arbitrarily; in other words, for the ``walking_handlers``
counter to never become zero. For this reason, this technique is
more easily applicable if concurrent access to the structure is rare.
However, critical sections are easy to forget since you have to do
-them for each modification of the counter. QemuLockCnt ensures that
+them for each modification of the counter. ``QemuLockCnt`` ensures that
all modifications of the counter take the lock appropriately, and it
can also be more efficient in two ways:
- it avoids taking the lock for many operations (for example
incrementing the counter while it is non-zero);
-- on some platforms, one can implement QemuLockCnt to hold the lock
+- on some platforms, one can implement ``QemuLockCnt`` to hold the lock
and the mutex in a single word, making the fast path no more expensive
than simply managing a counter using atomic operations (see
- docs/devel/atomics.rst). This can be very helpful if concurrent access to
+ :doc:`atomics`). This can be very helpful if concurrent access to
the data structure is expected to be rare.
Using the same mutex for frees and writes can still incur some small
inefficiencies; for example, a visit can never start if the counter is
-zero and the mutex is taken---even if the mutex is taken by a write,
+zero and the mutex is taken -- even if the mutex is taken by a write,
which in principle need not block a visit of the data structure.
However, these are usually not a problem if any of the following
assumptions are valid:
@@ -163,27 +164,27 @@ assumptions are valid:
- writes are frequent, but this kind of write (e.g. appending to a
list) has a very small critical section.
-For example, QEMU uses QemuLockCnt to manage an AioContext's list of
+For example, QEMU uses ``QemuLockCnt`` to manage an ``AioContext``'s list of
bottom halves and file descriptor handlers. Modifications to the list
of file descriptor handlers are rare. Creation of a new bottom half is
frequent and can happen on a fast path; however: 1) it is almost never
concurrent with a visit to the list of bottom halves; 2) it only has
-three instructions in the critical path, two assignments and a smp_wmb().
+three instructions in the critical path, two assignments and a ``smp_wmb()``.
-QemuLockCnt API
----------------
+``QemuLockCnt`` API
+-------------------
-The QemuLockCnt API is described in include/qemu/thread.h.
+.. kernel-doc:: include/qemu/lockcnt.h
-QemuLockCnt usage
------------------
+``QemuLockCnt`` usage
+---------------------
-This section explains the typical usage patterns for QemuLockCnt functions.
+This section explains the typical usage patterns for ``QemuLockCnt`` functions.
Setting a variable to a non-NULL value can be done between
-qemu_lockcnt_lock and qemu_lockcnt_unlock:
+``qemu_lockcnt_lock`` and ``qemu_lockcnt_unlock``::
qemu_lockcnt_lock(&xyz_lockcnt);
if (!xyz) {
@@ -193,8 +194,8 @@ qemu_lockcnt_lock and qemu_lockcnt_unlock:
}
qemu_lockcnt_unlock(&xyz_lockcnt);
-Accessing the value can be done between qemu_lockcnt_inc and
-qemu_lockcnt_dec:
+Accessing the value can be done between ``qemu_lockcnt_inc`` and
+``qemu_lockcnt_dec``::
qemu_lockcnt_inc(&xyz_lockcnt);
if (xyz) {
@@ -204,11 +205,11 @@ qemu_lockcnt_dec:
}
qemu_lockcnt_dec(&xyz_lockcnt);
-Freeing the object can similarly use qemu_lockcnt_lock and
-qemu_lockcnt_unlock, but you also need to ensure that the count
-is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc
-takes the QemuLockCnt's lock, the count cannot become non-zero while
-the object is being freed. Freeing an object looks like this:
+Freeing the object can similarly use ``qemu_lockcnt_lock`` and
+``qemu_lockcnt_unlock``, but you also need to ensure that the count
+is zero (i.e. there is no concurrent visit). Because ``qemu_lockcnt_inc``
+takes the ``QemuLockCnt``'s lock, the count cannot become non-zero while
+the object is being freed. Freeing an object looks like this::
qemu_lockcnt_lock(&xyz_lockcnt);
if (!qemu_lockcnt_count(&xyz_lockcnt)) {
@@ -218,7 +219,7 @@ the object is being freed. Freeing an object looks like this:
qemu_lockcnt_unlock(&xyz_lockcnt);
If an object has to be freed right after a visit, you can combine
-the decrement, the locking and the check on count as follows:
+the decrement, the locking and the check on count as follows::
qemu_lockcnt_inc(&xyz_lockcnt);
if (xyz) {
@@ -232,7 +233,7 @@ the decrement, the locking and the check on count as follows:
qemu_lockcnt_unlock(&xyz_lockcnt);
}
-QemuLockCnt can also be used to access a list as follows:
+``QemuLockCnt`` can also be used to access a list as follows::
qemu_lockcnt_inc(&io_handlers_lockcnt);
QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) {
@@ -252,10 +253,10 @@ QemuLockCnt can also be used to access a list as follows:
}
Again, the RCU primitives are used because new items can be added to the
-list during the walk. QLIST_FOREACH_RCU ensures that the processor and
+list during the walk. ``QLIST_FOREACH_RCU`` ensures that the processor and
the compiler see the appropriate memory barriers.
-An alternative pattern uses qemu_lockcnt_dec_if_lock:
+An alternative pattern uses ``qemu_lockcnt_dec_if_lock``::
qemu_lockcnt_inc(&io_handlers_lockcnt);
QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) {
@@ -273,5 +274,5 @@ An alternative pattern uses qemu_lockcnt_dec_if_lock:
}
qemu_lockcnt_dec(&io_handlers_lockcnt);
-Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock,
+Here you can use ``qemu_lockcnt_dec`` instead of ``qemu_lockcnt_dec_and_lock``,
because there is no special task to do if the count goes from 1 to 0.
diff --git a/docs/devel/maintainers.rst b/docs/devel/maintainers.rst
index 5c907d9..88a613e 100644
--- a/docs/devel/maintainers.rst
+++ b/docs/devel/maintainers.rst
@@ -99,9 +99,9 @@ members of the QEMU community, you should make arrangements to attend
a `KeySigningParty <https://wiki.qemu.org/KeySigningParty>`__ (for
example at KVM Forum) or make alternative arrangements to have your
key signed by an attendee. Key signing requires meeting another
-community member **in person** [#]_ so please make appropriate
+community member **in person**\ [#2020]_ so please make appropriate
arrangements.
-.. [#] In recent pandemic times we have had to exercise some
+.. [#2020] In recent pandemic times we have had to exercise some
flexibility here. Maintainers still need to sign their pull
requests though.
diff --git a/docs/devel/memory.rst b/docs/devel/memory.rst
index 69c5e3f..57fb2ae 100644
--- a/docs/devel/memory.rst
+++ b/docs/devel/memory.rst
@@ -369,4 +369,4 @@ callbacks are called:
API Reference
-------------
-.. kernel-doc:: include/exec/memory.h
+.. kernel-doc:: include/system/memory.h
diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst
index 63c3647..7897873 100644
--- a/docs/devel/migration/CPR.rst
+++ b/docs/devel/migration/CPR.rst
@@ -5,7 +5,7 @@ CPR is the umbrella name for a set of migration modes in which the
VM is migrated to a new QEMU instance on the same host. It is
intended for use when the goal is to update host software components
that run the VM, such as QEMU or even the host kernel. At this time,
-cpr-reboot is the only available mode.
+the cpr-reboot and cpr-transfer modes are available.
Because QEMU is restarted on the same host, with access to the same
local devices, CPR is allowed in certain cases where normal migration
@@ -53,7 +53,7 @@ RAM is copied to the migration URI.
Outgoing:
* Set the migration mode parameter to ``cpr-reboot``.
* Set the ``x-ignore-shared`` capability if desired.
- * Issue the ``migrate`` command. It is recommended the the URI be a
+ * Issue the ``migrate`` command. It is recommended the URI be a
``file`` type, but one can use other types such as ``exec``,
provided the command captures all the data from the outgoing side,
and provides all the data to the incoming side.
@@ -145,3 +145,183 @@ Caveats
cpr-reboot mode may not be used with postcopy, background-snapshot,
or COLO.
+
+cpr-transfer mode
+-----------------
+
+This mode allows the user to transfer a guest to a new QEMU instance
+on the same host with minimal guest pause time, by preserving guest
+RAM in place, albeit with new virtual addresses in new QEMU. Devices
+and their pinned memory pages will also be preserved in a future QEMU
+release.
+
+The user starts new QEMU on the same host as old QEMU, with command-
+line arguments to create the same machine, plus the ``-incoming``
+option for the main migration channel, like normal live migration.
+In addition, the user adds a second -incoming option with channel
+type ``cpr``. This CPR channel must support file descriptor transfer
+with SCM_RIGHTS, i.e. it must be a UNIX domain socket.
+
+To initiate CPR, the user issues a migrate command to old QEMU,
+adding a second migration channel of type ``cpr`` in the channels
+argument. Old QEMU stops the VM, saves state to the migration
+channels, and enters the postmigrate state. Execution resumes in
+new QEMU.
+
+New QEMU reads the CPR channel before opening a monitor, hence
+the CPR channel cannot be specified in the list of channels for a
+migrate-incoming command. It may only be specified on the command
+line.
+
+Usage
+^^^^^
+
+Memory backend objects must have the ``share=on`` attribute.
+
+The VM must be started with the ``-machine aux-ram-share=on``
+option. This causes implicit RAM blocks (those not described by
+a memory-backend object) to be allocated by mmap'ing a memfd.
+Examples include VGA and ROM.
+
+Outgoing:
+ * Set the migration mode parameter to ``cpr-transfer``.
+ * Issue the ``migrate`` command, containing a main channel and
+ a cpr channel.
+
+Incoming:
+ * Start new QEMU with two ``-incoming`` options.
+ * If the VM was running when the outgoing ``migrate`` command was
+ issued, then QEMU automatically resumes VM execution.
+
+Caveats
+^^^^^^^
+
+cpr-transfer mode may not be used with postcopy, background-snapshot,
+or COLO.
+
+memory-backend-epc is not supported.
+
+The main incoming migration channel address cannot be a file type.
+
+If the main incoming channel address is an inet socket, then the port
+cannot be 0 (meaning dynamically choose a port).
+
+When using ``-incoming defer``, you must issue the migrate command to
+old QEMU before issuing any monitor commands to new QEMU, because new
+QEMU blocks waiting to read from the cpr channel before starting its
+monitor, and old QEMU does not write to the channel until the migrate
+command is issued. However, new QEMU does not open and read the
+main migration channel until you issue the migrate incoming command.
+
+Example 1: incoming channel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In these examples, we simply restart the same version of QEMU, but
+in a real scenario one would start new QEMU on the incoming side.
+Note that new QEMU does not print the monitor prompt until old QEMU
+has issued the migrate command. The outgoing side uses QMP because
+HMP cannot specify a CPR channel. Some QMP responses are omitted for
+brevity.
+
+::
+
+ Outgoing: Incoming:
+
+ # qemu-kvm -qmp stdio
+ -object memory-backend-file,id=ram0,size=4G,
+ mem-path=/dev/shm/ram0,share=on -m 4G
+ -machine memory-backend=ram0
+ -machine aux-ram-share=on
+ ...
+ # qemu-kvm -monitor stdio
+ -incoming tcp:0:44444
+ -incoming '{"channel-type": "cpr",
+ "addr": { "transport": "socket",
+ "type": "unix", "path": "cpr.sock"}}'
+ ...
+ {"execute":"qmp_capabilities"}
+
+ {"execute": "query-status"}
+ {"return": {"status": "running",
+ "running": true}}
+
+ {"execute":"migrate-set-parameters",
+ "arguments":{"mode":"cpr-transfer"}}
+
+ {"execute": "migrate", "arguments": { "channels": [
+ {"channel-type": "main",
+ "addr": { "transport": "socket", "type": "inet",
+ "host": "0", "port": "44444" }},
+ {"channel-type": "cpr",
+ "addr": { "transport": "socket", "type": "unix",
+ "path": "cpr.sock" }}]}}
+
+ QEMU 10.0.50 monitor
+ (qemu) info status
+ VM status: running
+
+ {"execute": "query-status"}
+ {"return": {"status": "postmigrate",
+ "running": false}}
+
+Example 2: incoming defer
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This example uses ``-incoming defer`` to hot plug a device before
+accepting the main migration channel. Again note you must issue the
+migrate command to old QEMU before you can issue any monitor
+commands to new QEMU.
+
+
+::
+
+ Outgoing: Incoming:
+
+ # qemu-kvm -monitor stdio
+ -object memory-backend-file,id=ram0,size=4G,
+ mem-path=/dev/shm/ram0,share=on -m 4G
+ -machine memory-backend=ram0
+ -machine aux-ram-share=on
+ ...
+ # qemu-kvm -monitor stdio
+ -incoming defer
+ -incoming '{"channel-type": "cpr",
+ "addr": { "transport": "socket",
+ "type": "unix", "path": "cpr.sock"}}'
+ ...
+ {"execute":"qmp_capabilities"}
+
+ {"execute": "device_add",
+ "arguments": {"driver": "pcie-root-port"}}
+
+ {"execute":"migrate-set-parameters",
+ "arguments":{"mode":"cpr-transfer"}}
+
+ {"execute": "migrate", "arguments": { "channels": [
+ {"channel-type": "main",
+ "addr": { "transport": "socket", "type": "inet",
+ "host": "0", "port": "44444" }},
+ {"channel-type": "cpr",
+ "addr": { "transport": "socket", "type": "unix",
+ "path": "cpr.sock" }}]}}
+
+ QEMU 10.0.50 monitor
+ (qemu) info status
+ VM status: paused (inmigrate)
+ (qemu) device_add pcie-root-port
+ (qemu) migrate_incoming tcp:0:44444
+ (qemu) info status
+ VM status: running
+
+ {"execute": "query-status"}
+ {"return": {"status": "postmigrate",
+ "running": false}}
+
+Futures
+^^^^^^^
+
+cpr-transfer mode is based on a capability to transfer open file
+descriptors from old to new QEMU. In the future, descriptors for
+vfio, iommufd, vhost, and char devices could be transferred,
+preserving those devices and their kernel state without interruption,
+even if they do not explicitly support live migration.
diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst
index 5a5417e..ecb887e 100644
--- a/docs/devel/migration/compatibility.rst
+++ b/docs/devel/migration/compatibility.rst
@@ -198,7 +198,7 @@ was done::
The relevant parts for migration are::
- @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = {
+ @@ -1281,7 +1284,8 @@ static const Property virtio_blk_properties[] = {
#endif
DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
true),
@@ -395,13 +395,12 @@ the old behaviour or the new behaviour::
index 8a87ccc8b0..5153ad63d6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
- @@ -79,6 +79,8 @@ static Property pci_props[] = {
+ @@ -79,6 +79,8 @@ static const Property pci_props[] = {
DEFINE_PROP_STRING("failover_pair_id", PCIDevice,
failover_pair_id),
DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0),
+ DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present,
+ QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
- DEFINE_PROP_END_OF_LIST()
};
Notice that we enable the feature for new machine types.
diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst
index 58f8fd9..8f431d5 100644
--- a/docs/devel/migration/features.rst
+++ b/docs/devel/migration/features.rst
@@ -14,3 +14,4 @@ Migration has plenty of features to support different use cases.
CPR
qpl-compression
uadk-compression
+ qatzip-compression
diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst
index 784c899..cdd4f4a 100644
--- a/docs/devel/migration/main.rst
+++ b/docs/devel/migration/main.rst
@@ -1,3 +1,5 @@
+.. _migration:
+
===================
Migration framework
===================
@@ -465,6 +467,12 @@ Examples of such API functions are:
- portio_list_set_address()
- portio_list_set_enabled()
+Since the order of device save/restore is not defined, you must
+avoid accessing or changing any other device's state in one of these
+callbacks. (For instance, don't do anything that calls ``update_irq()``
+in a ``post_load`` hook.) Otherwise, restore will not be deterministic,
+and this will break execution record/replay.
+
Iterative device migration
--------------------------
diff --git a/docs/devel/migration/mapped-ram.rst b/docs/devel/migration/mapped-ram.rst
index d352b54..b08c2b4 100644
--- a/docs/devel/migration/mapped-ram.rst
+++ b/docs/devel/migration/mapped-ram.rst
@@ -44,7 +44,7 @@ Use-cases
The mapped-ram feature was designed for use cases where the migration
stream will be directed to a file in the filesystem and not
-immediately restored on the destination VM [#]_. These could be
+immediately restored on the destination VM\ [#alternatives]_. These could be
thought of as snapshots. We can further categorize them into live and
non-live.
@@ -70,7 +70,7 @@ mapped-ram in this scenario is portability since background-snapshot
depends on async dirty tracking (KVM_GET_DIRTY_LOG) which is not
supported outside of Linux.
-.. [#] While this same effect could be obtained with the usage of
+.. [#alternatives] While this same effect could be obtained with the usage of
snapshots or the ``file:`` migration alone, mapped-ram provides
a performance increase for VMs with larger RAM sizes (10s to
100s of GiBs), specially if the VM has been stopped beforehand.
diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst
new file mode 100644
index 0000000..862b383
--- /dev/null
+++ b/docs/devel/migration/qatzip-compression.rst
@@ -0,0 +1,165 @@
+==================
+QATzip Compression
+==================
+In scenarios with limited network bandwidth, the ``QATzip`` solution can help
+users save a lot of host CPU resources by accelerating compression and
+decompression through the Intel QuickAssist Technology(``QAT``) hardware.
+
+
+The following test was conducted using 8 multifd channels and 10Gbps network
+bandwidth. The results show that, compared to zstd, ``QATzip`` significantly
+saves CPU resources on the sender and reduces migration time. Compared to the
+uncompressed solution, ``QATzip`` greatly improves the dirty page processing
+capability, indicated by the Pages per Second metric, and also reduces the
+total migration time.
+
+::
+
+ VM Configuration: 16 vCPU and 64G memory
+ VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data.
+ QAT Devices: 4
+ |-----------|--------|---------|----------|----------|------|------|
+ |8 Channels |Total |down |throughput|pages per | send | recv |
+ | |time(ms)|time(ms) |(mbps) |second | cpu %| cpu% |
+ |-----------|--------|---------|----------|----------|------|------|
+ |qatzip | 16630| 28| 10467| 2940235| 160| 360|
+ |-----------|--------|---------|----------|----------|------|------|
+ |zstd | 20165| 24| 8579| 2391465| 810| 340|
+ |-----------|--------|---------|----------|----------|------|------|
+ |none | 46063| 40| 10848| 330240| 45| 85|
+ |-----------|--------|---------|----------|----------|------|------|
+
+
+QATzip Compression Framework
+============================
+
+``QATzip`` is a user space library which builds on top of the Intel QuickAssist
+Technology to provide extended accelerated compression and decompression
+services.
+
+For more ``QATzip`` introduction, please refer to `QATzip Introduction
+<https://github.com/intel/QATzip?tab=readme-ov-file#introductionl>`_
+
+::
+
+ +----------------+
+ | MultiFd Thread |
+ +-------+--------+
+ |
+ | compress/decompress
+ +-------+--------+
+ | QATzip library |
+ +-------+--------+
+ |
+ +-------+--------+
+ | QAT library |
+ +-------+--------+
+ | user space
+ --------+---------------------
+ | kernel space
+ +------+-------+
+ | QAT Driver |
+ +------+-------+
+ |
+ +------+-------+
+ | QAT Devices |
+ +--------------+
+
+
+QATzip Installation
+-------------------
+
+The ``QATzip`` installation package has been integrated into some Linux
+distributions and can be installed directly. For example, the Ubuntu Server
+24.04 LTS system can be installed using below command
+
+.. code-block:: shell
+
+ #apt search qatzip
+ libqatzip-dev/noble 1.2.0-0ubuntu3 amd64
+ Intel QuickAssist user space library development files
+
+ libqatzip3/noble 1.2.0-0ubuntu3 amd64
+ Intel QuickAssist user space library
+
+ qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed]
+ Compression user-space tool for Intel QuickAssist Technology
+
+ #sudo apt install libqatzip-dev libqatzip3 qatzip
+
+If your system does not support the ``QATzip`` installation package, you can
+use the source code to build and install, please refer to `QATzip source code installation
+<https://github.com/intel/QATzip?tab=readme-ov-file#build-intel-quickassist-technology-driver>`_
+
+QAT Hardware Deployment
+-----------------------
+
+``QAT`` supports physical functions(PFs) and virtual functions(VFs) for
+deployment, and users can configure ``QAT`` resources for migration according
+to actual needs. For more details about ``QAT`` deployment, please refer to
+`Intel QuickAssist Technology Documentation
+<https://intel.github.io/quickassist/index.html>`_
+
+For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview
+<https://www.intel.com/content/www/us/en/architecture-and-technology/intel-quick-assist-technology-overview.html>`_
+
+How To Use QATzip Compression
+=============================
+
+1 - Install ``QATzip`` library
+
+2 - Build ``QEMU`` with ``--enable-qatzip`` parameter
+
+ E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip``
+
+3 - Set ``migrate_set_parameter multifd-compression qatzip``
+
+4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default
+comp_level value is 1, and it supports levels from 1 to 9
+
+QAT Memory Requirements
+=======================
+
+The user needs to reserve system memory for the QAT memory management to
+allocate DMA memory. The size of the reserved system memory depends on the
+number of devices used for migration and the number of multifd channels.
+
+Because memory usage depends on QAT configuration, please refer to `QAT Memory
+Driver Queries
+<https://intel.github.io/quickassist/PG/infrastructure_debugability.html?highlight=memory>`_
+for memory usage calculation.
+
+.. list-table:: An example of a PF used for migration
+ :header-rows: 1
+
+ * - Number of channels
+ - Sender memory usage
+ - Receiver memory usage
+ * - 2
+ - 10M
+ - 10M
+ * - 4
+ - 12M
+ - 14M
+ * - 8
+ - 16M
+ - 20M
+
+How To Choose Between QATzip and QPL
+====================================
+Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids
+processor(``SPR``), multiple built-in accelerators are supported including
+``QAT`` and ``IAA``. The former can accelerate ``QATzip`` and the latter is
+used to accelerate ``QPL``.
+
+Here are some suggestions:
+
+1 - If the live migration scenario is limited by network bandwidth and ``QAT``
+hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a
+lot of host CPU resources for compression.
+
+2 - If the system cannot support shared virtual memory (SVM) technology, use
+the ``QATzip`` method because ``QPL`` performance is not good without SVM
+support.
+
+3 - For other scenarios, use the ``QPL`` method first.
diff --git a/docs/devel/migration/uadk-compression.rst b/docs/devel/migration/uadk-compression.rst
index 3f73345..64cadeb 100644
--- a/docs/devel/migration/uadk-compression.rst
+++ b/docs/devel/migration/uadk-compression.rst
@@ -114,7 +114,7 @@ Make sure all these above kernel configurations are selected.
Accelerator dev node permissions
--------------------------------
-Harware accelerators(eg: HiSilicon Kunpeng Zip accelerator) gets registered to
+Hardware accelerators (eg: HiSilicon Kunpeng Zip accelerator) gets registered to
UADK and char devices are created in dev directory. In order to access resources
on hardware accelerator devices, write permission should be provided to user.
@@ -134,7 +134,7 @@ How To Use UADK Compression In QEMU Migration
Set ``migrate_set_parameter multifd-compression uadk``
Since UADK uses Shared Virtual Addressing(SVA) and device access virtual memory
-directly it is possible that SMMUv3 may enounter page faults while walking the
+directly it is possible that SMMUv3 may encounter page faults while walking the
IO page tables. This may impact the performance. In order to mitigate this,
please make sure to specify ``-mem-prealloc`` parameter to the destination VM
boot parameters.
diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst
index c49482e..673e354 100644
--- a/docs/devel/migration/vfio.rst
+++ b/docs/devel/migration/vfio.rst
@@ -67,15 +67,35 @@ VFIO implements the device hooks for the iterative approach as follows:
* A ``switchover_ack_needed`` function that checks if the VFIO device uses
"switchover-ack" migration capability when this capability is enabled.
-* A ``save_state`` function to save the device config space if it is present.
+* A ``switchover_start`` function that in the multifd mode starts a thread that
+ reassembles the multifd received data and loads it in-order into the device.
+ In the non-multifd mode this function is a NOP.
+
+* A ``save_state`` function to save the device config space if it is present
+ in the non-multifd mode.
+ In the multifd mode it just emits either a dummy EOS marker.
* A ``save_live_complete_precopy`` function that sets the VFIO device in
_STOP_COPY state and iteratively copies the data for the VFIO device until
the vendor driver indicates that no data remains.
+ In the multifd mode it just emits a dummy EOS marker.
+
+* A ``save_live_complete_precopy_thread`` function that in the multifd mode
+ provides thread handler performing multifd device state transfer.
+ It sets the VFIO device to _STOP_COPY state, iteratively reads the data
+ from the VFIO device and queues it for multifd transmission until the vendor
+ driver indicates that no data remains.
+ After that, it saves the device config space and queues it for multifd
+ transfer too.
+ In the non-multifd mode this thread is a NOP.
* A ``load_state`` function that loads the config section and the data
sections that are generated by the save functions above.
+* A ``load_state_buffer`` function that loads the device state and the device
+ config that arrived via multifd channels.
+ It's used only in the multifd mode.
+
* ``cleanup`` functions for both save and load that perform any migration
related cleanup.
@@ -176,8 +196,11 @@ Live migration save path
Then the VFIO device is put in _STOP_COPY state
(FINISH_MIGRATE, _ACTIVE, _STOP_COPY)
.save_live_complete_precopy() is called for each active device
- For the VFIO device, iterate in .save_live_complete_precopy() until
+ For the VFIO device: in the non-multifd mode iterate in
+ .save_live_complete_precopy() until
pending data is 0
+ In the multifd mode this iteration is done in
+ .save_live_complete_precopy_thread() instead.
|
(POSTMIGRATE, _COMPLETED, _STOP_COPY)
Migraton thread schedules cleanup bottom half and exits
@@ -194,6 +217,9 @@ Live migration resume path
(RESTORE_VM, _ACTIVE, _STOP)
|
For each device, .load_state() is called for that device section data
+ transmitted via the main migration channel.
+ For data transmitted via multifd channels .load_state_buffer() is called
+ instead.
(RESTORE_VM, _ACTIVE, _RESUMING)
|
At the end, .load_cleanup() is called for each device and vCPUs are started
@@ -206,3 +232,18 @@ Postcopy
========
Postcopy migration is currently not supported for VFIO devices.
+
+Multifd
+=======
+
+Starting from QEMU version 10.0 there's a possibility to transfer VFIO device
+_STOP_COPY state via multifd channels. This helps reduce downtime - especially
+with multiple VFIO devices or with devices having a large migration state.
+As an additional benefit, setting the VFIO device to _STOP_COPY state and
+saving its config space is also parallelized (run in a separate thread) in
+such migration mode.
+
+The multifd VFIO device state transfer is controlled by
+"x-migration-multifd-transfer" VFIO device property. This property defaults to
+AUTO, which means that VFIO device state transfer via multifd channels is
+attempted in configurations that otherwise support it.
diff --git a/docs/devel/multi-thread-tcg.rst b/docs/devel/multi-thread-tcg.rst
index d706c27..da9a153 100644
--- a/docs/devel/multi-thread-tcg.rst
+++ b/docs/devel/multi-thread-tcg.rst
@@ -4,6 +4,8 @@
This work is licensed under the terms of the GNU GPL, version 2 or
later. See the COPYING file in the top-level directory.
+.. _mttcg:
+
==================
Multi-threaded TCG
==================
@@ -26,16 +28,15 @@ vCPU Scheduling
We introduce a new running mode where each vCPU will run on its own
user-space thread. This is enabled by default for all FE/BE
combinations where the host memory model is able to accommodate the
-guest (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO is zero) and the
-guest has had the required work done to support this safely
-(TARGET_SUPPORTS_MTTCG).
+guest (TCGCPUOps::guest_default_memory_order & ~TCG_TARGET_DEFAULT_MO is zero)
+and the guest has had the required work done to support this safely
+(TCGCPUOps::mttcg_supported).
System emulation will fall back to the original round robin approach
if:
* forced by --accel tcg,thread=single
* enabling --icount mode
-* 64 bit guests on 32 bit hosts (TCG_OVERSIZED_GUEST)
In the general case of running translated code there should be no
inter-vCPU dependencies and all vCPUs should be able to run at full
diff --git a/docs/devel/multiple-iothreads.rst b/docs/devel/multiple-iothreads.rst
new file mode 100644
index 0000000..d1f3fc4
--- /dev/null
+++ b/docs/devel/multiple-iothreads.rst
@@ -0,0 +1,139 @@
+Using Multiple ``IOThread``\ s
+==============================
+
+..
+ Copyright (c) 2014-2017 Red Hat Inc.
+
+ This work is licensed under the terms of the GNU GPL, version 2 or later. See
+ the COPYING file in the top-level directory.
+
+
+This document explains the ``IOThread`` feature and how to write code that runs
+outside the BQL.
+
+The main loop and ``IOThread``\ s
+---------------------------------
+QEMU is an event-driven program that can do several things at once using an
+event loop. The VNC server and the QMP monitor are both processed from the
+same event loop, which monitors their file descriptors until they become
+readable and then invokes a callback.
+
+The default event loop is called the main loop (see ``main-loop.c``). It is
+possible to create additional event loop threads using
+``-object iothread,id=my-iothread``.
+
+Side note: The main loop and ``IOThread`` are both event loops but their code is
+not shared completely. Sometimes it is useful to remember that although they
+are conceptually similar they are currently not interchangeable.
+
+Why ``IOThread``\ s are useful
+------------------------------
+``IOThread``\ s allow the user to control the placement of work. The main loop is a
+scalability bottleneck on hosts with many CPUs. Work can be spread across
+several ``IOThread``\ s instead of just one main loop. When set up correctly this
+can improve I/O latency and reduce jitter seen by the guest.
+
+The main loop is also deeply associated with the BQL, which is a
+scalability bottleneck in itself. vCPU threads and the main loop use the BQL
+to serialize execution of QEMU code. This mutex is necessary because a lot of
+QEMU's code historically was not thread-safe.
+
+The fact that all I/O processing is done in a single main loop and that the
+BQL is contended by all vCPU threads and the main loop explain
+why it is desirable to place work into ``IOThread``\ s.
+
+The experimental ``virtio-blk`` data-plane implementation has been benchmarked and
+shows these effects:
+ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf
+
+.. _how-to-program:
+
+How to program for ``IOThread``\ s
+----------------------------------
+The main difference between legacy code and new code that can run in an
+``IOThread`` is dealing explicitly with the event loop object, ``AioContext``
+(see ``include/block/aio.h``). Code that only works in the main loop
+implicitly uses the main loop's ``AioContext``. Code that supports running
+in ``IOThread``\ s must be aware of its ``AioContext``.
+
+AioContext supports the following services:
+ * File descriptor monitoring (read/write/error on POSIX hosts)
+ * Event notifiers (inter-thread signalling)
+ * Timers
+ * Bottom Halves (BH) deferred callbacks
+
+There are several old APIs that use the main loop AioContext:
+ * LEGACY ``qemu_aio_set_fd_handler()`` - monitor a file descriptor
+ * LEGACY ``qemu_aio_set_event_notifier()`` - monitor an event notifier
+ * LEGACY ``timer_new_ms()`` - create a timer
+ * LEGACY ``qemu_bh_new()`` - create a BH
+ * LEGACY ``qemu_bh_new_guarded()`` - create a BH with a device re-entrancy guard
+ * LEGACY ``qemu_aio_wait()`` - run an event loop iteration
+
+Since they implicitly work on the main loop they cannot be used in code that
+runs in an ``IOThread``. They might cause a crash or deadlock if called from an
+``IOThread`` since the BQL is not held.
+
+Instead, use the ``AioContext`` functions directly (see ``include/block/aio.h``):
+ * ``aio_set_fd_handler()`` - monitor a file descriptor
+ * ``aio_set_event_notifier()`` - monitor an event notifier
+ * ``aio_timer_new()`` - create a timer
+ * ``aio_bh_new()`` - create a BH
+ * ``aio_bh_new_guarded()`` - create a BH with a device re-entrancy guard
+ * ``aio_poll()`` - run an event loop iteration
+
+The ``qemu_bh_new_guarded``/``aio_bh_new_guarded`` APIs accept a
+``MemReentrancyGuard``
+argument, which is used to check for and prevent re-entrancy problems. For
+BHs associated with devices, the reentrancy-guard is contained in the
+corresponding ``DeviceState`` and named ``mem_reentrancy_guard``.
+
+The ``AioContext`` can be obtained from the ``IOThread`` using
+``iothread_get_aio_context()`` or for the main loop using
+``qemu_get_aio_context()``. Code that takes an ``AioContext`` argument
+works both in ``IOThread``\ s or the main loop, depending on which ``AioContext``
+instance the caller passes in.
+
+How to synchronize with an ``IOThread``
+---------------------------------------
+Variables that can be accessed by multiple threads require some form of
+synchronization such as ``qemu_mutex_lock()``, ``rcu_read_lock()``, etc.
+
+``AioContext`` functions like ``aio_set_fd_handler()``,
+``aio_set_event_notifier()``, ``aio_bh_new()``, and ``aio_timer_new()``
+are thread-safe. They can be used to trigger activity in an ``IOThread``.
+
+Side note: the best way to schedule a function call across threads is to call
+``aio_bh_schedule_oneshot()``.
+
+The main loop thread can wait synchronously for a condition using
+``AIO_WAIT_WHILE()``.
+
+``AioContext`` and the block layer
+----------------------------------
+The ``AioContext`` originates from the QEMU block layer, even though nowadays
+``AioContext`` is a generic event loop that can be used by any QEMU subsystem.
+
+The block layer has support for ``AioContext`` integrated. Each
+``BlockDriverState`` is associated with an ``AioContext`` using
+``bdrv_try_change_aio_context()`` and ``bdrv_get_aio_context()``.
+This allows block layer code to process I/O inside the
+right ``AioContext``. Other subsystems may wish to follow a similar approach.
+
+Block layer code must therefore expect to run in an ``IOThread`` and avoid using
+old APIs that implicitly use the main loop. See
+`How to program for IOThreads`_ for information on how to do that.
+
+Code running in the monitor typically needs to ensure that past
+requests from the guest are completed. When a block device is running
+in an ``IOThread``, the ``IOThread`` can also process requests from the guest
+(via ioeventfd). To achieve both objects, wrap the code between
+``bdrv_drained_begin()`` and ``bdrv_drained_end()``, thus creating a "drained
+section".
+
+Long-running jobs (usually in the form of coroutines) are often scheduled in
+the ``BlockDriverState``'s ``AioContext``. The functions
+``bdrv_add``/``remove_aio_context_notifier``, or alternatively
+``blk_add``/``remove_aio_context_notifier`` if you use ``BlockBackends``,
+can be used to get a notification whenever ``bdrv_try_change_aio_context()``
+moves a ``BlockDriverState`` to a different ``AioContext``.
diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt
deleted file mode 100644
index de85767..0000000
--- a/docs/devel/multiple-iothreads.txt
+++ /dev/null
@@ -1,130 +0,0 @@
-Copyright (c) 2014-2017 Red Hat Inc.
-
-This work is licensed under the terms of the GNU GPL, version 2 or later. See
-the COPYING file in the top-level directory.
-
-
-This document explains the IOThread feature and how to write code that runs
-outside the BQL.
-
-The main loop and IOThreads
----------------------------
-QEMU is an event-driven program that can do several things at once using an
-event loop. The VNC server and the QMP monitor are both processed from the
-same event loop, which monitors their file descriptors until they become
-readable and then invokes a callback.
-
-The default event loop is called the main loop (see main-loop.c). It is
-possible to create additional event loop threads using -object
-iothread,id=my-iothread.
-
-Side note: The main loop and IOThread are both event loops but their code is
-not shared completely. Sometimes it is useful to remember that although they
-are conceptually similar they are currently not interchangeable.
-
-Why IOThreads are useful
-------------------------
-IOThreads allow the user to control the placement of work. The main loop is a
-scalability bottleneck on hosts with many CPUs. Work can be spread across
-several IOThreads instead of just one main loop. When set up correctly this
-can improve I/O latency and reduce jitter seen by the guest.
-
-The main loop is also deeply associated with the BQL, which is a
-scalability bottleneck in itself. vCPU threads and the main loop use the BQL
-to serialize execution of QEMU code. This mutex is necessary because a lot of
-QEMU's code historically was not thread-safe.
-
-The fact that all I/O processing is done in a single main loop and that the
-BQL is contended by all vCPU threads and the main loop explain
-why it is desirable to place work into IOThreads.
-
-The experimental virtio-blk data-plane implementation has been benchmarked and
-shows these effects:
-ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf
-
-How to program for IOThreads
-----------------------------
-The main difference between legacy code and new code that can run in an
-IOThread is dealing explicitly with the event loop object, AioContext
-(see include/block/aio.h). Code that only works in the main loop
-implicitly uses the main loop's AioContext. Code that supports running
-in IOThreads must be aware of its AioContext.
-
-AioContext supports the following services:
- * File descriptor monitoring (read/write/error on POSIX hosts)
- * Event notifiers (inter-thread signalling)
- * Timers
- * Bottom Halves (BH) deferred callbacks
-
-There are several old APIs that use the main loop AioContext:
- * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor
- * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier
- * LEGACY timer_new_ms() - create a timer
- * LEGACY qemu_bh_new() - create a BH
- * LEGACY qemu_bh_new_guarded() - create a BH with a device re-entrancy guard
- * LEGACY qemu_aio_wait() - run an event loop iteration
-
-Since they implicitly work on the main loop they cannot be used in code that
-runs in an IOThread. They might cause a crash or deadlock if called from an
-IOThread since the BQL is not held.
-
-Instead, use the AioContext functions directly (see include/block/aio.h):
- * aio_set_fd_handler() - monitor a file descriptor
- * aio_set_event_notifier() - monitor an event notifier
- * aio_timer_new() - create a timer
- * aio_bh_new() - create a BH
- * aio_bh_new_guarded() - create a BH with a device re-entrancy guard
- * aio_poll() - run an event loop iteration
-
-The qemu_bh_new_guarded/aio_bh_new_guarded APIs accept a "MemReentrancyGuard"
-argument, which is used to check for and prevent re-entrancy problems. For
-BHs associated with devices, the reentrancy-guard is contained in the
-corresponding DeviceState and named "mem_reentrancy_guard".
-
-The AioContext can be obtained from the IOThread using
-iothread_get_aio_context() or for the main loop using qemu_get_aio_context().
-Code that takes an AioContext argument works both in IOThreads or the main
-loop, depending on which AioContext instance the caller passes in.
-
-How to synchronize with an IOThread
------------------------------------
-Variables that can be accessed by multiple threads require some form of
-synchronization such as qemu_mutex_lock(), rcu_read_lock(), etc.
-
-AioContext functions like aio_set_fd_handler(), aio_set_event_notifier(),
-aio_bh_new(), and aio_timer_new() are thread-safe. They can be used to trigger
-activity in an IOThread.
-
-Side note: the best way to schedule a function call across threads is to call
-aio_bh_schedule_oneshot().
-
-The main loop thread can wait synchronously for a condition using
-AIO_WAIT_WHILE().
-
-AioContext and the block layer
-------------------------------
-The AioContext originates from the QEMU block layer, even though nowadays
-AioContext is a generic event loop that can be used by any QEMU subsystem.
-
-The block layer has support for AioContext integrated. Each BlockDriverState
-is associated with an AioContext using bdrv_try_change_aio_context() and
-bdrv_get_aio_context(). This allows block layer code to process I/O inside the
-right AioContext. Other subsystems may wish to follow a similar approach.
-
-Block layer code must therefore expect to run in an IOThread and avoid using
-old APIs that implicitly use the main loop. See the "How to program for
-IOThreads" above for information on how to do that.
-
-Code running in the monitor typically needs to ensure that past
-requests from the guest are completed. When a block device is running
-in an IOThread, the IOThread can also process requests from the guest
-(via ioeventfd). To achieve both objects, wrap the code between
-bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
-section".
-
-Long-running jobs (usually in the form of coroutines) are often scheduled in
-the BlockDriverState's AioContext. The functions
-bdrv_add/remove_aio_context_notifier, or alternatively
-blk_add/remove_aio_context_notifier if you use BlockBackends, can be used to
-get a notification whenever bdrv_try_change_aio_context() moves a
-BlockDriverState to a different AioContext.
diff --git a/docs/devel/nested-papr.txt b/docs/devel/nested-papr.txt
deleted file mode 100644
index 9094365..0000000
--- a/docs/devel/nested-papr.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-Nested PAPR API (aka KVM on PowerVM)
-====================================
-
-This API aims at providing support to enable nested virtualization with
-KVM on PowerVM. While the existing support for nested KVM on PowerNV was
-introduced with cap-nested-hv option, however, with a slight design change,
-to enable this on papr/pseries, a new cap-nested-papr option is added. eg:
-
- qemu-system-ppc64 -cpu POWER10 -machine pseries,cap-nested-papr=true ...
-
-Work by:
- Michael Neuling <mikey@neuling.org>
- Vaibhav Jain <vaibhav@linux.ibm.com>
- Jordan Niethe <jniethe5@gmail.com>
- Harsh Prateek Bora <harshpb@linux.ibm.com>
- Shivaprasad G Bhat <sbhat@linux.ibm.com>
- Kautuk Consul <kconsul@linux.vnet.ibm.com>
-
-Below taken from the kernel documentation:
-
-Introduction
-============
-
-This document explains how a guest operating system can act as a
-hypervisor and run nested guests through the use of hypercalls, if the
-hypervisor has implemented them. The terms L0, L1, and L2 are used to
-refer to different software entities. L0 is the hypervisor mode entity
-that would normally be called the "host" or "hypervisor". L1 is a
-guest virtual machine that is directly run under L0 and is initiated
-and controlled by L0. L2 is a guest virtual machine that is initiated
-and controlled by L1 acting as a hypervisor. A significant design change
-wrt existing API is that now the entire L2 state is maintained within L0.
-
-Existing Nested-HV API
-======================
-
-Linux/KVM has had support for Nesting as an L0 or L1 since 2018
-
-The L0 code was added::
-
- commit 8e3f5fc1045dc49fd175b978c5457f5f51e7a2ce
- Author: Paul Mackerras <paulus@ozlabs.org>
- Date: Mon Oct 8 16:31:03 2018 +1100
- KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
-
-The L1 code was added::
-
- commit 360cae313702cdd0b90f82c261a8302fecef030a
- Author: Paul Mackerras <paulus@ozlabs.org>
- Date: Mon Oct 8 16:31:04 2018 +1100
- KVM: PPC: Book3S HV: Nested guest entry via hypercall
-
-This API works primarily using a signal hcall h_enter_nested(). This
-call made by the L1 to tell the L0 to start an L2 vCPU with the given
-state. The L0 then starts this L2 and runs until an L2 exit condition
-is reached. Once the L2 exits, the state of the L2 is given back to
-the L1 by the L0. The full L2 vCPU state is always transferred from
-and to L1 when the L2 is run. The L0 doesn't keep any state on the L2
-vCPU (except in the short sequence in the L0 on L1 -> L2 entry and L2
--> L1 exit).
-
-The only state kept by the L0 is the partition table. The L1 registers
-it's partition table using the h_set_partition_table() hcall. All
-other state held by the L0 about the L2s is cached state (such as
-shadow page tables).
-
-The L1 may run any L2 or vCPU without first informing the L0. It
-simply starts the vCPU using h_enter_nested(). The creation of L2s and
-vCPUs is done implicitly whenever h_enter_nested() is called.
-
-In this document, we call this existing API the v1 API.
-
-New PAPR API
-===============
-
-The new PAPR API changes from the v1 API such that the creating L2 and
-associated vCPUs is explicit. In this document, we call this the v2
-API.
-
-h_enter_nested() is replaced with H_GUEST_VCPU_RUN(). Before this can
-be called the L1 must explicitly create the L2 using h_guest_create()
-and any associated vCPUs() created with h_guest_create_vCPU(). Getting
-and setting vCPU state can also be performed using h_guest_{g|s}et
-hcall.
-
-The basic execution flow is for an L1 to create an L2, run it, and
-delete it is:
-
-- L1 and L0 negotiate capabilities with H_GUEST_{G,S}ET_CAPABILITIES()
- (normally at L1 boot time).
-
-- L1 requests the L0 to create an L2 with H_GUEST_CREATE() and receives a token
-
-- L1 requests the L0 to create an L2 vCPU with H_GUEST_CREATE_VCPU()
-
-- L1 and L0 communicate the vCPU state using the H_GUEST_{G,S}ET() hcall
-
-- L1 requests the L0 to run the vCPU using H_GUEST_RUN_VCPU() hcall
-
-- L1 deletes L2 with H_GUEST_DELETE()
-
-For more details, please refer:
-
-[1] Linux Kernel documentation (upstream documentation commit):
-
-commit 476652297f94a2e5e5ef29e734b0da37ade94110
-Author: Michael Neuling <mikey@neuling.org>
-Date: Thu Sep 14 13:06:00 2023 +1000
-
- docs: powerpc: Document nested KVM on POWER
-
- Document support for nested KVM on POWER using the existing API as well
- as the new PAPR API. This includes the new HCALL interface and how it
- used by KVM.
-
- Signed-off-by: Michael Neuling <mikey@neuling.org>
- Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
- Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
- Link: https://msgid.link/20230914030600.16993-12-jniethe5@gmail.com
diff --git a/docs/devel/qapi-code-gen.rst b/docs/devel/qapi-code-gen.rst
index 583207a..231cc0f 100644
--- a/docs/devel/qapi-code-gen.rst
+++ b/docs/devel/qapi-code-gen.rst
@@ -9,6 +9,7 @@ How to use the QAPI code generator
This work is licensed under the terms of the GNU GPL, version 2 or
later. See the COPYING file in the top-level directory.
+.. _qapi:
Introduction
============
@@ -228,7 +229,8 @@ These are of the form PREFIX_NAME, where PREFIX is derived from the
enumeration type's name, and NAME from the value's name. For the
example above, the generator maps 'MyEnum' to MY_ENUM and 'value1' to
VALUE1, resulting in the enumeration constant MY_ENUM_VALUE1. The
-optional 'prefix' member overrides PREFIX.
+optional 'prefix' member overrides PREFIX. This is rarely necessary,
+and should be used with restraint.
The generated C enumeration constants have values 0, 1, ..., N-1 (in
QAPI schema order), where N is the number of values. There is an
@@ -761,8 +763,8 @@ Names beginning with ``x-`` used to signify "experimental". This
convention has been replaced by special feature "unstable".
Pragmas ``command-name-exceptions`` and ``member-name-exceptions`` let
-you violate naming rules. Use for new code is strongly discouraged. See
-`Pragma directives`_ for details.
+you violate naming rules. Use for new code is strongly discouraged.
+See `Pragma directives`_ for details.
Downstream extensions
@@ -1011,7 +1013,7 @@ like this::
document the success and the error response, respectively.
"Errors" sections should be formatted as an rST list, each entry
-detailing a relevant error condition. For example::
+detailing a relevant error condition. For example::
# Errors:
# - If @device does not exist, DeviceNotFound
@@ -1024,31 +1026,28 @@ definition.
QMP). In other sections, the text is formatted, and rST markup can be
used.
-QMP Examples can be added by using the ``.. qmp-example::``
-directive. In its simplest form, this can be used to contain a single
-QMP code block which accepts standard JSON syntax with additional server
-directionality indicators (``->`` and ``<-``), and elisions (``...``).
+QMP Examples can be added by using the ``.. qmp-example::`` directive.
+In its simplest form, this can be used to contain a single QMP code
+block which accepts standard JSON syntax with additional server
+directionality indicators (``->`` and ``<-``), and elisions. An
+elision is commonly ``...``, but it can also be or a pair of ``...``
+with text in between.
Optionally, a plaintext title may be provided by using the ``:title:``
-directive option. If the title is omitted, the example title will
+directive option. If the title is omitted, the example title will
default to "Example:".
A simple QMP example::
# .. qmp-example::
- # :title: Using query-block
#
- # -> { "execute": "query-block" }
- # <- { ... }
+ # -> { "execute": "query-name" }
+ # <- { "return": { "name": "Fred" } }
-More complex or multi-step examples where exposition is needed before or
-between QMP code blocks can be created by using the ``:annotated:``
-directive option. When using this option, nested QMP code blocks must be
-entered explicitly with rST's ``::`` syntax.
-
-Highlighting in non-QMP languages can be accomplished by using the
-``.. code-block:: lang`` directive, and non-highlighted text can be
-achieved by omitting the language argument.
+More complex or multi-step examples where exposition is needed before
+or between QMP code blocks can be created by using the ``:annotated:``
+directive option. When using this option, nested QMP code blocks must
+be entered explicitly with rST's ``::`` syntax.
For example::
@@ -1059,11 +1058,21 @@ For example::
# This is a more complex example that can use
# ``arbitrary rST syntax`` in its exposition::
#
- # -> { "execute": "query-block" }
- # <- { ... }
+ # -> { "execute": "query-block" }
+ # <- { "return": [
+ # {
+ # "device": "ide0-hd0",
+ # ...
+ # }
+ # ... more ...
+ # ] }
#
# Above, lengthy output has been omitted for brevity.
+Highlighting in non-QMP languages can be accomplished by using the
+``.. code-block:: lang`` directive, and non-highlighted text can be
+achieved by omitting the language argument.
+
Examples of complete definition documentation::
@@ -1464,7 +1473,9 @@ As an example, we'll use the following schema, which describes a
single complex user-defined type, along with command which takes a
list of that type as a parameter, and returns a single element of that
type. The user is responsible for writing the implementation of
-qmp_my_command(); everything else is produced by the generator. ::
+qmp_my_command(); everything else is produced by the generator.
+
+::
$ cat example-schema.json
{ 'struct': 'UserDefOne',
@@ -1854,7 +1865,7 @@ Example::
#ifndef EXAMPLE_QAPI_INIT_COMMANDS_H
#define EXAMPLE_QAPI_INIT_COMMANDS_H
- #include "qapi/qmp/dispatch.h"
+ #include "qapi/qmp-registry.h"
void example_qmp_init_marshal(QmpCommandList *cmds);
@@ -1985,7 +1996,7 @@ Example::
#ifndef EXAMPLE_QAPI_INTROSPECT_H
#define EXAMPLE_QAPI_INTROSPECT_H
- #include "qapi/qmp/qlit.h"
+ #include "qobject/qlit.h"
extern const QLitObject example_qmp_schema_qlit;
diff --git a/docs/devel/qapi-domain.rst b/docs/devel/qapi-domain.rst
new file mode 100644
index 0000000..1123872
--- /dev/null
+++ b/docs/devel/qapi-domain.rst
@@ -0,0 +1,716 @@
+======================
+The Sphinx QAPI Domain
+======================
+
+An extension to the `rST syntax
+<https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html>`_
+in Sphinx is provided by the QAPI Domain, located in
+``docs/sphinx/qapi_domain.py``. This extension is analogous to the
+`Python Domain
+<https://www.sphinx-doc.org/en/master/usage/domains/python.html>`_
+included with Sphinx, but provides special directives and roles
+speciically for annotating and documenting QAPI definitions
+specifically.
+
+A `Domain
+<https://www.sphinx-doc.org/en/master/usage/domains/index.html>`_
+provides a set of special rST directives and cross-referencing roles to
+Sphinx for understanding rST markup written to document a specific
+language. By itself, this QAPI extension is only sufficient to parse rST
+markup written by hand; the `autodoc
+<https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html>`_
+functionality is provided elsewhere, in ``docs/sphinx/qapidoc.py``, by
+the "Transmogrifier".
+
+It is not expected that any developer nor documentation writer would
+never need to write *nor* read these special rST forms. However, in the
+event that something needs to be debugged, knowing the syntax of the
+domain is quite handy. This reference may also be useful as a guide for
+understanding the QAPI Domain extension code itself. Although most of
+these forms will not be needed for documentation writing purposes,
+understanding the cross-referencing syntax *will* be helpful when
+writing rST documentation elsewhere, or for enriching the body of
+QAPIDoc blocks themselves.
+
+
+Concepts
+========
+
+The QAPI Domain itself provides no mechanisms for reading the QAPI
+Schema or generating documentation from code that exists. It is merely
+the rST syntax used to describe things. For instance, the Sphinx Python
+domain adds syntax like ``:py:func:`` for describing Python functions in
+documentation, but it's the autodoc module that is responsible for
+reading Python code and generating such syntax. QAPI is analogous here:
+qapidoc.py is responsible for reading the QAPI Schema and generating rST
+syntax, and qapi_domain.py is responsible for translating that special
+syntax and providing APIs for Sphinx internals.
+
+In other words:
+
+qapi_domain.py adds syntax like ``.. qapi:command::`` to Sphinx, and
+qapidoc.py transforms the documentation in ``qapi/*.json`` into rST
+using directives defined by the domain.
+
+Or even shorter:
+
+``:py:`` is to ``:qapi:`` as *autodoc* is to *qapidoc*.
+
+
+Info Field Lists
+================
+
+`Field lists
+<https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#field-lists>`_
+are a standard syntax in reStructuredText. Sphinx `extends that syntax
+<https://www.sphinx-doc.org/en/master/usage/domains/python.html#info-field-lists>`_
+to give certain field list entries special meaning and parsing to, for
+example, add cross-references. The QAPI Domain takes advantage of this
+field list extension to document things like Arguments, Members, Values,
+and so on.
+
+The special parsing and handling of info field lists in Sphinx is provided by
+three main classes; Field, GroupedField, and TypedField. The behavior
+and formatting for each configured field list entry in the domain
+changes depending on which class is used.
+
+Field:
+ * Creates an ungrouped field: i.e., each entry will create its own
+ section and they will not be combined.
+ * May *optionally* support an argument.
+ * May apply cross-reference roles to *either* the argument *or* the
+ content body, both, or neither.
+
+This is used primarily for entries which are not expected to be
+repeated, i.e., items that may only show up at most once. The QAPI
+domain uses this class for "Errors" section.
+
+GroupedField:
+ * Creates a grouped field: i.e. multiple adjacent entries will be
+ merged into one section, and the content will form a bulleted list.
+ * *Must* take an argument.
+ * May optionally apply a cross-reference role to the argument, but not
+ the body.
+ * Can be configured to remove the bulleted list if there is only a
+ single entry.
+ * All items will be generated with the form: "argument -- body"
+
+This is used for entries which are expected to be repeated, but aren't
+expected to have two arguments, i.e. types without names, or names
+without types. The QAPI domain uses this class for features, returns,
+and enum values.
+
+TypedField:
+ * Creates a grouped, typed field. Multiple adjacent entres will be
+ merged into one section, and the content will form a bulleted list.
+ * *Must* take at least one argument, but supports up to two -
+ nominally, a name and a type.
+ * May optionally apply a cross-reference role to the type or the name
+ argument, but not the body.
+ * Can be configured to remove the bulleted list if there is only a
+ single entry.
+ * All items will be generated with the form "name (type) -- body"
+
+This is used for entries that are expected to be repeated and will have
+a name, a type, and a description. The QAPI domain uses this class for
+arguments, alternatives, and members. Wherever type names are referenced
+below, They must be a valid, documented type that will be
+cross-referenced in the HTML output; or one of the built-in JSON types
+(string, number, int, boolean, null, value, q_empty).
+
+
+``:feat:``
+----------
+
+Document a feature attached to a QAPI definition.
+
+:availability: This field list is available in the body of Command,
+ Event, Enum, Object and Alternate directives.
+:syntax: ``:feat name: Lorem ipsum, dolor sit amet...``
+:type: `sphinx.util.docfields.GroupedField
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.GroupedField.html?private=1>`_
+
+Example::
+
+ .. qapi:object:: BlockdevOptionsVirtioBlkVhostVdpa
+ :since: 7.2
+ :ifcond: CONFIG_BLKIO
+
+ Driver specific block device options for the virtio-blk-vhost-vdpa
+ backend.
+
+ :memb string path: path to the vhost-vdpa character device.
+ :feat fdset: Member ``path`` supports the special "/dev/fdset/N" path
+ (since 8.1)
+
+
+``:arg:``
+---------
+
+Document an argument to a QAPI command.
+
+:availability: This field list is only available in the body of the
+ Command directive.
+:syntax: ``:arg type name: description``
+:type: `sphinx.util.docfields.TypedField
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.TypedField.html?private=1>`_
+
+
+Example::
+
+ .. qapi:command:: job-pause
+ :since: 3.0
+
+ Pause an active job.
+
+ This command returns immediately after marking the active job for
+ pausing. Pausing an already paused job is an error.
+
+ The job will pause as soon as possible, which means transitioning
+ into the PAUSED state if it was RUNNING, or into STANDBY if it was
+ READY. The corresponding JOB_STATUS_CHANGE event will be emitted.
+
+ Cancelling a paused job automatically resumes it.
+
+ :arg string id: The job identifier.
+
+
+``:error:``
+-----------
+
+Document the error condition(s) of a QAPI command.
+
+:availability: This field list is only available in the body of the
+ Command directive.
+:syntax: ``:error: Lorem ipsum dolor sit amet ...``
+:type: `sphinx.util.docfields.Field
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.Field.html?private=1>`_
+
+The format of the :errors: field list description is free-form rST. The
+alternative spelling ":errors:" is also permitted, but strictly
+analogous.
+
+Example::
+
+ .. qapi:command:: block-job-set-speed
+ :since: 1.1
+
+ Set maximum speed for a background block operation.
+
+ This command can only be issued when there is an active block job.
+
+ Throttling can be disabled by setting the speed to 0.
+
+ :arg string device: The job identifier. This used to be a device
+ name (hence the name of the parameter), but since QEMU 2.7 it
+ can have other values.
+ :arg int speed: the maximum speed, in bytes per second, or 0 for
+ unlimited. Defaults to 0.
+ :error:
+ - If no background operation is active on this device,
+ DeviceNotActive
+
+
+``:return:``
+-------------
+
+Document the return type(s) and value(s) of a QAPI command.
+
+:availability: This field list is only available in the body of the
+ Command directive.
+:syntax: ``:return type: Lorem ipsum dolor sit amet ...``
+:type: `sphinx.util.docfields.GroupedField
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.GroupedField.html?private=1>`_
+
+
+Example::
+
+ .. qapi:command:: query-replay
+ :since: 5.2
+
+ Retrieve the record/replay information. It includes current
+ instruction count which may be used for ``replay-break`` and
+ ``replay-seek`` commands.
+
+ :return ReplayInfo: record/replay information.
+
+ .. qmp-example::
+
+ -> { "execute": "query-replay" }
+ <- { "return": {
+ "mode": "play", "filename": "log.rr", "icount": 220414 }
+ }
+
+
+``:value:``
+-----------
+
+Document a possible value for a QAPI enum.
+
+:availability: This field list is only available in the body of the Enum
+ directive.
+:syntax: ``:value name: Lorem ipsum, dolor sit amet ...``
+:type: `sphinx.util.docfields.GroupedField
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.GroupedField.html?private=1>`_
+
+Example::
+
+ .. qapi:enum:: QapiErrorClass
+ :since: 1.2
+
+ QEMU error classes
+
+ :value GenericError: this is used for errors that don't require a specific
+ error class. This should be the default case for most errors
+ :value CommandNotFound: the requested command has not been found
+ :value DeviceNotActive: a device has failed to be become active
+ :value DeviceNotFound: the requested device has not been found
+ :value KVMMissingCap: the requested operation can't be fulfilled because a
+ required KVM capability is missing
+
+
+``:alt:``
+------------
+
+Document a possible branch for a QAPI alternate.
+
+:availability: This field list is only available in the body of the
+ Alternate directive.
+:syntax: ``:alt type name: Lorem ipsum, dolor sit amet ...``
+:type: `sphinx.util.docfields.TypedField
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.TypedField.html?private=1>`_
+
+As a limitation of Sphinx, we must document the "name" of the branch in
+addition to the type, even though this information is not visible on the
+wire in the QMP protocol format. This limitation *may* be lifted at a
+future date.
+
+Example::
+
+ .. qapi:alternate:: StrOrNull
+ :since: 2.10
+
+ This is a string value or the explicit lack of a string (null
+ pointer in C). Intended for cases when 'optional absent' already
+ has a different meaning.
+
+ :alt string s: the string value
+ :alt null n: no string value
+
+
+``:memb:``
+----------
+
+Document a member of an Event or Object.
+
+:availability: This field list is available in the body of Event or
+ Object directives.
+:syntax: ``:memb type name: Lorem ipsum, dolor sit amet ...``
+:type: `sphinx.util.docfields.TypedField
+ <https://pydoc.dev/sphinx/latest/sphinx.util.docfields.TypedField.html?private=1>`_
+
+This is fundamentally the same as ``:arg:`` and ``:alt:``, but uses the
+"Members" phrasing for Events and Objects (Structs and Unions).
+
+Example::
+
+ .. qapi:event:: JOB_STATUS_CHANGE
+ :since: 3.0
+
+ Emitted when a job transitions to a different status.
+
+ :memb string id: The job identifier
+ :memb JobStatus status: The new job status
+
+
+Arbitrary field lists
+---------------------
+
+Other field list names, while valid rST syntax, are prohibited inside of
+QAPI directives to help prevent accidental misspellings of info field
+list names. If you want to add a new arbitrary "non-value-added" field
+list to QAPI documentation, you must add the field name to the allow
+list in ``docs/conf.py``
+
+For example::
+
+ qapi_allowed_fields = {
+ "see also",
+ }
+
+Will allow you to add arbitrary field lists in QAPI directives::
+
+ .. qapi:command:: x-fake-command
+
+ :see also: Lorem ipsum, dolor sit amet ...
+
+
+Cross-references
+================
+
+Cross-reference `roles
+<https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html>`_
+in the QAPI domain are modeled closely after the `Python
+cross-referencing syntax
+<https://www.sphinx-doc.org/en/master/usage/domains/python.html#cross-referencing-python-objects>`_.
+
+QAPI definitions can be referenced using the standard `any
+<https://www.sphinx-doc.org/en/master/usage/referencing.html#role-any>`_
+role cross-reference syntax, such as with ```query-blockstats```. In
+the event that disambiguation is needed, cross-references can also be
+written using a number of explicit cross-reference roles:
+
+* ``:qapi:mod:`block-core``` -- Reference a QAPI module. The link will
+ take you to the beginning of that section in the documentation.
+* ``:qapi:cmd:`query-block``` -- Reference a QAPI command.
+* ``:qapi:event:`JOB_STATUS_CHANGE``` -- Reference a QAPI event.
+* ``:qapi:enum:`QapiErrorClass``` -- Reference a QAPI enum.
+* ``:qapi:obj:`BlockdevOptionsVirtioBlkVhostVdpa`` -- Reference a QAPI
+ object (struct or union)
+* ``:qapi:alt:`StrOrNull``` -- Reference a QAPI alternate.
+* ``:qapi:type:`BlockDirtyInfo``` -- Reference *any* QAPI type; this
+ excludes modules, commands, and events.
+* ``:qapi:any:`block-job-set-speed``` -- Reference absolutely any QAPI entity.
+
+Type arguments in info field lists are converted into references as if
+you had used the ``:qapi:type:`` role. All of the special syntax below
+applies to both info field lists and standalone explicit
+cross-references.
+
+
+Type decorations
+----------------
+
+Type names in references can be surrounded by brackets, like
+``[typename]``, to indicate an array of that type. The cross-reference
+will apply only to the type name between the brackets. For example;
+``:qapi:type:`[Qcow2BitmapInfoFlags]``` renders to:
+:qapi:type:`[QMP:Qcow2BitmapInfoFlags]`
+
+To indicate an optional argument/member in a field list, the type name
+can be suffixed with ``?``. The cross-reference will be transformed to
+"type, Optional" with the link applying only to the type name. For
+example; ``:qapi:type:`BitmapSyncMode?``` renders to:
+:qapi:type:`QMP:BitmapSyncMode?`
+
+
+Namespaces
+----------
+
+Mimicking the `Python domain target specification syntax
+<https://www.sphinx-doc.org/en/master/usage/domains/python.html#target-specification>`_,
+QAPI allows you to specify the fully qualified path for a data
+type.
+
+* A namespace can be explicitly provided;
+ e.g. ``:qapi:type:`QMP:BitmapSyncMode``
+* A module can be explicitly provided;
+ ``:qapi:type:`QMP:block-core.BitmapSyncMode``` will render to:
+ :qapi:type:`QMP:block-core.BitmapSyncMode`
+* If you don't want to display the "fully qualified" name, it can be
+ prefixed with a tilde; ``:qapi:type:`~QMP:block-core.BitmapSyncMode```
+ will render to: :qapi:type:`~QMP:block-core.BitmapSyncMode`
+
+
+Target resolution
+-----------------
+
+Any cross-reference to a QAPI type, whether using the ```any``` style of
+reference or the more explicit ```:qapi:any:`target``` syntax, allows
+for the presence or absence of either the namespace or module
+information.
+
+When absent, their value will be inferred from context by the presence
+of any ``qapi:namespace`` or ``qapi:module`` directives preceding the
+cross-reference.
+
+If no results are found when using the inferred values, other
+namespaces/modules will be searched as a last resort; but any explicitly
+provided values must always match in order to succeed.
+
+This allows for efficient cross-referencing with a minimum of syntax in
+the large majority of cases, but additional context or namespace markup
+may be required outside of the QAPI reference documents when linking to
+items that share a name across multiple documented QAPI schema.
+
+
+Custom link text
+----------------
+
+The name of a cross-reference link can be explicitly overridden like
+`most stock Sphinx references
+<https://www.sphinx-doc.org/en/master/usage/referencing.html#syntax>`_
+using the ``custom text <target>`` syntax.
+
+For example, ``:qapi:cmd:`Merge dirty bitmaps
+<block-dirty-bitmap-merge>``` will render as: :qapi:cmd:`Merge dirty
+bitmaps <QMP:block-dirty-bitmap-merge>`
+
+
+Directives
+==========
+
+The QAPI domain adds a number of custom directives for documenting
+various QAPI/QMP entities. The syntax is plain rST, and follows this
+general format::
+
+ .. qapi:directive:: argument
+ :option:
+ :another-option: with an argument
+
+ Content body, arbitrary rST is allowed here.
+
+
+Sphinx standard options
+-----------------------
+
+All QAPI directives inherit a number of `standard options
+<https://www.sphinx-doc.org/en/master/usage/domains/index.html#basic-markup>`_
+from Sphinx's ObjectDescription class.
+
+The dashed spellings of the below options were added in Sphinx 7.2, the
+undashed spellings are currently retained as aliases, but will be
+removed in a future version.
+
+* ``:no-index:`` and ``:noindex:`` -- Do not add this item into the
+ Index, and do not make it available for cross-referencing.
+* ``no-index-entry:`` and ``:noindexentry:`` -- Do not add this item
+ into the Index, but allow it to be cross-referenced.
+* ``no-contents-entry`` and ``:nocontentsentry:`` -- Exclude this item
+ from the Table of Contents.
+* ``no-typesetting`` -- Create TOC, Index and cross-referencing
+ entities, but don't actually display the content.
+
+
+QAPI standard options
+---------------------
+
+All QAPI directives -- *except* for namespace and module -- support
+these common options.
+
+* ``:namespace: name`` -- This option allows you to override the
+ namespace association of a given definition.
+* ``:module: modname`` -- Borrowed from the Python domain, this option allows
+ you to override the module association of a given definition.
+* ``:since: x.y`` -- Allows the documenting of "Since" information, which is
+ displayed in the signature bar.
+* ``:ifcond: CONDITION`` -- Allows the documenting of conditional availability
+ information, which is displayed in an eyecatch just below the
+ signature bar.
+* ``:deprecated:`` -- Adds an eyecatch just below the signature bar that
+ advertises that this definition is deprecated and should be avoided.
+* ``:unstable:`` -- Adds an eyecatch just below the signature bar that
+ advertises that this definition is unstable and should not be used in
+ production code.
+
+
+qapi:namespace
+--------------
+
+The ``qapi:namespace`` directive marks the start of a QAPI namespace. It
+does not take a content body, nor any options. All subsequent QAPI
+directives are associated with the most recent namespace. This affects
+the definition's "fully qualified name", allowing two different
+namespaces to create an otherwise identically named definition.
+
+This directive also influences how reference resolution works for any
+references that do not explicitly specify a namespace, so this directive
+can be used to nudge references into preferring targets from within that
+namespace.
+
+Example::
+
+ .. qapi:namespace:: QMP
+
+
+This directive has no visible effect.
+
+
+qapi:module
+-----------
+
+The ``qapi:module`` directive marks the start of a QAPI module. It may have
+a content body, but it can be omitted. All subsequent QAPI directives
+are associated with the most recent module; this effects their "fully
+qualified" name, but has no other effect.
+
+Example::
+
+ .. qapi:module:: block-core
+
+ Welcome to the block-core module!
+
+Will be rendered as:
+
+.. qapi:module:: block-core
+ :noindex:
+
+ Welcome to the block-core module!
+
+
+qapi:command
+------------
+
+This directive documents a QMP command. It may use any of the standard
+Sphinx or QAPI options, and the documentation body may contain
+``:arg:``, ``:feat:``, ``:error:``, or ``:return:`` info field list
+entries.
+
+Example::
+
+ .. qapi:command:: x-fake-command
+ :since: 42.0
+ :unstable:
+
+ This command is fake, so it can't hurt you!
+
+ :arg int foo: Your favorite number.
+ :arg string? bar: Your favorite season.
+ :return [string]: A lovely computer-written poem for you.
+
+
+Will be rendered as:
+
+ .. qapi:command:: x-fake-command
+ :noindex:
+ :since: 42.0
+ :unstable:
+
+ This command is fake, so it can't hurt you!
+
+ :arg int foo: Your favorite number.
+ :arg string? bar: Your favorite season.
+ :return [string]: A lovely computer-written poem for you.
+
+
+qapi:event
+----------
+
+This directive documents a QMP event. It may use any of the standard
+Sphinx or QAPI options, and the documentation body may contain
+``:memb:`` or ``:feat:`` info field list entries.
+
+Example::
+
+ .. qapi:event:: COMPUTER_IS_RUINED
+ :since: 0.1
+ :deprecated:
+
+ This event is emitted when your computer is *extremely* ruined.
+
+ :memb string reason: Diagnostics as to what caused your computer to
+ be ruined.
+ :feat sadness: When present, the diagnostic message will also
+ explain how sad the computer is as a result of your wrongdoings.
+
+Will be rendered as:
+
+.. qapi:event:: COMPUTER_IS_RUINED
+ :noindex:
+ :since: 0.1
+ :deprecated:
+
+ This event is emitted when your computer is *extremely* ruined.
+
+ :memb string reason: Diagnostics as to what caused your computer to
+ be ruined.
+ :feat sadness: When present, the diagnostic message will also explain
+ how sad the computer is as a result of your wrongdoings.
+
+
+qapi:enum
+---------
+
+This directive documents a QAPI enum. It may use any of the standard
+Sphinx or QAPI options, and the documentation body may contain
+``:value:`` or ``:feat:`` info field list entries.
+
+Example::
+
+ .. qapi:enum:: Mood
+ :ifcond: LIB_PERSONALITY
+
+ This enum represents your virtual machine's current mood!
+
+ :value Happy: Your VM is content and well-fed.
+ :value Hungry: Your VM needs food.
+ :value Melancholic: Your VM is experiencing existential angst.
+ :value Petulant: Your VM is throwing a temper tantrum.
+
+Will be rendered as:
+
+.. qapi:enum:: Mood
+ :noindex:
+ :ifcond: LIB_PERSONALITY
+
+ This enum represents your virtual machine's current mood!
+
+ :value Happy: Your VM is content and well-fed.
+ :value Hungry: Your VM needs food.
+ :value Melancholic: Your VM is experiencing existential angst.
+ :value Petulant: Your VM is throwing a temper tantrum.
+
+
+qapi:object
+-----------
+
+This directive documents a QAPI structure or union and represents a QMP
+object. It may use any of the standard Sphinx or QAPI options, and the
+documentation body may contain ``:memb:`` or ``:feat:`` info field list
+entries.
+
+Example::
+
+ .. qapi:object:: BigBlobOfStuff
+
+ This object has a bunch of disparate and unrelated things in it.
+
+ :memb int Birthday: Your birthday, represented in seconds since the
+ UNIX epoch.
+ :memb [string] Fav-Foods: A list of your favorite foods.
+ :memb boolean? Bizarre-Docs: True if the documentation reference
+ should be strange.
+
+Will be rendered as:
+
+.. qapi:object:: BigBlobOfStuff
+ :noindex:
+
+ This object has a bunch of disparate and unrelated things in it.
+
+ :memb int Birthday: Your birthday, represented in seconds since the
+ UNIX epoch.
+ :memb [string] Fav-Foods: A list of your favorite foods.
+ :memb boolean? Bizarre-Docs: True if the documentation reference
+ should be strange.
+
+
+qapi:alternate
+--------------
+
+This directive documents a QAPI alternate. It may use any of the
+standard Sphinx or QAPI options, and the documentation body may contain
+``:alt:`` or ``:feat:`` info field list entries.
+
+Example::
+
+ .. qapi:alternate:: ErrorCode
+
+ This alternate represents an Error Code from the VM.
+
+ :alt int ec: An error code, like the type you're used to.
+ :alt string em: An expletive-laced error message, if your
+ computer is feeling particularly cranky and tired of your
+ antics.
+
+Will be rendered as:
+
+.. qapi:alternate:: ErrorCode
+ :noindex:
+
+ This alternate represents an Error Code from the VM.
+
+ :alt int ec: An error code, like the type you're used to.
+ :alt string em: An expletive-laced error message, if your
+ computer is feeling particularly cranky and tired of your
+ antics.
diff --git a/docs/devel/qom.rst b/docs/devel/qom.rst
index 0889ca9..5870745 100644
--- a/docs/devel/qom.rst
+++ b/docs/devel/qom.rst
@@ -147,7 +147,7 @@ to introduce an overridden virtual function:
#include "qdev.h"
- void my_device_class_init(ObjectClass *klass, void *class_data)
+ void my_device_class_init(ObjectClass *klass, const void *class_data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
dc->reset = my_device_reset;
@@ -249,7 +249,7 @@ class, which someone might choose to change at some point.
// do something
}
- static void my_class_init(ObjectClass *oc, void *data)
+ static void my_class_init(ObjectClass *oc, const void *data)
{
MyClass *mc = MY_CLASS(oc);
@@ -279,7 +279,7 @@ class, which someone might choose to change at some point.
// do something else here
}
- static void derived_class_init(ObjectClass *oc, void *data)
+ static void derived_class_init(ObjectClass *oc, const void *data)
{
MyClass *mc = MY_CLASS(oc);
DerivedClass *dc = DERIVED_CLASS(oc);
@@ -363,7 +363,7 @@ This is equivalent to the following:
:caption: Expansion from defining a simple type
static void my_device_finalize(Object *obj);
- static void my_device_class_init(ObjectClass *oc, void *data);
+ static void my_device_class_init(ObjectClass *oc, const void *data);
static void my_device_init(Object *obj);
static const TypeInfo my_device_info = {
diff --git a/docs/devel/rcu.txt b/docs/devel/rcu.rst
index 2e6cc60..dd07c1d 100644
--- a/docs/devel/rcu.txt
+++ b/docs/devel/rcu.rst
@@ -20,7 +20,7 @@ for the execution of all *currently running* critical sections before
proceeding, or before asynchronously executing a callback.
The key point here is that only the currently running critical sections
-are waited for; critical sections that are started _after_ the beginning
+are waited for; critical sections that are started **after** the beginning
of the wait do not extend the wait, despite running concurrently with
the updater. This is the reason why RCU is more scalable than,
for example, reader-writer locks. It is so much more scalable that
@@ -37,7 +37,7 @@ do not matter; as soon as all previous critical sections have finished,
there cannot be any readers who hold references to the data structure,
and these can now be safely reclaimed (e.g., freed or unref'ed).
-Here is a picture:
+Here is a picture::
thread 1 thread 2 thread 3
------------------- ------------------------ -------------------
@@ -58,43 +58,38 @@ that critical section.
RCU API
-=======
+-------
The core RCU API is small:
- void rcu_read_lock(void);
-
+``void rcu_read_lock(void);``
Used by a reader to inform the reclaimer that the reader is
entering an RCU read-side critical section.
- void rcu_read_unlock(void);
-
+``void rcu_read_unlock(void);``
Used by a reader to inform the reclaimer that the reader is
exiting an RCU read-side critical section. Note that RCU
read-side critical sections may be nested and/or overlapping.
- void synchronize_rcu(void);
-
+``void synchronize_rcu(void);``
Blocks until all pre-existing RCU read-side critical sections
on all threads have completed. This marks the end of the removal
phase and the beginning of reclamation phase.
Note that it would be valid for another update to come while
- synchronize_rcu is running. Because of this, it is better that
+ ``synchronize_rcu`` is running. Because of this, it is better that
the updater releases any locks it may hold before calling
- synchronize_rcu. If this is not possible (for example, because
- the updater is protected by the BQL), you can use call_rcu.
+ ``synchronize_rcu``. If this is not possible (for example, because
+ the updater is protected by the BQL), you can use ``call_rcu``.
- void call_rcu1(struct rcu_head * head,
- void (*func)(struct rcu_head *head));
-
- This function invokes func(head) after all pre-existing RCU
+``void call_rcu1(struct rcu_head * head, void (*func)(struct rcu_head *head));``
+ This function invokes ``func(head)`` after all pre-existing RCU
read-side critical sections on all threads have completed. This
marks the end of the removal phase, with func taking care
asynchronously of the reclamation phase.
- The foo struct needs to have an rcu_head structure added,
- perhaps as follows:
+ The ``foo`` struct needs to have an ``rcu_head`` structure added,
+ perhaps as follows::
struct foo {
struct rcu_head rcu;
@@ -103,8 +98,8 @@ The core RCU API is small:
long c;
};
- so that the reclaimer function can fetch the struct foo address
- and free it:
+ so that the reclaimer function can fetch the ``struct foo`` address
+ and free it::
call_rcu1(&foo.rcu, foo_reclaim);
@@ -114,29 +109,27 @@ The core RCU API is small:
g_free(fp);
}
- For the common case where the rcu_head member is the first of the
- struct, you can use the following macro.
+ ``call_rcu1`` is typically used via either the ``call_rcu`` or
+ ``g_free_rcu`` macros, which handle the common case where the
+ ``rcu_head`` member is the first of the struct.
- void call_rcu(T *p,
- void (*func)(T *p),
- field-name);
- void g_free_rcu(T *p,
- field-name);
+``void call_rcu(T *p, void (*func)(T *p), field-name);``
+ If the ``struct rcu_head`` is the first field in the struct, you can
+ use this macro instead of ``call_rcu1``.
- call_rcu1 is typically used through these macro, in the common case
- where the "struct rcu_head" is the first field in the struct. If
- the callback function is g_free, in particular, g_free_rcu can be
- used. In the above case, one could have written simply:
+``void g_free_rcu(T *p, field-name);``
+ This is a special-case version of ``call_rcu`` where the callback
+ function is ``g_free``.
+ In the example given in ``call_rcu1``, one could have written simply::
g_free_rcu(&foo, rcu);
- typeof(*p) qatomic_rcu_read(p);
-
- qatomic_rcu_read() is similar to qatomic_load_acquire(), but it makes
- some assumptions on the code that calls it. This allows a more
- optimized implementation.
+``typeof(*p) qatomic_rcu_read(p);``
+ ``qatomic_rcu_read()`` is similar to ``qatomic_load_acquire()``, but
+ it makes some assumptions on the code that calls it. This allows a
+ more optimized implementation.
- qatomic_rcu_read assumes that whenever a single RCU critical
+ ``qatomic_rcu_read`` assumes that whenever a single RCU critical
section reads multiple shared data, these reads are either
data-dependent or need no ordering. This is almost always the
case when using RCU, because read-side critical sections typically
@@ -144,7 +137,7 @@ The core RCU API is small:
every update) until reaching a data structure of interest,
and then read from there.
- RCU read-side critical sections must use qatomic_rcu_read() to
+ RCU read-side critical sections must use ``qatomic_rcu_read()`` to
read data, unless concurrent writes are prevented by another
synchronization mechanism.
@@ -152,18 +145,17 @@ The core RCU API is small:
data structure in a single direction, opposite to the direction
in which the updater initializes it.
- void qatomic_rcu_set(p, typeof(*p) v);
-
- qatomic_rcu_set() is similar to qatomic_store_release(), though it also
- makes assumptions on the code that calls it in order to allow a more
- optimized implementation.
+``void qatomic_rcu_set(p, typeof(*p) v);``
+ ``qatomic_rcu_set()`` is similar to ``qatomic_store_release()``,
+ though it also makes assumptions on the code that calls it in
+ order to allow a more optimized implementation.
- In particular, qatomic_rcu_set() suffices for synchronization
+ In particular, ``qatomic_rcu_set()`` suffices for synchronization
with readers, if the updater never mutates a field within a
data item that is already accessible to readers. This is the
case when initializing a new copy of the RCU-protected data
- structure; just ensure that initialization of *p is carried out
- before qatomic_rcu_set() makes the data item visible to readers.
+ structure; just ensure that initialization of ``*p`` is carried out
+ before ``qatomic_rcu_set()`` makes the data item visible to readers.
If this rule is observed, writes will happen in the opposite
order as reads in the RCU read-side critical sections (or if
there is just one update), and there will be no need for other
@@ -171,58 +163,54 @@ The core RCU API is small:
The following APIs must be used before RCU is used in a thread:
- void rcu_register_thread(void);
-
+``void rcu_register_thread(void);``
Mark a thread as taking part in the RCU mechanism. Such a thread
will have to report quiescent points regularly, either manually
- or through the QemuCond/QemuSemaphore/QemuEvent APIs.
-
- void rcu_unregister_thread(void);
+ or through the ``QemuCond``/``QemuSemaphore``/``QemuEvent`` APIs.
+``void rcu_unregister_thread(void);``
Mark a thread as not taking part anymore in the RCU mechanism.
It is not a problem if such a thread reports quiescent points,
- either manually or by using the QemuCond/QemuSemaphore/QemuEvent
- APIs.
+ either manually or by using the
+ ``QemuCond``/``QemuSemaphore``/``QemuEvent`` APIs.
-Note that these APIs are relatively heavyweight, and should _not_ be
+Note that these APIs are relatively heavyweight, and should **not** be
nested.
Convenience macros
-==================
+------------------
Two macros are provided that automatically release the read lock at the
end of the scope.
- RCU_READ_LOCK_GUARD()
-
+``RCU_READ_LOCK_GUARD()``
Takes the lock and will release it at the end of the block it's
used in.
- WITH_RCU_READ_LOCK_GUARD() { code }
-
+``WITH_RCU_READ_LOCK_GUARD() { code }``
Is used at the head of a block to protect the code within the block.
-Note that 'goto'ing out of the guarded block will also drop the lock.
+Note that a ``goto`` out of the guarded block will also drop the lock.
-DIFFERENCES WITH LINUX
-======================
+Differences with Linux
+----------------------
- Waiting on a mutex is possible, though discouraged, within an RCU critical
section. This is because spinlocks are rarely (if ever) used in userspace
programming; not allowing this would prevent upgrading an RCU read-side
critical section to become an updater.
-- qatomic_rcu_read and qatomic_rcu_set replace rcu_dereference and
- rcu_assign_pointer. They take a _pointer_ to the variable being accessed.
+- ``qatomic_rcu_read`` and ``qatomic_rcu_set`` replace ``rcu_dereference`` and
+ ``rcu_assign_pointer``. They take a **pointer** to the variable being accessed.
-- call_rcu is a macro that has an extra argument (the name of the first
- field in the struct, which must be a struct rcu_head), and expects the
+- ``call_rcu`` is a macro that has an extra argument (the name of the first
+ field in the struct, which must be a struct ``rcu_head``), and expects the
type of the callback's argument to be the type of the first argument.
- call_rcu1 is the same as Linux's call_rcu.
+ ``call_rcu1`` is the same as Linux's ``call_rcu``.
-RCU PATTERNS
-============
+RCU Patterns
+------------
Many patterns using read-writer locks translate directly to RCU, with
the advantages of higher scalability and deadlock immunity.
@@ -243,28 +231,28 @@ Here are some frequently-used RCU idioms that are worth noting.
RCU list processing
--------------------
+^^^^^^^^^^^^^^^^^^^
TBD (not yet used in QEMU)
RCU reference counting
-----------------------
+^^^^^^^^^^^^^^^^^^^^^^
Because grace periods are not allowed to complete while there is an RCU
read-side critical section in progress, the RCU read-side primitives
may be used as a restricted reference-counting mechanism. For example,
-consider the following code fragment:
+consider the following code fragment::
rcu_read_lock();
p = qatomic_rcu_read(&foo);
/* do something with p. */
rcu_read_unlock();
-The RCU read-side critical section ensures that the value of "p" remains
-valid until after the rcu_read_unlock(). In some sense, it is acquiring
-a reference to p that is later released when the critical section ends.
-The write side looks simply like this (with appropriate locking):
+The RCU read-side critical section ensures that the value of ``p`` remains
+valid until after the ``rcu_read_unlock()``. In some sense, it is acquiring
+a reference to ``p`` that is later released when the critical section ends.
+The write side looks simply like this (with appropriate locking)::
qemu_mutex_lock(&foo_mutex);
old = foo;
@@ -274,7 +262,7 @@ The write side looks simply like this (with appropriate locking):
free(old);
If the processing cannot be done purely within the critical section, it
-is possible to combine this idiom with a "real" reference count:
+is possible to combine this idiom with a "real" reference count::
rcu_read_lock();
p = qatomic_rcu_read(&foo);
@@ -283,7 +271,7 @@ is possible to combine this idiom with a "real" reference count:
/* do something with p. */
foo_unref(p);
-The write side can be like this:
+The write side can be like this::
qemu_mutex_lock(&foo_mutex);
old = foo;
@@ -292,7 +280,7 @@ The write side can be like this:
synchronize_rcu();
foo_unref(old);
-or with call_rcu:
+or with ``call_rcu``::
qemu_mutex_lock(&foo_mutex);
old = foo;
@@ -301,10 +289,10 @@ or with call_rcu:
call_rcu(foo_unref, old, rcu);
In both cases, the write side only performs removal. Reclamation
-happens when the last reference to a "foo" object is dropped.
-Using synchronize_rcu() is undesirably expensive, because the
+happens when the last reference to a ``foo`` object is dropped.
+Using ``synchronize_rcu()`` is undesirably expensive, because the
last reference may be dropped on the read side. Hence you can
-use call_rcu() instead:
+use ``call_rcu()`` instead::
foo_unref(struct foo *p) {
if (qatomic_fetch_dec(&p->refcount) == 1) {
@@ -314,7 +302,7 @@ use call_rcu() instead:
Note that the same idioms would be possible with reader/writer
-locks:
+locks::
read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock);
p = foo; p = foo;
@@ -334,15 +322,15 @@ locks:
foo_unref(p);
read_unlock(&foo_rwlock);
-foo_unref could use a mechanism such as bottom halves to move deallocation
+``foo_unref`` could use a mechanism such as bottom halves to move deallocation
out of the write-side critical section.
RCU resizable arrays
---------------------
+^^^^^^^^^^^^^^^^^^^^
Resizable arrays can be used with RCU. The expensive RCU synchronization
-(or call_rcu) only needs to take place when the array is resized.
+(or ``call_rcu``) only needs to take place when the array is resized.
The two items to take care of are:
- ensuring that the old version of the array is available between removal
@@ -351,10 +339,10 @@ The two items to take care of are:
- avoiding mismatches in the read side between the array data and the
array size.
-The first problem is avoided simply by not using realloc. Instead,
+The first problem is avoided simply by not using ``realloc``. Instead,
each resize will allocate a new array and copy the old data into it.
The second problem would arise if the size and the data pointers were
-two members of a larger struct:
+two members of a larger struct::
struct mystuff {
...
@@ -364,7 +352,7 @@ two members of a larger struct:
...
};
-Instead, we store the size of the array with the array itself:
+Instead, we store the size of the array with the array itself::
struct arr {
int size;
@@ -400,7 +388,7 @@ Instead, we store the size of the array with the array itself:
}
-SOURCES
-=======
+References
+----------
-* Documentation/RCU/ from the Linux kernel
+* The `Linux kernel RCU documentation <https://docs.kernel.org/RCU/>`__
diff --git a/docs/devel/replay.rst b/docs/devel/replay.rst
index effd856..40f58d9 100644
--- a/docs/devel/replay.rst
+++ b/docs/devel/replay.rst
@@ -202,6 +202,9 @@ into the log.
Saving/restoring the VM state
-----------------------------
+Record/replay relies on VM state save and restore being complete and
+deterministic.
+
All fields in the device state structure (including virtual timers)
should be restored by loadvm to the same values they had before savevm.
diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst
index 9746a4e..c02fe0a 100644
--- a/docs/devel/reset.rst
+++ b/docs/devel/reset.rst
@@ -44,6 +44,26 @@ The Resettable interface handles reset types with an enum ``ResetType``:
value on each cold reset, such as RNG seed information, and which they
must not reinitialize on a snapshot-load reset.
+``RESET_TYPE_WAKEUP``
+ If the machine supports waking up from a suspended state and needs to reset
+ its devices during wake-up (from the ``MachineClass::wakeup()`` method), this
+ reset type should be used for such a request. Devices can utilize this reset
+ type to differentiate the reset requested during machine wake-up from other
+ reset requests. For example, RAM content must not be lost during wake-up, and
+ memory devices like virtio-mem that provide additional RAM must not reset
+ such state during wake-ups, but might do so during cold resets. However, this
+ reset type should not be used for wake-up detection, as not every machine
+ type issues a device reset request during wake-up.
+
+``RESET_TYPE_S390_CPU_NORMAL``
+ This is only used for S390 CPU objects; it clears interrupts, stops
+ processing, and clears the TLB, but does not touch register contents.
+
+``RESET_TYPE_S390_CPU_INITIAL``
+ This is only used for S390 CPU objects; it does everything
+ ``RESET_TYPE_S390_CPU_NORMAL`` does and also clears the PSW, prefix,
+ FPC, timer and control registers. It does not touch gprs, fprs or acrs.
+
Devices which implement reset methods must treat any unknown ``ResetType``
as equivalent to ``RESET_TYPE_COLD``; this will reduce the amount of
existing code we need to change if we add more types in future.
@@ -123,6 +143,11 @@ The *exit* phase is executed only when the last reset operation ends. Therefore
the object does not need to care how many of reset controllers it has and how
many of them have started a reset.
+DMA capable devices are expected to cancel all outstanding DMA operations
+during either 'enter' or 'hold' phases. IOMMUs are expected to reset during
+the 'exit' phase and this sequencing makes sure no outstanding DMA request
+will fault.
+
Handling reset in a resettable object
-------------------------------------
@@ -191,7 +216,7 @@ in reset.
ResettablePhases parent_phases;
} MyDevClass;
- static void mydev_class_init(ObjectClass *class, void *data)
+ static void mydev_class_init(ObjectClass *class, const void *data)
{
MyDevClass *myclass = MYDEV_CLASS(class);
ResettableClass *rc = RESETTABLE_CLASS(class);
@@ -266,8 +291,8 @@ every reset child of the given resettable object. All children must be
resettable too. Additional parameters (a reset type and an opaque pointer) must
be passed to the callback too.
-In ``DeviceClass`` and ``BusClass`` the ``ResettableState`` is located
-``DeviceState`` and ``BusState`` structure. ``child_foreach()`` is implemented
+In ``DeviceClass`` and ``BusClass`` the ``ResettableState`` is located in the
+``DeviceState`` and ``BusState`` structures. ``child_foreach()`` is implemented
to follow the bus hierarchy; for a bus, it calls the function on every child
device; for a device, it calls the function on every bus child. When we reset
the main system bus, we reset the whole machine bus tree.
diff --git a/docs/devel/rust.rst b/docs/devel/rust.rst
new file mode 100644
index 0000000..dc8c441
--- /dev/null
+++ b/docs/devel/rust.rst
@@ -0,0 +1,478 @@
+.. |msrv| replace:: 1.63.0
+
+Rust in QEMU
+============
+
+Rust in QEMU is a project to enable using the Rust programming language
+to add new functionality to QEMU.
+
+Right now, the focus is on making it possible to write devices that inherit
+from ``SysBusDevice`` in `*safe*`__ Rust. Later, it may become possible
+to write other kinds of devices (e.g. PCI devices that can do DMA),
+complete boards, or backends (e.g. block device formats).
+
+__ https://doc.rust-lang.org/nomicon/meet-safe-and-unsafe.html
+
+Building the Rust in QEMU code
+------------------------------
+
+The Rust in QEMU code is included in the emulators via Meson. Meson
+invokes rustc directly, building static libraries that are then linked
+together with the C code. This is completely automatic when you run
+``make`` or ``ninja``.
+
+However, QEMU's build system also tries to be easy to use for people who
+are accustomed to the more "normal" Cargo-based development workflow.
+In particular:
+
+* the set of warnings and lints that are used to build QEMU always
+ comes from the ``rust/Cargo.toml`` workspace file
+
+* it is also possible to use ``cargo`` for common Rust-specific coding
+ tasks, in particular to invoke ``clippy``, ``rustfmt`` and ``rustdoc``.
+
+To this end, QEMU includes a ``build.rs`` build script that picks up
+generated sources from QEMU's build directory and puts it in Cargo's
+output directory (typically ``rust/target/``). A vanilla invocation
+of Cargo will complain that it cannot find the generated sources,
+which can be fixed in different ways:
+
+* by using Makefile targets, provided by Meson, that run ``clippy`` or
+ ``rustdoc``:
+
+ make clippy
+ make rustdoc
+
+A target for ``rustfmt`` is also declared in ``rust/meson.build``:
+
+ make rustfmt
+
+* by invoking ``cargo`` through the Meson `development environment`__
+ feature::
+
+ pyvenv/bin/meson devenv -w ../rust cargo clippy --tests
+ pyvenv/bin/meson devenv -w ../rust cargo fmt
+
+ If you are going to use ``cargo`` repeatedly, ``pyvenv/bin/meson devenv``
+ will enter a shell where commands like ``cargo fmt`` just work.
+
+__ https://mesonbuild.com/Commands.html#devenv
+
+* by pointing the ``MESON_BUILD_ROOT`` to the top of your QEMU build
+ tree. This third method is useful if you are using ``rust-analyzer``;
+ you can set the environment variable through the
+ ``rust-analyzer.cargo.extraEnv`` setting.
+
+As shown above, you can use the ``--tests`` option as usual to operate on test
+code. Note however that you cannot *build* or run tests via ``cargo``, because
+they need support C code from QEMU that Cargo does not know about. Tests can
+be run via ``meson test`` or ``make``::
+
+ make check-rust
+
+Note that doctests require all ``.o`` files from the build to be available.
+
+Supported tools
+'''''''''''''''
+
+QEMU supports rustc version 1.77.0 and newer. Notably, the following features
+are missing:
+
+* inline const expression (stable in 1.79.0), currently worked around with
+ associated constants in the ``FnCall`` trait.
+
+* associated constants have to be explicitly marked ``'static`` (`changed in
+ 1.81.0`__)
+
+* ``&raw`` (stable in 1.82.0). Use ``addr_of!`` and ``addr_of_mut!`` instead,
+ though hopefully the need for raw pointers will go down over time.
+
+* ``new_uninit`` (stable in 1.82.0). This is used internally by the ``pinned_init``
+ crate, which is planned for inclusion in QEMU, but it can be easily patched
+ out.
+
+* referencing statics in constants (stable in 1.83.0). For now use a const
+ function; this is an important limitation for QEMU's migration stream
+ architecture (VMState). Right now, VMState lacks type safety because
+ it is hard to place the ``VMStateField`` definitions in traits.
+
+* NUL-terminated file names with ``#[track_caller]`` are scheduled for
+ inclusion as ``#![feature(location_file_nul)]``, but it will be a while
+ before QEMU can use them. For now, there is special code in
+ ``util/error.c`` to support non-NUL-terminated file names.
+
+* associated const equality would be nice to have for some users of
+ ``callbacks::FnCall``, but is still experimental. ``ASSERT_IS_SOME``
+ replaces it.
+
+__ https://github.com/rust-lang/rust/pull/125258
+
+QEMU also supports version 0.60.x of bindgen, which is missing option
+``--generate-cstr``. This option requires version 0.66.x and will
+be adopted as soon as supporting these older versions is not necessary
+anymore.
+
+Writing Rust code in QEMU
+-------------------------
+
+QEMU includes four crates:
+
+* ``qemu_api`` for bindings to C code and useful functionality
+
+* ``qemu_api_macros`` defines several procedural macros that are useful when
+ writing C code
+
+* ``pl011`` (under ``rust/hw/char/pl011``) and ``hpet`` (under ``rust/hw/timer/hpet``)
+ are sample devices that demonstrate ``qemu_api`` and ``qemu_api_macros``, and are
+ used to further develop them. These two crates are functional\ [#issues]_ replacements
+ for the ``hw/char/pl011.c`` and ``hw/timer/hpet.c`` files.
+
+.. [#issues] The ``pl011`` crate is synchronized with ``hw/char/pl011.c``
+ as of commit 3e0f118f82. The ``hpet`` crate is synchronized as of
+ commit 1433e38cc8. Both are lacking tracing functionality.
+
+This section explains how to work with them.
+
+Status
+''''''
+
+Modules of ``qemu_api`` can be defined as:
+
+- *complete*: ready for use in new devices; if applicable, the API supports the
+ full functionality available in C
+
+- *stable*: ready for production use, the API is safe and should not undergo
+ major changes
+
+- *proof of concept*: the API is subject to change but allows working with safe
+ Rust
+
+- *initial*: the API is in its initial stages; it requires large amount of
+ unsafe code; it might have soundness or type-safety issues
+
+The status of the modules is as follows:
+
+================ ======================
+module status
+================ ======================
+``assertions`` stable
+``bitops`` complete
+``callbacks`` complete
+``cell`` stable
+``errno`` complete
+``error`` stable
+``irq`` complete
+``log`` proof of concept
+``memory`` stable
+``module`` complete
+``qdev`` stable
+``qom`` stable
+``sysbus`` stable
+``timer`` stable
+``vmstate`` proof of concept
+``zeroable`` stable
+================ ======================
+
+.. note::
+ API stability is not a promise, if anything because the C APIs are not a stable
+ interface either. Also, ``unsafe`` interfaces may be replaced by safe interfaces
+ later.
+
+Naming convention
+'''''''''''''''''
+
+C function names usually are prefixed according to the data type that they
+apply to, for example ``timer_mod`` or ``sysbus_connect_irq``. Furthermore,
+both function and structs sometimes have a ``qemu_`` or ``QEMU`` prefix.
+Generally speaking, these are all removed in the corresponding Rust functions:
+``QEMUTimer`` becomes ``timer::Timer``, ``timer_mod`` becomes ``Timer::modify``,
+``sysbus_connect_irq`` becomes ``SysBusDeviceMethods::connect_irq``.
+
+Sometimes however a name appears multiple times in the QOM class hierarchy,
+and the only difference is in the prefix. An example is ``qdev_realize`` and
+``sysbus_realize``. In such cases, whenever a name is not unique in
+the hierarchy, always add the prefix to the classes that are lower in
+the hierarchy; for the top class, decide on a case by case basis.
+
+For example:
+
+========================== =========================================
+``device_cold_reset()`` ``DeviceMethods::cold_reset()``
+``pci_device_reset()`` ``PciDeviceMethods::pci_device_reset()``
+``pci_bridge_reset()`` ``PciBridgeMethods::pci_bridge_reset()``
+========================== =========================================
+
+Here, the name is not exactly the same, but nevertheless ``PciDeviceMethods``
+adds the prefix to avoid confusion, because the functionality of
+``device_cold_reset()`` and ``pci_device_reset()`` is subtly different.
+
+In this case, however, no prefix is needed:
+
+========================== =========================================
+``device_realize()`` ``DeviceMethods::realize()``
+``sysbus_realize()`` ``SysbusDeviceMethods::sysbus_realize()``
+``pci_realize()`` ``PciDeviceMethods::pci_realize()``
+========================== =========================================
+
+Here, the lower classes do not add any functionality, and mostly
+provide extra compile-time checking; the basic *realize* functionality
+is the same for all devices. Therefore, ``DeviceMethods`` does not
+add the prefix.
+
+Whenever a name is unique in the hierarchy, instead, you should
+always remove the class name prefix.
+
+Common pitfalls
+'''''''''''''''
+
+Rust has very strict rules with respect to how you get an exclusive (``&mut``)
+reference; failure to respect those rules is a source of undefined behavior.
+In particular, even if a value is loaded from a raw mutable pointer (``*mut``),
+it *cannot* be casted to ``&mut`` unless the value was stored to the ``*mut``
+from a mutable reference. Furthermore, it is undefined behavior if any
+shared reference was created between the store to the ``*mut`` and the load::
+
+ let mut p: u32 = 42;
+ let p_mut = &mut p; // 1
+ let p_raw = p_mut as *mut u32; // 2
+
+ // p_raw keeps the mutable reference "alive"
+
+ let p_shared = &p; // 3
+ println!("access from &u32: {}", *p_shared);
+
+ // Bring back the mutable reference, its lifetime overlaps
+ // with that of a shared reference.
+ let p_mut = unsafe { &mut *p_raw }; // 4
+ println!("access from &mut 32: {}", *p_mut);
+
+ println!("access from &u32: {}", *p_shared); // 5
+
+These rules can be tested with `MIRI`__, for example.
+
+__ https://github.com/rust-lang/miri
+
+Almost all Rust code in QEMU will involve QOM objects, and pointers to these
+objects are *shared*, for example because they are part of the QOM composition
+tree. This creates exactly the above scenario:
+
+1. a QOM object is created
+
+2. a ``*mut`` is created, for example as the opaque value for a ``MemoryRegion``
+
+3. the QOM object is placed in the composition tree
+
+4. a memory access dereferences the opaque value to a ``&mut``
+
+5. but the shared reference is still present in the composition tree
+
+Because of this, QOM objects should almost always use ``&self`` instead
+of ``&mut self``; access to internal fields must use *interior mutability*
+to go from a shared reference to a ``&mut``.
+
+Whenever C code provides you with an opaque ``void *``, avoid converting it
+to a Rust mutable reference, and use a shared reference instead. The
+``qemu_api::cell`` module provides wrappers that can be used to tell the
+Rust compiler about interior mutability, and optionally to enforce locking
+rules for the "Big QEMU Lock". In the future, similar cell types might
+also be provided for ``AioContext``-based locking as well.
+
+In particular, device code will usually rely on the ``BqlRefCell`` and
+``BqlCell`` type to ensure that data is accessed correctly under the
+"Big QEMU Lock". These cell types are also known to the ``vmstate``
+crate, which is able to "look inside" them when building an in-memory
+representation of a ``struct``'s layout. Note that the same is not true
+of a ``RefCell`` or ``Mutex``.
+
+Bindings code instead will usually use the ``Opaque`` type, which hides
+the contents of the underlying struct and can be easily converted to
+a raw pointer, for use in calls to C functions. It can be used for
+example as follows::
+
+ #[repr(transparent)]
+ #[derive(Debug, qemu_api_macros::Wrapper)]
+ pub struct Object(Opaque<bindings::Object>);
+
+where the special ``derive`` macro provides useful methods such as
+``from_raw``, ``as_ptr`, ``as_mut_ptr`` and ``raw_get``. The bindings will
+then manually check for the big QEMU lock with assertions, which allows
+the wrapper to be declared thread-safe::
+
+ unsafe impl Send for Object {}
+ unsafe impl Sync for Object {}
+
+Writing bindings to C code
+''''''''''''''''''''''''''
+
+Here are some things to keep in mind when working on the ``qemu_api`` crate.
+
+**Look at existing code**
+ Very often, similar idioms in C code correspond to similar tricks in
+ Rust bindings. If the C code uses ``offsetof``, look at qdev properties
+ or ``vmstate``. If the C code has a complex const struct, look at
+ ``MemoryRegion``. Reuse existing patterns for handling lifetimes;
+ for example use ``&T`` for QOM objects that do not need a reference
+ count (including those that can be embedded in other objects) and
+ ``Owned<T>`` for those that need it.
+
+**Use the type system**
+ Bindings often will need access information that is specific to a type
+ (either a builtin one or a user-defined one) in order to pass it to C
+ functions. Put them in a trait and access it through generic parameters.
+ The ``vmstate`` module has examples of how to retrieve type information
+ for the fields of a Rust ``struct``.
+
+**Prefer unsafe traits to unsafe functions**
+ Unsafe traits are much easier to prove correct than unsafe functions.
+ They are an excellent place to store metadata that can later be accessed
+ by generic functions. C code usually places metadata in global variables;
+ in Rust, they can be stored in traits and then turned into ``static``
+ variables. Often, unsafe traits can be generated by procedural macros.
+
+**Document limitations due to old Rust versions**
+ If you need to settle for an inferior solution because of the currently
+ supported set of Rust versions, document it in the source and in this
+ file. This ensures that it can be fixed when the minimum supported
+ version is bumped.
+
+**Keep locking in mind**.
+ When marking a type ``Sync``, be careful of whether it needs the big
+ QEMU lock. Use ``BqlCell`` and ``BqlRefCell`` for interior data,
+ or assert ``bql_locked()``.
+
+**Don't be afraid of complexity, but document and isolate it**
+ It's okay to be tricky; device code is written more often than bindings
+ code and it's important that it is idiomatic. However, you should strive
+ to isolate any tricks in a place (for example a ``struct``, a trait
+ or a macro) where it can be documented and tested. If needed, include
+ toy versions of the code in the documentation.
+
+Writing procedural macros
+'''''''''''''''''''''''''
+
+By conventions, procedural macros are split in two functions, one
+returning ``Result<proc_macro2::TokenStream, MacroError>`` with the body of
+the procedural macro, and the second returning ``proc_macro::TokenStream``
+which is the actual procedural macro. The former's name is the same as
+the latter with the ``_or_error`` suffix. The code for the latter is more
+or less fixed; it follows the following template, which is fixed apart
+from the type after ``as`` in the invocation of ``parse_macro_input!``::
+
+ #[proc_macro_derive(Object)]
+ pub fn derive_object(input: TokenStream) -> TokenStream {
+ let input = parse_macro_input!(input as DeriveInput);
+ let expanded = derive_object_or_error(input).unwrap_or_else(Into::into);
+
+ TokenStream::from(expanded)
+ }
+
+The ``qemu_api_macros`` crate has utility functions to examine a
+``DeriveInput`` and perform common checks (e.g. looking for a struct
+with named fields). These functions return ``Result<..., MacroError>``
+and can be used easily in the procedural macro function::
+
+ fn derive_object_or_error(input: DeriveInput) ->
+ Result<proc_macro2::TokenStream, MacroError>
+ {
+ is_c_repr(&input, "#[derive(Object)]")?;
+
+ let name = &input.ident;
+ let parent = &get_fields(&input, "#[derive(Object)]")?[0].ident;
+ ...
+ }
+
+Use procedural macros with care. They are mostly useful for two purposes:
+
+* Performing consistency checks; for example ``#[derive(Object)]`` checks
+ that the structure has ``#[repr[C])`` and that the type of the first field
+ is consistent with the ``ObjectType`` declaration.
+
+* Extracting information from Rust source code into traits, typically based
+ on types and attributes. For example, ``#[derive(TryInto)]`` builds an
+ implementation of ``TryFrom``, and it uses the ``#[repr(...)]`` attribute
+ as the ``TryFrom`` source and error types.
+
+Procedural macros can be hard to debug and test; if the code generation
+exceeds a few lines of code, it may be worthwhile to delegate work to
+"regular" declarative (``macro_rules!``) macros and write unit tests for
+those instead.
+
+
+Coding style
+''''''''''''
+
+Code should pass clippy and be formatted with rustfmt.
+
+Right now, only the nightly version of ``rustfmt`` is supported. This
+might change in the future. While CI checks for correct formatting via
+``cargo fmt --check``, maintainers can fix this for you when applying patches.
+
+It is expected that ``qemu_api`` provides full ``rustdoc`` documentation for
+bindings that are in their final shape or close.
+
+Adding dependencies
+-------------------
+
+Generally, the set of dependent crates is kept small. Think twice before
+adding a new external crate, especially if it comes with a large set of
+dependencies itself. Sometimes QEMU only needs a small subset of the
+functionality; see for example QEMU's ``assertions`` module.
+
+On top of this recommendation, adding external crates to QEMU is a
+slightly complicated process, mostly due to the need to teach Meson how
+to build them. While Meson has initial support for parsing ``Cargo.lock``
+files, it is still highly experimental and is therefore not used.
+
+Therefore, external crates must be added as subprojects for Meson to
+learn how to build them, as well as to the relevant ``Cargo.toml`` files.
+The versions specified in ``rust/Cargo.lock`` must be the same as the
+subprojects; note that the ``rust/`` directory forms a Cargo `workspace`__,
+and therefore there is a single lock file for the whole build.
+
+__ https://doc.rust-lang.org/cargo/reference/workspaces.html#virtual-workspace
+
+Choose a version of the crate that works with QEMU's minimum supported
+Rust version (|msrv|).
+
+Second, a new ``wrap`` file must be added to teach Meson how to download the
+crate. The wrap file must be named ``NAME-SEMVER-rs.wrap``, where ``NAME``
+is the name of the crate and ``SEMVER`` is the version up to and including the
+first non-zero number. For example, a crate with version ``0.2.3`` will use
+``0.2`` for its ``SEMVER``, while a crate with version ``1.0.84`` will use ``1``.
+
+Third, the Meson rules to build the crate must be added at
+``subprojects/NAME-SEMVER-rs/meson.build``. Generally this includes:
+
+* ``subproject`` and ``dependency`` lines for all dependent crates
+
+* a ``static_library`` or ``rust.proc_macro`` line to perform the actual build
+
+* ``declare_dependency`` and a ``meson.override_dependency`` lines to expose
+ the result to QEMU and to other subprojects
+
+Remember to add ``native: true`` to ``dependency``, ``static_library`` and
+``meson.override_dependency`` for dependencies of procedural macros.
+If a crate is needed in both procedural macros and QEMU binaries, everything
+apart from ``subproject`` must be duplicated to build both native and
+non-native versions of the crate.
+
+It's important to specify the right compiler options. These include:
+
+* the language edition (which can be found in the ``Cargo.toml`` file)
+
+* the ``--cfg`` (which have to be "reverse engineered" from the ``build.rs``
+ file of the crate).
+
+* usually, a ``--cap-lints allow`` argument to hide warnings from rustc
+ or clippy.
+
+After every change to the ``meson.build`` file you have to update the patched
+version with ``meson subprojects update --reset ``NAME-SEMVER-rs``. This might
+be automated in the future.
+
+Also, after every change to the ``meson.build`` file it is strongly suggested to
+do a dummy change to the ``.wrap`` file (for example adding a comment like
+``# version 2``), which will help Meson notice that the subproject is out of date.
+
+As a last step, add the new subproject to ``scripts/archive-source.sh``,
+``scripts/make-release`` and ``subprojects/.gitignore``.
diff --git a/docs/devel/style.rst b/docs/devel/style.rst
index 2f68b50..d025933 100644
--- a/docs/devel/style.rst
+++ b/docs/devel/style.rst
@@ -416,6 +416,26 @@ definitions instead of typedefs in headers and function prototypes; this
avoids problems with duplicated typedefs and reduces the need to include
headers from other headers.
+Bitfields
+---------
+
+C bitfields can be a cause of non-portability issues, especially under windows
+where `MSVC has a different way to lay them out than GCC
+<https://gcc.gnu.org/onlinedocs/gcc/x86-Type-Attributes.html>`_, or where
+endianness matters.
+
+For this reason, we disallow usage of bitfields in packed structures and in any
+structures which are supposed to exactly match a specific layout in guest
+memory. Some existing code may use it, and we carefully ensured the layout was
+the one expected.
+
+We also suggest avoiding bitfields even in structures where the exact
+layout does not matter, unless you can show that they provide a significant
+usability benefit.
+
+We encourage the usage of ``include/hw/registerfields.h`` as a safe replacement
+for bitfields.
+
Reserved namespaces in C and POSIX
----------------------------------
diff --git a/docs/devel/submitting-a-patch.rst b/docs/devel/submitting-a-patch.rst
index 83e9092..f7917b8 100644
--- a/docs/devel/submitting-a-patch.rst
+++ b/docs/devel/submitting-a-patch.rst
@@ -18,7 +18,7 @@ one-shot fix, the bare minimum we ask is that:
* - Check
- Reason
- * - Patches contain Signed-off-by: Real Name <author@email>
+ * - Patches contain Signed-off-by: Your Name <author@email>
- States you are legally able to contribute the code. See :ref:`patch_emails_must_include_a_signed_off_by_line`
* - Sent as patch emails to ``qemu-devel@nongnu.org``
- The project uses an email list based workflow. See :ref:`submitting_your_patches`
@@ -235,6 +235,31 @@ to another list.) ``git send-email`` (`step-by-step setup guide
works best for delivering the patch without mangling it, but
attachments can be used as a last resort on a first-time submission.
+.. _use_git_publish:
+
+Use git-publish
+~~~~~~~~~~~~~~~
+
+If you already configured git send-email, you can simply use `git-publish
+<https://github.com/stefanha/git-publish>`__ to send series.
+
+::
+
+ $ git checkout master -b my-feature
+ $ # work on new commits, add your 'Signed-off-by' lines to each
+ $ git publish
+ $ ... more work, rebase on master, ...
+ $ git publish # will send a v2
+
+Each time you post a series, git-publish will create a local tag with the format
+``<branchname>-v<version>`` to record the patch series.
+
+When sending patch emails, 'git publish' will consult the output of
+'scripts/get_maintainers.pl' and automatically CC anyone listed as maintainers
+of the affected code. Generally you should accept the suggested CC list, but
+there may sometimes be scenarios where it is appropriate to cut it down (eg on
+certain large tree-wide cleanups), or augment it with other interested people.
+
.. _if_you_cannot_send_patch_emails:
If you cannot send patch emails
@@ -252,10 +277,7 @@ patches to the QEMU mailing list by following these steps:
#. Send your patches to the QEMU mailing list using the web-based
``git-send-email`` UI at https://git.sr.ht/~USERNAME/qemu/send-email
-`This video
-<https://spacepub.space/videos/watch/ad258d23-0ac6-488c-83fc-2bacf578de3a>`__
-shows the web-based ``git-send-email`` workflow. Documentation is
-available `here
+Documentation for sourcehut is available `here
<https://man.sr.ht/git.sr.ht/#sending-patches-upstream>`__.
.. _cc_the_relevant_maintainer:
@@ -322,23 +344,9 @@ Patch emails must include a ``Signed-off-by:`` line
Your patches **must** include a Signed-off-by: line. This is a hard
requirement because it's how you say "I'm legally okay to contribute
-this and happy for it to go into QEMU". The process is modelled after
-the `Linux kernel
-<http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/SubmittingPatches?id=f6f94e2ab1b33f0082ac22d71f66385a60d8157f#n297>`__
-policy.
-
-If you wrote the patch, make sure your "From:" and "Signed-off-by:"
-lines use the same spelling. It's okay if you subscribe or contribute to
-the list via more than one address, but using multiple addresses in one
-commit just confuses things. If someone else wrote the patch, git will
-include a "From:" line in the body of the email (different from your
-envelope From:) that will give credit to the correct author; but again,
-that author's Signed-off-by: line is mandatory, with the same spelling.
-
-There are various tooling options for automatically adding these tags
-include using ``git commit -s`` or ``git format-patch -s``. For more
-information see `SubmittingPatches 1.12
-<http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/SubmittingPatches?id=f6f94e2ab1b33f0082ac22d71f66385a60d8157f#n297>`__.
+this and happy for it to go into QEMU". For full guidance, read the
+:ref:`code-provenance` documentation.
+
.. _include_a_meaningful_cover_letter:
@@ -406,6 +414,20 @@ For more details on how QEMU's stable process works, refer to the
.. _participating_in_code_review:
+Retrieve an existing series
+---------------------------
+
+If you want to apply an existing series on top of your tree, you can simply use
+`b4 <https://github.com/mricon/b4>`__.
+
+::
+
+ b4 shazam $msg-id
+
+The message id is related to the patch series that has been sent to the mailing
+list. You need to retrieve the "Message-Id:" header from one of the patches. Any
+of them can be used and b4 will apply the whole series.
+
Participating in Code Review
----------------------------
diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst
index d46b625..f26b837 100644
--- a/docs/devel/tcg-ops.rst
+++ b/docs/devel/tcg-ops.rst
@@ -239,7 +239,7 @@ Jumps/Labels
- | Jump to label.
- * - brcond_i32/i64 *t0*, *t1*, *cond*, *label*
+ * - brcond *t0*, *t1*, *cond*, *label*
- | Conditional jump if *t0* *cond* *t1* is true. *cond* can be:
|
@@ -261,98 +261,117 @@ Arithmetic
.. list-table::
- * - add_i32/i64 *t0*, *t1*, *t2*
+ * - add *t0*, *t1*, *t2*
- | *t0* = *t1* + *t2*
- * - sub_i32/i64 *t0*, *t1*, *t2*
+ * - sub *t0*, *t1*, *t2*
- | *t0* = *t1* - *t2*
- * - neg_i32/i64 *t0*, *t1*
+ * - neg *t0*, *t1*
- | *t0* = -*t1* (two's complement)
- * - mul_i32/i64 *t0*, *t1*, *t2*
+ * - mul *t0*, *t1*, *t2*
- | *t0* = *t1* * *t2*
- * - div_i32/i64 *t0*, *t1*, *t2*
+ * - divs *t0*, *t1*, *t2*
- | *t0* = *t1* / *t2* (signed)
| Undefined behavior if division by zero or overflow.
- * - divu_i32/i64 *t0*, *t1*, *t2*
+ * - divu *t0*, *t1*, *t2*
- | *t0* = *t1* / *t2* (unsigned)
| Undefined behavior if division by zero.
- * - rem_i32/i64 *t0*, *t1*, *t2*
+ * - rems *t0*, *t1*, *t2*
- | *t0* = *t1* % *t2* (signed)
| Undefined behavior if division by zero or overflow.
- * - remu_i32/i64 *t0*, *t1*, *t2*
+ * - remu *t0*, *t1*, *t2*
- | *t0* = *t1* % *t2* (unsigned)
| Undefined behavior if division by zero.
+ * - divs2 *q*, *r*, *nl*, *nh*, *d*
+
+ - | *q* = *nh:nl* / *d* (signed)
+ | *r* = *nh:nl* % *d*
+ | Undefined behaviour if division by zero, or the double-word
+ numerator divided by the single-word divisor does not fit
+ within the single-word quotient. The code generator will
+ pass *nh* as a simple sign-extension of *nl*, so the only
+ overflow should be *INT_MIN* / -1.
+
+ * - divu2 *q*, *r*, *nl*, *nh*, *d*
+
+ - | *q* = *nh:nl* / *d* (unsigned)
+ | *r* = *nh:nl* % *d*
+ | Undefined behaviour if division by zero, or the double-word
+ numerator divided by the single-word divisor does not fit
+ within the single-word quotient. The code generator will
+ pass 0 to *nh* to make a simple zero-extension of *nl*,
+ so overflow should never occur.
Logical
-------
.. list-table::
- * - and_i32/i64 *t0*, *t1*, *t2*
+ * - and *t0*, *t1*, *t2*
- | *t0* = *t1* & *t2*
- * - or_i32/i64 *t0*, *t1*, *t2*
+ * - or *t0*, *t1*, *t2*
- | *t0* = *t1* | *t2*
- * - xor_i32/i64 *t0*, *t1*, *t2*
+ * - xor *t0*, *t1*, *t2*
- | *t0* = *t1* ^ *t2*
- * - not_i32/i64 *t0*, *t1*
+ * - not *t0*, *t1*
- | *t0* = ~\ *t1*
- * - andc_i32/i64 *t0*, *t1*, *t2*
+ * - andc *t0*, *t1*, *t2*
- | *t0* = *t1* & ~\ *t2*
- * - eqv_i32/i64 *t0*, *t1*, *t2*
+ * - eqv *t0*, *t1*, *t2*
- | *t0* = ~(*t1* ^ *t2*), or equivalently, *t0* = *t1* ^ ~\ *t2*
- * - nand_i32/i64 *t0*, *t1*, *t2*
+ * - nand *t0*, *t1*, *t2*
- | *t0* = ~(*t1* & *t2*)
- * - nor_i32/i64 *t0*, *t1*, *t2*
+ * - nor *t0*, *t1*, *t2*
- | *t0* = ~(*t1* | *t2*)
- * - orc_i32/i64 *t0*, *t1*, *t2*
+ * - orc *t0*, *t1*, *t2*
- | *t0* = *t1* | ~\ *t2*
- * - clz_i32/i64 *t0*, *t1*, *t2*
+ * - clz *t0*, *t1*, *t2*
- | *t0* = *t1* ? clz(*t1*) : *t2*
- * - ctz_i32/i64 *t0*, *t1*, *t2*
+ * - ctz *t0*, *t1*, *t2*
- | *t0* = *t1* ? ctz(*t1*) : *t2*
- * - ctpop_i32/i64 *t0*, *t1*
+ * - ctpop *t0*, *t1*
- | *t0* = number of bits set in *t1*
|
- | With *ctpop* short for "count population", matching
- | the function name used in ``include/qemu/host-utils.h``.
+ | The name *ctpop* is short for "count population", and matches
+ the function name used in ``include/qemu/host-utils.h``.
Shifts/Rotates
@@ -360,30 +379,30 @@ Shifts/Rotates
.. list-table::
- * - shl_i32/i64 *t0*, *t1*, *t2*
+ * - shl *t0*, *t1*, *t2*
- | *t0* = *t1* << *t2*
- | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+ | Unspecified behavior for negative or out-of-range shifts.
- * - shr_i32/i64 *t0*, *t1*, *t2*
+ * - shr *t0*, *t1*, *t2*
- | *t0* = *t1* >> *t2* (unsigned)
- | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+ | Unspecified behavior for negative or out-of-range shifts.
- * - sar_i32/i64 *t0*, *t1*, *t2*
+ * - sar *t0*, *t1*, *t2*
- | *t0* = *t1* >> *t2* (signed)
- | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+ | Unspecified behavior for negative or out-of-range shifts.
- * - rotl_i32/i64 *t0*, *t1*, *t2*
+ * - rotl *t0*, *t1*, *t2*
- | Rotation of *t2* bits to the left
- | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+ | Unspecified behavior for negative or out-of-range shifts.
- * - rotr_i32/i64 *t0*, *t1*, *t2*
+ * - rotr *t0*, *t1*, *t2*
- | Rotation of *t2* bits to the right.
- | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+ | Unspecified behavior for negative or out-of-range shifts.
Misc
@@ -391,26 +410,12 @@ Misc
.. list-table::
- * - mov_i32/i64 *t0*, *t1*
+ * - mov *t0*, *t1*
- | *t0* = *t1*
- | Move *t1* to *t0* (both operands must have the same type).
-
- * - ext8s_i32/i64 *t0*, *t1*
-
- ext8u_i32/i64 *t0*, *t1*
-
- ext16s_i32/i64 *t0*, *t1*
-
- ext16u_i32/i64 *t0*, *t1*
+ | Move *t1* to *t0*.
- ext32s_i64 *t0*, *t1*
-
- ext32u_i64 *t0*, *t1*
-
- - | 8, 16 or 32 bit sign/zero extension (both operands must have the same type)
-
- * - bswap16_i32/i64 *t0*, *t1*, *flags*
+ * - bswap16 *t0*, *t1*, *flags*
- | 16 bit byte swap on the low bits of a 32/64 bit input.
|
@@ -420,24 +425,24 @@ Misc
|
| If neither ``TCG_BSWAP_OZ`` nor ``TCG_BSWAP_OS`` are set, then the bits of *t0* above bit 15 may contain any value.
- * - bswap32_i64 *t0*, *t1*, *flags*
-
- - | 32 bit byte swap on a 64-bit value. The flags are the same as for bswap16,
- except they apply from bit 31 instead of bit 15.
+ * - bswap32 *t0*, *t1*, *flags*
- * - bswap32_i32 *t0*, *t1*, *flags*
+ - | 32 bit byte swap. The flags are the same as for bswap16, except
+ they apply from bit 31 instead of bit 15. On TCG_TYPE_I32, the
+ flags should be zero.
- bswap64_i64 *t0*, *t1*, *flags*
+ * - bswap64 *t0*, *t1*, *flags*
- - | 32/64 bit byte swap. The flags are ignored, but still present
- for consistency with the other bswap opcodes.
+ - | 64 bit byte swap. The flags are ignored, but still present
+ for consistency with the other bswap opcodes. For future
+ compatibility, the flags should be zero.
* - discard_i32/i64 *t0*
- | Indicate that the value of *t0* won't be used later. It is useful to
force dead code elimination.
- * - deposit_i32/i64 *dest*, *t1*, *t2*, *pos*, *len*
+ * - deposit *dest*, *t1*, *t2*, *pos*, *len*
- | Deposit *t2* as a bitfield into *t1*, placing the result in *dest*.
|
@@ -446,14 +451,16 @@ Misc
| *len* - the length of the bitfield
| *pos* - the position of the first bit, counting from the LSB
|
- | For example, "deposit_i32 dest, t1, t2, 8, 4" indicates a 4-bit field
+ | For example, "deposit dest, t1, t2, 8, 4" indicates a 4-bit field
at bit 8. This operation would be equivalent to
|
| *dest* = (*t1* & ~0x0f00) | ((*t2* << 8) & 0x0f00)
+ |
+ | on TCG_TYPE_I32.
- * - extract_i32/i64 *dest*, *t1*, *pos*, *len*
+ * - extract *dest*, *t1*, *pos*, *len*
- sextract_i32/i64 *dest*, *t1*, *pos*, *len*
+ sextract *dest*, *t1*, *pos*, *len*
- | Extract a bitfield from *t1*, placing the result in *dest*.
|
@@ -462,16 +469,16 @@ Misc
to the left with zeros; for sextract_*, the result will be extended
to the left with copies of the bitfield sign bit at *pos* + *len* - 1.
|
- | For example, "sextract_i32 dest, t1, 8, 4" indicates a 4-bit field
+ | For example, "sextract dest, t1, 8, 4" indicates a 4-bit field
at bit 8. This operation would be equivalent to
|
| *dest* = (*t1* << 20) >> 28
|
- | (using an arithmetic right shift).
+ | (using an arithmetic right shift) on TCG_TYPE_I32.
- * - extract2_i32/i64 *dest*, *t1*, *t2*, *pos*
+ * - extract2 *dest*, *t1*, *t2*, *pos*
- - | For N = {32,64}, extract an N-bit quantity from the concatenation
+ - | For TCG_TYPE_I{N}, extract an N-bit quantity from the concatenation
of *t2*:*t1*, beginning at *pos*. The tcg_gen_extract2_{i32,i64} expander
accepts 0 <= *pos* <= N as inputs. The backend code generator will
not see either 0 or N as inputs for these opcodes.
@@ -494,19 +501,19 @@ Conditional moves
.. list-table::
- * - setcond_i32/i64 *dest*, *t1*, *t2*, *cond*
+ * - setcond *dest*, *t1*, *t2*, *cond*
- | *dest* = (*t1* *cond* *t2*)
|
| Set *dest* to 1 if (*t1* *cond* *t2*) is true, otherwise set to 0.
- * - negsetcond_i32/i64 *dest*, *t1*, *t2*, *cond*
+ * - negsetcond *dest*, *t1*, *t2*, *cond*
- | *dest* = -(*t1* *cond* *t2*)
|
| Set *dest* to -1 if (*t1* *cond* *t2*) is true, otherwise set to 0.
- * - movcond_i32/i64 *dest*, *c1*, *c2*, *v1*, *v2*, *cond*
+ * - movcond *dest*, *c1*, *c2*, *v1*, *v2*, *cond*
- | *dest* = (*c1* *cond* *c2* ? *v1* : *v2*)
|
@@ -586,26 +593,79 @@ Multiword arithmetic support
.. list-table::
- * - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
+ * - addco *t0*, *t1*, *t2*
+
+ - | Compute *t0* = *t1* + *t2* and in addition output to the
+ carry bit provided by the host architecture.
+
+ * - addci *t0, *t1*, *t2*
- sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
+ - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
+ input carry bit provided by the host architecture.
+ The output carry bit need not be computed.
- - | Similar to add/sub, except that the double-word inputs *t1* and *t2* are
- formed from two single-word arguments, and the double-word output *t0*
- is returned in two single-word outputs.
+ * - addcio *t0, *t1*, *t2*
- * - mulu2_i32/i64 *t0_low*, *t0_high*, *t1*, *t2*
+ - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
+ input carry bit provided by the host architecture,
+ and also compute the output carry bit.
+
+ * - addc1o *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* + *t2* + 1, and in addition output to the
+ carry bit provided by the host architecture. This is akin to
+ *addcio* with a fixed carry-in value of 1.
+ | This is intended to be used by the optimization pass,
+ intermediate to complete folding of the addition chain.
+ In some cases complete folding is not possible and this
+ opcode will remain until output. If this happens, the
+ code generator will use ``tcg_out_set_carry`` and then
+ the output routine for *addcio*.
+
+ * - subbo *t0*, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* and in addition output to the
+ borrow bit provided by the host architecture.
+ | Depending on the host architecture, the carry bit may or may not be
+ identical to the borrow bit. Thus the addc\* and subb\*
+ opcodes must not be mixed.
+
+ * - subbi *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
+ input borrow bit provided by the host architecture.
+ The output borrow bit need not be computed.
+
+ * - subbio *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
+ input borrow bit provided by the host architecture,
+ and also compute the output borrow bit.
+
+ * - subb1o *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* - 1, and in addition output to the
+ borrow bit provided by the host architecture. This is akin to
+ *subbio* with a fixed borrow-in value of 1.
+ | This is intended to be used by the optimization pass,
+ intermediate to complete folding of the subtraction chain.
+ In some cases complete folding is not possible and this
+ opcode will remain until output. If this happens, the
+ code generator will use ``tcg_out_set_borrow`` and then
+ the output routine for *subbio*.
+
+ * - mulu2 *t0_low*, *t0_high*, *t1*, *t2*
- | Similar to mul, except two unsigned inputs *t1* and *t2* yielding the full
double-word product *t0*. The latter is returned in two single-word outputs.
- * - muls2_i32/i64 *t0_low*, *t0_high*, *t1*, *t2*
+ * - muls2 *t0_low*, *t0_high*, *t1*, *t2*
- | Similar to mulu2, except the two inputs *t1* and *t2* are signed.
- * - mulsh_i32/i64 *t0*, *t1*, *t2*
+ * - mulsh *t0*, *t1*, *t2*
- muluh_i32/i64 *t0*, *t1*, *t2*
+ muluh *t0*, *t1*, *t2*
- | Provide the high part of a signed or unsigned multiply, respectively.
|
@@ -684,8 +744,6 @@ QEMU specific operations
qemu_st_i32/i64/i128 *t0*, *t1*, *flags*, *memidx*
- qemu_st8_i32 *t0*, *t1*, *flags*, *memidx*
-
- | Load data at the guest address *t1* into *t0*, or store data in *t0* at guest
address *t1*. The _i32/_i64/_i128 size applies to the size of the input/output
register *t0* only. The address *t1* is always sized according to the guest,
@@ -703,19 +761,14 @@ QEMU specific operations
64-bit memory access specified in *flags*.
|
| For qemu_ld/st_i128, these are only supported for a 64-bit host.
- |
- | For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
- the memory operation is known to be 8-bit. This allows the backend to
- provide a different set of register constraints.
Host vector operations
----------------------
-All of the vector ops have two parameters, ``TCGOP_VECL`` & ``TCGOP_VECE``.
-The former specifies the length of the vector in log2 64-bit units; the
-latter specifies the length of the element (if applicable) in log2 8-bit units.
-E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
+All of the vector ops have two parameters, ``TCGOP_TYPE`` & ``TCGOP_VECE``.
+The former specifies the length of the vector as a TCGType; the latter
+specifies the length of the element (if applicable) in log2 8-bit units.
.. list-table::
@@ -729,7 +782,7 @@ E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
* - dup_vec *v0*, *r1*
- - | Duplicate the low N bits of *r1* into VECL/VECE copies across *v0*.
+ - | Duplicate the low N bits of *r1* into TYPE/VECE copies across *v0*.
* - dupi_vec *v0*, *c*
@@ -738,7 +791,7 @@ E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
* - dup2_vec *v0*, *r1*, *r2*
- - | Duplicate *r2*:*r1* into VECL/64 copies across *v0*. This opcode is
+ - | Duplicate *r2*:*r1* into TYPE/64 copies across *v0*. This opcode is
only present for 32-bit hosts.
* - add_vec *v0*, *v1*, *v2*
@@ -810,7 +863,7 @@ E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
.. code-block:: c
- for (i = 0; i < VECL/VECE; ++i) {
+ for (i = 0; i < TYPE/VECE; ++i) {
v0[i] = v1[i] << s2;
}
@@ -832,7 +885,7 @@ E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
.. code-block:: c
- for (i = 0; i < VECL/VECE; ++i) {
+ for (i = 0; i < TYPE/VECE; ++i) {
v0[i] = v1[i] << v2[i];
}
@@ -885,9 +938,9 @@ Assumptions
The target word size (``TCG_TARGET_REG_BITS``) is expected to be 32 bit or
64 bit. It is expected that the pointer has the same size as the word.
-On a 32 bit target, all 64 bit operations are converted to 32 bits. A
-few specific operations must be implemented to allow it (see add2_i32,
-sub2_i32, brcond2_i32).
+On a 32 bit target, all 64 bit operations are converted to 32 bits.
+A few specific operations must be implemented to allow it
+(see brcond2_i32, setcond2_i32).
On a 64 bit target, the values are transferred between 32 and 64-bit
registers using the following ops:
@@ -928,7 +981,9 @@ operation uses a constant input constraint which does not allow all
constants, it must also accept registers in order to have a fallback.
The constraint '``i``' is defined generically to accept any constant.
The constraint '``r``' is not defined generically, but is consistently
-used by each backend to indicate all registers.
+used by each backend to indicate all registers. If ``TCG_REG_ZERO``
+is defined by the backend, the constraint '``z``' is defined generically
+to map constant 0 to the hardware zero register.
The movi_i32 and movi_i64 operations must accept any constants.
diff --git a/docs/devel/tcg-plugins.rst b/docs/devel/tcg-plugins.rst
index d8725c2..9463692 100644
--- a/docs/devel/tcg-plugins.rst
+++ b/docs/devel/tcg-plugins.rst
@@ -61,11 +61,14 @@ translation event the plugin has an option to enumerate the
instructions in a block of instructions and optionally register
callbacks to some or all instructions when they are executed.
-There is also a facility to add an inline event where code to
-increment a counter can be directly inlined with the translation.
-Currently only a simple increment is supported. This is not atomic so
-can miss counts. If you want absolute precision you should use a
-callback which can then ensure atomicity itself.
+There is also a facility to add inline instructions doing various operations,
+like adding or storing an immediate value. It is also possible to execute a
+callback conditionally, with condition being evaluated inline. All those inline
+operations are associated to a ``scoreboard``, which is a thread-local storage
+automatically expanded when new cores/threads are created and that can be
+accessed/modified in a thread-safe way without any lock needed. Combining inline
+operations and conditional callbacks offer a more efficient way to instrument
+binaries, compared to classic callbacks.
Finally when QEMU exits all the registered *atexit* callbacks are
invoked.
diff --git a/docs/devel/acpi-bits.rst b/docs/devel/testing/acpi-bits.rst
index 1ec394f..9a4d716 100644
--- a/docs/devel/acpi-bits.rst
+++ b/docs/devel/testing/acpi-bits.rst
@@ -1,6 +1,6 @@
-=============================================================================
-ACPI/SMBIOS avocado tests using biosbits
-=============================================================================
+==================================
+ACPI/SMBIOS testing using biosbits
+==================================
************
Introduction
************
@@ -30,26 +30,31 @@ OS modules are generally written using low level languages such as C and
low level assembly machine language. Writing test routines in a low level
language makes things more cumbersome. These and other reasons makes using
bios-bits very attractive for testing bioses. More details on the inspiration
-for developing biosbits and its real life uses can be found in [#a]_ and [#b]_.
+for developing biosbits and its real life uses were presented `at Plumbers
+in 2011 <Plumbers_>`__ and `at Linux.conf.au in 2012 <Linux.conf.au_>`__.
-For QEMU, we maintain a fork of bios bits in gitlab along with all the
-dependent submodules `here <https://gitlab.com/qemu-project/biosbits-bits>`__.
-This fork contains numerous fixes, a newer acpica and changes specific to
-running this avocado QEMU tests using bits. The author of this document
-is the sole maintainer of the QEMU fork of bios bits repository. For more
-information, please see author's `FOSDEM talk on this bios-bits based test
-framework <https://fosdem.org/2024/schedule/event/fosdem-2024-2262-exercising-qemu-generated-acpi-smbios-tables-using-biosbits-from-within-a-guest-vm-/>`__.
+For QEMU, we maintain a fork of bios bits in `gitlab`_, along with all
+the dependent submodules. This fork contains numerous fixes, a newer
+acpica and changes specific to running these functional QEMU tests using
+bits. The author of this document is the current maintainer of the QEMU
+fork of bios bits repository. For more information, please see `the
+author's FOSDEM presentation <FOSDEM_>`__ on this bios-bits based test framework.
+
+.. _Plumbers: https://blog.linuxplumbersconf.org/2011/ocw/system/presentations/867/original/bits.pdf
+.. _Linux.conf.au: https://www.youtube.com/watch?v=36QIepyUuhg
+.. _gitlab: https://gitlab.com/qemu-project/biosbits-bits
+.. _FOSDEM: https://fosdem.org/2024/schedule/event/fosdem-2024-2262-exercising-qemu-generated-acpi-smbios-tables-using-biosbits-from-within-a-guest-vm-/
*********************************
Description of the test framework
*********************************
-Under the directory ``tests/avocado/``, ``acpi-bits.py`` is a QEMU avocado
-test that drives all this.
+Under the directory ``tests/functional/``, ``test_acpi_bits.py`` is a QEMU
+functional test that drives all this.
A brief description of the various test files follows.
-Under ``tests/avocado/`` as the root we have:
+Under ``tests/functional/`` as the root we have:
::
@@ -60,12 +65,12 @@ Under ``tests/avocado/`` as the root we have:
│ ├── smbios.py2
│ ├── testacpi.py2
│ └── testcpuid.py2
- ├── acpi-bits.py
+ ├── test_acpi_bits.py
-* ``tests/avocado``:
+* ``tests/functional``:
- ``acpi-bits.py``:
- This is the main python avocado test script that generates a
+ ``test_acpi_bits.py``:
+ This is the main python functional test script that generates a
biosbits iso. It then spawns a QEMU VM with it, collects the log and reports
test failures. This is the script one would be interested in if they wanted
to add or change some component of the log parsing, add a new command line
@@ -79,35 +84,22 @@ Under ``tests/avocado/`` as the root we have:
you to inspect and run the specific commands manually.
In order to run this test, please perform the following steps from the QEMU
- build directory:
- ::
-
- $ make check-venv (needed only the first time to create the venv)
- $ ./pyvenv/bin/avocado run -t acpi tests/avocado
-
- The above will run all acpi avocado tests including this one.
- In order to run the individual tests, perform the following:
+ build directory (assuming that the sources are in ".."):
::
- $ ./pyvenv/bin/avocado run tests/avocado/acpi-bits.py --tap -
-
- The above will produce output in tap format. You can omit "--tap -" in the
- end and it will produce output like the following:
- ::
+ $ export PYTHONPATH=../python:../tests/functional
+ $ export QEMU_TEST_QEMU_BINARY=$PWD/qemu-system-x86_64
+ $ python3 ../tests/functional/test_acpi_bits.py
- $ ./pyvenv/bin/avocado run tests/avocado/acpi-bits.py
- Fetching asset from tests/avocado/acpi-bits.py:AcpiBitsTest.test_acpi_smbios_bits
- JOB ID : eab225724da7b64c012c65705dc2fa14ab1defef
- JOB LOG : /home/anisinha/avocado/job-results/job-2022-10-10T17.58-eab2257/job.log
- (1/1) tests/avocado/acpi-bits.py:AcpiBitsTest.test_acpi_smbios_bits: PASS (33.09 s)
- RESULTS : PASS 1 | ERROR 0 | FAIL 0 | SKIP 0 | WARN 0 | INTERRUPT 0 | CANCEL 0
- JOB TIME : 39.22 s
+ The above will run all acpi-bits functional tests (producing output in
+ tap format).
- You can inspect the log file for more information about the run or in order
- to diagnoze issues. If you pass V=1 in the environment, more diagnostic logs
- would be found in the test log.
+ You can inspect the log files in tests/functional/x86_64/test_acpi_bits.*/
+ for more information about the run or in order to diagnoze issues.
+ If you pass V=1 in the environment, more diagnostic logs will be put into
+ the test log.
-* ``tests/avocado/acpi-bits/bits-config``:
+* ``tests/functional/acpi-bits/bits-config``:
This location contains biosbits configuration files that determine how the
software runs the tests.
@@ -117,7 +109,7 @@ Under ``tests/avocado/`` as the root we have:
or actions are performed by bits. The description of the config options are
provided in the file itself.
-* ``tests/avocado/acpi-bits/bits-tests``:
+* ``tests/functional/acpi-bits/bits-tests``:
This directory contains biosbits python based tests that are run from within
the biosbits environment in the spawned VM. New additions of test cases can
@@ -155,13 +147,9 @@ Under ``tests/avocado/`` as the root we have:
(a) They are python2.7 based scripts and not python 3 scripts.
(b) They are run from within the bios bits VM and is not subjected to QEMU
build/test python script maintenance and dependency resolutions.
- (c) They need not be loaded by avocado framework when running tests.
+ (c) They need not be loaded by the test framework by accident when running
+ tests.
Author: Ani Sinha <anisinha@redhat.com>
-References:
------------
-.. [#a] https://blog.linuxplumbersconf.org/2011/ocw/system/presentations/867/original/bits.pdf
-.. [#b] https://www.youtube.com/watch?v=36QIepyUuhg
-.. [#c] https://fosdem.org/2024/schedule/event/fosdem-2024-2262-exercising-qemu-generated-acpi-smbios-tables-using-biosbits-from-within-a-guest-vm-/
diff --git a/docs/devel/testing/blkdebug.rst b/docs/devel/testing/blkdebug.rst
new file mode 100644
index 0000000..63887c9
--- /dev/null
+++ b/docs/devel/testing/blkdebug.rst
@@ -0,0 +1,177 @@
+Block I/O error injection using ``blkdebug``
+============================================
+
+..
+ Copyright (C) 2014-2015 Red Hat Inc
+
+ This work is licensed under the terms of the GNU GPL, version 2 or later. See
+ the COPYING file in the top-level directory.
+
+The ``blkdebug`` block driver is a rule-based error injection engine. It can be
+used to exercise error code paths in block drivers including ``ENOSPC`` (out of
+space) and ``EIO``.
+
+This document gives an overview of the features available in ``blkdebug``.
+
+Background
+----------
+Block drivers have many error code paths that handle I/O errors. Image formats
+are especially complex since metadata I/O errors during cluster allocation or
+while updating tables happen halfway through request processing and require
+discipline to keep image files consistent.
+
+Error injection allows test cases to trigger I/O errors at specific points.
+This way, all error paths can be tested to make sure they are correct.
+
+Rules
+-----
+The ``blkdebug`` block driver takes a list of "rules" that tell the error injection
+engine when to fail an I/O request.
+
+Each I/O request is evaluated against the rules. If a rule matches the request
+then its "action" is executed.
+
+Rules can be placed in a configuration file; the configuration file
+follows the same .ini-like format used by QEMU's ``-readconfig`` option, and
+each section of the file represents a rule.
+
+The following configuration file defines a single rule::
+
+ $ cat blkdebug.conf
+ [inject-error]
+ event = "read_aio"
+ errno = "28"
+
+This rule fails all aio read requests with ``ENOSPC`` (28). Note that the errno
+value depends on the host. On Linux, see
+``/usr/include/asm-generic/errno-base.h`` for errno values.
+
+Invoke QEMU as follows::
+
+ $ qemu-system-x86_64
+ -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \
+ -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0
+
+Rules support the following attributes:
+
+``event``
+ which type of operation to match (e.g. ``read_aio``, ``write_aio``,
+ ``flush_to_os``, ``flush_to_disk``). See `Events`_ for
+ information on events.
+
+``state``
+ (optional) the engine must be in this state number in order for this
+ rule to match. See `State transitions`_ for information
+ on states.
+
+``errno``
+ the numeric errno value to return when a request matches this rule.
+ The errno values depend on the host since the numeric values are not
+ standardized in the POSIX specification.
+
+``sector``
+ (optional) a sector number that the request must overlap in order to
+ match this rule
+
+``once``
+ (optional, default ``off``) only execute this action on the first
+ matching request
+
+``immediately``
+ (optional, default ``off``) return a NULL ``BlockAIOCB``
+ pointer and fail without an errno instead. This
+ exercises the code path where ``BlockAIOCB`` fails and the
+ caller's ``BlockCompletionFunc`` is not invoked.
+
+Events
+------
+Block drivers provide information about the type of I/O request they are about
+to make so rules can match specific types of requests. For example, the ``qcow2``
+block driver tells ``blkdebug`` when it accesses the L1 table so rules can match
+only L1 table accesses and not other metadata or guest data requests.
+
+The core events are:
+
+``read_aio``
+ guest data read
+
+``write_aio``
+ guest data write
+
+``flush_to_os``
+ write out unwritten block driver state (e.g. cached metadata)
+
+``flush_to_disk``
+ flush the host block device's disk cache
+
+See ``qapi/block-core.json:BlkdebugEvent`` for the full list of events.
+You may need to grep block driver source code to understand the
+meaning of specific events.
+
+State transitions
+-----------------
+There are cases where more power is needed to match a particular I/O request in
+a longer sequence of requests. For example::
+
+ write_aio
+ flush_to_disk
+ write_aio
+
+How do we match the 2nd ``write_aio`` but not the first? This is where state
+transitions come in.
+
+The error injection engine has an integer called the "state" that always starts
+initialized to 1. The state integer is internal to ``blkdebug`` and cannot be
+observed from outside but rules can interact with it for powerful matching
+behavior.
+
+Rules can be conditional on the current state and they can transition to a new
+state.
+
+When a rule's "state" attribute is non-zero then the current state must equal
+the attribute in order for the rule to match.
+
+For example, to match the 2nd write_aio::
+
+ [set-state]
+ event = "write_aio"
+ state = "1"
+ new_state = "2"
+
+ [inject-error]
+ event = "write_aio"
+ state = "2"
+ errno = "5"
+
+The first ``write_aio`` request matches the ``set-state`` rule and transitions from
+state 1 to state 2. Once state 2 has been entered, the ``set-state`` rule no
+longer matches since it requires state 1. But the ``inject-error`` rule now
+matches the next ``write_aio`` request and injects ``EIO`` (5).
+
+State transition rules support the following attributes:
+
+``event``
+ which type of operation to match (e.g. ``read_aio``, ``write_aio``,
+ ``flush_to_os`, ``flush_to_disk``). See `Events`_ for
+ information on events.
+
+``state``
+ (optional) the engine must be in this state number in order for this
+ rule to match
+
+``new_state``
+ transition to this state number
+
+Suspend and resume
+------------------
+Exercising code paths in block drivers may require specific ordering amongst
+concurrent requests. The "breakpoint" feature allows requests to be halted on
+a ``blkdebug`` event and resumed later. This makes it possible to achieve
+deterministic ordering when multiple requests are in flight.
+
+Breakpoints on ``blkdebug`` events are associated with a user-defined ``tag`` string.
+This tag serves as an identifier by which the request can be resumed at a later
+point.
+
+See the ``qemu-io(1)`` ``break``, ``resume``, ``remove_break``, and ``wait_break``
+commands for details.
diff --git a/docs/devel/blkverify.txt b/docs/devel/testing/blkverify.rst
index aca826c..2a71778 100644
--- a/docs/devel/blkverify.txt
+++ b/docs/devel/testing/blkverify.rst
@@ -1,8 +1,10 @@
-= Block driver correctness testing with blkverify =
+Block driver correctness testing with ``blkverify``
+===================================================
-== Introduction ==
+Introduction
+------------
-This document describes how to use the blkverify protocol to test that a block
+This document describes how to use the ``blkverify`` protocol to test that a block
driver is operating correctly.
It is difficult to test and debug block drivers against real guests. Often
@@ -11,12 +13,13 @@ of the executable. Other times obscure errors are raised by a program inside
the guest. These issues are extremely hard to trace back to bugs in the block
driver.
-Blkverify solves this problem by catching data corruption inside QEMU the first
+``blkverify`` solves this problem by catching data corruption inside QEMU the first
time bad data is read and reporting the disk sector that is corrupted.
-== How it works ==
+How it works
+------------
-The blkverify protocol has two child block devices, the "test" device and the
+The ``blkverify`` protocol has two child block devices, the "test" device and the
"raw" device. Read/write operations are mirrored to both devices so their
state should always be in sync.
@@ -25,13 +28,14 @@ contents to the "test" image. The idea is that the "raw" device will handle
read/write operations correctly and not corrupt data. It can be used as a
reference for comparison against the "test" device.
-After a mirrored read operation completes, blkverify will compare the data and
+After a mirrored read operation completes, ``blkverify`` will compare the data and
raise an error if it is not identical. This makes it possible to catch the
first instance where corrupt data is read.
-== Example ==
+Example
+-------
-Imagine raw.img has 0xcd repeated throughout its first sector:
+Imagine raw.img has 0xcd repeated throughout its first sector::
$ ./qemu-io -c 'read -v 0 512' raw.img
00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
@@ -42,7 +46,7 @@ Imagine raw.img has 0xcd repeated throughout its first sector:
read 512/512 bytes at offset 0
512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec)
-And test.img is corrupt, its first sector is zeroed when it shouldn't be:
+And test.img is corrupt, its first sector is zeroed when it shouldn't be::
$ ./qemu-io -c 'read -v 0 512' test.img
00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
@@ -53,17 +57,17 @@ And test.img is corrupt, its first sector is zeroed when it shouldn't be:
read 512/512 bytes at offset 0
512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec)
-This error is caught by blkverify:
+This error is caught by ``blkverify``::
$ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img
blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
-A more realistic scenario is verifying the installation of a guest OS:
+A more realistic scenario is verifying the installation of a guest OS::
$ ./qemu-img create raw.img 16G
$ ./qemu-img create -f qcow2 test.qcow2 16G
$ ./qemu-system-x86_64 -cdrom debian.iso \
-drive file=blkverify:raw.img:test.qcow2
-If the installation is aborted when blkverify detects corruption, use qemu-io
+If the installation is aborted when ``blkverify`` detects corruption, use ``qemu-io``
to explore the contents of the disk image at the sector in question.
diff --git a/docs/devel/ci-jobs.rst.inc b/docs/devel/testing/ci-jobs.rst.inc
index 3756bbe..f1c541c 100644
--- a/docs/devel/ci-jobs.rst.inc
+++ b/docs/devel/testing/ci-jobs.rst.inc
@@ -126,10 +126,10 @@ QEMU_JOB_PUBLISH
The job is for publishing content after a branch has been
merged into the upstream default branch.
-QEMU_JOB_AVOCADO
-~~~~~~~~~~~~~~~~
+QEMU_JOB_FUNCTIONAL
+~~~~~~~~~~~~~~~~~~~
-The job runs the Avocado integration test suite
+The job runs the functional test suite
Contributor controlled runtime variables
----------------------------------------
@@ -149,13 +149,12 @@ the jobs to be manually started from the UI
Set this variable to 2 to create the pipelines and run all
the jobs immediately, as was the historical behaviour
-QEMU_CI_AVOCADO_TESTING
-~~~~~~~~~~~~~~~~~~~~~~~
-By default, tests using the Avocado framework are not run automatically in
-the pipelines (because multiple artifacts have to be downloaded, and if
-these artifacts are not already cached, downloading them make the jobs
-reach the timeout limit). Set this variable to have the tests using the
-Avocado framework run automatically.
+QEMU_CI_FUNCTIONAL
+~~~~~~~~~~~~~~~~~~
+By default, tests using the functional framework are not run automatically
+in the pipelines (because multiple artifacts have to be downloaded, which
+might cause a lot of network traffic). Set this variable to have the tests
+using the functional framework run automatically.
Other misc variables
--------------------
diff --git a/docs/devel/ci-runners.rst.inc b/docs/devel/testing/ci-runners.rst.inc
index 67b23d3..67b23d3 100644
--- a/docs/devel/ci-runners.rst.inc
+++ b/docs/devel/testing/ci-runners.rst.inc
diff --git a/docs/devel/testing/ci.rst b/docs/devel/testing/ci.rst
new file mode 100644
index 0000000..e21d39d
--- /dev/null
+++ b/docs/devel/testing/ci.rst
@@ -0,0 +1,34 @@
+.. _ci:
+
+Continuous Integration (CI)
+===========================
+
+Continuous integration (CI) requires the builds of the entire application and
+the execution of a comprehensive set of automated tests every time there is a
+need to commit any set of changes [1]_. The automated tests are composed
+of unit, functional and other tests.
+
+Most of QEMU's CI is run on GitLab's infrastructure although a number
+of other CI services are used for specialised purposes. The most up to
+date information about them and their status can be found on the
+`project wiki testing page <https://wiki.qemu.org/Testing/CI>`_.
+
+These tests are also used as gating tests before merging pull requests.
+A gating test restricts the move of code from one stage to another on a
+test/deployment pipeline. The step move is granted with approval. The approval
+can be a manual intervention or a set of tests succeeding [2]_.
+
+On QEMU, the gating process happens during the pull request. The approval is
+done by the project leader running its own set of tests. The pull request gets
+merged when the tests succeed.
+
+.. include:: ci-jobs.rst.inc
+.. include:: ci-runners.rst.inc
+
+References
+----------
+
+.. [1] Humble, Jez & Farley, David (2010). Continuous Delivery:
+ Reliable Software Releases Through Build, Test, and Deployment, p. 55.
+.. [2] Humble, Jez & Farley, David (2010). Continuous Delivery:
+ Reliable Software Releases Through Build, Test, and Deployment, p. 122.
diff --git a/docs/devel/testing/functional.rst b/docs/devel/testing/functional.rst
new file mode 100644
index 0000000..9e56dd1
--- /dev/null
+++ b/docs/devel/testing/functional.rst
@@ -0,0 +1,380 @@
+.. _checkfunctional-ref:
+
+Functional testing with Python
+==============================
+
+The ``tests/functional`` directory hosts functional tests written in
+Python. They are usually higher level tests, and may interact with
+external resources and with various guest operating systems.
+
+The tests should be written in the style of the Python `unittest`_ framework,
+using stdio for the TAP protocol. The folder ``tests/functional/qemu_test``
+provides classes (e.g. the ``QemuBaseTest``, ``QemuUserTest`` and the
+``QemuSystemTest`` classes) and utility functions that help to get your test
+into the right shape, e.g. by replacing the 'stdout' python object to redirect
+the normal output of your test to stderr instead.
+
+Note that if you don't use one of the QemuBaseTest based classes for your
+test, or if you spawn subprocesses from your test, you have to make sure
+that there is no TAP-incompatible output written to stdio, e.g. either by
+prefixing every line with a "# " to mark the output as a TAP comment, or
+e.g. by capturing the stdout output of subprocesses (redirecting it to
+stderr is OK).
+
+Tests based on ``qemu_test.QemuSystemTest`` can easily:
+
+ * Customize the command line arguments given to the convenience
+ ``self.vm`` attribute (a QEMUMachine instance)
+
+ * Interact with the QEMU monitor, send QMP commands and check
+ their results
+
+ * Interact with the guest OS, using the convenience console device
+ (which may be useful to assert the effectiveness and correctness of
+ command line arguments or QMP commands)
+
+ * Download (and cache) remote data files, such as firmware and kernel
+ images
+
+Running tests
+-------------
+
+You can run the functional tests simply by executing:
+
+.. code::
+
+ make check-functional
+
+It is also possible to run tests for a certain target only, for example
+the following line will only run the tests for the x86_64 target:
+
+.. code::
+
+ make check-functional-x86_64
+
+To run a single test file without the meson test runner, you can also
+execute the file directly by specifying two environment variables first,
+the PYTHONPATH that has to include the python folder and the tests/functional
+folder of the source tree, and QEMU_TEST_QEMU_BINARY that has to point
+to the QEMU binary that should be used for the test. The current working
+directory should be your build folder. For example::
+
+ $ export PYTHONPATH=../python:../tests/functional
+ $ export QEMU_TEST_QEMU_BINARY=$PWD/qemu-system-x86_64
+ $ pyvenv/bin/python3 ../tests/functional/test_file.py
+
+The test framework will automatically purge any scratch files created during
+the tests. If needing to debug a failed test, it is possible to keep these
+files around on disk by setting ```QEMU_TEST_KEEP_SCRATCH=1``` as an env
+variable. Any preserved files will be deleted the next time the test is run
+without this variable set.
+
+Logging
+-------
+
+The framework collects log files for each test in the build directory
+in the following subfolder::
+
+ <builddir>/tests/functional/<arch>/<fileid>.<classid>.<testname>/
+
+There are usually three log files:
+
+* ``base.log`` contains the generic logging information that is written
+ by the calls to the logging functions in the test code (e.g. by calling
+ the ``self.log.info()`` or ``self.log.debug()`` functions).
+* ``console.log`` contains the output of the serial console of the guest.
+* ``default.log`` contains the output of QEMU. This file could be named
+ differently if the test chooses to use a different identifier for
+ the guest VM (e.g. when the test spins up multiple VMs).
+
+Introduction to writing tests
+-----------------------------
+
+The ``tests/functional/qemu_test`` directory provides the ``qemu_test``
+Python module, containing the ``qemu_test.QemuSystemTest`` class.
+Here is a simple usage example:
+
+.. code::
+
+ #!/usr/bin/env python3
+
+ from qemu_test import QemuSystemTest
+
+ class Version(QemuSystemTest):
+
+ def test_qmp_human_info_version(self):
+ self.vm.launch()
+ res = self.vm.cmd('human-monitor-command',
+ command_line='info version')
+ self.assertRegex(res, r'^(\d+\.\d+\.\d)')
+
+ if __name__ == '__main__':
+ QemuSystemTest.main()
+
+By providing the "hash bang" line at the beginning of the script, marking
+the file as executable and by calling into QemuSystemTest.main(), the test
+can also be run stand-alone, without a test runner. OTOH when run via a test
+runner, the QemuSystemTest.main() function takes care of running the test
+functions in the right fassion (e.g. with TAP output that is required by the
+meson test runner).
+
+The ``qemu_test.QemuSystemTest`` base test class
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``qemu_test.QemuSystemTest`` class has a number of characteristics
+that are worth being mentioned.
+
+First of all, it attempts to give each test a ready to use QEMUMachine
+instance, available at ``self.vm``. Because many tests will tweak the
+QEMU command line, launching the QEMUMachine (by using ``self.vm.launch()``)
+is left to the test writer.
+
+The base test class has also support for tests with more than one
+QEMUMachine. The way to get machines is through the ``self.get_vm()``
+method which will return a QEMUMachine instance. The ``self.get_vm()``
+method accepts arguments that will be passed to the QEMUMachine creation
+and also an optional ``name`` attribute so you can identify a specific
+machine and get it more than once through the tests methods. A simple
+and hypothetical example follows:
+
+.. code::
+
+ from qemu_test import QemuSystemTest
+
+ class MultipleMachines(QemuSystemTest):
+ def test_multiple_machines(self):
+ first_machine = self.get_vm()
+ second_machine = self.get_vm()
+ self.get_vm(name='third_machine').launch()
+
+ first_machine.launch()
+ second_machine.launch()
+
+ first_res = first_machine.cmd(
+ 'human-monitor-command',
+ command_line='info version')
+
+ second_res = second_machine.cmd(
+ 'human-monitor-command',
+ command_line='info version')
+
+ third_res = self.get_vm(name='third_machine').cmd(
+ 'human-monitor-command',
+ command_line='info version')
+
+ self.assertEqual(first_res, second_res, third_res)
+
+At test "tear down", ``qemu_test.QemuSystemTest`` handles all the QEMUMachines
+shutdown.
+
+QEMUMachine
+-----------
+
+The QEMUMachine API is already widely used in the Python iotests,
+device-crash-test and other Python scripts. It's a wrapper around the
+execution of a QEMU binary, giving its users:
+
+ * the ability to set command line arguments to be given to the QEMU
+ binary
+
+ * a ready to use QMP connection and interface, which can be used to
+ send commands and inspect its results, as well as asynchronous
+ events
+
+ * convenience methods to set commonly used command line arguments in
+ a more succinct and intuitive way
+
+QEMU binary selection
+^^^^^^^^^^^^^^^^^^^^^
+
+The QEMU binary used for the ``self.vm`` QEMUMachine instance will
+primarily depend on the value of the ``qemu_bin`` instance attribute.
+If it is not explicitly set by the test code, its default value will
+be the result the QEMU_TEST_QEMU_BINARY environment variable.
+
+Debugging hung QEMU
+^^^^^^^^^^^^^^^^^^^
+
+When test cases go wrong it may be helpful to debug a stalled QEMU
+process. While the QEMUMachine class owns the primary QMP monitor
+socket, it is possible to request a second QMP monitor be created
+by setting the ``QEMU_TEST_QMP_BACKDOOR`` env variable to refer
+to a UNIX socket name. The ``qmp-shell`` command can then be
+attached to the stalled QEMU to examine its live state.
+
+Attribute reference
+-------------------
+
+QemuBaseTest
+^^^^^^^^^^^^
+
+The following attributes are available on any ``qemu_test.QemuBaseTest``
+instance.
+
+arch
+""""
+
+The target architecture of the QEMU binary.
+
+Tests are also free to use this attribute value, for their own needs.
+A test may, for instance, use this value when selecting the architecture
+of a kernel or disk image to boot a VM with.
+
+qemu_bin
+""""""""
+
+The preserved value of the ``QEMU_TEST_QEMU_BINARY`` environment
+variable.
+
+QemuUserTest
+^^^^^^^^^^^^
+
+The QemuUserTest class can be used for running an executable via the
+usermode emulation binaries.
+
+QemuSystemTest
+^^^^^^^^^^^^^^
+
+The QemuSystemTest class can be used for running tests via one of the
+qemu-system-* binaries.
+
+vm
+""
+
+A QEMUMachine instance, initially configured according to the given
+``qemu_bin`` parameter.
+
+cpu
+"""
+
+The cpu model that will be set to all QEMUMachine instances created
+by the test.
+
+machine
+"""""""
+
+The machine type that will be set to all QEMUMachine instances created
+by the test. By using the set_machine() function of the QemuSystemTest
+class to set this attribute, you can automatically check whether the
+machine is available to skip the test in case it is not built into the
+QEMU binary.
+
+Asset handling
+--------------
+
+Many functional tests download assets (e.g. Linux kernels, initrds,
+firmware images, etc.) from the internet to be able to run tests with
+them. This imposes additional challenges to the test framework.
+
+First there is the problem that some people might not have an
+unconstrained internet connection, so such tests should not be run by
+default when running ``make check``. To accomplish this situation,
+the tests that download files should only be added to the "thorough"
+speed mode in the meson.build file, while the "quick" speed mode is
+fine for functional tests that can be run without downloading files.
+``make check`` then only runs the quick functional tests along with
+the other quick tests from the other test suites. If you choose to
+run only ``make check-functional``, the "thorough" tests will be
+executed, too. And to run all functional tests along with the others,
+you can use something like::
+
+ make -j$(nproc) check SPEED=thorough
+
+The second problem with downloading files from the internet are time
+constraints. The time for downloading files should not be taken into
+account when the test is running and the timeout of the test is ticking
+(since downloading can be very slow, depending on the network bandwidth).
+This problem is solved by downloading the assets ahead of time, before
+the tests are run. This pre-caching is done with the qemu_test.Asset
+class. To use it in your test, declare an asset in your test class with
+its URL and SHA256 checksum like this::
+
+ from qemu_test import Asset
+
+ ASSET_somename = Asset(
+ ('https://www.qemu.org/assets/images/qemu_head_200.png'),
+ '34b74cad46ea28a2966c1d04e102510daf1fd73e6582b6b74523940d5da029dd')
+
+In your test function, you can then get the file name of the cached
+asset like this::
+
+ def test_function(self):
+ file_path = self.ASSET_somename.fetch()
+
+The pre-caching will be done automatically when running
+``make check-functional`` (but not when running e.g.
+``make check-functional-<target>``). In case you just want to download
+the assets without running the tests, you can do so by running::
+
+ make precache-functional
+
+The cache is populated in the ``~/.cache/qemu/download`` directory by
+default, but the location can be changed by setting the
+``QEMU_TEST_CACHE_DIR`` environment variable.
+
+Skipping tests
+--------------
+
+Since the test framework is based on the common Python unittest framework,
+you can use the usual Python decorators which allow for easily skipping
+tests running under certain conditions, for example, on the lack of a binary
+on the test system or when the running environment is a CI system. For further
+information about those decorators, please refer to:
+
+ https://docs.python.org/3/library/unittest.html#skipping-tests-and-expected-failures
+
+While the conditions for skipping tests are often specifics of each one, there
+are recurring scenarios identified by the QEMU developers and the use of
+environment variables became a kind of standard way to enable/disable tests.
+
+Here is a list of the most used variables:
+
+QEMU_TEST_ALLOW_LARGE_STORAGE
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Tests which are going to fetch or produce assets considered *large* are not
+going to run unless that ``QEMU_TEST_ALLOW_LARGE_STORAGE=1`` is exported on
+the environment.
+
+The definition of *large* is a bit arbitrary here, but it usually means an
+asset which occupies at least 1GB of size on disk when uncompressed.
+
+QEMU_TEST_ALLOW_UNTRUSTED_CODE
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+There are tests which will boot a kernel image or firmware that can be
+considered not safe to run on the developer's workstation, thus they are
+skipped by default. The definition of *not safe* is also arbitrary but
+usually it means a blob which either its source or build process aren't
+public available.
+
+You should export ``QEMU_TEST_ALLOW_UNTRUSTED_CODE=1`` on the environment in
+order to allow tests which make use of those kind of assets.
+
+QEMU_TEST_FLAKY_TESTS
+^^^^^^^^^^^^^^^^^^^^^
+Some tests are not working reliably and thus are disabled by default.
+This includes tests that don't run reliably on GitLab's CI which
+usually expose real issues that are rarely seen on developer machines
+due to the constraints of the CI environment. If you encounter a
+similar situation then raise a bug and then mark the test as shown on
+the code snippet below:
+
+.. code::
+
+ # See https://gitlab.com/qemu-project/qemu/-/issues/nnnn
+ @skipUnless(os.getenv('QEMU_TEST_FLAKY_TESTS'), 'Test is unstable on GitLab')
+ def test(self):
+ do_something()
+
+Tests should not live in this state forever and should either be fixed
+or eventually removed.
+
+QEMU_TEST_ALLOW_SLOW
+^^^^^^^^^^^^^^^^^^^^
+Tests that have a very long runtime and might run into timeout issues
+e.g. if the QEMU binary has been compiled with debugging options enabled.
+To avoid these timeout issues by default and to save some precious CPU
+cycles during normal testing, such tests are disabled by default unless
+the QEMU_TEST_ALLOW_SLOW environment variable has been set.
+
+
+.. _unittest: https://docs.python.org/3/library/unittest.html
diff --git a/docs/devel/fuzzing.rst b/docs/devel/testing/fuzzing.rst
index 3bfcb33..c3ac084 100644
--- a/docs/devel/fuzzing.rst
+++ b/docs/devel/testing/fuzzing.rst
@@ -21,11 +21,12 @@ Building the fuzzers
To build the fuzzers, install a recent version of clang:
Configure with (substitute the clang binaries with the version you installed).
-Here, enable-sanitizers, is optional but it allows us to reliably detect bugs
-such as out-of-bounds accesses, use-after-frees, double-frees etc.::
+Here, enable-asan and enable-ubsan are optional but they allow us to reliably
+detect bugs such as out-of-bounds accesses, uses-after-free, double-frees
+etc.::
- CC=clang-8 CXX=clang++-8 /path/to/configure --enable-fuzzing \
- --enable-sanitizers
+ CC=clang-8 CXX=clang++-8 /path/to/configure \
+ --enable-fuzzing --enable-asan --enable-ubsan
Fuzz targets are built similarly to system targets::
diff --git a/docs/devel/testing/index.rst b/docs/devel/testing/index.rst
new file mode 100644
index 0000000..ccc2fc6
--- /dev/null
+++ b/docs/devel/testing/index.rst
@@ -0,0 +1,17 @@
+Testing QEMU
+------------
+
+Details about how to test QEMU and how it is integrated into our CI
+testing infrastructure.
+
+.. toctree::
+ :maxdepth: 3
+
+ main
+ qtest
+ functional
+ acpi-bits
+ ci
+ fuzzing
+ blkdebug
+ blkverify
diff --git a/docs/devel/testing.rst b/docs/devel/testing/main.rst
index af73d3d..6b18ed8 100644
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing/main.rst
@@ -5,19 +5,32 @@ Testing in QEMU
QEMU's testing infrastructure is fairly complex as it covers
everything from unit testing and exercising specific sub-systems all
-the way to full blown acceptance tests. To get an overview of the
+the way to full blown functional tests. To get an overview of the
tests you can run ``make check-help`` from either the source or build
tree.
-Most (but not all) tests are also integrated into the meson build
-system so can be run directly from the build tree, for example:
-
-.. code::
+Most (but not all) tests are also integrated as an automated test into
+the meson build system so can be run directly from the build tree,
+for example::
[./pyvenv/bin/]meson test --suite qemu:softfloat
will run just the softfloat tests.
+An automated test is written with one of the test frameworks using its
+generic test functions/classes. The test framework can run the tests and
+report their success or failure [1]_.
+
+An automated test has essentially three parts:
+
+1. The test initialization of the parameters, where the expected parameters,
+ like inputs and expected results, are set up;
+2. The call to the code that should be tested;
+3. An assertion, comparing the result from the previous call with the expected
+ result set during the initialization of the parameters. If the result
+ matches the expected result, the test has been successful; otherwise, it has
+ failed.
+
The rest of this document will cover the details for specific test
groups.
@@ -39,12 +52,22 @@ Before running tests, it is best to build QEMU programs first. Some tests
expect the executables to exist and will fail with obscure messages if they
cannot find them.
+.. _unit-tests:
+
Unit tests
~~~~~~~~~~
-Unit tests, which can be invoked with ``make check-unit``, are simple C tests
-that typically link to individual QEMU object files and exercise them by
-calling exported functions.
+A unit test is responsible for exercising individual software components as a
+unit, like interfaces, data structures, and functionality, uncovering errors
+within the boundaries of a component. The verification effort is in the
+smallest software unit and focuses on the internal processing logic and data
+structures. A test case of unit tests should be designed to uncover errors
+due to erroneous computations, incorrect comparisons, or improper control
+flow [2]_.
+
+In QEMU, unit tests can be invoked with ``make check-unit``. They are
+simple C tests that typically link to individual QEMU object files and
+exercise them by calling exported functions.
If you are writing new code in QEMU, consider adding a unit test, especially
for utility modules that are relatively stateless or have few dependencies. To
@@ -126,6 +149,8 @@ successfully on various hosts. The following list shows some best practices:
#ifdef in the codes. If the whole test suite cannot run on Windows, disable
the build in the meson.build file.
+.. _qapi-tests:
+
QAPI schema tests
~~~~~~~~~~~~~~~~~
@@ -160,6 +185,8 @@ check-block
are in the "auto" group).
See the "QEMU iotests" section below for more information.
+.. _qemu-iotests:
+
QEMU iotests
------------
@@ -500,12 +527,6 @@ first to contribute the mapping to the ``libvirt-ci`` project:
`CI <https://www.qemu.org/docs/master/devel/ci.html>`__ documentation
page on how to trigger gitlab CI pipelines on your change.
- * Please also trigger gitlab container generation pipelines on your change
- for as many OS distros as practical to make sure that there are no
- obvious breakages when adding the new pre-requisite. Please see
- `CI <https://www.qemu.org/docs/master/devel/ci.html>`__ documentation
- page on how to trigger gitlab CI pipelines on your change.
-
For enterprise distros that default to old, end-of-life versions of the
Python runtime, QEMU uses a separate set of mappings that work with more
recent versions. These can be found in ``tests/lcitool/mappings.yml``.
@@ -634,20 +655,38 @@ Building and Testing with TSan
It is possible to build and test with TSan, with a few additional steps.
These steps are normally done automatically in the docker.
-There is a one time patch needed in clang-9 or clang-10 at this time:
+TSan is supported for clang and gcc.
+One particularity of sanitizers is that all the code, including shared objects
+dependencies, should be built with it.
+In the case of TSan, any synchronization primitive from glib (GMutex for
+instance) will not be recognized, and will lead to false positives.
+
+To build a tsan version of glib:
.. code::
- sed -i 's/^const/static const/g' \
- /usr/lib/llvm-10/lib/clang/10.0.0/include/sanitizer/tsan_interface.h
+ $ git clone --depth=1 --branch=2.81.0 https://github.com/GNOME/glib.git
+ $ cd glib
+ $ CFLAGS="-O2 -g -fsanitize=thread" meson build
+ $ ninja -C build
To configure the build for TSan:
.. code::
- ../configure --enable-tsan --cc=clang-10 --cxx=clang++-10 \
+ ../configure --enable-tsan \
--disable-werror --extra-cflags="-O0"
+When executing qemu, don't forget to point to tsan glib:
+
+.. code::
+
+ $ glib_dir=/path/to/glib
+ $ export LD_LIBRARY_PATH=$glib_dir/build/gio:$glib_dir/build/glib:$glib_dir/build/gmodule:$glib_dir/build/gobject:$glib_dir/build/gthread
+ # check correct version is used
+ $ ldd build/qemu-x86_64 | grep glib
+ $ qemu-system-x86_64 ...
+
The runtime behavior of TSAN is controlled by the TSAN_OPTIONS environment
variable.
@@ -667,6 +706,8 @@ The above exitcode=0 has TSan continue without error if any warnings are found.
This allows for running the test and then checking the warnings afterwards.
If you want TSan to stop and exit with error on warnings, use exitcode=66.
+.. _tsan-suppressions:
+
TSan Suppressions
~~~~~~~~~~~~~~~~~
Keep in mind that for any data race warning, although there might be a data race
@@ -862,584 +903,21 @@ supported. To start the fuzzer, run
Alternatively, some command different from ``qemu-img info`` can be tested, by
changing the ``-c`` option.
-Integration tests using the Avocado Framework
----------------------------------------------
-
-The ``tests/avocado`` directory hosts integration tests. They're usually
-higher level tests, and may interact with external resources and with
-various guest operating systems.
-
-These tests are written using the Avocado Testing Framework (which must
-be installed separately) in conjunction with a the ``avocado_qemu.Test``
-class, implemented at ``tests/avocado/avocado_qemu``.
-
-Tests based on ``avocado_qemu.Test`` can easily:
-
- * Customize the command line arguments given to the convenience
- ``self.vm`` attribute (a QEMUMachine instance)
-
- * Interact with the QEMU monitor, send QMP commands and check
- their results
-
- * Interact with the guest OS, using the convenience console device
- (which may be useful to assert the effectiveness and correctness of
- command line arguments or QMP commands)
-
- * Interact with external data files that accompany the test itself
- (see ``self.get_data()``)
-
- * Download (and cache) remote data files, such as firmware and kernel
- images
-
- * Have access to a library of guest OS images (by means of the
- ``avocado.utils.vmimage`` library)
-
- * Make use of various other test related utilities available at the
- test class itself and at the utility library:
-
- - http://avocado-framework.readthedocs.io/en/latest/api/test/avocado.html#avocado.Test
- - http://avocado-framework.readthedocs.io/en/latest/api/utils/avocado.utils.html
-
-Running tests
-~~~~~~~~~~~~~
-
-You can run the avocado tests simply by executing:
-
-.. code::
-
- make check-avocado
-
-This involves the automatic installation, from PyPI, of all the
-necessary avocado-framework dependencies into the QEMU venv within the
-build tree (at ``./pyvenv``). Test results are also saved within the
-build tree (at ``tests/results``).
-
-Note: the build environment must be using a Python 3 stack, and have
-the ``venv`` and ``pip`` packages installed. If necessary, make sure
-``configure`` is called with ``--python=`` and that those modules are
-available. On Debian and Ubuntu based systems, depending on the
-specific version, they may be on packages named ``python3-venv`` and
-``python3-pip``.
-
-It is also possible to run tests based on tags using the
-``make check-avocado`` command and the ``AVOCADO_TAGS`` environment
-variable:
-
-.. code::
-
- make check-avocado AVOCADO_TAGS=quick
-
-Note that tags separated with commas have an AND behavior, while tags
-separated by spaces have an OR behavior. For more information on Avocado
-tags, see:
-
- https://avocado-framework.readthedocs.io/en/latest/guides/user/chapters/tags.html
-
-To run a single test file, a couple of them, or a test within a file
-using the ``make check-avocado`` command, set the ``AVOCADO_TESTS``
-environment variable with the test files or test names. To run all
-tests from a single file, use:
-
- .. code::
-
- make check-avocado AVOCADO_TESTS=$FILEPATH
-
-The same is valid to run tests from multiple test files:
-
- .. code::
-
- make check-avocado AVOCADO_TESTS='$FILEPATH1 $FILEPATH2'
-
-To run a single test within a file, use:
-
- .. code::
-
- make check-avocado AVOCADO_TESTS=$FILEPATH:$TESTCLASS.$TESTNAME
-
-The same is valid to run single tests from multiple test files:
-
- .. code::
-
- make check-avocado AVOCADO_TESTS='$FILEPATH1:$TESTCLASS1.$TESTNAME1 $FILEPATH2:$TESTCLASS2.$TESTNAME2'
-
-The scripts installed inside the virtual environment may be used
-without an "activation". For instance, the Avocado test runner
-may be invoked by running:
-
- .. code::
-
- pyvenv/bin/avocado run $OPTION1 $OPTION2 tests/avocado/
-
-Note that if ``make check-avocado`` was not executed before, it is
-possible to create the Python virtual environment with the dependencies
-needed running:
-
- .. code::
-
- make check-venv
-
-It is also possible to run tests from a single file or a single test within
-a test file. To run tests from a single file within the build tree, use:
-
- .. code::
-
- pyvenv/bin/avocado run tests/avocado/$TESTFILE
-
-To run a single test within a test file, use:
-
- .. code::
-
- pyvenv/bin/avocado run tests/avocado/$TESTFILE:$TESTCLASS.$TESTNAME
-
-Valid test names are visible in the output from any previous execution
-of Avocado or ``make check-avocado``, and can also be queried using:
-
- .. code::
-
- pyvenv/bin/avocado list tests/avocado
-
-Manual Installation
-~~~~~~~~~~~~~~~~~~~
-
-To manually install Avocado and its dependencies, run:
-
-.. code::
-
- pip install --user avocado-framework
-
-Alternatively, follow the instructions on this link:
-
- https://avocado-framework.readthedocs.io/en/latest/guides/user/chapters/installing.html
-
-Overview
-~~~~~~~~
-
-The ``tests/avocado/avocado_qemu`` directory provides the
-``avocado_qemu`` Python module, containing the ``avocado_qemu.Test``
-class. Here's a simple usage example:
-
-.. code::
-
- from avocado_qemu import QemuSystemTest
-
-
- class Version(QemuSystemTest):
- """
- :avocado: tags=quick
- """
- def test_qmp_human_info_version(self):
- self.vm.launch()
- res = self.vm.cmd('human-monitor-command',
- command_line='info version')
- self.assertRegex(res, r'^(\d+\.\d+\.\d)')
-
-To execute your test, run:
-
-.. code::
-
- avocado run version.py
-
-Tests may be classified according to a convention by using docstring
-directives such as ``:avocado: tags=TAG1,TAG2``. To run all tests
-in the current directory, tagged as "quick", run:
-
-.. code::
-
- avocado run -t quick .
-
-The ``avocado_qemu.Test`` base test class
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The ``avocado_qemu.Test`` class has a number of characteristics that
-are worth being mentioned right away.
-
-First of all, it attempts to give each test a ready to use QEMUMachine
-instance, available at ``self.vm``. Because many tests will tweak the
-QEMU command line, launching the QEMUMachine (by using ``self.vm.launch()``)
-is left to the test writer.
-
-The base test class has also support for tests with more than one
-QEMUMachine. The way to get machines is through the ``self.get_vm()``
-method which will return a QEMUMachine instance. The ``self.get_vm()``
-method accepts arguments that will be passed to the QEMUMachine creation
-and also an optional ``name`` attribute so you can identify a specific
-machine and get it more than once through the tests methods. A simple
-and hypothetical example follows:
-
-.. code::
-
- from avocado_qemu import QemuSystemTest
-
-
- class MultipleMachines(QemuSystemTest):
- def test_multiple_machines(self):
- first_machine = self.get_vm()
- second_machine = self.get_vm()
- self.get_vm(name='third_machine').launch()
-
- first_machine.launch()
- second_machine.launch()
-
- first_res = first_machine.cmd(
- 'human-monitor-command',
- command_line='info version')
-
- second_res = second_machine.cmd(
- 'human-monitor-command',
- command_line='info version')
-
- third_res = self.get_vm(name='third_machine').cmd(
- 'human-monitor-command',
- command_line='info version')
-
- self.assertEqual(first_res, second_res, third_res)
-
-At test "tear down", ``avocado_qemu.Test`` handles all the QEMUMachines
-shutdown.
-
-The ``avocado_qemu.LinuxTest`` base test class
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The ``avocado_qemu.LinuxTest`` is further specialization of the
-``avocado_qemu.Test`` class, so it contains all the characteristics of
-the later plus some extra features.
-
-First of all, this base class is intended for tests that need to
-interact with a fully booted and operational Linux guest. At this
-time, it uses a Fedora 31 guest image. The most basic example looks
-like this:
-
-.. code::
-
- from avocado_qemu import LinuxTest
-
-
- class SomeTest(LinuxTest):
-
- def test(self):
- self.launch_and_wait()
- self.ssh_command('some_command_to_be_run_in_the_guest')
-
-Please refer to tests that use ``avocado_qemu.LinuxTest`` under
-``tests/avocado`` for more examples.
-
-QEMUMachine
-~~~~~~~~~~~
-
-The QEMUMachine API is already widely used in the Python iotests,
-device-crash-test and other Python scripts. It's a wrapper around the
-execution of a QEMU binary, giving its users:
-
- * the ability to set command line arguments to be given to the QEMU
- binary
-
- * a ready to use QMP connection and interface, which can be used to
- send commands and inspect its results, as well as asynchronous
- events
-
- * convenience methods to set commonly used command line arguments in
- a more succinct and intuitive way
-
-QEMU binary selection
-^^^^^^^^^^^^^^^^^^^^^
-
-The QEMU binary used for the ``self.vm`` QEMUMachine instance will
-primarily depend on the value of the ``qemu_bin`` parameter. If it's
-not explicitly set, its default value will be the result of a dynamic
-probe in the same source tree. A suitable binary will be one that
-targets the architecture matching host machine.
-
-Based on this description, test writers will usually rely on one of
-the following approaches:
-
-1) Set ``qemu_bin``, and use the given binary
-
-2) Do not set ``qemu_bin``, and use a QEMU binary named like
- "qemu-system-${arch}", either in the current
- working directory, or in the current source tree.
-
-The resulting ``qemu_bin`` value will be preserved in the
-``avocado_qemu.Test`` as an attribute with the same name.
-
-Attribute reference
-~~~~~~~~~~~~~~~~~~~
-
-Test
-^^^^
-
-Besides the attributes and methods that are part of the base
-``avocado.Test`` class, the following attributes are available on any
-``avocado_qemu.Test`` instance.
-
-vm
-''
-
-A QEMUMachine instance, initially configured according to the given
-``qemu_bin`` parameter.
-
-arch
-''''
-
-The architecture can be used on different levels of the stack, e.g. by
-the framework or by the test itself. At the framework level, it will
-currently influence the selection of a QEMU binary (when one is not
-explicitly given).
-
-Tests are also free to use this attribute value, for their own needs.
-A test may, for instance, use the same value when selecting the
-architecture of a kernel or disk image to boot a VM with.
-
-The ``arch`` attribute will be set to the test parameter of the same
-name. If one is not given explicitly, it will either be set to
-``None``, or, if the test is tagged with one (and only one)
-``:avocado: tags=arch:VALUE`` tag, it will be set to ``VALUE``.
-
-cpu
-'''
-
-The cpu model that will be set to all QEMUMachine instances created
-by the test.
-
-The ``cpu`` attribute will be set to the test parameter of the same
-name. If one is not given explicitly, it will either be set to
-``None ``, or, if the test is tagged with one (and only one)
-``:avocado: tags=cpu:VALUE`` tag, it will be set to ``VALUE``.
-
-machine
-'''''''
-
-The machine type that will be set to all QEMUMachine instances created
-by the test.
-
-The ``machine`` attribute will be set to the test parameter of the same
-name. If one is not given explicitly, it will either be set to
-``None``, or, if the test is tagged with one (and only one)
-``:avocado: tags=machine:VALUE`` tag, it will be set to ``VALUE``.
-
-qemu_bin
-''''''''
-
-The preserved value of the ``qemu_bin`` parameter or the result of the
-dynamic probe for a QEMU binary in the current working directory or
-source tree.
-
-LinuxTest
-^^^^^^^^^
-
-Besides the attributes present on the ``avocado_qemu.Test`` base
-class, the ``avocado_qemu.LinuxTest`` adds the following attributes:
-
-distro
-''''''
-
-The name of the Linux distribution used as the guest image for the
-test. The name should match the **Provider** column on the list
-of images supported by the avocado.utils.vmimage library:
-
-https://avocado-framework.readthedocs.io/en/latest/guides/writer/libs/vmimage.html#supported-images
-
-distro_version
-''''''''''''''
-
-The version of the Linux distribution as the guest image for the
-test. The name should match the **Version** column on the list
-of images supported by the avocado.utils.vmimage library:
-
-https://avocado-framework.readthedocs.io/en/latest/guides/writer/libs/vmimage.html#supported-images
-
-distro_checksum
-'''''''''''''''
-
-The sha256 hash of the guest image file used for the test.
-
-If this value is not set in the code or by a test parameter (with the
-same name), no validation on the integrity of the image will be
-performed.
-
-Parameter reference
-~~~~~~~~~~~~~~~~~~~
-
-To understand how Avocado parameters are accessed by tests, and how
-they can be passed to tests, please refer to::
-
- https://avocado-framework.readthedocs.io/en/latest/guides/writer/chapters/writing.html#accessing-test-parameters
-
-Parameter values can be easily seen in the log files, and will look
-like the following:
-
-.. code::
-
- PARAMS (key=qemu_bin, path=*, default=./qemu-system-x86_64) => './qemu-system-x86_64
-
-Test
-^^^^
-
-arch
-''''
-
-The architecture that will influence the selection of a QEMU binary
-(when one is not explicitly given).
-
-Tests are also free to use this parameter value, for their own needs.
-A test may, for instance, use the same value when selecting the
-architecture of a kernel or disk image to boot a VM with.
-
-This parameter has a direct relation with the ``arch`` attribute. If
-not given, it will default to None.
-
-cpu
-'''
-
-The cpu model that will be set to all QEMUMachine instances created
-by the test.
-
-machine
-'''''''
-
-The machine type that will be set to all QEMUMachine instances created
-by the test.
-
-qemu_bin
-''''''''
-
-The exact QEMU binary to be used on QEMUMachine.
-
-LinuxTest
-^^^^^^^^^
-
-Besides the parameters present on the ``avocado_qemu.Test`` base
-class, the ``avocado_qemu.LinuxTest`` adds the following parameters:
-
-distro
-''''''
-
-The name of the Linux distribution used as the guest image for the
-test. The name should match the **Provider** column on the list
-of images supported by the avocado.utils.vmimage library:
-
-https://avocado-framework.readthedocs.io/en/latest/guides/writer/libs/vmimage.html#supported-images
-
-distro_version
-''''''''''''''
-
-The version of the Linux distribution as the guest image for the
-test. The name should match the **Version** column on the list
-of images supported by the avocado.utils.vmimage library:
-
-https://avocado-framework.readthedocs.io/en/latest/guides/writer/libs/vmimage.html#supported-images
-
-distro_checksum
-'''''''''''''''
-
-The sha256 hash of the guest image file used for the test.
-
-If this value is not set in the code or by this parameter no
-validation on the integrity of the image will be performed.
-
-Skipping tests
-~~~~~~~~~~~~~~
-
-The Avocado framework provides Python decorators which allow for easily skip
-tests running under certain conditions. For example, on the lack of a binary
-on the test system or when the running environment is a CI system. For further
-information about those decorators, please refer to::
-
- https://avocado-framework.readthedocs.io/en/latest/guides/writer/chapters/writing.html#skipping-tests
-
-While the conditions for skipping tests are often specifics of each one, there
-are recurring scenarios identified by the QEMU developers and the use of
-environment variables became a kind of standard way to enable/disable tests.
-
-Here is a list of the most used variables:
-
-AVOCADO_ALLOW_LARGE_STORAGE
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Tests which are going to fetch or produce assets considered *large* are not
-going to run unless that ``AVOCADO_ALLOW_LARGE_STORAGE=1`` is exported on
-the environment.
-
-The definition of *large* is a bit arbitrary here, but it usually means an
-asset which occupies at least 1GB of size on disk when uncompressed.
-
-SPEED
-^^^^^
-Tests which have a long runtime will not be run unless ``SPEED=slow`` is
-exported on the environment.
-
-The definition of *long* is a bit arbitrary here, and it depends on the
-usefulness of the test too. A unique test is worth spending more time on,
-small variations on existing tests perhaps less so. As a rough guide,
-a test or set of similar tests which take more than 100 seconds to
-complete.
-
-AVOCADO_ALLOW_UNTRUSTED_CODE
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-There are tests which will boot a kernel image or firmware that can be
-considered not safe to run on the developer's workstation, thus they are
-skipped by default. The definition of *not safe* is also arbitrary but
-usually it means a blob which either its source or build process aren't
-public available.
-
-You should export ``AVOCADO_ALLOW_UNTRUSTED_CODE=1`` on the environment in
-order to allow tests which make use of those kind of assets.
-
-AVOCADO_TIMEOUT_EXPECTED
-^^^^^^^^^^^^^^^^^^^^^^^^
-The Avocado framework has a timeout mechanism which interrupts tests to avoid the
-test suite of getting stuck. The timeout value can be set via test parameter or
-property defined in the test class, for further details::
-
- https://avocado-framework.readthedocs.io/en/latest/guides/writer/chapters/writing.html#setting-a-test-timeout
-
-Even though the timeout can be set by the test developer, there are some tests
-that may not have a well-defined limit of time to finish under certain
-conditions. For example, tests that take longer to execute when QEMU is
-compiled with debug flags. Therefore, the ``AVOCADO_TIMEOUT_EXPECTED`` variable
-has been used to determine whether those tests should run or not.
-
-QEMU_TEST_FLAKY_TESTS
-^^^^^^^^^^^^^^^^^^^^^
-Some tests are not working reliably and thus are disabled by default.
-This includes tests that don't run reliably on GitLab's CI which
-usually expose real issues that are rarely seen on developer machines
-due to the constraints of the CI environment. If you encounter a
-similar situation then raise a bug and then mark the test as shown on
-the code snippet below:
-
-.. code::
+Functional tests using Python
+-----------------------------
- # See https://gitlab.com/qemu-project/qemu/-/issues/nnnn
- @skipUnless(os.getenv('QEMU_TEST_FLAKY_TESTS'), 'Test is unstable on GitLab')
- def test(self):
- do_something()
+A functional test focuses on the functional requirement of the software,
+attempting to find errors like incorrect functions, interface errors,
+behavior errors, and initialization and termination errors [3]_.
-You can also add ``:avocado: tags=flaky`` to the test meta-data so
-only the flaky tests can be run as a group:
+The ``tests/functional`` directory hosts functional tests written in
+Python. You can run the functional tests simply by executing:
.. code::
- env QEMU_TEST_FLAKY_TESTS=1 ./pyvenv/bin/avocado \
- run tests/avocado -filter-by-tags=flaky
-
-Tests should not live in this state forever and should either be fixed
-or eventually removed.
+ make check-functional
-
-Uninstalling Avocado
-~~~~~~~~~~~~~~~~~~~~
-
-If you've followed the manual installation instructions above, you can
-easily uninstall Avocado. Start by listing the packages you have
-installed::
-
- pip list --user
-
-And remove any package you want with::
-
- pip uninstall <package_name>
-
-If you've used ``make check-avocado``, the Python virtual environment where
-Avocado is installed will be cleaned up as part of ``make check-clean``.
+See :ref:`checkfunctional-ref` for more details.
.. _checktcg-ref:
@@ -1555,3 +1033,27 @@ coverage-html`` which will create
Further analysis can be conducted by running the ``gcov`` command
directly on the various .gcda output files. Please read the ``gcov``
documentation for more information.
+
+Flaky tests
+-----------
+
+A flaky test is defined as a test that exhibits both a passing and a failing
+result with the same code on different runs. Some usual reasons for an
+intermittent/flaky test are async wait, concurrency, and test order dependency
+[4]_.
+
+In QEMU, tests that are identified to be flaky are normally disabled by
+default. Set the QEMU_TEST_FLAKY_TESTS environment variable before running
+the tests to enable them.
+
+References
+----------
+
+.. [1] Sommerville, Ian (2016). Software Engineering. p. 233.
+.. [2] Pressman, Roger S. & Maxim, Bruce R. (2020). Software Engineering,
+ A Practitioner’s Approach. p. 48, 376, 378, 381.
+.. [3] Pressman, Roger S. & Maxim, Bruce R. (2020). Software Engineering,
+ A Practitioner’s Approach. p. 388.
+.. [4] Luo, Qingzhou, et al. An empirical analysis of flaky tests.
+ Proceedings of the 22nd ACM SIGSOFT International Symposium on
+ Foundations of Software Engineering. 2014.
diff --git a/docs/devel/qgraph.rst b/docs/devel/testing/qgraph.rst
index 43342d9..43342d9 100644
--- a/docs/devel/qgraph.rst
+++ b/docs/devel/testing/qgraph.rst
diff --git a/docs/devel/qtest.rst b/docs/devel/testing/qtest.rst
index c5b8546..73ef770 100644
--- a/docs/devel/qtest.rst
+++ b/docs/devel/testing/qtest.rst
@@ -1,3 +1,5 @@
+.. _qtest:
+
========================================
QTest Device Emulation Testing Framework
========================================
diff --git a/docs/devel/uefi-vars.rst b/docs/devel/uefi-vars.rst
new file mode 100644
index 0000000..0151a26
--- /dev/null
+++ b/docs/devel/uefi-vars.rst
@@ -0,0 +1,68 @@
+==============
+UEFI variables
+==============
+
+Guest UEFI variable management
+==============================
+
+The traditional approach for UEFI Variable storage in qemu guests is
+to work as close as possible to physical hardware. That means
+providing pflash as storage and leaving the management of variables
+and flash to the guest.
+
+Secure boot support comes with the requirement that the UEFI variable
+storage must be protected against direct access by the OS. All update
+requests must pass the sanity checks. (Parts of) the firmware must
+run with a higher privilege level than the OS so this can be enforced
+by the firmware. On x86 this has been implemented using System
+Management Mode (SMM) in qemu and kvm, which again is the same
+approach taken by physical hardware. Only privileged code running in
+SMM mode is allowed to access flash storage.
+
+Communication with the firmware code running in SMM mode works by
+serializing the requests to a shared buffer, then trapping into SMM
+mode via SMI. The SMM code processes the request, stores the reply in
+the same buffer and returns.
+
+Host UEFI variable service
+==========================
+
+Instead of running the privileged code inside the guest we can run it
+on the host. The serialization protocol can be reused. The
+communication with the host uses a virtual device, which essentially
+configures the shared buffer location and size, and traps to the host
+to process the requests.
+
+The ``uefi-vars`` device implements the UEFI virtual device. It comes
+in ``uefi-vars-x86`` and ``uefi-vars-sysbus`` flavours. The device
+reimplements the handlers needed, specifically
+``EfiSmmVariableProtocol`` and ``VarCheckPolicyLibMmiHandler``. It
+also consumes events (``EfiEndOfDxeEventGroup``,
+``EfiEventReadyToBoot`` and ``EfiEventExitBootServices``).
+
+The advantage of the approach is that we do not need a special
+privilege level for the firmware to protect itself, i.e. it does not
+depend on SMM emulation on x64, which allows the removal of a bunch of
+complex code for SMM emulation from the linux kernel
+(CONFIG_KVM_SMM=n). It also allows support for secure boot on arm
+without implementing secure world (el3) emulation in kvm.
+
+Of course there are also downsides. The added device increases the
+attack surface of the host, and we are adding some code duplication
+because we have to reimplement some edk2 functionality in qemu.
+
+usage on x86_64
+---------------
+
+.. code::
+
+ qemu-system-x86_64 \
+ -device uefi-vars-x86,jsonfile=/path/to/vars.json
+
+usage on aarch64
+----------------
+
+.. code::
+
+ qemu-system-aarch64 -M virt \
+ -device uefi-vars-sysbus,jsonfile=/path/to/vars.json
diff --git a/docs/devel/virtio-backends.rst b/docs/devel/virtio-backends.rst
index 9ff092e..ebddc3b 100644
--- a/docs/devel/virtio-backends.rst
+++ b/docs/devel/virtio-backends.rst
@@ -101,13 +101,12 @@ manually instantiated:
VirtIOBlock vdev;
};
- static Property virtio_blk_pci_properties[] = {
+ static const Property virtio_blk_pci_properties[] = {
DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
DEV_NVECTORS_UNSPECIFIED),
- DEFINE_PROP_END_OF_LIST(),
};
static void virtio_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
@@ -120,7 +119,7 @@ manually instantiated:
qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
}
- static void virtio_blk_pci_class_init(ObjectClass *klass, void *data)
+ static void virtio_blk_pci_class_init(ObjectClass *klass, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
diff --git a/docs/glossary.rst b/docs/glossary.rst
new file mode 100644
index 0000000..4fa044b
--- /dev/null
+++ b/docs/glossary.rst
@@ -0,0 +1,280 @@
+.. _Glossary:
+
+--------
+Glossary
+--------
+
+This section of the manual presents brief definitions of acronyms and terms used
+by QEMU developers.
+
+Accelerator
+-----------
+
+A specific API used to accelerate execution of guest instructions. It can be
+hardware-based, through a virtualization API provided by the host OS (kvm, hvf,
+whpx, ...), or software-based (tcg). See this description of `supported
+accelerators<Accelerators>`.
+
+Board
+-----
+
+Another name for :ref:`machine`.
+
+Block
+-----
+
+Block drivers are the available `disk formats and front-ends
+<block-drivers>` available, and block devices `(see Block device section on
+options page)<sec_005finvocation>` are using them to implement disks for a
+virtual machine.
+
+CFI
+---
+
+Control Flow Integrity is a hardening technique used to prevent exploits
+targeting QEMU by detecting unexpected branches during execution. QEMU `actively
+supports<cfi>` being compiled with CFI enabled.
+
+Device
+------
+
+In QEMU, a device is a piece of hardware visible to the guest. Examples include
+UARTs, PCI controllers, PCI cards, VGA controllers, and many more.
+
+QEMU is able to emulate a CPU, and all the hardware interacting with it,
+including `many devices<device-emulation>`. When QEMU runs a virtual machine
+using a hardware-based accelerator, it is responsible for emulating, using
+software, all devices.
+
+EDK2
+----
+
+EDK2, as known as `TianoCore <https://www.tianocore.org/>`_, is an open source
+implementation of UEFI standard. QEMU virtual machines that boot a UEFI firmware
+usually use EDK2.
+
+gdbstub
+-------
+
+QEMU implements a `gdb server <GDB usage>`, allowing gdb to attach to it and
+debug a running virtual machine, or a program in user-mode. This allows
+debugging the guest code that is running inside QEMU.
+
+glib2
+-----
+
+`GLib2 <https://docs.gtk.org/glib/>`_ is one of the most important libraries we
+are using through the codebase. It provides many data structures, macros, string
+and thread utilities and portable functions across different OS. It's required
+to build QEMU.
+
+Guest agent
+-----------
+
+The `QEMU Guest Agent <qemu-ga>` is a daemon intended to be run within virtual
+machines. It provides various services to help QEMU to interact with it.
+
+.. _guest:
+
+Guest
+-----
+
+Guest is the architecture of the virtual machine, which is emulated.
+See also :ref:`host`.
+
+Sometimes this is called the :ref:`target` architecture, but that term
+can be ambiguous.
+
+.. _host:
+
+Host
+----
+
+Host is the architecture on which QEMU is running on, which is native.
+See also :ref:`guest`.
+
+Hypervisor
+----------
+
+The formal definition of an hypervisor is a program or API than can be used to
+manage a virtual machine. QEMU is a virtualizer, that interacts with various
+hypervisors.
+
+In the context of QEMU, an hypervisor is an API, provided by the Host OS,
+allowing to execute virtual machines. Linux implementation is KVM (and supports
+Xen as well). For MacOS, it's HVF. Windows defines WHPX. And NetBSD provides
+NVMM.
+
+.. _machine:
+
+Machine
+-------
+
+QEMU's system emulation models many different types of hardware. A machine model
+(sometimes called a board model) is the model of a complete virtual system with
+RAM, one or more CPUs, and various devices. It can be selected with the option
+``-machine`` of qemu-system. Our machine models can be found on this `page
+<system-targets-ref>`.
+
+Migration
+---------
+
+QEMU can save and restore the execution of a virtual machine between different
+host systems. This is provided by the :ref:`Migration framework<migration>`.
+
+NBD
+---
+
+The `QEMU Network Block Device server <qemu-nbd>` is a tool that can be used to
+mount and access QEMU images, providing functionality similar to a loop device.
+
+Mailing List
+------------
+
+This is `where <https://wiki.qemu.org/Contribute/MailingLists>`_ all the
+development happens! Changes are posted as series, that all developers can
+review and share feedback for.
+
+For reporting issues, our `GitLab
+<https://gitlab.com/qemu-project/qemu/-/issues>`_ tracker is the best place.
+
+.. _softmmu:
+
+MMU / softmmu
+-------------
+
+The Memory Management Unit is responsible for translating virtual addresses to
+physical addresses and managing memory protection. QEMU system mode is named
+"softmmu" precisely because it implements this in software, including a TLB
+(Translation lookaside buffer), for the guest virtual machine.
+
+QEMU user-mode does not implement a full software MMU, but "simply" translates
+virtual addresses by adding a specific offset, and relying on host MMU/OS
+instead.
+
+Monitor / QMP / HMP
+-------------------
+
+The `QEMU Monitor <QEMU monitor>` is a text interface which can be used to interact
+with a running virtual machine.
+
+QMP stands for QEMU Monitor Protocol and is a json based interface.
+HMP stands for Human Monitor Protocol and is a set of text commands available
+for users who prefer natural language to json.
+
+MTTCG
+-----
+
+Multiple CPU support was first implemented using a round-robin algorithm
+running on a single thread. Later on, `Multi-threaded TCG <mttcg>` was developed
+to benefit from multiple cores to speed up execution.
+
+Plugins
+-------
+
+`TCG Plugins <TCG Plugins>` is an API used to instrument guest code, in system
+and user mode. The end goal is to have a similar set of functionality compared
+to `DynamoRIO <https://dynamorio.org/>`_ or `valgrind <https://valgrind.org/>`_.
+
+One key advantage of QEMU plugins is that they can be used to perform
+architecture agnostic instrumentation.
+
+Patchew
+-------
+
+`Patchew <https://patchew.org/QEMU/>`_ is a website that tracks patches on the
+Mailing List.
+
+PR
+--
+
+Once a series is reviewed and accepted by a subsystem maintainer, it will be
+included in a PR (Pull Request) that the project maintainer will merge into QEMU
+main branch, after running tests.
+
+The QEMU project doesn't currently expect most developers to directly submit
+pull requests.
+
+QCOW2
+-----
+
+QEMU Copy On Write is a disk format developed by QEMU. It provides transparent
+compression, automatic extension, and many other advantages over a raw image.
+
+qcow2 is the recommended format to use.
+
+QEMU
+----
+
+`QEMU (Quick Emulator) <https://www.qemu.org/>`_ is a generic and open source
+machine emulator and virtualizer.
+
+QOM
+---
+
+:ref:`QEMU Object Model <qom>` is an object oriented API used to define
+various devices and hardware in the QEMU codebase.
+
+Record/replay
+-------------
+
+:ref:`Record/replay <replay>` is a feature of QEMU allowing to have a
+deterministic and reproducible execution of a virtual machine.
+
+Rust
+----
+
+`A new programming language <https://www.rust-lang.org/>`_, memory safe by
+default. There is a work in progress to integrate it in QEMU codebase for
+various subsystems.
+
+System mode
+-----------
+
+QEMU System mode provides a virtual model of an entire machine (CPU, memory and
+emulated devices) to run a guest OS. In this mode the CPU may be fully emulated,
+or it may work with a hypervisor such as KVM, Xen or Hypervisor.Framework to
+allow the guest to run directly on the host CPU.
+
+QEMU System mode is called :ref:`softmmu <softmmu>` as well.
+
+.. _target:
+
+Target
+------
+
+The term "target" can be ambiguous. In most places in QEMU it is used as a
+synonym for :ref:`guest`. For example the code for emulating Arm CPUs is in
+``target/arm/``. However in the :ref:`TCG subsystem <tcg>` "target" refers to the
+architecture which QEMU is running on, i.e. the :ref:`host`.
+
+TCG
+---
+
+TCG is the QEMU `Tiny Code Generator <tcg>`. It is the JIT (just-in-time)
+compiler we use to emulate a guest CPU in software.
+
+It is one of the accelerators supported by QEMU, and supports a lot of
+guest/host architectures.
+
+User mode
+---------
+
+QEMU User mode can launch processes compiled for one CPU on another CPU. In this
+mode the CPU is always emulated. In this mode, QEMU translate system calls from
+guest to host kernel. It is available for Linux and BSD.
+
+VirtIO
+------
+
+VirtIO is an open standard used to define and implement virtual devices with a
+minimal overhead, defining a set of data structures and hypercalls (similar to
+system calls, but targeting an hypervisor, which happens to be QEMU in our
+case). It's designed to be more efficient than emulating a real device, by
+minimizing the amount of interactions between a guest VM and its hypervisor.
+
+vhost-user
+----------
+
+`Vhost-user <vhost_user>` is an interface used to implement VirtIO devices
+outside of QEMU itself.
diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt
index e17bb50..af4e839 100644
--- a/docs/igd-assign.txt
+++ b/docs/igd-assign.txt
@@ -1,44 +1,70 @@
Intel Graphics Device (IGD) assignment with vfio-pci
====================================================
-IGD has two different modes for assignment using vfio-pci:
-
-1) Universal Pass-Through (UPT) mode:
-
- In this mode the IGD device is added as a *secondary* (ie. non-primary)
- graphics device in combination with an emulated primary graphics device.
- This mode *requires* guest driver support to remove the external
- dependencies generally associated with IGD (see below). Those guest
- drivers only support this mode for Broadwell and newer IGD, according to
- Intel. Additionally, this mode by default, and as officially supported
- by Intel, does not support direct video output. The intention is to use
- this mode either to provide hardware acceleration to the emulated graphics
- or to use this mode in combination with guest-based remote access software,
- for example VNC (see below for optional output support). This mode
- theoretically has no device specific handling dependencies on vfio-pci or
- the VM firmware.
-
-2) "Legacy" mode:
-
- In this mode the IGD device is intended to be the primary and exclusive
- graphics device in the VM[1], as such QEMU does not facilitate any sort
- of remote graphics to the VM in this mode. A connected physical monitor
- is the intended output device for IGD. This mode includes several
- requirements and restrictions:
-
- * IGD must be given address 02.0 on the PCI root bus in the VM
- * The host kernel must support vfio extensions for IGD (v4.6)
- * vfio VGA support very likely needs to be enabled in the host kernel
- * The VM firmware must support specific fw_cfg enablers for IGD
- * The VM machine type must support a PCI host bridge at 00.0 (standard)
- * The VM machine type must provide or allow to be created a special
- ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at
- PCI address 1f.0.
- * The IGD device must have a VGA ROM, either provided via the romfile
- option or loaded automatically through vfio (standard). rombar=0
- will disable legacy mode support.
- * Hotplug of the IGD device is not supported.
- * The IGD device must be a SandyBridge or newer model device.
+Using vfio-pci, we can passthrough Intel Graphics Device (IGD) to guest, either
+serve as primary and exclusive graphics adapter, or used in combination with an
+emulated primary graphics device, depending on the config and guest driver
+support. However, IGD devices are not "clean" PCI devices, they use extra
+memory regions other than BARs. Special handling is required to make them work
+properly, including:
+
+* OpRegion for accessing Virtual BIOS Table (VBT) that contains display output
+ information.
+* Data Stolen Memory (DSM) region used as VRAM at early stage (BIOS/UEFI)
+
+Certain guest software also depends on following conditions to work:
+(*-Required by)
+
+| Condition | Linux | Windows | VBIOS | EFI GOP |
+|---------------------------------------------|-------|---------|-------|---------|
+| #1 IGD has a valid OpRegion containing VBT | * ^1 | * | * | * |
+| #2 VID/DID of LPC bridge at 00:1f.0 matches | | | * | * |
+| #3 IGD is assigned to BDF 00:02.0 | | | * | * |
+| #4 IGD has VGA controller device class | | | * | * |
+| #5 Host's VGA ranges are mapped to IGD | | | * | |
+| #6 Guest has valid VBIOS or UEFI Option ROM | | | * | * |
+
+^1 Though i915 driver is able to mock a OpRegion, it is still recommended to
+ use the VBT copied from host OpRegion to prevent incorrect configuration.
+
+For #1, the "x-igd-opregion=on" option exposes a copy of host IGD OpRegion to
+guest via fw_cfg, where guest firmware can set up guest OpRegion with it.
+
+For #2, "x-igd-lpc=on" option copies the IDs of host LPC bridge and host bridge
+to guest. Currently this is only supported on i440fx machines as there is
+already an ICH9 LPC bridge present on q35 machines, overwriting its IDs may
+lead to unexpected behavior.
+
+For #3, "addr=2.0" assigns IGD to 00:02.0.
+
+For #4, the primary display must be set to IGD in host BIOS.
+
+For #5, "x-vga=on" enables guest access to standard VGA IO/MMIO ranges.
+
+For #6, ROM either provided via the ROM BAR or romfile= option is needed, this
+Intel document [1] shows how to dump VBIOS to file. For UEFI Option ROM, see
+"Guest firmware" section.
+
+QEMU also provides a "Legacy" mode that implicitly enables full functionality
+on IGD, it is automatically enabled when
+* IGD generation is 6 to 9 (Sandy Bridge to Comet Lake)
+* Machine type is i440fx
+* IGD is assigned to guest BDF 00:02.0
+* ROM BAR or romfile is present
+
+In "Legacy" mode, QEMU will automatically setup OpRegion, LPC bridge IDs and
+VGA range access, which is equivalent to:
+ x-igd-opregion=on,x-igd-lpc=on,x-vga=on
+
+By default, "Legacy" mode won't fail, it continues on error. User can set
+"x-igd-legacy-mode=on" to force enabling legacy mode, this also checks if the
+conditions above for legacy mode is met, and if any error occurs, QEMU will
+fail immediately. Users can also set "x-igd-legacy-mode=off" to disable legacy
+mode.
+
+In legacy mode, as the guest VGA ranges are assigned to IGD device, all other
+graphics devices should be removed, this can be done using "-nographic" or
+"-vga none" or "-nodefaults", along with adding the device using vfio-pci.
For either mode, depending on the host kernel, the i915 driver in the host
may generate faults and errors upon re-binding to an IGD device after it
@@ -73,31 +99,39 @@ DVI, or DisplayPort) may be unsupported in some use cases. In the author's
experience, even DP to VGA adapters can be troublesome while adapters between
digital formats work well.
-Usage
-=====
-The intention is for IGD assignment to be transparent for users and thus for
-management tools like libvirt. To make use of legacy mode, simply remove all
-other graphics options and use "-nographic" and either "-vga none" or
-"-nodefaults", along with adding the device using vfio-pci:
- -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2
+Options
+=======
+* x-igd-opregion=[*on*|off]
+ Copy host IGD OpRegion and expose it to guest with fw_cfg
+
+* x-igd-lpc=[on|*off*]
+ Creates a dummy LPC bridge at 00:1f:0 with host VID/DID (i440fx only)
+
+* x-igd-legacy-mode=[on|off|*auto*]
+ Enable/Disable legacy mode
+
+* x-igd-gms=[hex, default 0]
+ Overriding DSM region size in GGC register, 0 means uses host value.
+ Use this only when the DSM size cannot be changed through the
+ 'DVMT Pre-Allocated' option in host BIOS.
+
-For UPT mode, retain the default emulated graphics and simply add the vfio-pci
-device making use of any other bus address other than 02.0. libvirt will
-default to assigning the device a UPT compatible address while legacy mode
-users will need to manually edit the XML if using a tool like virt-manager
-where the VM device address is not expressly specified.
+Examples
+========
+* Adding IGD with automatically legacy mode support
+ -device vfio-pci,host=00:02.0,id=hostdev0,addr=2.0
-An experimental vfio-pci option also exists to enable OpRegion, and thus
-external monitor support, for UPT mode. This can be enabled by adding
-"x-igd-opregion=on" to the vfio-pci device options for the IGD device. As
-with legacy mode, this requires the host to support features introduced in
-the v4.6 kernel. If Intel chooses to embrace this support, the option may
-be made non-experimental in the future, opening it to libvirt support.
+* Adding IGD with OpRegion and LPC ID hack, but without VGA ranges
+ (For UEFI guests)
+ -device vfio-pci,host=00:02.0,id=hostdev0,addr=2.0,x-igd-legacy-mode=off,x-igd-lpc=on,romfile=efi_oprom.rom
-Developer ABI
-=============
-Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
+
+Guest firmware
+==============
+Guest firmware is responsible for setting up OpRegion and Base of Data Stolen
+Memory (BDSM) in guest address space. IGD passthrough support imposes two
+fw_cfg requirements on the VM firmware:
1) "etc/igd-opregion"
@@ -117,17 +151,117 @@ Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
Firmware must allocate a reserved memory below 4GB with required 1MB
alignment equal to this size. Additionally the base address of this
reserved region must be written to the dword BDSM register in PCI config
- space of the IGD device at offset 0x5C. As this support is related to
- running the IGD ROM, which has other dependencies on the device appearing
- at guest address 00:02.0, it's expected that this fw_cfg file is only
- relevant to a single PCI class VGA device with Intel vendor ID, appearing
- at PCI bus address 00:02.0.
+ space of the IGD device at offset 0x5C (or 0xC0 for Gen 11+ devices using
+ 64-bit BDSM). As this support is related to running the IGD ROM, which
+ has other dependencies on the device appearing at guest address 00:02.0,
+ it's expected that this fw_cfg file is only relevant to a single PCI
+ class VGA device with Intel vendor ID, appearing at PCI bus address 00:02.0.
+
+ Starting from Meteor Lake, IGD devices access stolen memory via its MMIO
+ BAR2 (LMEMBAR) and removed the BDSM register in config space. There is
+ no need for guest firmware to allocate data stolen memory in guest address
+ space and write it to BDSM register. Value of this fw_cfg file is 0 in
+ such case.
+
+Upstream Seabios has OpRegion and BDSM (pre-Gen11 device only) support.
+However, the support is not accepted by upstream EDK2/OVMF. A recommended
+solution is to create a virtual OpRom with following DXE drivers:
+
+* IgdAssignmentDxe: Set up OpRegion and BDSM according to fw_cfg (must)
+* IntelGopDriver: Closed-source Intel GOP driver
+* PlatformGopPolicy: Protocol required by IntelGopDriver
+
+IntelGopDriver and PlatformGopPolicy is only required when enabling GOP on IGD.
+
+The original IgdAssignmentDxe can be found at [3]. A Intel maintained version
+with PlatformGopPolicy for industrial computing is at [4]. There is also an
+unofficially maintained version with newer Gen11+ device support at [5].
+You need to build them with EDK2.
+
+For the IntelGopDriver, Intel never released it to public. You may contact
+Intel support to get one as [4] said, if you are an Intel Premier Support
+customer, or you can try extracting it from your host firmware using
+"UEFI BIOS Updater"[6].
+
+Once you got all the required DXE drivers, a Option ROM can be generated with
+EfiRom utility in EDK2, using
+ EfiRom -f 0x8086 -i <Device ID of your IGD> -o output.rom \
+ -e IgdAssignmentDxe.efi PlatformGOPPolicy.efi IntelGopDriver.efi
+
+
+Known issues
+============
+When using OVMF as guest firmware, you may encounter the following warning:
+warning: vfio_container_dma_map(0x55fab36ce610, 0x380010000000, 0x108000, 0x7fd336000000) = -22 (Invalid argument)
+
+Solution:
+Set the host physical address bits to IOMMU address width using
+ -cpu host,host-phys-bits-limit=<IOMMU address width>
+Or in libvirt XML with
+ <cpu>
+ <maxphysaddr mode='passthrough' limit='<IOMMU address width>'/>
+ </cpu>
+The IOMMU address width can be determined with
+ echo $(( ((0x$(cat /sys/devices/virtual/iommu/dmar0/intel-iommu/cap) & 0x3F0000) >> 16) + 1 ))
+Refer https://edk2.groups.io/g/devel/topic/patch_v1/102359124 for more details
+
+
+Memory View
+===========
+IGD has it own address space. To use system RAM as VRAM, a single-level page
+table named Global Graphics Translation Table (GTT) is used for the address
+translation. Each page table entry points a 4KB page. Illustration below shows
+the translation flow on IGD with 64-bit GTT PTEs.
+
+(PTE_SIZE == 8) +-------------+---+
+ | Address | V | V: Valid Bit
+ +-------------+---+
+ | ... | |
+IGD:0x01ae9010 0xd740| 0x70ffc000 | 1 | Mem:0x42ba3e010^
+-----------------------> 0xd748| 0x42ba3e000 | 1 +------------------>
+(addr >> 12) * PTE_SIZE 0xd750| 0x42ba3f000 | 1 |
+ | ... | |
+ +-------------+---+
+^ The address may be remapped by IOMMU
+
+The memory region store GTT is called GTT Stolen Memory (GSM) it is located
+right below the Data Stolen Memory (DSM). Accessing this region directly is
+not allowed, any access will immediately freeze the whole system. The only way
+to access it is through the second half of MMIO BAR0.
+
+The Data Stolen Memory is reserved by firmware, and acts as the VRAM in pre-OS
+environments. In QEMU, guest firmware (Seabios/OVMF) is responsible for
+reserving a continuous region and program its base address to BDSM register,
+then let VBIOS/GOP driver initializing this region. Illustration below shows
+how DSM is mapped.
+
+ IGD Addr Space Host Addr Space Guest Addr Space
+ +-------------+ +-------------+ +-------------+
+ | | | | | |
+ | | | | | |
+ | | +-------------+ +-------------+
+ | | | Data Stolen | | Data Stolen |
+ | | | (Guest) | | (Guest) |
+ | | +------------>+-------------+<------->+-------------+<--Guest BDSM
+ | | | Passthrough | | EPT | | Emulated by QEMU
+DSMSIZE+-------------+ | with IOMMU | | Mapping | | Programmed by guest FW
+ | | | | | | |
+ | | | | | | |
+ 0+-------------+--+ | | | |
+ | +-------------+ | |
+ | | Data Stolen | +-------------+
+ | | (Host) |
+ +------------>+-------------+<--Host BDSM
+ Non- | | "real" one in HW
+ Passthrough | | Programmed by host FW
+ +-------------+
Footnotes
=========
-[1] Nothing precludes adding additional emulated or assigned graphics devices
- as non-primary, other than the combination typically not working. I only
- intend to set user expectations, others are welcome to find working
- combinations or fix whatever issues prevent this from working in the common
- case.
+[1] https://www.intel.com/content/www/us/en/docs/graphics-for-linux/developer-reference/1-0/dump-video-bios.html
[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override
+[3] https://web.archive.org/web/20240827012422/https://bugzilla.tianocore.org/show_bug.cgi?id=935
+ Tianocore bugzilla was down since Jan 2025 :(
+[4] https://eci.intel.com/docs/3.3/components/kvm-hypervisor.html, Patch 0001-0004
+[5] https://github.com/tomitamoeko/VfioIgdPkg
+[6] https://winraid.level1techs.com/t/tool-guide-news-uefi-bios-updater-ubu/30357
diff --git a/docs/index.rst b/docs/index.rst
index 0b9ee99..5665de8 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,6 +3,8 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
+.. _documentation-root:
+
================================
Welcome to QEMU's documentation!
================================
@@ -18,3 +20,4 @@ Welcome to QEMU's documentation!
interop/index
specs/index
devel/index
+ glossary
diff --git a/docs/interop/bitmaps.rst b/docs/interop/bitmaps.rst
index ddf8947..7536f0b 100644
--- a/docs/interop/bitmaps.rst
+++ b/docs/interop/bitmaps.rst
@@ -97,7 +97,7 @@ time.
- Persistent storage formats may impose their own requirements on bitmap names
and namespaces. Presently, only qcow2 supports persistent bitmaps. See
- docs/interop/qcow2.txt for more details on restrictions. Notably:
+ :doc:`qcow2` for more details on restrictions. Notably:
- qcow2 bitmap names are limited to between 1 and 1023 bytes long.
diff --git a/docs/interop/firmware.json b/docs/interop/firmware.json
index 57f55f6..745d21d 100644
--- a/docs/interop/firmware.json
+++ b/docs/interop/firmware.json
@@ -214,13 +214,23 @@
# PL011 UART. @verbose-static is mutually exclusive
# with @verbose-dynamic.
#
+# @host-uefi-vars: The firmware expects the host to provide an uefi
+# variable store. qemu supports that via
+# "uefi-vars-sysbus" (aarch64, riscv64, loongarch64)
+# or "uefi-vars-x64" (x86_64) devices. The firmware
+# will not use flash for nvram. When loading the
+# firmware into flash the 'stateless' setup should be
+# used. It is recommened to load the firmware into
+# memory though.
+#
# Since: 3.0
##
{ 'enum' : 'FirmwareFeature',
'data' : [ 'acpi-s3', 'acpi-s4',
'amd-sev', 'amd-sev-es', 'amd-sev-snp',
'intel-tdx',
- 'enrolled-keys', 'requires-smm', 'secure-boot',
+ 'enrolled-keys', 'requires-smm',
+ 'secure-boot', 'host-uefi-vars',
'verbose-dynamic', 'verbose-static' ] }
##
diff --git a/docs/interop/index.rst b/docs/interop/index.rst
index ed65395..d830c5c 100644
--- a/docs/interop/index.rst
+++ b/docs/interop/index.rst
@@ -14,12 +14,18 @@ are useful for making QEMU interoperate with other software.
dbus-vmstate
dbus-display
live-block-operations
+ nbd
+ parallels
+ prl-xml
+ qcow2
+ qed_spec
pr-helper
qmp-spec
qemu-ga
qemu-ga-ref
qemu-qmp-ref
qemu-storage-daemon-qmp-ref
+ vfio-user
vhost-user
vhost-user-gpu
vhost-vdpa
diff --git a/docs/interop/live-block-operations.rst b/docs/interop/live-block-operations.rst
index 691429c..6b549ed 100644
--- a/docs/interop/live-block-operations.rst
+++ b/docs/interop/live-block-operations.rst
@@ -931,8 +931,8 @@ Shutdown the guest, by issuing the ``quit`` QMP command::
}
-Live disk backup --- ``blockdev-backup`` and the deprecated``drive-backup``
----------------------------------------------------------------------------
+Live disk backup --- ``blockdev-backup`` and the deprecated ``drive-backup``
+----------------------------------------------------------------------------
The ``blockdev-backup`` (and the deprecated ``drive-backup``) allows
you to create a point-in-time snapshot.
diff --git a/docs/interop/nbd.rst b/docs/interop/nbd.rst
new file mode 100644
index 0000000..de079d3
--- /dev/null
+++ b/docs/interop/nbd.rst
@@ -0,0 +1,89 @@
+QEMU NBD protocol support
+=========================
+
+QEMU supports the NBD protocol, and has an internal NBD client (see
+``block/nbd.c``), an internal NBD server (see ``blockdev-nbd.c``), and an
+external NBD server tool (see ``qemu-nbd.c``). The common code is placed
+in ``nbd/*``.
+
+The NBD protocol is specified here:
+https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md
+
+The following paragraphs describe some specific properties of NBD
+protocol realization in QEMU.
+
+Metadata namespaces
+-------------------
+
+QEMU supports the ``base:allocation`` metadata context as defined in the
+NBD protocol specification, and also defines an additional metadata
+namespace ``qemu``.
+
+``qemu`` namespace
+------------------
+
+The ``qemu`` namespace currently contains two available metadata context
+types. The first is related to exposing the contents of a dirty
+bitmap alongside the associated disk contents. That metadata context
+is named with the following form::
+
+ qemu:dirty-bitmap:<dirty-bitmap-export-name>
+
+Each dirty-bitmap metadata context defines only one flag for extents
+in reply for ``NBD_CMD_BLOCK_STATUS``:
+
+bit 0:
+ ``NBD_STATE_DIRTY``, set when the extent is "dirty"
+
+The second is related to exposing the source of various extents within
+the image, with a single metadata context named::
+
+ qemu:allocation-depth
+
+In the allocation depth context, the entire 32-bit value represents a
+depth of which layer in a thin-provisioned backing chain provided the
+data (0 for unallocated, 1 for the active layer, 2 for the first
+backing layer, and so forth).
+
+For ``NBD_OPT_LIST_META_CONTEXT`` the following queries are supported
+in addition to the specific ``qemu:allocation-depth`` and
+``qemu:dirty-bitmap:<dirty-bitmap-export-name>``:
+
+``qemu:``
+ returns list of all available metadata contexts in the namespace
+``qemu:dirty-bitmap:``
+ returns list of all available dirty-bitmap metadata contexts
+
+Features by version
+-------------------
+
+The following list documents which qemu version first implemented
+various features (both as a server exposing the feature, and as a
+client taking advantage of the feature when present), to make it
+easier to plan for cross-version interoperability. Note that in
+several cases, the initial release containing a feature may require
+additional patches from the corresponding stable branch to fix bugs in
+the operation of that feature.
+
+2.6
+ ``NBD_OPT_STARTTLS`` with TLS X.509 Certificates
+2.8
+ ``NBD_CMD_WRITE_ZEROES``
+2.10
+ ``NBD_OPT_GO``, ``NBD_INFO_BLOCK``
+2.11
+ ``NBD_OPT_STRUCTURED_REPLY``
+2.12
+ ``NBD_CMD_BLOCK_STATUS`` for ``base:allocation``
+3.0
+ ``NBD_OPT_STARTTLS`` with TLS Pre-Shared Keys (PSK),
+ ``NBD_CMD_BLOCK_STATUS`` for ``qemu:dirty-bitmap:``, ``NBD_CMD_CACHE``
+4.2
+ ``NBD_FLAG_CAN_MULTI_CONN`` for shareable read-only exports,
+ ``NBD_CMD_FLAG_FAST_ZERO``
+5.2
+ ``NBD_CMD_BLOCK_STATUS`` for ``qemu:allocation-depth``
+7.1
+ ``NBD_FLAG_CAN_MULTI_CONN`` for shareable writable exports
+8.2
+ ``NBD_OPT_EXTENDED_HEADERS``, ``NBD_FLAG_BLOCK_STATUS_PAYLOAD``
diff --git a/docs/interop/nbd.txt b/docs/interop/nbd.txt
deleted file mode 100644
index 18efb25..0000000
--- a/docs/interop/nbd.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-QEMU supports the NBD protocol, and has an internal NBD client (see
-block/nbd.c), an internal NBD server (see blockdev-nbd.c), and an
-external NBD server tool (see qemu-nbd.c). The common code is placed
-in nbd/*.
-
-The NBD protocol is specified here:
-https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md
-
-The following paragraphs describe some specific properties of NBD
-protocol realization in QEMU.
-
-= Metadata namespaces =
-
-QEMU supports the "base:allocation" metadata context as defined in the
-NBD protocol specification, and also defines an additional metadata
-namespace "qemu".
-
-== "qemu" namespace ==
-
-The "qemu" namespace currently contains two available metadata context
-types. The first is related to exposing the contents of a dirty
-bitmap alongside the associated disk contents. That metadata context
-is named with the following form:
-
- qemu:dirty-bitmap:<dirty-bitmap-export-name>
-
-Each dirty-bitmap metadata context defines only one flag for extents
-in reply for NBD_CMD_BLOCK_STATUS:
-
- bit 0: NBD_STATE_DIRTY, set when the extent is "dirty"
-
-The second is related to exposing the source of various extents within
-the image, with a single metadata context named:
-
- qemu:allocation-depth
-
-In the allocation depth context, the entire 32-bit value represents a
-depth of which layer in a thin-provisioned backing chain provided the
-data (0 for unallocated, 1 for the active layer, 2 for the first
-backing layer, and so forth).
-
-For NBD_OPT_LIST_META_CONTEXT the following queries are supported
-in addition to the specific "qemu:allocation-depth" and
-"qemu:dirty-bitmap:<dirty-bitmap-export-name>":
-
-* "qemu:" - returns list of all available metadata contexts in the
- namespace.
-* "qemu:dirty-bitmap:" - returns list of all available dirty-bitmap
- metadata contexts.
-
-= Features by version =
-
-The following list documents which qemu version first implemented
-various features (both as a server exposing the feature, and as a
-client taking advantage of the feature when present), to make it
-easier to plan for cross-version interoperability. Note that in
-several cases, the initial release containing a feature may require
-additional patches from the corresponding stable branch to fix bugs in
-the operation of that feature.
-
-* 2.6: NBD_OPT_STARTTLS with TLS X.509 Certificates
-* 2.8: NBD_CMD_WRITE_ZEROES
-* 2.10: NBD_OPT_GO, NBD_INFO_BLOCK
-* 2.11: NBD_OPT_STRUCTURED_REPLY
-* 2.12: NBD_CMD_BLOCK_STATUS for "base:allocation"
-* 3.0: NBD_OPT_STARTTLS with TLS Pre-Shared Keys (PSK),
-NBD_CMD_BLOCK_STATUS for "qemu:dirty-bitmap:", NBD_CMD_CACHE
-* 4.2: NBD_FLAG_CAN_MULTI_CONN for shareable read-only exports,
-NBD_CMD_FLAG_FAST_ZERO
-* 5.2: NBD_CMD_BLOCK_STATUS for "qemu:allocation-depth"
-* 7.1: NBD_FLAG_CAN_MULTI_CONN for shareable writable exports
-* 8.2: NBD_OPT_EXTENDED_HEADERS, NBD_FLAG_BLOCK_STATUS_PAYLOAD
diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.rst
index bb3fadf..7b328a4 100644
--- a/docs/interop/parallels.txt
+++ b/docs/interop/parallels.rst
@@ -1,41 +1,46 @@
-= License =
+Parallels Expandable Image File Format
+======================================
-Copyright (c) 2015 Denis Lunev
-Copyright (c) 2015 Vladimir Sementsov-Ogievskiy
+..
+ Copyright (c) 2015 Denis Lunev
+ Copyright (c) 2015 Vladimir Sementsov-Ogievskiy
-This work is licensed under the terms of the GNU GPL, version 2 or later.
-See the COPYING file in the top-level directory.
+ This work is licensed under the terms of the GNU GPL, version 2 or later.
+ See the COPYING file in the top-level directory.
-= Parallels Expandable Image File Format =
A Parallels expandable image file consists of three consecutive parts:
- * header
- * BAT
- * data area
+
+* header
+* BAT
+* data area
All numbers in a Parallels expandable image are stored in little-endian byte
order.
-== Definitions ==
-
- Sector A 512-byte data chunk.
+Definitions
+-----------
- Cluster A data chunk of the size specified in the image header.
- Currently, the default size is 1MiB (2048 sectors). In previous
- versions, cluster sizes of 63 sectors, 256 and 252 kilobytes were
- used.
+Sector
+ A 512-byte data chunk.
- BAT Block Allocation Table, an entity that contains information for
- guest-to-host I/O data address translation.
+Cluster
+ A data chunk of the size specified in the image header.
+ Currently, the default size is 1MiB (2048 sectors). In previous
+ versions, cluster sizes of 63 sectors, 256 and 252 kilobytes were used.
+BAT
+ Block Allocation Table, an entity that contains information for
+ guest-to-host I/O data address translation.
-== Header ==
+Header
+------
The header is placed at the start of an image and contains the following
-fields:
+fields::
-Bytes:
+ Bytes:
0 - 15: magic
Must contain "WithoutFreeSpace" or "WithouFreSpacExt".
@@ -103,44 +108,46 @@ Bytes:
ext_off must meet the same requirements as cluster offsets
defined by BAT entries (see below).
-
-== BAT ==
+BAT
+---
BAT is placed immediately after the image header. In the file, BAT is a
contiguous array of 32-bit unsigned little-endian integers with
-(bat_entries * 4) bytes size.
+``(bat_entries * 4)`` bytes size.
Each BAT entry contains an offset from the start of the file to the
-corresponding cluster. The offset set in clusters for "WithouFreSpacExt" images
-and in sectors for "WithoutFreeSpace" images.
+corresponding cluster. The offset set in clusters for ``WithouFreSpacExt``
+images and in sectors for ``WithoutFreeSpace`` images.
If a BAT entry is zero, the corresponding cluster is not allocated and should
be considered as filled with zeroes.
Cluster offsets specified by BAT entries must meet the following requirements:
- - the value must not be lower than data offset (provided by header.data_off
- or calculated as specified above),
- - the value must be lower than the desired file size,
- - the value must be unique among all BAT entries,
- - the result of (cluster offset - data offset) must be aligned to cluster
- size.
+- the value must not be lower than data offset (provided by ``header.data_off``
+ or calculated as specified above)
+- the value must be lower than the desired file size
+- the value must be unique among all BAT entries
+- the result of ``(cluster offset - data offset)`` must be aligned to
+ cluster size
-== Data Area ==
+Data Area
+---------
-The data area is an area from the data offset (provided by header.data_off or
-calculated as specified above) to the end of the file. It represents a
+The data area is an area from the data offset (provided by ``header.data_off``
+or calculated as specified above) to the end of the file. It represents a
contiguous array of clusters. Most of them are allocated by the BAT, some may
-be allocated by the ext_off field in the header while other may be allocated by
-extensions. All clusters allocated by ext_off and extensions should meet the
-same requirements as clusters specified by BAT entries.
+be allocated by the ``ext_off`` field in the header while other may be
+allocated by extensions. All clusters allocated by ``ext_off`` and extensions
+should meet the same requirements as clusters specified by BAT entries.
-== Format Extension ==
+Format Extension
+----------------
The Format Extension is an area 1 cluster in size that provides additional
format features. This cluster is addressed by the ext_off field in the header.
-The format of the Format Extension area is the following:
+The format of the Format Extension area is the following::
0 - 7: magic
Must be 0xAB234CEF23DCEA87
@@ -149,10 +156,10 @@ The format of the Format Extension area is the following:
The MD5 checksum of the entire Header Extension cluster except
the first 24 bytes.
- The above are followed by feature sections or "extensions". The last
- extension must be "End of features" (see below).
+The above are followed by feature sections or "extensions". The last
+extension must be "End of features" (see below).
-Each feature section has the following format:
+Each feature section has the following format::
0 - 7: magic
The identifier of the feature:
@@ -183,16 +190,17 @@ Each feature section has the following format:
variable: data (data_size bytes)
- The above is followed by padding to the next 8 bytes boundary, then the
- next extension starts.
+The above is followed by padding to the next 8 bytes boundary, then the
+next extension starts.
- The last extension must be "End of features" with all the fields set to 0.
+The last extension must be "End of features" with all the fields set to 0.
-=== Dirty bitmaps feature ===
+Dirty bitmaps feature
+---------------------
This feature provides a way of storing dirty bitmaps in the image. The fields
-of its data area are:
+of its data area are::
0 - 7: size
The bitmap size, should be equal to disk size in sectors.
@@ -215,7 +223,7 @@ clusters inside the Parallels image file. The offsets of these clusters are
saved in the L1 offset table specified by the feature extension. Each L1 table
entry is a 64 bit integer as described below:
-Given an offset in bytes into the bitmap data, corresponding L1 entry is
+Given an offset in bytes into the bitmap data, corresponding L1 entry is::
l1_table[offset / cluster_size]
@@ -227,6 +235,6 @@ are assumed to be 1.
If an L1 table entry is not 0 or 1, it contains the corresponding cluster
offset (in 512b sectors). Given an offset in bytes into the bitmap data the
-offset in bytes into the image file can be obtained as follows:
+offset in bytes into the image file can be obtained as follows::
offset = l1_table[offset / cluster_size] * 512 + (offset % cluster_size)
diff --git a/docs/interop/prl-xml.rst b/docs/interop/prl-xml.rst
new file mode 100644
index 0000000..5bb63bb
--- /dev/null
+++ b/docs/interop/prl-xml.rst
@@ -0,0 +1,192 @@
+Parallels Disk Format
+=====================
+
+..
+ Copyright (c) 2015-2017, Virtuozzo, Inc.
+ Authors:
+ 2015 Denis Lunev <den@openvz.org>
+ 2015 Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+ 2016-2017 Klim Kireev <klim.kireev@virtuozzo.com>
+ 2016-2017 Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
+
+ This work is licensed under the terms of the GNU GPL, version 2 or later.
+ See the COPYING file in the top-level directory.
+
+This specification contains minimal information about Parallels Disk Format,
+which is enough to properly work with QEMU. Nevertheless, Parallels Cloud Server
+and Parallels Desktop are able to add some unspecified nodes to the xml and use
+them, but they are for internal work and don't affect functionality. Also it
+uses auxiliary xml ``Snapshot.xml``, which allows storage of optional snapshot
+information, but this doesn't influence open/read/write functionality. QEMU and
+other software should not use fields not covered in this document or the
+``Snapshot.xml`` file, and must leave them as is.
+
+A Parallels disk consists of two parts: the set of snapshots and the disk
+descriptor file, which stores information about all files and snapshots.
+
+Definitions
+-----------
+
+Snapshot
+ a record of the contents captured at a particular time, capable
+ of storing current state. A snapshot has a UUID and a parent UUID.
+
+Snapshot image
+ an overlay representing the difference between this
+ snapshot and some earlier snapshot.
+
+Overlay
+ an image storing the different sectors between two captured states.
+
+Root image
+ a snapshot image with no parent, the root of the snapshot tree.
+
+Storage
+ the backing storage for a subset of the virtual disk. When
+ there is more than one storage in a Parallels disk then that
+ is referred to as a split image. In this case every storage
+ covers a specific address space area of the disk and has its
+ particular root image. Split images are not considered here
+ and are not supported. Each storage consists of disk
+ parameters and a list of images. The list of images always
+ contains a root image and may also contain overlays. The
+ root image can be an expandable Parallels image file or
+ plain. Overlays must be expandable.
+
+Description file
+ ``DiskDescriptor.xml`` stores information about disk parameters,
+ snapshots, and storages.
+
+Top Snapshot
+ The overlay between actual state and some previous snapshot.
+ It is not a snapshot in the classical sense because it
+ serves as the active image that the guest writes to.
+
+Sector
+ a 512-byte data chunk.
+
+Description file
+----------------
+
+All information is placed in a single XML element
+``Parallels_disk_image``.
+The element has only one attribute, ``Version``, which must be ``1.0``.
+
+The schema of ``DiskDescriptor.xml``::
+
+ <Parallels_disk_image Version="1.0">
+ <Disk_Parameters>
+ ...
+ </Disk_Parameters>
+ <StorageData>
+ ...
+ </StorageData>
+ <Snapshots>
+ ...
+ </Snapshots>
+ </Parallels_disk_image>
+
+``Disk_Parameters`` element
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``Disk_Parameters`` element describes the physical layout of the
+virtual disk and some general settings.
+
+The ``Disk_Parameters`` element MUST contain the following child elements:
+
+* ``Disk_size`` - number of sectors in the disk,
+ desired size of the disk.
+* ``Cylinders`` - number of the disk cylinders.
+* ``Heads`` - number of the disk heads.
+* ``Sectors`` - number of the disk sectors per cylinder
+ (sector size is 512 bytes)
+ Limitation: The product of the ``Heads``, ``Sectors`` and ``Cylinders``
+ values MUST be equal to the value of the Disk_size parameter.
+* ``Padding`` - must be 0. Parallels Cloud Server and Parallels Desktop may
+ use padding set to 1; however this case is not covered
+ by this specification. QEMU and other software should not open
+ such disks and should not create them.
+
+``StorageData`` element
+^^^^^^^^^^^^^^^^^^^^^^^
+
+This element of the file describes the root image and all snapshot images.
+
+The ``StorageData`` element consists of the ``Storage`` child element,
+as shown below::
+
+ <StorageData>
+ <Storage>
+ ...
+ </Storage>
+ </StorageData>
+
+A ``Storage`` element has the following child elements:
+
+* ``Start`` - start sector of the storage, in case of non split storage
+ equals to 0.
+* ``End`` - number of sector following the last sector, in case of non
+ split storage equals to ``Disk_size``.
+* ``Blocksize`` - storage cluster size, number of sectors per one cluster.
+ The cluster size for each "Compressed" (see below) image in
+ a parallels disk must be equal to this field. Note: the cluster
+ size for a Parallels Expandable Image is in the ``tracks`` field of
+ its header (see :doc:`parallels`).
+* Several ``Image`` child elements.
+
+Each ``Image`` element has the following child elements:
+
+* ``GUID`` - image identifier, UUID in curly brackets.
+ For instance, ``{12345678-9abc-def1-2345-6789abcdef12}.``
+ The GUID is used by the Snapshots element to reference images
+ (see below)
+* ``Type`` - image type of the element. It can be:
+
+ * ``Plain`` for raw files.
+ * ``Compressed`` for expanding disks.
+
+* ``File`` - path to image file. The path can be relative to
+ ``DiskDescriptor.xml`` or absolute.
+
+``Snapshots`` element
+^^^^^^^^^^^^^^^^^^^^^
+
+The ``Snapshots`` element describes the snapshot relations with the snapshot tree.
+
+The element contains the set of ``Shot`` child elements, as shown below::
+
+ <Snapshots>
+ <TopGUID> ... </TopGUID> /* Optional child element */
+ <Shot>
+ ...
+ </Shot>
+ <Shot>
+ ...
+ </Shot>
+ ...
+ </Snapshots>
+
+Each ``Shot`` element contains the following child elements:
+
+* ``GUID`` - an image GUID.
+* ``ParentGUID`` - GUID of the image of the parent snapshot.
+
+The software may traverse snapshots from child to parent using the
+``<ParentGUID>`` field as reference. The ``ParentGUID`` of the root
+snapshot is ``{00000000-0000-0000-0000-000000000000}``.
+There should be only one root snapshot.
+
+The Top snapshot could be
+described via two ways: via the ``TopGUID`` child
+element of the ``Snapshots`` element, or via the predefined GUID
+``{5fbaabe3-6958-40ff-92a7-860e329aab41}``. If ``TopGUID`` is defined,
+the predefined GUID is interpreted as a normal GUID. All snapshot images
+(except the Top Snapshot) should be
+opened read-only.
+
+There is another predefined GUID,
+``BackupID = {704718e1-2314-44c8-9087-d78ed36b0f4e}``, which is used by
+original and some third-party software for backup. QEMU and other
+software may operate with images with ``GUID = BackupID`` as usual.
+However, it is not recommended to use this
+GUID for new disks. The Top snapshot cannot have this GUID.
diff --git a/docs/interop/prl-xml.txt b/docs/interop/prl-xml.txt
deleted file mode 100644
index cf9b3fb..0000000
--- a/docs/interop/prl-xml.txt
+++ /dev/null
@@ -1,158 +0,0 @@
-= License =
-
-Copyright (c) 2015-2017, Virtuozzo, Inc.
-Authors:
- 2015 Denis Lunev <den@openvz.org>
- 2015 Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
- 2016-2017 Klim Kireev <klim.kireev@virtuozzo.com>
- 2016-2017 Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-
-This work is licensed under the terms of the GNU GPL, version 2 or later.
-See the COPYING file in the top-level directory.
-
-This specification contains minimal information about Parallels Disk Format,
-which is enough to proper work with QEMU. Nevertheless, Parallels Cloud Server
-and Parallels Desktop are able to add some unspecified nodes to xml and use
-them, but they are for internal work and don't affect functionality. Also it
-uses auxiliary xml "Snapshot.xml", which allows to store optional snapshot
-information, but it doesn't influence open/read/write functionality. QEMU and
-other software should not use fields not covered in this document and
-Snapshot.xml file and must leave them as is.
-
-= Parallels Disk Format =
-
-Parallels disk consists of two parts: the set of snapshots and the disk
-descriptor file, which stores information about all files and snapshots.
-
-== Definitions ==
- Snapshot a record of the contents captured at a particular time,
- capable of storing current state. A snapshot has UUID and
- parent UUID.
-
- Snapshot image an overlay representing the difference between this
- snapshot and some earlier snapshot.
-
- Overlay an image storing the different sectors between two captured
- states.
-
- Root image snapshot image with no parent, the root of snapshot tree.
-
- Storage the backing storage for a subset of the virtual disk. When
- there is more than one storage in a Parallels disk then that
- is referred to as a split image. In this case every storage
- covers specific address space area of the disk and has its
- particular root image. Split images are not considered here
- and are not supported. Each storage consists of disk
- parameters and a list of images. The list of images always
- contains a root image and may also contain overlays. The
- root image can be an expandable Parallels image file or
- plain. Overlays must be expandable.
-
- Description DiskDescriptor.xml stores information about disk parameters,
- file snapshots, storages.
-
- Top The overlay between actual state and some previous snapshot.
- Snapshot It is not a snapshot in the classical sense because it
- serves as the active image that the guest writes to.
-
- Sector a 512-byte data chunk.
-
-== Description file ==
-All information is placed in a single XML element Parallels_disk_image.
-The element has only one attribute "Version", that must be 1.0.
-Schema of DiskDescriptor.xml:
-
-<Parallels_disk_image Version="1.0">
- <Disk_Parameters>
- ...
- </Disk_Parameters>
- <StorageData>
- ...
- </StorageData>
- <Snapshots>
- ...
- </Snapshots>
-</Parallels_disk_image>
-
-== Disk_Parameters element ==
-The Disk_Parameters element describes the physical layout of the virtual disk
-and some general settings.
-
-The Disk_Parameters element MUST contain the following child elements:
- * Disk_size - number of sectors in the disk,
- desired size of the disk.
- * Cylinders - number of the disk cylinders.
- * Heads - number of the disk heads.
- * Sectors - number of the disk sectors per cylinder
- (sector size is 512 bytes)
- Limitation: Product of the Heads, Sectors and Cylinders
- values MUST be equal to the value of the Disk_size parameter.
- * Padding - must be 0. Parallels Cloud Server and Parallels Desktop may
- use padding set to 1, however this case is not covered
- by this spec, QEMU and other software should not open
- such disks and should not create them.
-
-== StorageData element ==
-This element of the file describes the root image and all snapshot images.
-
-The StorageData element consists of the Storage child element, as shown below:
-<StorageData>
- <Storage>
- ...
- </Storage>
-</StorageData>
-
-A Storage element has following child elements:
- * Start - start sector of the storage, in case of non split storage
- equals to 0.
- * End - number of sector following the last sector, in case of non
- split storage equals to Disk_size.
- * Blocksize - storage cluster size, number of sectors per one cluster.
- Cluster size for each "Compressed" (see below) image in
- parallels disk must be equal to this field. Note: cluster
- size for Parallels Expandable Image is in 'tracks' field of
- its header (see docs/interop/parallels.txt).
- * Several Image child elements.
-
-Each Image element has following child elements:
- * GUID - image identifier, UUID in curly brackets.
- For instance, {12345678-9abc-def1-2345-6789abcdef12}.
- The GUID is used by the Snapshots element to reference images
- (see below)
- * Type - image type of the element. It can be:
- "Plain" for raw files.
- "Compressed" for expanding disks.
- * File - path to image file. Path can be relative to DiskDescriptor.xml or
- absolute.
-
-== Snapshots element ==
-The Snapshots element describes the snapshot relations with the snapshot tree.
-
-The element contains the set of Shot child elements, as shown below:
-<Snapshots>
- <TopGUID> ... </TopGUID> /* Optional child element */
- <Shot>
- ...
- </Shot>
- <Shot>
- ...
- </Shot>
- ...
-</Snapshots>
-
-Each Shot element contains the following child elements:
- * GUID - an image GUID.
- * ParentGUID - GUID of the image of the parent snapshot.
-
-The software may traverse snapshots from child to parent using <ParentGUID>
-field as reference. ParentGUID of root snapshot is
-{00000000-0000-0000-0000-000000000000}. There should be only one root
-snapshot. Top snapshot could be described via two ways: via TopGUID child
-element of the Snapshots element or via predefined GUID
-{5fbaabe3-6958-40ff-92a7-860e329aab41}. If TopGUID is defined, predefined GUID is
-interpreted as usual GUID. All snapshot images (except Top Snapshot) should be
-opened read-only. There is another predefined GUID,
-BackupID = {704718e1-2314-44c8-9087-d78ed36b0f4e}, which is used by original and
-some third-party software for backup, QEMU and other software may operate with
-images with GUID = BackupID as usual, however, it is not recommended to use this
-GUID for new disks. Top snapshot cannot have this GUID.
diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.rst
index 2c46183..5948591 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.rst
@@ -1,6 +1,8 @@
-== General ==
+=======================
+Qcow2 Image File Format
+=======================
-A qcow2 image file is organized in units of constant size, which are called
+A ``qcow2`` image file is organized in units of constant size, which are called
(host) clusters. A cluster is the unit in which all allocations are done,
both for actual guest data and for image metadata.
@@ -9,10 +11,10 @@ clusters of the same size.
All numbers in qcow2 are stored in Big Endian byte order.
+Header
+------
-== Header ==
-
-The first cluster of a qcow2 image contains the file header:
+The first cluster of a qcow2 image contains the file header::
Byte 0 - 3: magic
QCOW magic string ("QFI\xfb")
@@ -38,7 +40,7 @@ The first cluster of a qcow2 image contains the file header:
within a cluster (1 << cluster_bits is the cluster size).
Must not be less than 9 (i.e. 512 byte clusters).
- Note: qemu as of today has an implementation limit of 2 MB
+ Note: QEMU as of today has an implementation limit of 2 MB
as the maximum cluster size and won't be able to open images
with larger cluster sizes.
@@ -48,7 +50,7 @@ The first cluster of a qcow2 image contains the file header:
24 - 31: size
Virtual disk size in bytes.
- Note: qemu has an implementation limit of 32 MB as
+ Note: QEMU has an implementation limit of 32 MB as
the maximum L1 table size. With a 2 MB cluster
size, it is unable to populate a virtual cluster
beyond 2 EB (61 bits); with a 512 byte cluster
@@ -87,7 +89,8 @@ The first cluster of a qcow2 image contains the file header:
For version 2, the header is exactly 72 bytes in length, and finishes here.
For version 3 or higher, the header length is at least 104 bytes, including
-the next fields through header_length.
+the next fields through ``header_length``.
+::
72 - 79: incompatible_features
Bitmask of incompatible features. An implementation must
@@ -185,7 +188,8 @@ the next fields through header_length.
of 8.
-=== Additional fields (version 3 and higher) ===
+Additional fields (version 3 and higher)
+----------------------------------------
In general, these fields are optional and may be safely ignored by the software,
as well as filled by zeros (which is equal to field absence), if software needs
@@ -193,21 +197,25 @@ to set field B, but does not care about field A which precedes B. More
formally, additional fields have the following compatibility rules:
1. If the value of the additional field must not be ignored for correct
-handling of the file, it will be accompanied by a corresponding incompatible
-feature bit.
+ handling of the file, it will be accompanied by a corresponding incompatible
+ feature bit.
2. If there are no unrecognized incompatible feature bits set, an unknown
-additional field may be safely ignored other than preserving its value when
-rewriting the image header.
+ additional field may be safely ignored other than preserving its value when
+ rewriting the image header.
+
+.. _ref_rules_3:
3. An explicit value of 0 will have the same behavior as when the field is not
-present*, if not altered by a specific incompatible bit.
+ present*, if not altered by a specific incompatible bit.
-*. A field is considered not present when header_length is less than or equal
+(*) A field is considered not present when ``header_length`` is less than or equal
to the field's offset. Also, all additional fields are not present for
version 2.
- 104: compression_type
+::
+
+ 104: compression_type
Defines the compression method used for compressed clusters.
All compressed clusters in an image use the same compression
@@ -219,8 +227,8 @@ version 2.
or must be zero (which means deflate).
Available compression type values:
- 0: deflate <https://www.ietf.org/rfc/rfc1951.txt>
- 1: zstd <http://github.com/facebook/zstd>
+ - 0: deflate <https://www.ietf.org/rfc/rfc1951.txt>
+ - 1: zstd <http://github.com/facebook/zstd>
The deflate compression type is called "zlib"
<https://www.zlib.net/> in QEMU. However, clusters with the
@@ -228,19 +236,21 @@ version 2.
105 - 111: Padding, contents defined below.
-=== Header padding ===
+Header padding
+--------------
-@header_length must be a multiple of 8, which means that if the end of the last
+``header_length`` must be a multiple of 8, which means that if the end of the last
additional field is not aligned, some padding is needed. This padding must be
zeroed, so that if some existing (or future) additional field will fall into
-the padding, it will be interpreted accordingly to point [3.] of the previous
+the padding, it will be interpreted accordingly to point `[3.] <#ref_rules_3>`_ of the previous
paragraph, i.e. in the same manner as when this field is not present.
-=== Header extensions ===
+Header extensions
+-----------------
Directly after the image header, optional sections called header extensions can
-be stored. Each extension has a structure like the following:
+be stored. Each extension has a structure like the following::
Byte 0 - 3: Header extension type:
0x00000000 - End of the header extension area
@@ -270,17 +280,19 @@ data of compatible features that it doesn't support. Compatible features that
need space for additional data can use a header extension.
-== String header extensions ==
+String header extensions
+------------------------
Some header extensions (such as the backing file format name and the external
data file name) are just a single string. In this case, the header extension
-length is the string length and the string is not '\0' terminated. (The header
-extension padding can make it look like a string is '\0' terminated, but
+length is the string length and the string is not ``\0`` terminated. (The header
+extension padding can make it look like a string is ``\0`` terminated, but
neither is padding always necessary nor is there a guarantee that zero bytes
are used for padding.)
-== Feature name table ==
+Feature name table
+------------------
The feature name table is an optional header extension that contains the name
for features used by the image. It can be used by applications that don't know
@@ -288,7 +300,7 @@ the respective feature (e.g. because the feature was introduced only later) to
display a useful error message.
The number of entries in the feature name table is determined by the length of
-the header extension data. Each entry look like this:
+the header extension data. Each entry looks like this::
Byte 0: Type of feature (select feature bitmap)
0: Incompatible feature
@@ -302,7 +314,8 @@ the header extension data. Each entry look like this:
terminated if it has full length)
-== Bitmaps extension ==
+Bitmaps extension
+-----------------
The bitmaps extension is an optional header extension. It provides the ability
to store bitmaps related to a virtual disk. For now, there is only one bitmap
@@ -310,9 +323,9 @@ type: the dirty tracking bitmap, which tracks virtual disk changes from some
point in time.
The data of the extension should be considered consistent only if the
-corresponding auto-clear feature bit is set, see autoclear_features above.
+corresponding auto-clear feature bit is set, see ``autoclear_features`` above.
-The fields of the bitmaps extension are:
+The fields of the bitmaps extension are::
Byte 0 - 3: nb_bitmaps
The number of bitmaps contained in the image. Must be
@@ -331,15 +344,17 @@ The fields of the bitmaps extension are:
Offset into the image file at which the bitmap directory
starts. Must be aligned to a cluster boundary.
-== Full disk encryption header pointer ==
+Full disk encryption header pointer
+-----------------------------------
The full disk encryption header must be present if, and only if, the
-'crypt_method' header requires metadata. Currently this is only true
-of the 'LUKS' crypt method. The header extension must be absent for
+``crypt_method`` header requires metadata. Currently this is only true
+of the ``LUKS`` crypt method. The header extension must be absent for
other methods.
This header provides the offset at which the crypt method can store
its additional data, as well as the length of such data.
+::
Byte 0 - 7: Offset into the image file at which the encryption
header starts in bytes. Must be aligned to a cluster
@@ -357,10 +372,10 @@ The first 592 bytes of the header clusters will contain the LUKS
partition header. This is then followed by the key material data areas.
The size of the key material data areas is determined by the number of
stripes in the key slot and key size. Refer to the LUKS format
-specification ('docs/on-disk-format.pdf' in the cryptsetup source
+specification (``docs/on-disk-format.pdf`` in the cryptsetup source
package) for details of the LUKS partition header format.
-In the LUKS partition header, the "payload-offset" field will be
+In the LUKS partition header, the ``payload-offset`` field will be
calculated as normal for the LUKS spec. ie the size of the LUKS
header, plus key material regions, plus padding, relative to the
start of the LUKS header. This offset value is not required to be
@@ -369,11 +384,12 @@ context of qcow2, since the qcow2 file format itself defines where
the real payload offset is, but none the less a valid payload offset
should always be present.
-In the LUKS key slots header, the "key-material-offset" is relative
+In the LUKS key slots header, the ``key-material-offset`` is relative
to the start of the LUKS header clusters in the qcow2 container,
not the start of the qcow2 file.
Logically the layout looks like
+::
+-----------------------------+
| QCow2 header |
@@ -405,7 +421,8 @@ Logically the layout looks like
| |
+-----------------------------+
-== Data encryption ==
+Data encryption
+---------------
When an encryption method is requested in the header, the image payload
data must be encrypted/decrypted on every write/read. The image headers
@@ -413,7 +430,7 @@ and metadata are never encrypted.
The algorithms used for encryption vary depending on the method
- - AES:
+ - ``AES``:
The AES cipher, in CBC mode, with 256 bit keys.
@@ -425,7 +442,7 @@ The algorithms used for encryption vary depending on the method
supported in the command line tools for the sake of back compatibility
and data liberation.
- - LUKS:
+ - ``LUKS``:
The algorithms are specified in the LUKS header.
@@ -433,7 +450,8 @@ The algorithms used for encryption vary depending on the method
in the LUKS header, with the physical disk sector as the
input tweak.
-== Host cluster management ==
+Host cluster management
+-----------------------
qcow2 manages the allocation of host clusters by maintaining a reference count
for each host cluster. A refcount of 0 means that the cluster is free, 1 means
@@ -453,14 +471,15 @@ Although a large enough refcount table can reserve clusters past 64 PB
large), note that some qcow2 metadata such as L1/L2 tables must point
to clusters prior to that point.
-Note: qemu has an implementation limit of 8 MB as the maximum refcount
-table size. With a 2 MB cluster size and a default refcount_order of
-4, it is unable to reference host resources beyond 2 EB (61 bits); in
-the worst case, with a 512 cluster size and refcount_order of 6, it is
-unable to access beyond 32 GB (35 bits).
+.. note::
+ QEMU has an implementation limit of 8 MB as the maximum refcount
+ table size. With a 2 MB cluster size and a default refcount_order of
+ 4, it is unable to reference host resources beyond 2 EB (61 bits); in
+ the worst case, with a 512 cluster size and refcount_order of 6, it is
+ unable to access beyond 32 GB (35 bits).
Given an offset into the image file, the refcount of its cluster can be
-obtained as follows:
+obtained as follows::
refcount_block_entries = (cluster_size * 8 / refcount_bits)
@@ -470,7 +489,7 @@ obtained as follows:
refcount_block = load_cluster(refcount_table[refcount_table_index]);
return refcount_block[refcount_block_index];
-Refcount table entry:
+Refcount table entry::
Bit 0 - 8: Reserved (set to 0)
@@ -482,14 +501,15 @@ Refcount table entry:
been allocated. All refcounts managed by this refcount block
are 0.
-Refcount block entry (x = refcount_bits - 1):
+Refcount block entry ``(x = refcount_bits - 1)``::
Bit 0 - x: Reference count of the cluster. If refcount_bits implies a
sub-byte width, note that bit 0 means the least significant
bit in this context.
-== Cluster mapping ==
+Cluster mapping
+---------------
Just as for refcounts, qcow2 uses a two-level structure for the mapping of
guest clusters to host clusters. They are called L1 and L2 table.
@@ -509,7 +529,7 @@ compressed clusters to reside below 512 TB (49 bits), and this limit
cannot be relaxed without an incompatible layout change).
Given an offset into the virtual disk, the offset into the image file can be
-obtained as follows:
+obtained as follows::
l2_entries = (cluster_size / sizeof(uint64_t)) [*]
@@ -523,7 +543,7 @@ obtained as follows:
[*] this changes if Extended L2 Entries are enabled, see next section
-L1 table entry:
+L1 table entry::
Bit 0 - 8: Reserved (set to 0)
@@ -538,7 +558,7 @@ L1 table entry:
refcount is exactly one. This information is only accurate
in the active L1 table.
-L2 table entry:
+L2 table entry::
Bit 0 - 61: Cluster descriptor
@@ -555,7 +575,7 @@ L2 table entry:
mapping for guest cluster offsets), so this bit should be 1
for all allocated clusters.
-Standard Cluster Descriptor:
+Standard Cluster Descriptor::
Bit 0: If set to 1, the cluster reads as all zeros. The host
cluster offset can be used to describe a preallocation,
@@ -577,7 +597,7 @@ Standard Cluster Descriptor:
56 - 61: Reserved (set to 0)
-Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)):
+Compressed Clusters Descriptor ``(x = 62 - (cluster_bits - 8))``::
Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a
cluster or sector boundary! If cluster_bits is
@@ -601,7 +621,8 @@ file (except if bit 0 in the Standard Cluster Descriptor is set). If there is
no backing file or the backing file is smaller than the image, they shall read
zeros for all parts that are not covered by the backing file.
-== Extended L2 Entries ==
+Extended L2 Entries
+-------------------
An image uses Extended L2 Entries if bit 4 is set on the incompatible_features
field of the header.
@@ -615,6 +636,8 @@ subclusters so they are treated the same as in images without this feature.
The size of an extended L2 entry is 128 bits so the number of entries per table
is calculated using this formula:
+.. code::
+
l2_entries = (cluster_size / (2 * sizeof(uint64_t)))
The first 64 bits have the same format as the standard L2 table entry described
@@ -623,7 +646,7 @@ descriptor.
The last 64 bits contain a subcluster allocation bitmap with this format:
-Subcluster Allocation Bitmap (for standard clusters):
+Subcluster Allocation Bitmap (for standard clusters)::
Bit 0 - 31: Allocation status (one bit per subcluster)
@@ -647,13 +670,14 @@ Subcluster Allocation Bitmap (for standard clusters):
Bits are assigned starting from the least significant
one (i.e. bit x is used for subcluster x - 32).
-Subcluster Allocation Bitmap (for compressed clusters):
+Subcluster Allocation Bitmap (for compressed clusters)::
Bit 0 - 63: Reserved (set to 0)
Compressed clusters don't have subclusters,
so this field is not used.
-== Snapshots ==
+Snapshots
+---------
qcow2 supports internal snapshots. Their basic principle of operation is to
switch the active L1 table, so that a different set of host clusters are
@@ -672,7 +696,7 @@ in the image file, whose starting offset and length are given by the header
fields snapshots_offset and nb_snapshots. The entries of the snapshot table
have variable length, depending on the length of ID, name and extra data.
-Snapshot table entry:
+Snapshot table entry::
Byte 0 - 7: Offset into the image file at which the L1 table for the
snapshot starts. Must be aligned to a cluster boundary.
@@ -728,7 +752,8 @@ Snapshot table entry:
next multiple of 8.
-== Bitmaps ==
+Bitmaps
+-------
As mentioned above, the bitmaps extension provides the ability to store bitmaps
related to a virtual disk. This section describes how these bitmaps are stored.
@@ -739,20 +764,23 @@ each bitmap size is equal to the virtual disk size.
Each bit of the bitmap is responsible for strictly defined range of the virtual
disk. For bit number bit_nr the corresponding range (in bytes) will be:
+.. code::
+
[bit_nr * bitmap_granularity .. (bit_nr + 1) * bitmap_granularity - 1]
Granularity is a property of the concrete bitmap, see below.
-=== Bitmap directory ===
+Bitmap directory
+----------------
Each bitmap saved in the image is described in a bitmap directory entry. The
bitmap directory is a contiguous area in the image file, whose starting offset
-and length are given by the header extension fields bitmap_directory_offset and
-bitmap_directory_size. The entries of the bitmap directory have variable
+and length are given by the header extension fields ``bitmap_directory_offset`` and
+``bitmap_directory_size``. The entries of the bitmap directory have variable
length, depending on the lengths of the bitmap name and extra data.
-Structure of a bitmap directory entry:
+Structure of a bitmap directory entry::
Byte 0 - 7: bitmap_table_offset
Offset into the image file at which the bitmap table
@@ -833,7 +861,8 @@ Structure of a bitmap directory entry:
next multiple of 8. All bytes of the padding must be zero.
-=== Bitmap table ===
+Bitmap table
+------------
Each bitmap is stored using a one-level structure (as opposed to two-level
structures like for refcounts and guest clusters mapping) for the mapping of
@@ -843,7 +872,7 @@ Each bitmap table has a variable size (stored in the bitmap directory entry)
and may use multiple clusters, however, it must be contiguous in the image
file.
-Structure of a bitmap table entry:
+Structure of a bitmap table entry::
Bit 0: Reserved and must be zero if bits 9 - 55 are non-zero.
If bits 9 - 55 are zero:
@@ -860,11 +889,12 @@ Structure of a bitmap table entry:
56 - 63: Reserved and must be zero.
-=== Bitmap data ===
+Bitmap data
+-----------
As noted above, bitmap data is stored in separate clusters, described by the
bitmap table. Given an offset (in bytes) into the bitmap data, the offset into
-the image file can be obtained as follows:
+the image file can be obtained as follows::
image_offset(bitmap_data_offset) =
bitmap_table[bitmap_data_offset / cluster_size] +
@@ -875,7 +905,7 @@ above).
Given an offset byte_nr into the virtual disk and the bitmap's granularity, the
bit offset into the image file to the corresponding bit of the bitmap can be
-calculated like this:
+calculated like this::
bit_offset(byte_nr) =
image_offset(byte_nr / granularity / 8) * 8 +
@@ -886,21 +916,22 @@ last cluster of the bitmap data contains some unused tail bits. These bits must
be zero.
-=== Dirty tracking bitmaps ===
+Dirty tracking bitmaps
+----------------------
-Bitmaps with 'type' field equal to one are dirty tracking bitmaps.
+Bitmaps with ``type`` field equal to one are dirty tracking bitmaps.
-When the virtual disk is in use dirty tracking bitmap may be 'enabled' or
-'disabled'. While the bitmap is 'enabled', all writes to the virtual disk
+When the virtual disk is in use dirty tracking bitmap may be ``enabled`` or
+``disabled``. While the bitmap is ``enabled``, all writes to the virtual disk
should be reflected in the bitmap. A set bit in the bitmap means that the
corresponding range of the virtual disk (see above) was written to while the
-bitmap was 'enabled'. An unset bit means that this range was not written to.
+bitmap was ``enabled``. An unset bit means that this range was not written to.
The software doesn't have to sync the bitmap in the image file with its
-representation in RAM after each write or metadata change. Flag 'in_use'
+representation in RAM after each write or metadata change. Flag ``in_use``
should be set while the bitmap is not synced.
-In the image file the 'enabled' state is reflected by the 'auto' flag. If this
-flag is set, the software must consider the bitmap as 'enabled' and start
+In the image file the ``enabled`` state is reflected by the ``auto`` flag. If this
+flag is set, the software must consider the bitmap as ``enabled`` and start
tracking virtual disk changes to this bitmap from the first write to the
virtual disk. If this flag is not set then the bitmap is disabled.
diff --git a/docs/interop/qed_spec.rst b/docs/interop/qed_spec.rst
new file mode 100644
index 0000000..cd6c7d9
--- /dev/null
+++ b/docs/interop/qed_spec.rst
@@ -0,0 +1,219 @@
+===================================
+QED Image File Format Specification
+===================================
+
+The file format looks like this::
+
+ +----------+----------+----------+-----+
+ | cluster0 | cluster1 | cluster2 | ... |
+ +----------+----------+----------+-----+
+
+The first cluster begins with the ``header``. The header contains information
+about where regular clusters start; this allows the header to be extensible and
+store extra information about the image file. A regular cluster may be
+a ``data cluster``, an ``L2``, or an ``L1 table``. L1 and L2 tables are composed
+of one or more contiguous clusters.
+
+Normally the file size will be a multiple of the cluster size. If the file size
+is not a multiple, extra information after the last cluster may not be preserved
+if data is written. Legitimate extra information should use space between the header
+and the first regular cluster.
+
+All fields are little-endian.
+
+Header
+------
+
+::
+
+ Header {
+ uint32_t magic; /* QED\0 */
+
+ uint32_t cluster_size; /* in bytes */
+ uint32_t table_size; /* for L1 and L2 tables, in clusters */
+ uint32_t header_size; /* in clusters */
+
+ uint64_t features; /* format feature bits */
+ uint64_t compat_features; /* compat feature bits */
+ uint64_t autoclear_features; /* self-resetting feature bits */
+
+ uint64_t l1_table_offset; /* in bytes */
+ uint64_t image_size; /* total logical image size, in bytes */
+
+ /* if (features & QED_F_BACKING_FILE) */
+ uint32_t backing_filename_offset; /* in bytes from start of header */
+ uint32_t backing_filename_size; /* in bytes */
+ }
+
+Field descriptions:
+~~~~~~~~~~~~~~~~~~~
+
+- ``cluster_size`` must be a power of 2 in range [2^12, 2^26].
+- ``table_size`` must be a power of 2 in range [1, 16].
+- ``header_size`` is the number of clusters used by the header and any additional
+ information stored before regular clusters.
+- ``features``, ``compat_features``, and ``autoclear_features`` are file format
+ extension bitmaps. They work as follows:
+
+ - An image with unknown ``features`` bits enabled must not be opened. File format
+ changes that are not backwards-compatible must use ``features`` bits.
+ - An image with unknown ``compat_features`` bits enabled can be opened safely.
+ The unknown features are simply ignored and represent backwards-compatible
+ changes to the file format.
+ - An image with unknown ``autoclear_features`` bits enable can be opened safely
+ after clearing the unknown bits. This allows for backwards-compatible changes
+ to the file format which degrade gracefully and can be re-enabled again by a
+ new program later.
+- ``l1_table_offset`` is the offset of the first byte of the L1 table in the image
+ file and must be a multiple of ``cluster_size``.
+- ``image_size`` is the block device size seen by the guest and must be a multiple
+ of 512 bytes.
+- ``backing_filename_offset`` and ``backing_filename_size`` describe a string in
+ (byte offset, byte size) form. It is not NUL-terminated and has no alignment constraints.
+ The string must be stored within the first ``header_size`` clusters. The backing filename
+ may be an absolute path or relative to the image file.
+
+Feature bits:
+~~~~~~~~~~~~~
+
+- ``QED_F_BACKING_FILE = 0x01``. The image uses a backing file.
+- ``QED_F_NEED_CHECK = 0x02``. The image needs a consistency check before use.
+- ``QED_F_BACKING_FORMAT_NO_PROBE = 0x04``. The backing file is a raw disk image
+ and no file format autodetection should be attempted. This should be used to
+ ensure that raw backing files are never detected as an image format if they happen
+ to contain magic constants.
+
+There are currently no defined ``compat_features`` or ``autoclear_features`` bits.
+
+Fields predicated on a feature bit are only used when that feature is set.
+The fields always take up header space, regardless of whether or not the feature
+bit is set.
+
+Tables
+------
+
+Tables provide the translation from logical offsets in the block device to cluster
+offsets in the file.
+
+::
+
+ #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t))
+
+ Table {
+ uint64_t offsets[TABLE_NOFFSETS];
+ }
+
+The tables are organized as follows::
+
+ +----------+
+ | L1 table |
+ +----------+
+ ,------' | '------.
+ +----------+ | +----------+
+ | L2 table | ... | L2 table |
+ +----------+ +----------+
+ ,------' | '------.
+ +----------+ | +----------+
+ | Data | ... | Data |
+ +----------+ +----------+
+
+A table is made up of one or more contiguous clusters. The ``table_size`` header
+field determines table size for an image file. For example, ``cluster_size=64 KB``
+and ``table_size=4`` results in 256 KB tables.
+
+The logical image size must be less than or equal to the maximum possible size of
+clusters rooted by the L1 table:
+
+.. code::
+
+ header.image_size <= TABLE_NOFFSETS * TABLE_NOFFSETS * header.cluster_size
+
+L1, L2, and data cluster offsets must be aligned to ``header.cluster_size``.
+The following offsets have special meanings:
+
+L2 table offsets
+~~~~~~~~~~~~~~~~
+
+- 0 - unallocated. The L2 table is not yet allocated.
+
+Data cluster offsets
+~~~~~~~~~~~~~~~~~~~~
+
+- 0 - unallocated. The data cluster is not yet allocated.
+- 1 - zero. The data cluster contents are all zeroes and no cluster is allocated.
+
+Future format extensions may wish to store per-offset information. The least
+significant 12 bits of an offset are reserved for this purpose and must be set
+to zero. Image files with ``cluster_size`` > 2^12 will have more unused bits
+which should also be zeroed.
+
+Unallocated L2 tables and data clusters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Reads to an unallocated area of the image file access the backing file. If there
+is no backing file, then zeroes are produced. The backing file may be smaller
+than the image file and reads of unallocated areas beyond the end of the backing
+file produce zeroes.
+
+Writes to an unallocated area cause a new data clusters to be allocated, and a new
+L2 table if that is also unallocated. The new data cluster is populated with data
+from the backing file (or zeroes if no backing file) and the data being written.
+
+Zero data clusters
+~~~~~~~~~~~~~~~~~~
+
+Zero data clusters are a space-efficient way of storing zeroed regions of the image.
+
+Reads to a zero data cluster produce zeroes.
+
+.. note::
+ The difference between an unallocated and a zero data cluster is that zero data
+ clusters stop the reading of contents from the backing file.
+
+Writes to a zero data cluster cause a new data cluster to be allocated. The new
+data cluster is populated with zeroes and the data being written.
+
+Logical offset translation
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Logical offsets are translated into cluster offsets as follows::
+
+ table_bits table_bits cluster_bits
+ <--------> <--------> <--------------->
+ +----------+----------+-----------------+
+ | L1 index | L2 index | byte offset |
+ +----------+----------+-----------------+
+
+ Structure of a logical offset
+
+ offset_mask = ~(cluster_size - 1) # mask for the image file byte offset
+
+ def logical_to_cluster_offset(l1_index, l2_index, byte_offset):
+ l2_offset = l1_table[l1_index]
+ l2_table = load_table(l2_offset)
+ cluster_offset = l2_table[l2_index] & offset_mask
+ return cluster_offset + byte_offset
+
+Consistency checking
+--------------------
+
+This section is informational and included to provide background on the use
+of the ``QED_F_NEED_CHECK features`` bit.
+
+The ``QED_F_NEED_CHECK`` bit is used to mark an image as dirty before starting
+an operation that could leave the image in an inconsistent state if interrupted
+by a crash or power failure. A dirty image must be checked on open because its
+metadata may not be consistent.
+
+Consistency check includes the following invariants:
+
+- Each cluster is referenced once and only once. It is an inconsistency to have
+ a cluster referenced more than once by L1 or L2 tables. A cluster has been leaked
+ if it has no references.
+- Offsets must be within the image file size and must be ``cluster_size`` aligned.
+- Table offsets must at least ``table_size`` * ``cluster_size`` bytes from the end
+ of the image file so that there is space for the entire table.
+
+The consistency check process starts from ``l1_table_offset`` and scans all L2 tables.
+After the check completes with no other errors besides leaks, the ``QED_F_NEED_CHECK``
+bit can be cleared and the image can be accessed.
diff --git a/docs/interop/qed_spec.txt b/docs/interop/qed_spec.txt
deleted file mode 100644
index 7982e05..0000000
--- a/docs/interop/qed_spec.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-=Specification=
-
-The file format looks like this:
-
- +----------+----------+----------+-----+
- | cluster0 | cluster1 | cluster2 | ... |
- +----------+----------+----------+-----+
-
-The first cluster begins with the '''header'''. The header contains information about where regular clusters start; this allows the header to be extensible and store extra information about the image file. A regular cluster may be a '''data cluster''', an '''L2''', or an '''L1 table'''. L1 and L2 tables are composed of one or more contiguous clusters.
-
-Normally the file size will be a multiple of the cluster size. If the file size is not a multiple, extra information after the last cluster may not be preserved if data is written. Legitimate extra information should use space between the header and the first regular cluster.
-
-All fields are little-endian.
-
-==Header==
- Header {
- uint32_t magic; /* QED\0 */
-
- uint32_t cluster_size; /* in bytes */
- uint32_t table_size; /* for L1 and L2 tables, in clusters */
- uint32_t header_size; /* in clusters */
-
- uint64_t features; /* format feature bits */
- uint64_t compat_features; /* compat feature bits */
- uint64_t autoclear_features; /* self-resetting feature bits */
-
- uint64_t l1_table_offset; /* in bytes */
- uint64_t image_size; /* total logical image size, in bytes */
-
- /* if (features & QED_F_BACKING_FILE) */
- uint32_t backing_filename_offset; /* in bytes from start of header */
- uint32_t backing_filename_size; /* in bytes */
- }
-
-Field descriptions:
-* ''cluster_size'' must be a power of 2 in range [2^12, 2^26].
-* ''table_size'' must be a power of 2 in range [1, 16].
-* ''header_size'' is the number of clusters used by the header and any additional information stored before regular clusters.
-* ''features'', ''compat_features'', and ''autoclear_features'' are file format extension bitmaps. They work as follows:
-** An image with unknown ''features'' bits enabled must not be opened. File format changes that are not backwards-compatible must use ''features'' bits.
-** An image with unknown ''compat_features'' bits enabled can be opened safely. The unknown features are simply ignored and represent backwards-compatible changes to the file format.
-** An image with unknown ''autoclear_features'' bits enable can be opened safely after clearing the unknown bits. This allows for backwards-compatible changes to the file format which degrade gracefully and can be re-enabled again by a new program later.
-* ''l1_table_offset'' is the offset of the first byte of the L1 table in the image file and must be a multiple of ''cluster_size''.
-* ''image_size'' is the block device size seen by the guest and must be a multiple of 512 bytes.
-* ''backing_filename_offset'' and ''backing_filename_size'' describe a string in (byte offset, byte size) form. It is not NUL-terminated and has no alignment constraints. The string must be stored within the first ''header_size'' clusters. The backing filename may be an absolute path or relative to the image file.
-
-Feature bits:
-* QED_F_BACKING_FILE = 0x01. The image uses a backing file.
-* QED_F_NEED_CHECK = 0x02. The image needs a consistency check before use.
-* QED_F_BACKING_FORMAT_NO_PROBE = 0x04. The backing file is a raw disk image and no file format autodetection should be attempted. This should be used to ensure that raw backing files are never detected as an image format if they happen to contain magic constants.
-
-There are currently no defined ''compat_features'' or ''autoclear_features'' bits.
-
-Fields predicated on a feature bit are only used when that feature is set. The fields always take up header space, regardless of whether or not the feature bit is set.
-
-==Tables==
-
-Tables provide the translation from logical offsets in the block device to cluster offsets in the file.
-
- #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t))
-
- Table {
- uint64_t offsets[TABLE_NOFFSETS];
- }
-
-The tables are organized as follows:
-
- +----------+
- | L1 table |
- +----------+
- ,------' | '------.
- +----------+ | +----------+
- | L2 table | ... | L2 table |
- +----------+ +----------+
- ,------' | '------.
- +----------+ | +----------+
- | Data | ... | Data |
- +----------+ +----------+
-
-A table is made up of one or more contiguous clusters. The table_size header field determines table size for an image file. For example, cluster_size=64 KB and table_size=4 results in 256 KB tables.
-
-The logical image size must be less than or equal to the maximum possible size of clusters rooted by the L1 table:
- header.image_size <= TABLE_NOFFSETS * TABLE_NOFFSETS * header.cluster_size
-
-L1, L2, and data cluster offsets must be aligned to header.cluster_size. The following offsets have special meanings:
-
-===L2 table offsets===
-* 0 - unallocated. The L2 table is not yet allocated.
-
-===Data cluster offsets===
-* 0 - unallocated. The data cluster is not yet allocated.
-* 1 - zero. The data cluster contents are all zeroes and no cluster is allocated.
-
-Future format extensions may wish to store per-offset information. The least significant 12 bits of an offset are reserved for this purpose and must be set to zero. Image files with cluster_size > 2^12 will have more unused bits which should also be zeroed.
-
-===Unallocated L2 tables and data clusters===
-Reads to an unallocated area of the image file access the backing file. If there is no backing file, then zeroes are produced. The backing file may be smaller than the image file and reads of unallocated areas beyond the end of the backing file produce zeroes.
-
-Writes to an unallocated area cause a new data clusters to be allocated, and a new L2 table if that is also unallocated. The new data cluster is populated with data from the backing file (or zeroes if no backing file) and the data being written.
-
-===Zero data clusters===
-Zero data clusters are a space-efficient way of storing zeroed regions of the image.
-
-Reads to a zero data cluster produce zeroes. Note that the difference between an unallocated and a zero data cluster is that zero data clusters stop the reading of contents from the backing file.
-
-Writes to a zero data cluster cause a new data cluster to be allocated. The new data cluster is populated with zeroes and the data being written.
-
-===Logical offset translation===
-Logical offsets are translated into cluster offsets as follows:
-
- table_bits table_bits cluster_bits
- <--------> <--------> <--------------->
- +----------+----------+-----------------+
- | L1 index | L2 index | byte offset |
- +----------+----------+-----------------+
-
- Structure of a logical offset
-
- offset_mask = ~(cluster_size - 1) # mask for the image file byte offset
-
- def logical_to_cluster_offset(l1_index, l2_index, byte_offset):
- l2_offset = l1_table[l1_index]
- l2_table = load_table(l2_offset)
- cluster_offset = l2_table[l2_index] & offset_mask
- return cluster_offset + byte_offset
-
-==Consistency checking==
-
-This section is informational and included to provide background on the use of the QED_F_NEED_CHECK ''features'' bit.
-
-The QED_F_NEED_CHECK bit is used to mark an image as dirty before starting an operation that could leave the image in an inconsistent state if interrupted by a crash or power failure. A dirty image must be checked on open because its metadata may not be consistent.
-
-Consistency check includes the following invariants:
-# Each cluster is referenced once and only once. It is an inconsistency to have a cluster referenced more than once by L1 or L2 tables. A cluster has been leaked if it has no references.
-# Offsets must be within the image file size and must be ''cluster_size'' aligned.
-# Table offsets must at least ''table_size'' * ''cluster_size'' bytes from the end of the image file so that there is space for the entire table.
-
-The consistency check process starts by from ''l1_table_offset'' and scans all L2 tables. After the check completes with no other errors besides leaks, the QED_F_NEED_CHECK bit can be cleared and the image can be accessed.
diff --git a/docs/interop/qemu-ga-ref.rst b/docs/interop/qemu-ga-ref.rst
index 032d492..25f6e24 100644
--- a/docs/interop/qemu-ga-ref.rst
+++ b/docs/interop/qemu-ga-ref.rst
@@ -1,7 +1,6 @@
QEMU Guest Agent Protocol Reference
===================================
-.. contents::
- :depth: 3
-
.. qapi-doc:: qga/qapi-schema.json
+ :transmogrify:
+ :namespace: QGA
diff --git a/docs/interop/qemu-ga.rst b/docs/interop/qemu-ga.rst
index 9c73808..d16cc1b 100644
--- a/docs/interop/qemu-ga.rst
+++ b/docs/interop/qemu-ga.rst
@@ -1,3 +1,5 @@
+.. _qemu-ga:
+
QEMU Guest Agent
================
@@ -50,7 +52,7 @@ Options
.. option:: -c, --config=PATH
Configuration file path (the default is |CONFDIR|\ ``/qemu-ga.conf``,
- unless overriden by the QGA_CONF environment variable)
+ unless overridden by the QGA_CONF environment variable)
.. option:: -m, --method=METHOD
diff --git a/docs/interop/qemu-qmp-ref.rst b/docs/interop/qemu-qmp-ref.rst
index f94614a..3bc1ca1 100644
--- a/docs/interop/qemu-qmp-ref.rst
+++ b/docs/interop/qemu-qmp-ref.rst
@@ -4,6 +4,8 @@ QEMU QMP Reference Manual
=========================
.. contents::
- :depth: 3
+ :local:
.. qapi-doc:: qapi/qapi-schema.json
+ :transmogrify:
+ :namespace: QMP
diff --git a/docs/interop/qemu-storage-daemon-qmp-ref.rst b/docs/interop/qemu-storage-daemon-qmp-ref.rst
index 9fed681..dc7bde2 100644
--- a/docs/interop/qemu-storage-daemon-qmp-ref.rst
+++ b/docs/interop/qemu-storage-daemon-qmp-ref.rst
@@ -2,6 +2,8 @@ QEMU Storage Daemon QMP Reference Manual
========================================
.. contents::
- :depth: 3
+ :local:
.. qapi-doc:: storage-daemon/qapi/qapi-schema.json
+ :transmogrify:
+ :namespace: QSD
diff --git a/docs/interop/vfio-user.rst b/docs/interop/vfio-user.rst
new file mode 100644
index 0000000..0b06f02
--- /dev/null
+++ b/docs/interop/vfio-user.rst
@@ -0,0 +1,1520 @@
+.. include:: <isonum.txt>
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+================================
+vfio-user Protocol Specification
+================================
+
+.. contents:: Table of Contents
+
+Introduction
+============
+vfio-user is a protocol that allows a device to be emulated in a separate
+process outside of a Virtual Machine Monitor (VMM). vfio-user devices consist
+of a generic VFIO device type, living inside the VMM, which we call the client,
+and the core device implementation, living outside the VMM, which we call the
+server.
+
+The vfio-user specification is partly based on the
+`Linux VFIO ioctl interface <https://www.kernel.org/doc/html/latest/driver-api/vfio.html>`_.
+
+VFIO is a mature and stable API, backed by an extensively used framework. The
+existing VFIO client implementation in QEMU (``qemu/hw/vfio/``) can be largely
+re-used, though there is nothing in this specification that requires that
+particular implementation. None of the VFIO kernel modules are required for
+supporting the protocol, on either the client or server side. Some source
+definitions in VFIO are re-used for vfio-user.
+
+The main idea is to allow a virtual device to function in a separate process in
+the same host over a UNIX domain socket. A UNIX domain socket (``AF_UNIX``) is
+chosen because file descriptors can be trivially sent over it, which in turn
+allows:
+
+* Sharing of client memory for DMA with the server.
+* Sharing of server memory with the client for fast MMIO.
+* Efficient sharing of eventfd's for triggering interrupts.
+
+Other socket types could be used which allow the server to run in a separate
+guest in the same host (``AF_VSOCK``) or remotely (``AF_INET``). Theoretically
+the underlying transport does not necessarily have to be a socket, however we do
+not examine such alternatives. In this protocol version we focus on using a UNIX
+domain socket and introduce basic support for the other two types of sockets
+without considering performance implications.
+
+While passing of file descriptors is desirable for performance reasons, support
+is not necessary for either the client or the server in order to implement the
+protocol. There is always an in-band, message-passing fall back mechanism.
+
+Overview
+========
+
+VFIO is a framework that allows a physical device to be securely passed through
+to a user space process; the device-specific kernel driver does not drive the
+device at all. Typically, the user space process is a VMM and the device is
+passed through to it in order to achieve high performance. VFIO provides an API
+and the required functionality in the kernel. QEMU has adopted VFIO to allow a
+guest to directly access physical devices, instead of emulating them in
+software.
+
+vfio-user reuses the core VFIO concepts defined in its API, but implements them
+as messages to be sent over a socket. It does not change the kernel-based VFIO
+in any way, in fact none of the VFIO kernel modules need to be loaded to use
+vfio-user. It is also possible for the client to concurrently use the current
+kernel-based VFIO for one device, and vfio-user for another device.
+
+VFIO Device Model
+-----------------
+
+A device under VFIO presents a standard interface to the user process. Many of
+the VFIO operations in the existing interface use the ``ioctl()`` system call, and
+references to the existing interface are called the ``ioctl()`` implementation in
+this document.
+
+The following sections describe the set of messages that implement the vfio-user
+interface over a socket. In many cases, the messages are analogous to data
+structures used in the ``ioctl()`` implementation. Messages derived from the
+``ioctl()`` will have a name derived from the ``ioctl()`` command name. E.g., the
+``VFIO_DEVICE_GET_INFO`` ``ioctl()`` command becomes a
+``VFIO_USER_DEVICE_GET_INFO`` message. The purpose of this reuse is to share as
+much code as feasible with the ``ioctl()`` implementation``.
+
+Connection Initiation
+^^^^^^^^^^^^^^^^^^^^^
+
+After the client connects to the server, the initial client message is
+``VFIO_USER_VERSION`` to propose a protocol version and set of capabilities to
+apply to the session. The server replies with a compatible version and set of
+capabilities it supports, or closes the connection if it cannot support the
+advertised version.
+
+Device Information
+^^^^^^^^^^^^^^^^^^
+
+The client uses a ``VFIO_USER_DEVICE_GET_INFO`` message to query the server for
+information about the device. This information includes:
+
+* The device type and whether it supports reset (``VFIO_DEVICE_FLAGS_``),
+* the number of device regions, and
+* the device presents to the client the number of interrupt types the device
+ supports.
+
+Region Information
+^^^^^^^^^^^^^^^^^^
+
+The client uses ``VFIO_USER_DEVICE_GET_REGION_INFO`` messages to query the
+server for information about the device's regions. This information describes:
+
+* Read and write permissions, whether it can be memory mapped, and whether it
+ supports additional capabilities (``VFIO_REGION_INFO_CAP_``).
+* Region index, size, and offset.
+
+When a device region can be mapped by the client, the server provides a file
+descriptor which the client can ``mmap()``. The server is responsible for
+polling for client updates to memory mapped regions.
+
+Region Capabilities
+"""""""""""""""""""
+
+Some regions have additional capabilities that cannot be described adequately
+by the region info data structure. These capabilities are returned in the
+region info reply in a list similar to PCI capabilities in a PCI device's
+configuration space.
+
+Sparse Regions
+""""""""""""""
+A region can be memory-mappable in whole or in part. When only a subset of a
+region can be mapped by the client, a ``VFIO_REGION_INFO_CAP_SPARSE_MMAP``
+capability is included in the region info reply. This capability describes
+which portions can be mapped by the client.
+
+.. Note::
+ For example, in a virtual NVMe controller, sparse regions can be used so
+ that accesses to the NVMe registers (found in the beginning of BAR0) are
+ trapped (an infrequent event), while allowing direct access to the doorbells
+ (an extremely frequent event as every I/O submission requires a write to
+ BAR0), found in the next page after the NVMe registers in BAR0.
+
+Device-Specific Regions
+"""""""""""""""""""""""
+
+A device can define regions additional to the standard ones (e.g. PCI indexes
+0-8). This is achieved by including a ``VFIO_REGION_INFO_CAP_TYPE`` capability
+in the region info reply of a device-specific region. Such regions are reflected
+in ``struct vfio_user_device_info.num_regions``. Thus, for PCI devices this
+value can be equal to, or higher than, ``VFIO_PCI_NUM_REGIONS``.
+
+Region I/O via file descriptors
+-------------------------------
+
+For unmapped regions, region I/O from the client is done via
+``VFIO_USER_REGION_READ/WRITE``. As an optimization, ioeventfds or ioregionfds
+may be configured for sub-regions of some regions. A client may request
+information on these sub-regions via ``VFIO_USER_DEVICE_GET_REGION_IO_FDS``; by
+configuring the returned file descriptors as ioeventfds or ioregionfds, the
+server can be directly notified of I/O (for example, by KVM) without taking a
+trip through the client.
+
+Interrupts
+^^^^^^^^^^
+
+The client uses ``VFIO_USER_DEVICE_GET_IRQ_INFO`` messages to query the server
+for the device's interrupt types. The interrupt types are specific to the bus
+the device is attached to, and the client is expected to know the capabilities
+of each interrupt type. The server can signal an interrupt by directly injecting
+interrupts into the guest via an event file descriptor. The client configures
+how the server signals an interrupt with ``VFIO_USER_SET_IRQS`` messages.
+
+Device Read and Write
+^^^^^^^^^^^^^^^^^^^^^
+
+When the guest executes load or store operations to an unmapped device region,
+the client forwards these operations to the server with
+``VFIO_USER_REGION_READ`` or ``VFIO_USER_REGION_WRITE`` messages. The server
+will reply with data from the device on read operations or an acknowledgement on
+write operations. See `Read and Write Operations`_.
+
+Client memory access
+--------------------
+
+The client uses ``VFIO_USER_DMA_MAP`` and ``VFIO_USER_DMA_UNMAP`` messages to
+inform the server of the valid DMA ranges that the server can access on behalf
+of a device (typically, VM guest memory). DMA memory may be accessed by the
+server via ``VFIO_USER_DMA_READ`` and ``VFIO_USER_DMA_WRITE`` messages over the
+socket. In this case, the "DMA" part of the naming is a misnomer.
+
+Actual direct memory access of client memory from the server is possible if the
+client provides file descriptors the server can ``mmap()``. Note that ``mmap()``
+privileges cannot be revoked by the client, therefore file descriptors should
+only be exported in environments where the client trusts the server not to
+corrupt guest memory.
+
+See `Read and Write Operations`_.
+
+Client/server interactions
+==========================
+
+Socket
+------
+
+A server can serve:
+
+1) one or more clients, and/or
+2) one or more virtual devices, belonging to one or more clients.
+
+The current protocol specification requires a dedicated socket per
+client/server connection. It is a server-side implementation detail whether a
+single server handles multiple virtual devices from the same or multiple
+clients. The location of the socket is implementation-specific. Multiplexing
+clients, devices, and servers over the same socket is not supported in this
+version of the protocol.
+
+Authentication
+--------------
+
+For ``AF_UNIX``, we rely on OS mandatory access controls on the socket files,
+therefore it is up to the management layer to set up the socket as required.
+Socket types that span guests or hosts will require a proper authentication
+mechanism. Defining that mechanism is deferred to a future version of the
+protocol.
+
+Command Concurrency
+-------------------
+
+A client may pipeline multiple commands without waiting for previous command
+replies. The server will process commands in the order they are received. A
+consequence of this is if a client issues a command with the *No_reply* bit,
+then subsequently issues a command without *No_reply*, the older command will
+have been processed before the reply to the younger command is sent by the
+server. The client must be aware of the device's capability to process
+concurrent commands if pipelining is used. For example, pipelining allows
+multiple client threads to concurrently access device regions; the client must
+ensure these accesses obey device semantics.
+
+An example is a frame buffer device, where the device may allow concurrent
+access to different areas of video memory, but may have indeterminate behavior
+if concurrent accesses are performed to command or status registers.
+
+Note that unrelated messages sent from the server to the client can appear in
+between a client to server request/reply and vice versa.
+
+Implementers should be prepared for certain commands to exhibit potentially
+unbounded latencies. For example, ``VFIO_USER_DEVICE_RESET`` may take an
+arbitrarily long time to complete; clients should take care not to block
+unnecessarily.
+
+Socket Disconnection Behavior
+-----------------------------
+The server and the client can disconnect from each other, either intentionally
+or unexpectedly. Both the client and the server need to know how to handle such
+events.
+
+Server Disconnection
+^^^^^^^^^^^^^^^^^^^^
+A server disconnecting from the client may indicate that:
+
+1) A virtual device has been restarted, either intentionally (e.g. because of a
+ device update) or unintentionally (e.g. because of a crash).
+2) A virtual device has been shut down with no intention to be restarted.
+
+It is impossible for the client to know whether or not a failure is
+intermittent or innocuous and should be retried, therefore the client should
+reset the VFIO device when it detects the socket has been disconnected.
+Error recovery will be driven by the guest's device error handling
+behavior.
+
+Client Disconnection
+^^^^^^^^^^^^^^^^^^^^
+The client disconnecting from the server primarily means that the client
+has exited. Currently, this means that the guest is shut down so the device is
+no longer needed therefore the server can automatically exit. However, there
+can be cases where a client disconnection should not result in a server exit:
+
+1) A single server serving multiple clients.
+2) A multi-process QEMU upgrading itself step by step, which is not yet
+ implemented.
+
+Therefore in order for the protocol to be forward compatible, the server should
+respond to a client disconnection as follows:
+
+ - all client memory regions are unmapped and cleaned up (including closing any
+ passed file descriptors)
+ - all IRQ file descriptors passed from the old client are closed
+ - the device state should otherwise be retained
+
+The expectation is that when a client reconnects, it will re-establish IRQ and
+client memory mappings.
+
+If anything happens to the client (such as qemu really did exit), the control
+stack will know about it and can clean up resources accordingly.
+
+Security Considerations
+-----------------------
+
+Speaking generally, vfio-user clients should not trust servers, and vice versa.
+Standard tools and mechanisms should be used on both sides to validate input and
+prevent against denial of service scenarios, buffer overflow, etc.
+
+Request Retry and Response Timeout
+----------------------------------
+A failed command is a command that has been successfully sent and has been
+responded to with an error code. Failure to send the command in the first place
+(e.g. because the socket is disconnected) is a different type of error examined
+earlier in the disconnect section.
+
+.. Note::
+ QEMU's VFIO retries certain operations if they fail. While this makes sense
+ for real HW, we don't know for sure whether it makes sense for virtual
+ devices.
+
+Defining a retry and timeout scheme is deferred to a future version of the
+protocol.
+
+Message sizes
+-------------
+
+Some requests have an ``argsz`` field. In a request, it defines the maximum
+expected reply payload size, which should be at least the size of the fixed
+reply payload headers defined here. The *request* payload size is defined by the
+usual ``msg_size`` field in the header, not the ``argsz`` field.
+
+In a reply, the server sets ``argsz`` field to the size needed for a full
+payload size. This may be less than the requested maximum size. This may be
+larger than the requested maximum size: in that case, the full payload is not
+included in the reply, but the ``argsz`` field in the reply indicates the needed
+size, allowing a client to allocate a larger buffer for holding the reply before
+trying again.
+
+In addition, during negotiation (see `Version`_), the client and server may
+each specify a ``max_data_xfer_size`` value; this defines the maximum data that
+may be read or written via one of the ``VFIO_USER_DMA/REGION_READ/WRITE``
+messages; see `Read and Write Operations`_.
+
+Protocol Specification
+======================
+
+To distinguish from the base VFIO symbols, all vfio-user symbols are prefixed
+with ``vfio_user`` or ``VFIO_USER``. In this revision, all data is in the
+endianness of the host system, although this may be relaxed in future
+revisions in cases where the client and server run on different hosts
+with different endianness.
+
+Unless otherwise specified, all sizes should be presumed to be in bytes.
+
+.. _Commands:
+
+Commands
+--------
+The following table lists the VFIO message command IDs, and whether the
+message command is sent from the client or the server.
+
+====================================== ========= =================
+Name Command Request Direction
+====================================== ========= =================
+``VFIO_USER_VERSION`` 1 client -> server
+``VFIO_USER_DMA_MAP`` 2 client -> server
+``VFIO_USER_DMA_UNMAP`` 3 client -> server
+``VFIO_USER_DEVICE_GET_INFO`` 4 client -> server
+``VFIO_USER_DEVICE_GET_REGION_INFO`` 5 client -> server
+``VFIO_USER_DEVICE_GET_REGION_IO_FDS`` 6 client -> server
+``VFIO_USER_DEVICE_GET_IRQ_INFO`` 7 client -> server
+``VFIO_USER_DEVICE_SET_IRQS`` 8 client -> server
+``VFIO_USER_REGION_READ`` 9 client -> server
+``VFIO_USER_REGION_WRITE`` 10 client -> server
+``VFIO_USER_DMA_READ`` 11 server -> client
+``VFIO_USER_DMA_WRITE`` 12 server -> client
+``VFIO_USER_DEVICE_RESET`` 13 client -> server
+``VFIO_USER_REGION_WRITE_MULTI`` 15 client -> server
+====================================== ========= =================
+
+Header
+------
+
+All messages, both command messages and reply messages, are preceded by a
+16-byte header that contains basic information about the message. The header is
+followed by message-specific data described in the sections below.
+
++----------------+--------+-------------+
+| Name | Offset | Size |
++================+========+=============+
+| Message ID | 0 | 2 |
++----------------+--------+-------------+
+| Command | 2 | 2 |
++----------------+--------+-------------+
+| Message size | 4 | 4 |
++----------------+--------+-------------+
+| Flags | 8 | 4 |
++----------------+--------+-------------+
+| | +-----+------------+ |
+| | | Bit | Definition | |
+| | +=====+============+ |
+| | | 0-3 | Type | |
+| | +-----+------------+ |
+| | | 4 | No_reply | |
+| | +-----+------------+ |
+| | | 5 | Error | |
+| | +-----+------------+ |
++----------------+--------+-------------+
+| Error | 12 | 4 |
++----------------+--------+-------------+
+| <message data> | 16 | variable |
++----------------+--------+-------------+
+
+* *Message ID* identifies the message, and is echoed in the command's reply
+ message. Message IDs belong entirely to the sender, can be re-used (even
+ concurrently) and the receiver must not make any assumptions about their
+ uniqueness.
+* *Command* specifies the command to be executed, listed in Commands_. It is
+ also set in the reply header.
+* *Message size* contains the size of the entire message, including the header.
+* *Flags* contains attributes of the message:
+
+ * The *Type* bits indicate the message type.
+
+ * *Command* (value 0x0) indicates a command message.
+ * *Reply* (value 0x1) indicates a reply message acknowledging a previous
+ command with the same message ID.
+ * *No_reply* in a command message indicates that no reply is needed for this
+ command. This is commonly used when multiple commands are sent, and only
+ the last needs acknowledgement.
+ * *Error* in a reply message indicates the command being acknowledged had
+ an error. In this case, the *Error* field will be valid.
+
+* *Error* in a reply message is an optional UNIX errno value. It may be zero
+ even if the Error bit is set in Flags. It is reserved in a command message.
+
+Each command message in Commands_ must be replied to with a reply message,
+unless the message sets the *No_Reply* bit. The reply consists of the header
+with the *Reply* bit set, plus any additional data.
+
+If an error occurs, the reply message must only include the reply header.
+
+As the header is standard in both requests and replies, it is not included in
+the command-specific specifications below; each message definition should be
+appended to the standard header, and the offsets are given from the end of the
+standard header.
+
+``VFIO_USER_VERSION``
+---------------------
+
+.. _Version:
+
+This is the initial message sent by the client after the socket connection is
+established; the same format is used for the server's reply.
+
+Upon establishing a connection, the client must send a ``VFIO_USER_VERSION``
+message proposing a protocol version and a set of capabilities. The server
+compares these with the versions and capabilities it supports and sends a
+``VFIO_USER_VERSION`` reply according to the following rules.
+
+* The major version in the reply must be the same as proposed. If the client
+ does not support the proposed major, it closes the connection.
+* The minor version in the reply must be equal to or less than the minor
+ version proposed.
+* The capability list must be a subset of those proposed. If the server
+ requires a capability the client did not include, it closes the connection.
+
+The protocol major version will only change when incompatible protocol changes
+are made, such as changing the message format. The minor version may change
+when compatible changes are made, such as adding new messages or capabilities,
+Both the client and server must support all minor versions less than the
+maximum minor version it supports. E.g., an implementation that supports
+version 1.3 must also support 1.0 through 1.2.
+
+When making a change to this specification, the protocol version number must
+be included in the form "added in version X.Y"
+
+Request
+^^^^^^^
+
+============== ====== ====
+Name Offset Size
+============== ====== ====
+version major 0 2
+version minor 2 2
+version data 4 variable (including terminating NUL). Optional.
+============== ====== ====
+
+The version data is an optional UTF-8 encoded JSON byte array with the following
+format:
+
++--------------+--------+-----------------------------------+
+| Name | Type | Description |
++==============+========+===================================+
+| capabilities | object | Contains common capabilities that |
+| | | the sender supports. Optional. |
++--------------+--------+-----------------------------------+
+
+Capabilities:
+
++--------------------+---------+------------------------------------------------+
+| Name | Type | Description |
++====================+=========+================================================+
+| max_msg_fds | number | Maximum number of file descriptors that can be |
+| | | received by the sender in one message. |
+| | | Optional. If not specified then the receiver |
+| | | must assume a value of ``1``. |
++--------------------+---------+------------------------------------------------+
+| max_data_xfer_size | number | Maximum ``count`` for data transfer messages; |
+| | | see `Read and Write Operations`_. Optional, |
+| | | with a default value of 1048576 bytes. |
++--------------------+---------+------------------------------------------------+
+| pgsizes | number | Page sizes supported in DMA map operations |
+| | | or'ed together. Optional, with a default value |
+| | | of supporting only 4k pages. |
++--------------------+---------+------------------------------------------------+
+| max_dma_maps | number | Maximum number DMA map windows that can be |
+| | | valid simultaneously. Optional, with a |
+| | | value of 65535 (64k-1). |
++--------------------+---------+------------------------------------------------+
+| migration | object | Migration capability parameters. If missing |
+| | | then migration is not supported by the sender. |
++--------------------+---------+------------------------------------------------+
+| write_multiple | boolean | ``VFIO_USER_REGION_WRITE_MULTI`` messages |
+| | | are supported if the value is ``true``. |
++--------------------+---------+------------------------------------------------+
+
+The migration capability contains the following name/value pairs:
+
++-----------------+--------+--------------------------------------------------+
+| Name | Type | Description |
++=================+========+==================================================+
+| pgsize | number | Page size of dirty pages bitmap. The smallest |
+| | | between the client and the server is used. |
++-----------------+--------+--------------------------------------------------+
+| max_bitmap_size | number | Maximum bitmap size in ``VFIO_USER_DIRTY_PAGES`` |
+| | | and ``VFIO_DMA_UNMAP`` messages. Optional, |
+| | | with a default value of 256MB. |
++-----------------+--------+--------------------------------------------------+
+
+Reply
+^^^^^
+
+The same message format is used in the server's reply with the semantics
+described above.
+
+``VFIO_USER_DMA_MAP``
+---------------------
+
+This command message is sent by the client to the server to inform it of the
+memory regions the server can access. It must be sent before the server can
+perform any DMA to the client. It is normally sent directly after the version
+handshake is completed, but may also occur when memory is added to the client,
+or if the client uses a vIOMMU.
+
+Request
+^^^^^^^
+
+The request payload for this message is a structure of the following format:
+
++-------------+--------+-------------+
+| Name | Offset | Size |
++=============+========+=============+
+| argsz | 0 | 4 |
++-------------+--------+-------------+
+| flags | 4 | 4 |
++-------------+--------+-------------+
+| | +-----+------------+ |
+| | | Bit | Definition | |
+| | +=====+============+ |
+| | | 0 | readable | |
+| | +-----+------------+ |
+| | | 1 | writeable | |
+| | +-----+------------+ |
++-------------+--------+-------------+
+| offset | 8 | 8 |
++-------------+--------+-------------+
+| address | 16 | 8 |
++-------------+--------+-------------+
+| size | 24 | 8 |
++-------------+--------+-------------+
+
+* *argsz* is the size of the above structure. Note there is no reply payload,
+ so this field differs from other message types.
+* *flags* contains the following region attributes:
+
+ * *readable* indicates that the region can be read from.
+
+ * *writeable* indicates that the region can be written to.
+
+* *offset* is the file offset of the region with respect to the associated file
+ descriptor, or zero if the region is not mappable
+* *address* is the base DMA address of the region.
+* *size* is the size of the region.
+
+This structure is 32 bytes in size, so the message size is 16 + 32 bytes.
+
+If the DMA region being added can be directly mapped by the server, a file
+descriptor must be sent as part of the message meta-data. The region can be
+mapped via the mmap() system call. On ``AF_UNIX`` sockets, the file descriptor
+must be passed as ``SCM_RIGHTS`` type ancillary data. Otherwise, if the DMA
+region cannot be directly mapped by the server, no file descriptor must be sent
+as part of the message meta-data and the DMA region can be accessed by the
+server using ``VFIO_USER_DMA_READ`` and ``VFIO_USER_DMA_WRITE`` messages,
+explained in `Read and Write Operations`_. A command to map over an existing
+region must be failed by the server with ``EEXIST`` set in error field in the
+reply.
+
+Reply
+^^^^^
+
+There is no payload in the reply message.
+
+``VFIO_USER_DMA_UNMAP``
+-----------------------
+
+This command message is sent by the client to the server to inform it that a
+DMA region, previously made available via a ``VFIO_USER_DMA_MAP`` command
+message, is no longer available for DMA. It typically occurs when memory is
+subtracted from the client or if the client uses a vIOMMU. The DMA region is
+described by the following structure:
+
+Request
+^^^^^^^
+
+The request payload for this message is a structure of the following format:
+
++--------------+--------+------------------------+
+| Name | Offset | Size |
++==============+========+========================+
+| argsz | 0 | 4 |
++--------------+--------+------------------------+
+| flags | 4 | 4 |
++--------------+--------+------------------------+
+| address | 8 | 8 |
++--------------+--------+------------------------+
+| size | 16 | 8 |
++--------------+--------+------------------------+
+
+* *argsz* is the maximum size of the reply payload.
+* *flags* is unused in this version.
+* *address* is the base DMA address of the DMA region.
+* *size* is the size of the DMA region.
+
+The address and size of the DMA region being unmapped must match exactly a
+previous mapping.
+
+Reply
+^^^^^
+
+Upon receiving a ``VFIO_USER_DMA_UNMAP`` command, if the file descriptor is
+mapped then the server must release all references to that DMA region before
+replying, which potentially includes in-flight DMA transactions.
+
+The server responds with the original DMA entry in the request.
+
+
+``VFIO_USER_DEVICE_GET_INFO``
+-----------------------------
+
+This command message is sent by the client to the server to query for basic
+information about the device.
+
+Request
+^^^^^^^
+
++-------------+--------+--------------------------+
+| Name | Offset | Size |
++=============+========+==========================+
+| argsz | 0 | 4 |
++-------------+--------+--------------------------+
+| flags | 4 | 4 |
++-------------+--------+--------------------------+
+| | +-----+-------------------------+ |
+| | | Bit | Definition | |
+| | +=====+=========================+ |
+| | | 0 | VFIO_DEVICE_FLAGS_RESET | |
+| | +-----+-------------------------+ |
+| | | 1 | VFIO_DEVICE_FLAGS_PCI | |
+| | +-----+-------------------------+ |
++-------------+--------+--------------------------+
+| num_regions | 8 | 4 |
++-------------+--------+--------------------------+
+| num_irqs | 12 | 4 |
++-------------+--------+--------------------------+
+
+* *argsz* is the maximum size of the reply payload
+* all other fields must be zero.
+
+Reply
+^^^^^
+
++-------------+--------+--------------------------+
+| Name | Offset | Size |
++=============+========+==========================+
+| argsz | 0 | 4 |
++-------------+--------+--------------------------+
+| flags | 4 | 4 |
++-------------+--------+--------------------------+
+| | +-----+-------------------------+ |
+| | | Bit | Definition | |
+| | +=====+=========================+ |
+| | | 0 | VFIO_DEVICE_FLAGS_RESET | |
+| | +-----+-------------------------+ |
+| | | 1 | VFIO_DEVICE_FLAGS_PCI | |
+| | +-----+-------------------------+ |
++-------------+--------+--------------------------+
+| num_regions | 8 | 4 |
++-------------+--------+--------------------------+
+| num_irqs | 12 | 4 |
++-------------+--------+--------------------------+
+
+* *argsz* is the size required for the full reply payload (16 bytes today)
+* *flags* contains the following device attributes.
+
+ * ``VFIO_DEVICE_FLAGS_RESET`` indicates that the device supports the
+ ``VFIO_USER_DEVICE_RESET`` message.
+ * ``VFIO_DEVICE_FLAGS_PCI`` indicates that the device is a PCI device.
+
+* *num_regions* is the number of memory regions that the device exposes.
+* *num_irqs* is the number of distinct interrupt types that the device supports.
+
+This version of the protocol only supports PCI devices. Additional devices may
+be supported in future versions.
+
+``VFIO_USER_DEVICE_GET_REGION_INFO``
+------------------------------------
+
+This command message is sent by the client to the server to query for
+information about device regions. The VFIO region info structure is defined in
+``<linux/vfio.h>`` (``struct vfio_region_info``).
+
+Request
+^^^^^^^
+
++------------+--------+------------------------------+
+| Name | Offset | Size |
++============+========+==============================+
+| argsz | 0 | 4 |
++------------+--------+------------------------------+
+| flags | 4 | 4 |
++------------+--------+------------------------------+
+| index | 8 | 4 |
++------------+--------+------------------------------+
+| cap_offset | 12 | 4 |
++------------+--------+------------------------------+
+| size | 16 | 8 |
++------------+--------+------------------------------+
+| offset | 24 | 8 |
++------------+--------+------------------------------+
+
+* *argsz* the maximum size of the reply payload
+* *index* is the index of memory region being queried, it is the only field
+ that is required to be set in the command message.
+* all other fields must be zero.
+
+Reply
+^^^^^
+
++------------+--------+------------------------------+
+| Name | Offset | Size |
++============+========+==============================+
+| argsz | 0 | 4 |
++------------+--------+------------------------------+
+| flags | 4 | 4 |
++------------+--------+------------------------------+
+| | +-----+-----------------------------+ |
+| | | Bit | Definition | |
+| | +=====+=============================+ |
+| | | 0 | VFIO_REGION_INFO_FLAG_READ | |
+| | +-----+-----------------------------+ |
+| | | 1 | VFIO_REGION_INFO_FLAG_WRITE | |
+| | +-----+-----------------------------+ |
+| | | 2 | VFIO_REGION_INFO_FLAG_MMAP | |
+| | +-----+-----------------------------+ |
+| | | 3 | VFIO_REGION_INFO_FLAG_CAPS | |
+| | +-----+-----------------------------+ |
++------------+--------+------------------------------+
++------------+--------+------------------------------+
+| index | 8 | 4 |
++------------+--------+------------------------------+
+| cap_offset | 12 | 4 |
++------------+--------+------------------------------+
+| size | 16 | 8 |
++------------+--------+------------------------------+
+| offset | 24 | 8 |
++------------+--------+------------------------------+
+
+* *argsz* is the size required for the full reply payload (region info structure
+ plus the size of any region capabilities)
+* *flags* are attributes of the region:
+
+ * ``VFIO_REGION_INFO_FLAG_READ`` allows client read access to the region.
+ * ``VFIO_REGION_INFO_FLAG_WRITE`` allows client write access to the region.
+ * ``VFIO_REGION_INFO_FLAG_MMAP`` specifies the client can mmap() the region.
+ When this flag is set, the reply will include a file descriptor in its
+ meta-data. On ``AF_UNIX`` sockets, the file descriptors will be passed as
+ ``SCM_RIGHTS`` type ancillary data.
+ * ``VFIO_REGION_INFO_FLAG_CAPS`` indicates additional capabilities found in the
+ reply.
+
+* *index* is the index of memory region being queried, it is the only field
+ that is required to be set in the command message.
+* *cap_offset* describes where additional region capabilities can be found.
+ cap_offset is relative to the beginning of the VFIO region info structure.
+ The data structure it points is a VFIO cap header defined in
+ ``<linux/vfio.h>``.
+* *size* is the size of the region.
+* *offset* is the offset that should be given to the mmap() system call for
+ regions with the MMAP attribute. It is also used as the base offset when
+ mapping a VFIO sparse mmap area, described below.
+
+VFIO region capabilities
+""""""""""""""""""""""""
+
+The VFIO region information can also include a capabilities list. This list is
+similar to a PCI capability list - each entry has a common header that
+identifies a capability and where the next capability in the list can be found.
+The VFIO capability header format is defined in ``<linux/vfio.h>`` (``struct
+vfio_info_cap_header``).
+
+VFIO cap header format
+""""""""""""""""""""""
+
++---------+--------+------+
+| Name | Offset | Size |
++=========+========+======+
+| id | 0 | 2 |
++---------+--------+------+
+| version | 2 | 2 |
++---------+--------+------+
+| next | 4 | 4 |
++---------+--------+------+
+
+* *id* is the capability identity.
+* *version* is a capability-specific version number.
+* *next* specifies the offset of the next capability in the capability list. It
+ is relative to the beginning of the VFIO region info structure.
+
+VFIO sparse mmap cap header
+"""""""""""""""""""""""""""
+
++------------------+----------------------------------+
+| Name | Value |
++==================+==================================+
+| id | VFIO_REGION_INFO_CAP_SPARSE_MMAP |
++------------------+----------------------------------+
+| version | 0x1 |
++------------------+----------------------------------+
+| next | <next> |
++------------------+----------------------------------+
+| sparse mmap info | VFIO region info sparse mmap |
++------------------+----------------------------------+
+
+This capability is defined when only a subrange of the region supports
+direct access by the client via mmap(). The VFIO sparse mmap area is defined in
+``<linux/vfio.h>`` (``struct vfio_region_sparse_mmap_area`` and ``struct
+vfio_region_info_cap_sparse_mmap``).
+
+VFIO region info cap sparse mmap
+""""""""""""""""""""""""""""""""
+
++----------+--------+------+
+| Name | Offset | Size |
++==========+========+======+
+| nr_areas | 0 | 4 |
++----------+--------+------+
+| reserved | 4 | 4 |
++----------+--------+------+
+| offset | 8 | 8 |
++----------+--------+------+
+| size | 16 | 8 |
++----------+--------+------+
+| ... | | |
++----------+--------+------+
+
+* *nr_areas* is the number of sparse mmap areas in the region.
+* *offset* and size describe a single area that can be mapped by the client.
+ There will be *nr_areas* pairs of offset and size. The offset will be added to
+ the base offset given in the ``VFIO_USER_DEVICE_GET_REGION_INFO`` to form the
+ offset argument of the subsequent mmap() call.
+
+The VFIO sparse mmap area is defined in ``<linux/vfio.h>`` (``struct
+vfio_region_info_cap_sparse_mmap``).
+
+
+``VFIO_USER_DEVICE_GET_REGION_IO_FDS``
+--------------------------------------
+
+Clients can access regions via ``VFIO_USER_REGION_READ/WRITE`` or, if provided, by
+``mmap()`` of a file descriptor provided by the server.
+
+``VFIO_USER_DEVICE_GET_REGION_IO_FDS`` provides an alternative access mechanism via
+file descriptors. This is an optional feature intended for performance
+improvements where an underlying sub-system (such as KVM) supports communication
+across such file descriptors to the vfio-user server, without needing to
+round-trip through the client.
+
+The server returns an array of sub-regions for the requested region. Each
+sub-region describes a span (offset and size) of a region, along with the
+requested file descriptor notification mechanism to use. Each sub-region in the
+response message may choose to use a different method, as defined below. The
+two mechanisms supported in this specification are ioeventfds and ioregionfds.
+
+The server in addition returns a file descriptor in the ancillary data; clients
+are expected to configure each sub-region's file descriptor with the requested
+notification method. For example, a client could configure KVM with the
+requested ioeventfd via a ``KVM_IOEVENTFD`` ``ioctl()``.
+
+Request
+^^^^^^^
+
++-------------+--------+------+
+| Name | Offset | Size |
++=============+========+======+
+| argsz | 0 | 4 |
++-------------+--------+------+
+| flags | 4 | 4 |
++-------------+--------+------+
+| index | 8 | 4 |
++-------------+--------+------+
+| count | 12 | 4 |
++-------------+--------+------+
+
+* *argsz* the maximum size of the reply payload
+* *index* is the index of memory region being queried
+* all other fields must be zero
+
+The client must set ``flags`` to zero and specify the region being queried in
+the ``index``.
+
+Reply
+^^^^^
+
++-------------+--------+------+
+| Name | Offset | Size |
++=============+========+======+
+| argsz | 0 | 4 |
++-------------+--------+------+
+| flags | 4 | 4 |
++-------------+--------+------+
+| index | 8 | 4 |
++-------------+--------+------+
+| count | 12 | 4 |
++-------------+--------+------+
+| sub-regions | 16 | ... |
++-------------+--------+------+
+
+* *argsz* is the size of the region IO FD info structure plus the
+ total size of the sub-region array. Thus, each array entry "i" is at offset
+ i * ((argsz - 32) / count). Note that currently this is 40 bytes for both IO
+ FD types, but this is not to be relied on. As elsewhere, this indicates the
+ full reply payload size needed.
+* *flags* must be zero
+* *index* is the index of memory region being queried
+* *count* is the number of sub-regions in the array
+* *sub-regions* is the array of Sub-Region IO FD info structures
+
+The reply message will additionally include at least one file descriptor in the
+ancillary data. Note that more than one sub-region may share the same file
+descriptor.
+
+Note that it is the client's responsibility to verify the requested values (for
+example, that the requested offset does not exceed the region's bounds).
+
+Each sub-region given in the response has one of two possible structures,
+depending whether *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` or
+``VFIO_USER_IO_FD_TYPE_IOREGIONFD``:
+
+Sub-Region IO FD info format (ioeventfd)
+""""""""""""""""""""""""""""""""""""""""
+
++-----------+--------+------+
+| Name | Offset | Size |
++===========+========+======+
+| offset | 0 | 8 |
++-----------+--------+------+
+| size | 8 | 8 |
++-----------+--------+------+
+| fd_index | 16 | 4 |
++-----------+--------+------+
+| type | 20 | 4 |
++-----------+--------+------+
+| flags | 24 | 4 |
++-----------+--------+------+
+| padding | 28 | 4 |
++-----------+--------+------+
+| datamatch | 32 | 8 |
++-----------+--------+------+
+
+* *offset* is the offset of the start of the sub-region within the region
+ requested ("physical address offset" for the region)
+* *size* is the length of the sub-region. This may be zero if the access size is
+ not relevant, which may allow for optimizations
+* *fd_index* is the index in the ancillary data of the FD to use for ioeventfd
+ notification; it may be shared.
+* *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD``
+* *flags* is any of:
+
+ * ``KVM_IOEVENTFD_FLAG_DATAMATCH``
+ * ``KVM_IOEVENTFD_FLAG_PIO``
+ * ``KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY`` (FIXME: makes sense?)
+
+* *datamatch* is the datamatch value if needed
+
+See https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt, *4.59
+KVM_IOEVENTFD* for further context on the ioeventfd-specific fields.
+
+Sub-Region IO FD info format (ioregionfd)
+"""""""""""""""""""""""""""""""""""""""""
+
++-----------+--------+------+
+| Name | Offset | Size |
++===========+========+======+
+| offset | 0 | 8 |
++-----------+--------+------+
+| size | 8 | 8 |
++-----------+--------+------+
+| fd_index | 16 | 4 |
++-----------+--------+------+
+| type | 20 | 4 |
++-----------+--------+------+
+| flags | 24 | 4 |
++-----------+--------+------+
+| padding | 28 | 4 |
++-----------+--------+------+
+| user_data | 32 | 8 |
++-----------+--------+------+
+
+* *offset* is the offset of the start of the sub-region within the region
+ requested ("physical address offset" for the region)
+* *size* is the length of the sub-region. This may be zero if the access size is
+ not relevant, which may allow for optimizations; ``KVM_IOREGION_POSTED_WRITES``
+ must be set in *flags* in this case
+* *fd_index* is the index in the ancillary data of the FD to use for ioregionfd
+ messages; it may be shared
+* *type* is ``VFIO_USER_IO_FD_TYPE_IOREGIONFD``
+* *flags* is any of:
+
+ * ``KVM_IOREGION_PIO``
+ * ``KVM_IOREGION_POSTED_WRITES``
+
+* *user_data* is an opaque value passed back to the server via a message on the
+ file descriptor
+
+For further information on the ioregionfd-specific fields, see:
+https://lore.kernel.org/kvm/cover.1613828726.git.eafanasova@gmail.com/
+
+(FIXME: update with final API docs.)
+
+``VFIO_USER_DEVICE_GET_IRQ_INFO``
+---------------------------------
+
+This command message is sent by the client to the server to query for
+information about device interrupt types. The VFIO IRQ info structure is
+defined in ``<linux/vfio.h>`` (``struct vfio_irq_info``).
+
+Request
+^^^^^^^
+
++-------+--------+---------------------------+
+| Name | Offset | Size |
++=======+========+===========================+
+| argsz | 0 | 4 |
++-------+--------+---------------------------+
+| flags | 4 | 4 |
++-------+--------+---------------------------+
+| | +-----+--------------------------+ |
+| | | Bit | Definition | |
+| | +=====+==========================+ |
+| | | 0 | VFIO_IRQ_INFO_EVENTFD | |
+| | +-----+--------------------------+ |
+| | | 1 | VFIO_IRQ_INFO_MASKABLE | |
+| | +-----+--------------------------+ |
+| | | 2 | VFIO_IRQ_INFO_AUTOMASKED | |
+| | +-----+--------------------------+ |
+| | | 3 | VFIO_IRQ_INFO_NORESIZE | |
+| | +-----+--------------------------+ |
++-------+--------+---------------------------+
+| index | 8 | 4 |
++-------+--------+---------------------------+
+| count | 12 | 4 |
++-------+--------+---------------------------+
+
+* *argsz* is the maximum size of the reply payload (16 bytes today)
+* index is the index of IRQ type being queried (e.g. ``VFIO_PCI_MSIX_IRQ_INDEX``)
+* all other fields must be zero
+
+Reply
+^^^^^
+
++-------+--------+---------------------------+
+| Name | Offset | Size |
++=======+========+===========================+
+| argsz | 0 | 4 |
++-------+--------+---------------------------+
+| flags | 4 | 4 |
++-------+--------+---------------------------+
+| | +-----+--------------------------+ |
+| | | Bit | Definition | |
+| | +=====+==========================+ |
+| | | 0 | VFIO_IRQ_INFO_EVENTFD | |
+| | +-----+--------------------------+ |
+| | | 1 | VFIO_IRQ_INFO_MASKABLE | |
+| | +-----+--------------------------+ |
+| | | 2 | VFIO_IRQ_INFO_AUTOMASKED | |
+| | +-----+--------------------------+ |
+| | | 3 | VFIO_IRQ_INFO_NORESIZE | |
+| | +-----+--------------------------+ |
++-------+--------+---------------------------+
+| index | 8 | 4 |
++-------+--------+---------------------------+
+| count | 12 | 4 |
++-------+--------+---------------------------+
+
+* *argsz* is the size required for the full reply payload (16 bytes today)
+* *flags* defines IRQ attributes:
+
+ * ``VFIO_IRQ_INFO_EVENTFD`` indicates the IRQ type can support server eventfd
+ signalling.
+ * ``VFIO_IRQ_INFO_MASKABLE`` indicates that the IRQ type supports the ``MASK``
+ and ``UNMASK`` actions in a ``VFIO_USER_DEVICE_SET_IRQS`` message.
+ * ``VFIO_IRQ_INFO_AUTOMASKED`` indicates the IRQ type masks itself after being
+ triggered, and the client must send an ``UNMASK`` action to receive new
+ interrupts.
+ * ``VFIO_IRQ_INFO_NORESIZE`` indicates ``VFIO_USER_SET_IRQS`` operations setup
+ interrupts as a set, and new sub-indexes cannot be enabled without disabling
+ the entire type.
+* index is the index of IRQ type being queried
+* count describes the number of interrupts of the queried type.
+
+``VFIO_USER_DEVICE_SET_IRQS``
+-----------------------------
+
+This command message is sent by the client to the server to set actions for
+device interrupt types. The VFIO IRQ set structure is defined in
+``<linux/vfio.h>`` (``struct vfio_irq_set``).
+
+Request
+^^^^^^^
+
++-------+--------+------------------------------+
+| Name | Offset | Size |
++=======+========+==============================+
+| argsz | 0 | 4 |
++-------+--------+------------------------------+
+| flags | 4 | 4 |
++-------+--------+------------------------------+
+| | +-----+-----------------------------+ |
+| | | Bit | Definition | |
+| | +=====+=============================+ |
+| | | 0 | VFIO_IRQ_SET_DATA_NONE | |
+| | +-----+-----------------------------+ |
+| | | 1 | VFIO_IRQ_SET_DATA_BOOL | |
+| | +-----+-----------------------------+ |
+| | | 2 | VFIO_IRQ_SET_DATA_EVENTFD | |
+| | +-----+-----------------------------+ |
+| | | 3 | VFIO_IRQ_SET_ACTION_MASK | |
+| | +-----+-----------------------------+ |
+| | | 4 | VFIO_IRQ_SET_ACTION_UNMASK | |
+| | +-----+-----------------------------+ |
+| | | 5 | VFIO_IRQ_SET_ACTION_TRIGGER | |
+| | +-----+-----------------------------+ |
++-------+--------+------------------------------+
+| index | 8 | 4 |
++-------+--------+------------------------------+
+| start | 12 | 4 |
++-------+--------+------------------------------+
+| count | 16 | 4 |
++-------+--------+------------------------------+
+| data | 20 | variable |
++-------+--------+------------------------------+
+
+* *argsz* is the size of the VFIO IRQ set request payload, including any *data*
+ field. Note there is no reply payload, so this field differs from other
+ message types.
+* *flags* defines the action performed on the interrupt range. The ``DATA``
+ flags describe the data field sent in the message; the ``ACTION`` flags
+ describe the action to be performed. The flags are mutually exclusive for
+ both sets.
+
+ * ``VFIO_IRQ_SET_DATA_NONE`` indicates there is no data field in the command.
+ The action is performed unconditionally.
+ * ``VFIO_IRQ_SET_DATA_BOOL`` indicates the data field is an array of boolean
+ bytes. The action is performed if the corresponding boolean is true.
+ * ``VFIO_IRQ_SET_DATA_EVENTFD`` indicates an array of event file descriptors
+ was sent in the message meta-data. These descriptors will be signalled when
+ the action defined by the action flags occurs. In ``AF_UNIX`` sockets, the
+ descriptors are sent as ``SCM_RIGHTS`` type ancillary data.
+ If no file descriptors are provided, this de-assigns the specified
+ previously configured interrupts.
+ * ``VFIO_IRQ_SET_ACTION_MASK`` indicates a masking event. It can be used with
+ ``VFIO_IRQ_SET_DATA_BOOL`` or ``VFIO_IRQ_SET_DATA_NONE`` to mask an interrupt,
+ or with ``VFIO_IRQ_SET_DATA_EVENTFD`` to generate an event when the guest masks
+ the interrupt.
+ * ``VFIO_IRQ_SET_ACTION_UNMASK`` indicates an unmasking event. It can be used
+ with ``VFIO_IRQ_SET_DATA_BOOL`` or ``VFIO_IRQ_SET_DATA_NONE`` to unmask an
+ interrupt, or with ``VFIO_IRQ_SET_DATA_EVENTFD`` to generate an event when the
+ guest unmasks the interrupt.
+ * ``VFIO_IRQ_SET_ACTION_TRIGGER`` indicates a triggering event. It can be used
+ with ``VFIO_IRQ_SET_DATA_BOOL`` or ``VFIO_IRQ_SET_DATA_NONE`` to trigger an
+ interrupt, or with ``VFIO_IRQ_SET_DATA_EVENTFD`` to generate an event when the
+ server triggers the interrupt.
+
+* *index* is the index of IRQ type being setup.
+* *start* is the start of the sub-index being set.
+* *count* describes the number of sub-indexes being set. As a special case, a
+ count (and start) of 0, with data flags of ``VFIO_IRQ_SET_DATA_NONE`` disables
+ all interrupts of the index.
+* *data* is an optional field included when the
+ ``VFIO_IRQ_SET_DATA_BOOL`` flag is present. It contains an array of booleans
+ that specify whether the action is to be performed on the corresponding
+ index. It's used when the action is only performed on a subset of the range
+ specified.
+
+Not all interrupt types support every combination of data and action flags.
+The client must know the capabilities of the device and IRQ index before it
+sends a ``VFIO_USER_DEVICE_SET_IRQ`` message.
+
+In typical operation, a specific IRQ may operate as follows:
+
+1. The client sends a ``VFIO_USER_DEVICE_SET_IRQ`` message with
+ ``flags=(VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_TRIGGER)`` along
+ with an eventfd. This associates the IRQ with a particular eventfd on the
+ server side.
+
+#. The client may send a ``VFIO_USER_DEVICE_SET_IRQ`` message with
+ ``flags=(VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_MASK/UNMASK)`` along
+ with another eventfd. This associates the given eventfd with the
+ mask/unmask state on the server side.
+
+#. The server may trigger the IRQ by writing 1 to the eventfd.
+
+#. The server may mask/unmask an IRQ which will write 1 to the corresponding
+ mask/unmask eventfd, if there is one.
+
+5. A client may trigger a device IRQ itself, by sending a
+ ``VFIO_USER_DEVICE_SET_IRQ`` message with
+ ``flags=(VFIO_IRQ_SET_DATA_NONE/BOOL|VFIO_IRQ_SET_ACTION_TRIGGER)``.
+
+6. A client may mask or unmask the IRQ, by sending a
+ ``VFIO_USER_DEVICE_SET_IRQ`` message with
+ ``flags=(VFIO_IRQ_SET_DATA_NONE/BOOL|VFIO_IRQ_SET_ACTION_MASK/UNMASK)``.
+
+Reply
+^^^^^
+
+There is no payload in the reply.
+
+.. _Read and Write Operations:
+
+Note that all of these operations must be supported by the client and/or server,
+even if the corresponding memory or device region has been shared as mappable.
+
+The ``count`` field must not exceed the value of ``max_data_xfer_size`` of the
+peer, for both reads and writes.
+
+``VFIO_USER_REGION_READ``
+-------------------------
+
+If a device region is not mappable, it's not directly accessible by the client
+via ``mmap()`` of the underlying file descriptor. In this case, a client can
+read from a device region with this message.
+
+Request
+^^^^^^^
+
++--------+--------+----------+
+| Name | Offset | Size |
++========+========+==========+
+| offset | 0 | 8 |
++--------+--------+----------+
+| region | 8 | 4 |
++--------+--------+----------+
+| count | 12 | 4 |
++--------+--------+----------+
+
+* *offset* into the region being accessed.
+* *region* is the index of the region being accessed.
+* *count* is the size of the data to be transferred.
+
+Reply
+^^^^^
+
++--------+--------+----------+
+| Name | Offset | Size |
++========+========+==========+
+| offset | 0 | 8 |
++--------+--------+----------+
+| region | 8 | 4 |
++--------+--------+----------+
+| count | 12 | 4 |
++--------+--------+----------+
+| data | 16 | variable |
++--------+--------+----------+
+
+* *offset* into the region accessed.
+* *region* is the index of the region accessed.
+* *count* is the size of the data transferred.
+* *data* is the data that was read from the device region.
+
+``VFIO_USER_REGION_WRITE``
+--------------------------
+
+If a device region is not mappable, it's not directly accessible by the client
+via mmap() of the underlying fd. In this case, a client can write to a device
+region with this message.
+
+Request
+^^^^^^^
+
++--------+--------+----------+
+| Name | Offset | Size |
++========+========+==========+
+| offset | 0 | 8 |
++--------+--------+----------+
+| region | 8 | 4 |
++--------+--------+----------+
+| count | 12 | 4 |
++--------+--------+----------+
+| data | 16 | variable |
++--------+--------+----------+
+
+* *offset* into the region being accessed.
+* *region* is the index of the region being accessed.
+* *count* is the size of the data to be transferred.
+* *data* is the data to write
+
+Reply
+^^^^^
+
++--------+--------+----------+
+| Name | Offset | Size |
++========+========+==========+
+| offset | 0 | 8 |
++--------+--------+----------+
+| region | 8 | 4 |
++--------+--------+----------+
+| count | 12 | 4 |
++--------+--------+----------+
+
+* *offset* into the region accessed.
+* *region* is the index of the region accessed.
+* *count* is the size of the data transferred.
+
+``VFIO_USER_DMA_READ``
+-----------------------
+
+If the client has not shared mappable memory, the server can use this message to
+read from guest memory.
+
+Request
+^^^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| address | 0 | 8 |
++---------+--------+----------+
+| count | 8 | 8 |
++---------+--------+----------+
+
+* *address* is the client DMA memory address being accessed. This address must have
+ been previously exported to the server with a ``VFIO_USER_DMA_MAP`` message.
+* *count* is the size of the data to be transferred.
+
+Reply
+^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| address | 0 | 8 |
++---------+--------+----------+
+| count | 8 | 8 |
++---------+--------+----------+
+| data | 16 | variable |
++---------+--------+----------+
+
+* *address* is the client DMA memory address being accessed.
+* *count* is the size of the data transferred.
+* *data* is the data read.
+
+``VFIO_USER_DMA_WRITE``
+-----------------------
+
+If the client has not shared mappable memory, the server can use this message to
+write to guest memory.
+
+Request
+^^^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| address | 0 | 8 |
++---------+--------+----------+
+| count | 8 | 8 |
++---------+--------+----------+
+| data | 16 | variable |
++---------+--------+----------+
+
+* *address* is the client DMA memory address being accessed. This address must have
+ been previously exported to the server with a ``VFIO_USER_DMA_MAP`` message.
+* *count* is the size of the data to be transferred.
+* *data* is the data to write
+
+Reply
+^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| address | 0 | 8 |
++---------+--------+----------+
+| count | 8 | 4 |
++---------+--------+----------+
+
+* *address* is the client DMA memory address being accessed.
+* *count* is the size of the data transferred.
+
+``VFIO_USER_DEVICE_RESET``
+--------------------------
+
+This command message is sent from the client to the server to reset the device.
+Neither the request or reply have a payload.
+
+``VFIO_USER_REGION_WRITE_MULTI``
+--------------------------------
+
+This message can be used to coalesce multiple device write operations
+into a single messgage. It is only used as an optimization when the
+outgoing message queue is relatively full.
+
+Request
+^^^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| wr_cnt | 0 | 8 |
++---------+--------+----------+
+| wrs | 8 | variable |
++---------+--------+----------+
+
+* *wr_cnt* is the number of device writes coalesced in the message
+* *wrs* is an array of device writes defined below
+
+Single Device Write Format
+""""""""""""""""""""""""""
+
++--------+--------+----------+
+| Name | Offset | Size |
++========+========+==========+
+| offset | 0 | 8 |
++--------+--------+----------+
+| region | 8 | 4 |
++--------+--------+----------+
+| count | 12 | 4 |
++--------+--------+----------+
+| data | 16 | 8 |
++--------+--------+----------+
+
+* *offset* into the region being accessed.
+* *region* is the index of the region being accessed.
+* *count* is the size of the data to be transferred. This format can
+ only describe writes of 8 bytes or less.
+* *data* is the data to write.
+
+Reply
+^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| wr_cnt | 0 | 8 |
++---------+--------+----------+
+
+* *wr_cnt* is the number of device writes completed.
+
+
+Appendices
+==========
+
+Unused VFIO ``ioctl()`` commands
+--------------------------------
+
+The following VFIO commands do not have an equivalent vfio-user command:
+
+* ``VFIO_GET_API_VERSION``
+* ``VFIO_CHECK_EXTENSION``
+* ``VFIO_SET_IOMMU``
+* ``VFIO_GROUP_GET_STATUS``
+* ``VFIO_GROUP_SET_CONTAINER``
+* ``VFIO_GROUP_UNSET_CONTAINER``
+* ``VFIO_GROUP_GET_DEVICE_FD``
+* ``VFIO_IOMMU_GET_INFO``
+
+However, once support for live migration for VFIO devices is finalized some
+of the above commands may have to be handled by the client in their
+corresponding vfio-user form. This will be addressed in a future protocol
+version.
+
+VFIO groups and containers
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The current VFIO implementation includes group and container idioms that
+describe how a device relates to the host IOMMU. In the vfio-user
+implementation, the IOMMU is implemented in SW by the client, and is not
+visible to the server. The simplest idea would be that the client put each
+device into its own group and container.
+
+Backend Program Conventions
+---------------------------
+
+vfio-user backend program conventions are based on the vhost-user ones.
+
+* The backend program must not daemonize itself.
+* No assumptions must be made as to what access the backend program has on the
+ system.
+* File descriptors 0, 1 and 2 must exist, must have regular
+ stdin/stdout/stderr semantics, and can be redirected.
+* The backend program must honor the SIGTERM signal.
+* The backend program must accept the following commands line options:
+
+ * ``--socket-path=PATH``: path to UNIX domain socket,
+ * ``--fd=FDNUM``: file descriptor for UNIX domain socket, incompatible with
+ ``--socket-path``
+* The backend program must be accompanied with a JSON file stored under
+ ``/usr/share/vfio-user``.
+
+TODO add schema similar to docs/interop/vhost-user.json.
diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index d8419fd..2e50f2d 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -167,6 +167,8 @@ A vring address description
Note that a ring address is an IOVA if ``VIRTIO_F_IOMMU_PLATFORM`` has
been negotiated. Otherwise it is a user address.
+.. _memory_region_description:
+
Memory region description
^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -180,7 +182,7 @@ Memory region description
:user address: a 64-bit user address
-:mmap offset: 64-bit offset where region starts in the mapped memory
+:mmap offset: a 64-bit offset where region starts in the mapped memory
When the ``VHOST_USER_PROTOCOL_F_XEN_MMAP`` protocol feature has been
successfully negotiated, the memory region description contains two extra
@@ -190,7 +192,7 @@ fields at the end.
| guest address | size | user address | mmap offset | xen mmap flags | domid |
+---------------+------+--------------+-------------+----------------+-------+
-:xen mmap flags: 32-bit bit field
+:xen mmap flags: a 32-bit bit field
- Bit 0 is set for Xen foreign memory mapping.
- Bit 1 is set for Xen grant memory mapping.
@@ -211,7 +213,7 @@ Single memory region description
:padding: 64-bit
-A region is represented by Memory region description.
+:region: region is represented by :ref:`Memory region description <memory_region_description>`.
Multiple Memory regions description
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -224,7 +226,7 @@ Multiple Memory regions description
:padding: 32-bit
-A region is represented by Memory region description.
+:regions: regions field contains 8 regions of type :ref:`Memory region description <memory_region_description>`.
Log description
^^^^^^^^^^^^^^^
@@ -233,9 +235,9 @@ Log description
| log size | log offset |
+----------+------------+
-:log size: size of area used for logging
+:log size: a 64-bit size of area used for logging
-:log offset: offset from start of supplied file descriptor where
+:log offset: a 64-bit offset from start of supplied file descriptor where
logging starts (i.e. where guest address 0 would be
logged)
@@ -382,7 +384,7 @@ the kernel implementation.
The communication consists of the *front-end* sending message requests and
the *back-end* sending message replies. Most of the requests don't require
-replies. Here is a list of the ones that do:
+replies, except for the following requests:
* ``VHOST_USER_GET_FEATURES``
* ``VHOST_USER_GET_PROTOCOL_FEATURES``
@@ -1239,11 +1241,11 @@ Front-end message types
(*a vring descriptor index for split virtqueues* vs. *vring descriptor
indices for packed virtqueues*).
- When and as long as all of a device’s vrings are stopped, it is
+ When and as long as all of a device's vrings are stopped, it is
*suspended*, see :ref:`Suspended device state
<suspended_device_state>`.
- The request payload’s *num* field is currently reserved and must be
+ The request payload's *num* field is currently reserved and must be
set to 0.
``VHOST_USER_SET_VRING_KICK``
@@ -1662,7 +1664,7 @@ Front-end message types
:reply payload: ``u64``
Front-end and back-end negotiate a channel over which to transfer the
- back-end’s internal state during migration. Either side (front-end or
+ back-end's internal state during migration. Either side (front-end or
back-end) may create the channel. The nature of this channel is not
restricted or defined in this document, but whichever side creates it
must create a file descriptor that is provided to the respectively
@@ -1714,7 +1716,7 @@ Front-end message types
:request payload: N/A
:reply payload: ``u64``
- After transferring the back-end’s internal state during migration (see
+ After transferring the back-end's internal state during migration (see
the :ref:`Migrating back-end state <migrating_backend_state>`
section), check whether the back-end was able to successfully fully
process the state.
diff --git a/docs/meson.build b/docs/meson.build
index 322452c..3676f81 100644
--- a/docs/meson.build
+++ b/docs/meson.build
@@ -54,7 +54,6 @@ if build_docs
'qemu-pr-helper.8': (have_tools ? 'man8' : ''),
'qemu-storage-daemon.1': (have_tools ? 'man1' : ''),
'qemu-trace-stap.1': (stap.found() ? 'man1' : ''),
- 'virtfs-proxy-helper.1': (have_virtfs_proxy_helper ? 'man1' : ''),
'qemu.1': 'man1',
'qemu-block-drivers.7': 'man7',
'qemu-cpu-models.7': 'man7'
diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt
index a47aad0..ab21428 100644
--- a/docs/pcie_sriov.txt
+++ b/docs/pcie_sriov.txt
@@ -52,9 +52,11 @@ setting up a BAR for a VF.
...
/* Add and initialize the SR/IOV capability */
- pcie_sriov_pf_init(d, 0x200, "your_virtual_dev",
- vf_devid, initial_vfs, total_vfs,
- fun_offset, stride);
+ if (!pcie_sriov_pf_init(d, 0x200, "your_virtual_dev",
+ vf_devid, initial_vfs, total_vfs,
+ fun_offset, stride, errp)) {
+ return;
+ }
/* Set up individual VF BARs (parameters as for normal BARs) */
pcie_sriov_pf_init_vf_bar( ... )
diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index 5f763aa..204a574 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -15,7 +15,7 @@ not a straightforward operation.
This document attempts to give an overview of the L2 and refcount
caches, and how to configure them.
-Please refer to the docs/interop/qcow2.txt file for an in-depth
+Please refer to the docs/interop/qcow2.rst file for an in-depth
technical description of the qcow2 file format.
diff --git a/docs/specs/acpi_hest_ghes.rst b/docs/specs/acpi_hest_ghes.rst
index 68f1fbe..c3e9f8d 100644
--- a/docs/specs/acpi_hest_ghes.rst
+++ b/docs/specs/acpi_hest_ghes.rst
@@ -67,8 +67,10 @@ Design Details
(3) The address registers table contains N Error Block Address entries
and N Read Ack Register entries. The size for each entry is 8-byte.
The Error Status Data Block table contains N Error Status Data Block
- entries. The size for each entry is 4096(0x1000) bytes. The total size
- for the "etc/hardware_errors" fw_cfg blob is (N * 8 * 2 + N * 4096) bytes.
+ entries. The size for each entry is defined at the source code as
+ ACPI_GHES_MAX_RAW_DATA_LENGTH (currently 1024 bytes). The total size
+ for the "etc/hardware_errors" fw_cfg blob is
+ (N * 8 * 2 + N * ACPI_GHES_MAX_RAW_DATA_LENGTH) bytes.
N is the number of the kinds of hardware error sources.
(4) QEMU generates the ACPI linker/loader script for the firmware. The
diff --git a/docs/specs/aspeed-intc.rst b/docs/specs/aspeed-intc.rst
new file mode 100644
index 0000000..9cefd7f
--- /dev/null
+++ b/docs/specs/aspeed-intc.rst
@@ -0,0 +1,136 @@
+===========================
+ASPEED Interrupt Controller
+===========================
+
+AST2700
+-------
+There are a total of 480 interrupt sources in AST2700. Due to the limitation of
+interrupt numbers of processors, the interrupts are merged every 32 sources for
+interrupt numbers greater than 127.
+
+There are two levels of interrupt controllers, INTC (CPU Die) and INTCIO
+(I/O Die).
+
+Interrupt Mapping
+-----------------
+- INTC: Handles interrupt sources 0 - 127 and integrates signals from INTCIO.
+- INTCIO: Handles interrupt sources 128 - 319 independently.
+
+QEMU Support
+------------
+Currently, only GIC 192 to 201 are supported, and their source interrupts are
+from INTCIO and connected to INTC at input pin 0 and output pins 0 to 9 for
+GIC 192-201.
+
+Design for GICINT 196
+---------------------
+The orgate has interrupt sources ranging from 0 to 31, with its output pin
+connected to INTCIO "T0 GICINT_196". The output pin is then connected to INTC
+"GIC_192_201" at bit 4, and its bit 4 output pin is connected to GIC 196.
+
+INTC GIC_192_201 Output Pin Mapping
+-----------------------------------
+The design of INTC GIC_192_201 have 10 output pins, mapped as following:
+
+==== ====
+Bit GIC
+==== ====
+0 192
+1 193
+2 194
+3 195
+4 196
+5 197
+6 198
+7 199
+8 200
+9 201
+==== ====
+
+AST2700 A0
+----------
+It has only one INTC controller, and currently, only GIC 128-136 is supported.
+To support both AST2700 A1 and AST2700 A0, there are 10 OR gates in the INTC,
+with gates 1 to 9 supporting GIC 128-136.
+
+Design for GICINT 132
+---------------------
+The orgate has interrupt sources ranging from 0 to 31, with its output pin
+connected to INTC. The output pin is then connected to GIC 132.
+
+Block Diagram of GICINT 196 for AST2700 A1 and GICINT 132 for AST2700 A0
+------------------------------------------------------------------------
+
+.. code-block::
+
+ |-------------------------------------------------------------------------------------------------------|
+ | AST2700 A1 Design |
+ | To GICINT196 |
+ | |
+ | ETH1 |-----------| |--------------------------| |--------------| |
+ | -------->|0 | | INTCIO | | orgates[0] | |
+ | ETH2 | 4| orgates[0]------>|inpin[0]-------->outpin[0]|------->| 0 | |
+ | -------->|1 5| orgates[1]------>|inpin[1]-------->outpin[1]|------->| 1 | |
+ | ETH3 | 6| orgates[2]------>|inpin[2]-------->outpin[2]|------->| 2 | |
+ | -------->|2 19| orgates[3]------>|inpin[3]-------->outpin[3]|------->| 3 OR[0:9] |-----| |
+ | UART0 | 20|-->orgates[4]------>|inpin[4]-------->outpin[4]|------->| 4 | | |
+ | -------->|7 21| orgates[5]------>|inpin[5]-------->outpin[5]|------->| 5 | | |
+ | UART1 | 22| orgates[6]------>|inpin[6]-------->outpin[6]|------->| 6 | | |
+ | -------->|8 23| orgates[7]------>|inpin[7]-------->outpin[7]|------->| 7 | | |
+ | UART2 | 24| orgates[8]------>|inpin[8]-------->outpin[8]|------->| 8 | | |
+ | -------->|9 25| orgates[9]------>|inpin[9]-------->outpin[9]|------->| 9 | | |
+ | UART3 | 26| |--------------------------| |--------------| | |
+ | ---------|10 27| | |
+ | UART5 | 28| | |
+ | -------->|11 29| | |
+ | UART6 | | | |
+ | -------->|12 30| |-----------------------------------------------------------------------| |
+ | UART7 | 31| | |
+ | -------->|13 | | |
+ | UART8 | OR[0:31] | | |------------------------------| |----------| |
+ | -------->|14 | | | INTC | | GIC | |
+ | UART9 | | | |inpin[0:0]--------->outpin[0] |---------->|192 | |
+ | -------->|15 | | |inpin[0:1]--------->outpin[1] |---------->|193 | |
+ | UART10 | | | |inpin[0:2]--------->outpin[2] |---------->|194 | |
+ | -------->|16 | | |inpin[0:3]--------->outpin[3] |---------->|195 | |
+ | UART11 | | |--------------> |inpin[0:4]--------->outpin[4] |---------->|196 | |
+ | -------->|17 | |inpin[0:5]--------->outpin[5] |---------->|197 | |
+ | UART12 | | |inpin[0:6]--------->outpin[6] |---------->|198 | |
+ | -------->|18 | |inpin[0:7]--------->outpin[7] |---------->|199 | |
+ | |-----------| |inpin[0:8]--------->outpin[8] |---------->|200 | |
+ | |inpin[0:9]--------->outpin[9] |---------->|201 | |
+ |-------------------------------------------------------------------------------------------------------|
+ |-------------------------------------------------------------------------------------------------------|
+ | ETH1 |-----------| orgates[1]------->|inpin[1]----------->outpin[10]|---------->|128 | |
+ | -------->|0 | orgates[2]------->|inpin[2]----------->outpin[11]|---------->|129 | |
+ | ETH2 | 4| orgates[3]------->|inpin[3]----------->outpin[12]|---------->|130 | |
+ | -------->|1 5| orgates[4]------->|inpin[4]----------->outpin[13]|---------->|131 | |
+ | ETH3 | 6|---->orgates[5]------->|inpin[5]----------->outpin[14]|---------->|132 | |
+ | -------->|2 19| orgates[6]------->|inpin[6]----------->outpin[15]|---------->|133 | |
+ | UART0 | 20| orgates[7]------->|inpin[7]----------->outpin[16]|---------->|134 | |
+ | -------->|7 21| orgates[8]------->|inpin[8]----------->outpin[17]|---------->|135 | |
+ | UART1 | 22| orgates[9]------->|inpin[9]----------->outpin[18]|---------->|136 | |
+ | -------->|8 23| |------------------------------| |----------| |
+ | UART2 | 24| |
+ | -------->|9 25| AST2700 A0 Design |
+ | UART3 | 26| |
+ | -------->|10 27| |
+ | UART5 | 28| |
+ | -------->|11 29| GICINT132 |
+ | UART6 | | |
+ | -------->|12 30| |
+ | UART7 | 31| |
+ | -------->|13 | |
+ | UART8 | OR[0:31] | |
+ | -------->|14 | |
+ | UART9 | | |
+ | -------->|15 | |
+ | UART10 | | |
+ | -------->|16 | |
+ | UART11 | | |
+ | -------->|17 | |
+ | UART12 | | |
+ | -------->|18 | |
+ | |-----------| |
+ | |
+ |-------------------------------------------------------------------------------------------------------|
diff --git a/docs/specs/fw_cfg.rst b/docs/specs/fw_cfg.rst
index 5ad47a9..31ae315 100644
--- a/docs/specs/fw_cfg.rst
+++ b/docs/specs/fw_cfg.rst
@@ -54,11 +54,11 @@ Data Register
-------------
* Read/Write (writes ignored as of QEMU v2.4, but see the DMA interface)
-* Location: platform dependent (IOport [#]_ or MMIO)
+* Location: platform dependent (IOport\ [#placement]_ or MMIO)
* Width: 8-bit (if IOport), 8/16/32/64-bit (if MMIO)
* Endianness: string-preserving
-.. [#]
+.. [#placement]
On platforms where the data register is exposed as an IOport, its
port number will always be one greater than the port number of the
selector register. In other words, the two ports overlap, and can not
diff --git a/docs/specs/index.rst b/docs/specs/index.rst
index be899b4..f19d73c 100644
--- a/docs/specs/index.rst
+++ b/docs/specs/index.rst
@@ -35,3 +35,7 @@ guest hardware that is specific to QEMU.
vmcoreinfo
vmgenid
rapl-msr
+ rocker
+ riscv-iommu
+ riscv-aia
+ aspeed-intc
diff --git a/docs/specs/pci-ids.rst b/docs/specs/pci-ids.rst
index c0a3dec..261b0f3 100644
--- a/docs/specs/pci-ids.rst
+++ b/docs/specs/pci-ids.rst
@@ -77,13 +77,17 @@ PCI devices (other than virtio):
1b36:0008
PCIe host bridge
1b36:0009
- PCI Expander Bridge (-device pxb)
+ PCI Expander Bridge (``-device pxb``)
1b36:000a
PCI-PCI bridge (multiseat)
1b36:000b
- PCIe Expander Bridge (-device pxb-pcie)
+ PCIe Expander Bridge (``-device pxb-pcie``)
+1b36:000c
+ PCIe Root Port (``-device pcie-root-port``)
1b36:000d
PCI xhci usb host adapter
+1b36:000e
+ PCIe-to-PCI bridge (``-device pcie-pci-bridge``)
1b36:000f
mdpy (mdev sample device), ``linux/samples/vfio-mdev/mdpy.c``
1b36:0010
@@ -94,6 +98,8 @@ PCI devices (other than virtio):
PCI ACPI ERST device (``-device acpi-erst``)
1b36:0013
PCI UFS device (``-device ufs``)
+1b36:0014
+ PCI RISC-V IOMMU device
All these devices are documented in :doc:`index`.
diff --git a/docs/specs/rapl-msr.rst b/docs/specs/rapl-msr.rst
index 1202ee8..aaf0db9 100644
--- a/docs/specs/rapl-msr.rst
+++ b/docs/specs/rapl-msr.rst
@@ -9,11 +9,12 @@ The consumption is reported via MSRs (model specific registers) like
MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are 64 bits
registers that represent the accumulated energy consumption in micro Joules.
-Thanks to the MSR Filtering patch [#a]_ not all MSRs are handled by KVM. Some
-of them can now be handled by the userspace (QEMU). It uses a mechanism called
-"MSR filtering" where a list of MSRs is given at init time of a VM to KVM so
-that a callback is put in place. The design of this patch uses only this
-mechanism for handling the MSRs between guest/host.
+Thanks to KVM's `MSR filtering <msr-filter-patch_>`__ functionality,
+not all MSRs are handled by KVM. Some of them can now be handled by the
+userspace (QEMU); a list of MSRs is given at VM creation time to KVM, and
+a userspace exit occurs when they are accessed.
+
+.. _msr-filter-patch: https://patchwork.kernel.org/project/kvm/patch/20200916202951.23760-7-graf@amazon.com/
At the moment the following MSRs are involved:
@@ -92,9 +93,12 @@ found by the sysconf system call. A typical value of clock ticks per second is
package has 4 cores, 400 ticks maximum can be scheduled on all the cores
of the package for a period of 1 second.
-The /proc/[pid]/stat [#b]_ is a sysfs file that can give the executed time of a
-process with the [pid] as the process ID. It gives the amount of ticks the
-process has been scheduled in userspace (utime) and kernel space (stime).
+`/proc/[pid]/stat <stat_>`__ is a procfs file that can give the executed
+time of a process with the [pid] as the process ID. It gives the amount
+of ticks the process has been scheduled in userspace (utime) and kernel
+space (stime).
+
+.. _stat: https://man7.org/linux/man-pages/man5/proc.5.html
By reading those metrics for a thread, one can calculate the ratio of time the
package has spent executing the thread.
@@ -148,8 +152,3 @@ Current Limitations
- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at the
moment.
-References
-----------
-
-.. [#a] https://patchwork.kernel.org/project/kvm/patch/20200916202951.23760-7-graf@amazon.com/
-.. [#b] https://man7.org/linux/man-pages/man5/proc.5.html
diff --git a/docs/specs/riscv-aia.rst b/docs/specs/riscv-aia.rst
new file mode 100644
index 0000000..8097e2f
--- /dev/null
+++ b/docs/specs/riscv-aia.rst
@@ -0,0 +1,83 @@
+.. _riscv-aia:
+
+RISC-V AIA support for RISC-V machines
+======================================
+
+AIA (Advanced Interrupt Architecture) support is implemented in the ``virt``
+RISC-V machine for TCG and KVM accelerators.
+
+The support consists of two main modes:
+
+- "aia=aplic": adds one or more APLIC (Advanced Platform Level Interrupt Controller)
+ devices
+- "aia=aplic-imsic": adds one or more APLIC device and an IMSIC (Incoming MSI
+ Controller) device for each CPU
+
+From an user standpoint, these modes will behave the same regardless of the accelerator
+used. From a developer standpoint the accelerator settings will change what it being
+emulated in userspace versus what is being emulated by an in-kernel irqchip.
+
+When running TCG, all controllers are emulated in userspace, including machine mode
+(m-mode) APLIC and IMSIC (when applicable).
+
+When running KVM:
+
+- no m-mode is provided, so there is no m-mode APLIC or IMSIC emulation regardless of
+ the AIA mode chosen
+- with "aia=aplic", s-mode APLIC will be emulated by userspace
+- with "aia=aplic-imsic" there are two possibilities. If no additional KVM option
+ is provided there will be no APLIC or IMSIC emulation in userspace, and the virtual
+ machine will use the provided in-kernel APLIC and IMSIC controllers. If the user
+ chooses to use the irqchip in split mode via "-accel kvm,kernel-irqchip=split",
+ s-mode APLIC will be emulated while using the s-mode IMSIC from the irqchip
+
+The following table summarizes how the AIA and accelerator options defines what
+we will emulate in userspace:
+
+
+.. list-table:: How AIA and accel options changes controller emulation
+ :widths: 25 25 25 25 25 25 25
+ :header-rows: 1
+
+ * - Accel
+ - Accel props
+ - AIA type
+ - APLIC m-mode
+ - IMSIC m-mode
+ - APLIC s-mode
+ - IMSIC s-mode
+ * - tcg
+ - ---
+ - aplic
+ - emul
+ - n/a
+ - emul
+ - n/a
+ * - tcg
+ - ---
+ - aplic-imsic
+ - emul
+ - emul
+ - emul
+ - emul
+ * - kvm
+ - ---
+ - aplic
+ - n/a
+ - n/a
+ - emul
+ - n/a
+ * - kvm
+ - none
+ - aplic-imsic
+ - n/a
+ - n/a
+ - in-kernel
+ - in-kernel
+ * - kvm
+ - irqchip=split
+ - aplic-imsic
+ - n/a
+ - n/a
+ - emul
+ - in-kernel
diff --git a/docs/specs/riscv-iommu.rst b/docs/specs/riscv-iommu.rst
new file mode 100644
index 0000000..991d376
--- /dev/null
+++ b/docs/specs/riscv-iommu.rst
@@ -0,0 +1,116 @@
+.. _riscv-iommu:
+
+RISC-V IOMMU support for RISC-V machines
+========================================
+
+QEMU implements a RISC-V IOMMU emulation based on the RISC-V IOMMU spec
+version 1.0 `iommu1.0.0`_.
+
+The emulation includes a PCI reference device (riscv-iommu-pci) and a platform
+bus device (riscv-iommu-sys) that QEMU RISC-V boards can use. The 'virt'
+RISC-V machine is compatible with both devices.
+
+riscv-iommu-pci reference device
+--------------------------------
+
+This device implements the RISC-V IOMMU emulation as recommended by the section
+"Integrating an IOMMU as a PCIe device" of `iommu1.0.0`_: a PCI device with base
+class 08h, sub-class 06h and programming interface 00h.
+
+As a reference device it doesn't implement anything outside of the specification,
+so it uses a generic default PCI ID given by QEMU: 1b36:0014.
+
+To include the device in the 'virt' machine:
+
+.. code-block:: bash
+
+ $ qemu-system-riscv64 -M virt -device riscv-iommu-pci,[optional_pci_opts] (...)
+
+This will add a RISC-V IOMMU PCI device in the board following any additional
+PCI parameters (like PCI bus address). The behavior of the RISC-V IOMMU is
+defined by the spec but its operation is OS dependent.
+
+As of this writing the existing Linux kernel support `linux-v8`_, not yet merged,
+does not have support for features like VFIO passthrough. The IOMMU emulation
+was tested using a public Ventana Micro Systems kernel repository in
+`ventana-linux`_. This kernel is based on `linux-v8`_ with additional patches that
+enable features like KVM VFIO passthrough with irqbypass. Until the kernel support
+is feature complete feel free to use the kernel available in the Ventana Micro Systems
+mirror.
+
+The current Linux kernel support will use the IOMMU device to create IOMMU groups
+with any eligible cards available in the system, regardless of factors such as the
+order in which the devices are added in the command line.
+
+This means that these command lines are equivalent as far as the current
+IOMMU kernel driver behaves:
+
+.. code-block:: bash
+
+ $ qemu-system-riscv64 \
+ -M virt,aia=aplic-imsic,aia-guests=5 \
+ -device riscv-iommu-pci,addr=1.0,vendor-id=0x1efd,device-id=0xedf1 \
+ -device e1000e,netdev=net1 -netdev user,id=net1,net=192.168.0.0/24 \
+ -device e1000e,netdev=net2 -netdev user,id=net2,net=192.168.200.0/24 \
+ (...)
+
+ $ qemu-system-riscv64 \
+ -M virt,aia=aplic-imsic,aia-guests=5 \
+ -device e1000e,netdev=net1 -netdev user,id=net1,net=192.168.0.0/24 \
+ -device e1000e,netdev=net2 -netdev user,id=net2,net=192.168.200.0/24 \
+ -device riscv-iommu-pci,addr=1.0,vendor-id=0x1efd,device-id=0xedf1 \
+ (...)
+
+Both will create iommu groups for the two e1000e cards.
+
+Another thing to notice on `linux-v8`_ and `ventana-linux`_ is that the kernel driver
+considers an IOMMU identified as a Rivos device, i.e. it uses Rivos vendor ID. To
+use the riscv-iommu-pci device with the existing kernel support we need to emulate
+a Rivos PCI IOMMU by setting 'vendor-id' and 'device-id':
+
+.. code-block:: bash
+
+ $ qemu-system-riscv64 -M virt \
+ -device riscv-iommu-pci,vendor-id=0x1efd,device-id=0xedf1 (...)
+
+Several options are available to control the capabilities of the device, namely:
+
+- "bus": the bus that the IOMMU device uses
+- "ioatc-limit": size of the Address Translation Cache (default to 2Mb)
+- "intremap": enable/disable MSI support
+- "ats": enable ATS support
+- "off" (Out-of-reset translation mode: 'on' for DMA disabled, 'off' for 'BARE' (passthrough))
+- "s-stage": enable s-stage support
+- "g-stage": enable g-stage support
+- "hpm-counters": number of hardware performance counters available. Maximum value is 31.
+ Default value is 31. Use 0 (zero) to disable HPM support
+
+riscv-iommu-sys device
+----------------------
+
+This device implements the RISC-V IOMMU emulation as a platform bus device that
+RISC-V boards can use.
+
+For the 'virt' board the device is disabled by default. To enable it use the
+'iommu-sys' machine option:
+
+.. code-block:: bash
+
+ $ qemu-system-riscv64 -M virt,iommu-sys=on (...)
+
+There is no options to configure the capabilities of this device in the 'virt'
+board using the QEMU command line. The device is configured with the following
+riscv-iommu options:
+
+- "ioatc-limit": default value (2Mb)
+- "intremap": enabled
+- "ats": enabled
+- "off": on (DMA disabled)
+- "s-stage": enabled
+- "g-stage": enabled
+
+.. _iommu1.0.0: https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0.0/riscv-iommu.pdf
+
+.. _linux-v8: https://lore.kernel.org/linux-riscv/cover.1718388908.git.tjeznach@rivosinc.com/
+
+.. _ventana-linux: https://github.com/ventanamicro/linux/tree/dev-upstream
diff --git a/docs/specs/rocker.txt b/docs/specs/rocker.rst
index 1857b31..3a7fc6a 100644
--- a/docs/specs/rocker.txt
+++ b/docs/specs/rocker.rst
@@ -1,23 +1,23 @@
Rocker Network Switch Register Programming Guide
-Copyright (c) Scott Feldman <sfeldma@gmail.com>
-Copyright (c) Neil Horman <nhorman@tuxdriver.com>
-Version 0.11, 12/29/2014
+************************************************
-LICENSE
-=======
+..
+ Copyright (c) Scott Feldman <sfeldma@gmail.com>
+ Copyright (c) Neil Horman <nhorman@tuxdriver.com>
+ Version 0.11, 12/29/2014
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
-SECTION 1: Introduction
-=======================
+Introduction
+============
Overview
--------
@@ -29,25 +29,25 @@ software.
Notations and Conventions
-------------------------
-o In register descriptions, [n:m] indicates a range from bit n to bit m,
-inclusive.
-o Use of leading 0x indicates a hexadecimal number.
-o Use of leading 0b indicates a binary number.
-o The use of RSVD or Reserved indicates that a bit or field is reserved for
-future use.
-o Field width is in bytes, unless otherwise noted.
-o Register are (R) read-only, (R/W) read/write, (W) write-only, or (COR) clear
-on read
-o TLV values in network-byte-order are designated with (N).
+* In register descriptions, [n:m] indicates a range from bit n to bit m,
+ inclusive.
+* Use of leading 0x indicates a hexadecimal number.
+* Use of leading 0b indicates a binary number.
+* The use of RSVD or Reserved indicates that a bit or field is reserved for
+ future use.
+* Field width is in bytes, unless otherwise noted.
+* Register are (R) read-only, (R/W) read/write, (W) write-only, or (COR) clear
+ on read
+* TLV values in network-byte-order are designated with (N).
-SECTION 2: PCI Configuration Registers
-======================================
+PCI Configuration Registers
+===========================
PCI Configuration Space
-----------------------
-Each switch instance registers as a PCI device with PCI configuration space:
+Each switch instance registers as a PCI device with PCI configuration space::
offset width description value
---------------------------------------------
@@ -74,11 +74,10 @@ Each switch instance registers as a PCI device with PCI configuration space:
0x41 1 Retry count
0x42 2 Reserved
+ * Assigned by sub-system implementation
-* Assigned by sub-system implementation
-
-SECTION 3: Memory-Mapped Register Space
-=======================================
+Memory-Mapped Register Space
+============================
There are two memory-mapped BARs. BAR0 maps device register space and is
0x2000 in size. BAR1 maps MSI-X vector and PBA tables and is also 0x2000 in
@@ -89,7 +88,7 @@ byte registers with one 4-byte access, and 8 byte registers with either two
4-byte accesses or a single 8-byte access. In the case of two 4-byte accesses,
access must be lower and then upper 4-bytes, in that order.
-BAR0 device register space is organized as follows:
+BAR0 device register space is organized as follows::
offset description
------------------------------------------------------
@@ -105,7 +104,7 @@ Reads to reserved registers read back as 0.
No fancy stuff like write-combining is enabled on any of the registers.
-BAR1 MSI-X register space is organized as follows:
+BAR1 MSI-X register space is organized as follows::
offset description
------------------------------------------------------
@@ -113,8 +112,8 @@ BAR1 MSI-X register space is organized as follows:
0x1000-0x1fff MSI-X PBA table
-SECTION 4: Interrupts, DMA, and Endianness
-==========================================
+Interrupts, DMA, and Endianness
+===============================
PCI Interrupts
--------------
@@ -122,7 +121,7 @@ PCI Interrupts
The device supports only MSI-X interrupts. BAR1 memory-mapped region contains
the MSI-X vector and PBA tables, with support for up to 256 MSI-X vectors.
-The vector assignment is:
+The vector assignment is::
vector description
-----------------------------------------------------
@@ -134,7 +133,7 @@ The vector assignment is:
Tx vector is even
Rx vector is odd
-A MSI-X vector table entry is 16 bytes:
+A MSI-X vector table entry is 16 bytes::
field offset width description
-------------------------------------------------------------
@@ -170,7 +169,7 @@ ring, and hardware will set this bit when the descriptor is complete.
Descriptor ring sizes must be a power of 2 and range from 2 to 64K entries.
Descriptor rings' base address must be 8-byte aligned. Descriptors must be
packed within ring. Each descriptor in each ring must also be aligned on an 8
-byte boundary. Each descriptor ring will have these registers:
+byte boundary. Each descriptor ring will have these registers::
DMA_DESC_xxx_BASE_ADDR, offset 0x1000 + (x * 32), 64-bit, (R/W)
DMA_DESC_xxx_SIZE, offset 0x1008 + (x * 32), 32-bit, (R/W)
@@ -180,7 +179,7 @@ byte boundary. Each descriptor ring will have these registers:
DMA_DESC_xxx_CREDITS, offset 0x1018 + (x * 32), 32-bit, (R/W)
DMA_DESC_xxx_RSVD1, offset 0x101c + (x * 32), 32-bit, (R/W)
-Where x is descriptor ring index:
+Where x is descriptor ring index::
index ring
--------------------
@@ -203,14 +202,14 @@ written past TAIL. To do so would wrap the ring. An empty ring is when HEAD
== TAIL. A full ring is when HEAD is one position behind TAIL. Both HEAD and
TAIL increment and modulo wrap at the ring size.
-CTRL register bits:
+CTRL register bits::
bit name description
------------------------------------------------------------------------
[0] CTRL_RESET Reset the descriptor ring
[1:31] Reserved
-All descriptor types share some common fields:
+All descriptor types share some common fields::
field width description
-------------------------------------------------------------------
@@ -234,7 +233,7 @@ filled in by the switch. Likewise, the switch will ignore unknown fields
filled in by software.
Descriptor payload buffer is 8-byte aligned and TLVs are 8-byte aligned. The
-value within a TLV is also 8-byte aligned. The (packed, 8 byte) TLV header is:
+value within a TLV is also 8-byte aligned. The (packed, 8 byte) TLV header is::
field width description
-----------------------------
@@ -246,7 +245,7 @@ The alignment requirements for descriptors and TLVs are to avoid unaligned
access exceptions in software. Note that the payload for each TLV is also
8 byte aligned.
-Figure 1 shows an example descriptor buffer with two TLVs.
+Figure 1 shows an example descriptor buffer with two TLVs::
<------- 8 bytes ------->
@@ -316,11 +315,11 @@ network packet data. All non-network-packet TLV multi-byte values will be LE.
TLV values in network-byte-order are designated with (N).
-SECTION 5: Test Registers
-=========================
+Test Registers
+==============
Rocker has several test registers to support troubleshooting register access,
-interrupt generation, and DMA operations:
+interrupt generation, and DMA operations::
TEST_REG, offset 0x0010, 32-bit (R/W)
TEST_REG64, offset 0x0018, 64-bit (R/W)
@@ -338,7 +337,7 @@ for that vector.
To test basic DMA operations, allocate a DMA-able host buffer and put the
buffer address into TEST_DMA_ADDR and size into TEST_DMA_SIZE. Then, write to
-TEST_DMA_CTRL to manipulate the buffer contents. TEST_DMA_CTRL operations are:
+TEST_DMA_CTRL to manipulate the buffer contents. TEST_DMA_CTRL operations are::
operation value description
-----------------------------------------------------------
@@ -351,14 +350,14 @@ issue exists. In particular, buffers that start on odd-8-byte boundary and/or
span multiple PAGE sizes should be tested.
-SECTION 6: Ports
-================
+Ports
+=====
Physical and Logical Ports
------------------------------------
The switch supports up to 62 physical (front-panel) ports. Register
-PORT_PHYS_COUNT returns the actual number of physical ports available:
+PORT_PHYS_COUNT returns the actual number of physical ports available::
PORT_PHYS_COUNT, offset 0x0304, 32-bit, (R)
@@ -369,7 +368,7 @@ Front-panel ports and logical tunnel ports are mapped into a single 32-bit port
space. A special CPU port is assigned port 0. The front-panel ports are
mapped to ports 1-62. A special loopback port is assigned port 63. Logical
tunnel ports are assigned ports 0x0001000-0x0001ffff.
-To summarize the port assignments:
+To summarize the port assignments::
port mapping
-------------------------------------------------------
@@ -391,14 +390,14 @@ set/get the mode for front-panel ports, see port settings, below.
Port Settings
-------------
-Link status for all front-panel ports is available via PORT_PHYS_LINK_STATUS:
+Link status for all front-panel ports is available via PORT_PHYS_LINK_STATUS::
PORT_PHYS_LINK_STATUS, offset 0x0310, 64-bit, (R)
Value is port bitmap. Bits 0 and 63 always read 0. Bits 1-62
read 1 for link UP and 0 for link DOWN for respective front-panel ports.
-Other properties for front-panel ports are available via DMA CMD descriptors:
+Other properties for front-panel ports are available via DMA CMD descriptors::
Get PORT_SETTINGS descriptor:
@@ -438,7 +437,7 @@ Port Enable
-----------
Front-panel ports are initially disabled, which means port ingress and egress
-packets will be dropped. To enable or disable a port, use PORT_PHYS_ENABLE:
+packets will be dropped. To enable or disable a port, use PORT_PHYS_ENABLE::
PORT_PHYS_ENABLE: offset 0x0318, 64-bit, (R/W)
@@ -447,15 +446,15 @@ packets will be dropped. To enable or disable a port, use PORT_PHYS_ENABLE:
Default is 0.
-SECTION 7: Switch Control
-=========================
+Switch Control
+==============
This section covers switch-wide register settings.
Control
-------
-This register is used for low level control of the switch.
+This register is used for low level control of the switch::
CONTROL: offset 0x0300, 32-bit, (W)
@@ -468,18 +467,18 @@ Switch ID
---------
The switch has a SWITCH_ID to be used by software to uniquely identify the
-switch:
+switch::
SWITCH_ID: offset 0x0320, 64-bit, (R)
Value is opaque to switch software and no special encoding is implied.
-SECTION 8: Events
-=================
+Events
+======
Non-I/O asynchronous events from the device are notified to the host using the
-event ring. The TLV structure for events is:
+event ring. The TLV structure for events is::
field width description
---------------------------------------------------
@@ -491,7 +490,7 @@ event ring. The TLV structure for events is:
Link Changed Event
------------------
-When link status changes on a physical port, this event is generated.
+When link status changes on a physical port, this event is generated::
field width description
---------------------------------------------------
@@ -510,6 +509,8 @@ driver should install to the device the MAC/VLAN on the port into the bridge
table. Once installed, the MAC/VLAN is known on the port and this event will
no longer be generated.
+::
+
field width description
---------------------------------------------------
INFO <nest>
@@ -518,8 +519,8 @@ no longer be generated.
VLAN 2 VLAN ID
-SECTION 9: CPU Packet Processing
-================================
+CPU Packet Processing
+=====================
Ingress packets directed to the host CPU for further processing are delivered
in the DMA RX ring. Likewise, host CPU originating packets destined to egress
@@ -540,7 +541,7 @@ software that Tx is complete and software resources (e.g. skb) backing packet
can be released.
Figure 2 shows an example 3-fragment packet queued with one Tx descriptor. A
-TLV is used for each packet fragment.
+TLV is used for each packet fragment::
pkt frag 1
+–––––––+ +–+
@@ -570,7 +571,7 @@ TLV is used for each packet fragment.
fig 2.
-The TLVs for Tx descriptor buffer are:
+The TLVs for Tx descriptor buffer are::
field width description
---------------------------------------------------------------------
@@ -600,7 +601,7 @@ The TLVs for Tx descriptor buffer are:
TX_FRAG_ADDR 8 DMA address of packet fragment
TX_FRAG_LEN 2 Packet fragment length
-Possible status return codes in descriptor on completion are:
+Possible status return codes in descriptor on completion are::
DESC_COMP_ERR reason
--------------------------------------------------------------------
@@ -623,7 +624,7 @@ worst-case packet size. A single Rx descriptor will contain the entire Rx
packet data in one RX_FRAG. Other Rx TLVs describe and hardware offloads
performed on the packet, such as checksum validation.
-The TLVs for Rx descriptor buffer are:
+The TLVs for Rx descriptor buffer are::
field width description
---------------------------------------------------
@@ -649,7 +650,7 @@ The TLVs for Rx descriptor buffer are:
Offload forward RX_FLAG indicates the device has already forwarded the packet
so the host CPU should not also forward the packet.
-Possible status return codes in descriptor on completion are:
+Possible status return codes in descriptor on completion are::
DESC_COMP_ERR reason
--------------------------------------------------------------------
@@ -660,14 +661,14 @@ Possible status return codes in descriptor on completion are:
packet data TLV and other TLVs.
-SECTION 10: OF-DPA Mode
-======================
+OF-DPA Mode
+===========
OF-DPA mode allows the switch to offload flow packet processing functions to
hardware. An OpenFlow controller would communicate with an OpenFlow agent
installed on the switch. The OpenFlow agent would (directly or indirectly)
communicate with the Rocker switch driver, which in turn would program switch
-hardware with flow functionality, as defined in OF-DPA. The block diagram is:
+hardware with flow functionality, as defined in OF-DPA. The block diagram is::
+–––––––––––––––----–––+
| OF |
@@ -696,14 +697,14 @@ OF-DPA Flow Table Interface
There are commands to add, modify, delete, and get stats of flow table entries.
The commands are issued using the DMA CMD descriptor ring. The following
-commands are defined:
+commands are defined::
CMD_ADD: add an entry to flow table
CMD_MOD: modify an entry in flow table
CMD_DEL: delete an entry from flow table
CMD_GET_STATS: get stats for flow entry
-TLVs for add and modify commands are:
+TLVs for add and modify commands are::
field width description
----------------------------------------------------
@@ -723,14 +724,14 @@ TLVs for add and modify commands are:
Additional TLVs based on flow table ID:
-Table ID 0: ingress port
+Table ID 0: ingress port::
field width description
----------------------------------------------------
OF_DPA_IN_PPORT 4 ingress physical port number
OF_DPA_GOTO_TBL 2 goto table ID; zero to drop
-Table ID 10: vlan
+Table ID 10: vlan::
field width description
----------------------------------------------------
@@ -740,7 +741,7 @@ Table ID 10: vlan
OF_DPA_GOTO_TBL 2 goto table ID; zero to drop
OF_DPA_NEW_VLAN_ID 2 (N) new vlan ID
-Table ID 20: termination mac
+Table ID 20: termination mac::
field width description
----------------------------------------------------
@@ -757,7 +758,7 @@ Table ID 20: termination mac
OF_DPA_OUT_PPORT 2 if specified, must be
controller, set zero otherwise
-Table ID 30: unicast routing
+Table ID 30: unicast routing::
field width description
----------------------------------------------------
@@ -772,7 +773,7 @@ Table ID 30: unicast routing
OF_DPA_GROUP_ID 4 data for GROUP action must
be an L3 Unicast group entry
-Table ID 40: multicast routing
+Table ID 40: multicast routing::
field width description
----------------------------------------------------
@@ -797,7 +798,7 @@ Table ID 40: multicast routing
OF_DPA_GROUP_ID 4 data for GROUP action must
be an L3 multicast group entry
-Table ID 50: bridging
+Table ID 50: bridging::
field width description
----------------------------------------------------
@@ -818,7 +819,7 @@ Table ID 50: bridging
restricted to CONTROLLER,
set to 0 otherwise
-Table ID 60: acl policy
+Table ID 60: acl policy::
field width description
----------------------------------------------------
@@ -890,7 +891,7 @@ Table ID 60: acl policy
dropped (all other instructions
ignored)
-TLVs for flow delete and get stats command are:
+TLVs for flow delete and get stats command are::
field width description
---------------------------------------------------
@@ -898,7 +899,7 @@ TLVs for flow delete and get stats command are:
OF_DPA_COOKIE 8 Cookie
On completion of get stats command, the descriptor buffer is written back with
-the following TLVs:
+the following TLVs::
field width description
---------------------------------------------------
@@ -906,7 +907,7 @@ the following TLVs:
OF_DPA_STAT_RX_PKTS 8 Received packets
OF_DPA_STAT_TX_PKTS 8 Transmit packets
-Possible status return codes in descriptor on completion are:
+Possible status return codes in descriptor on completion are::
DESC_COMP_ERR command reason
--------------------------------------------------------------------
@@ -928,14 +929,14 @@ Group Table Interface
There are commands to add, modify, delete, and get stats of group table
entries. The commands are issued using the DMA CMD descriptor ring. The
-following commands are defined:
+following commands are defined::
CMD_ADD: add an entry to group table
CMD_MOD: modify an entry in group table
CMD_DEL: delete an entry from group table
CMD_GET_STATS: get stats for group entry
-TLVs for add and modify commands are:
+TLVs for add and modify commands are::
field width description
-----------------------------------------------------------
@@ -969,7 +970,7 @@ TLVs for add and modify commands are:
FLOW_SRC_MAC 6 (types 1, 2, 5)
FLOW_DST_MAC 6 (types 1, 2)
-TLVs for flow delete and get stats command are:
+TLVs for flow delete and get stats command are::
field width description
-----------------------------------------------------------
@@ -977,7 +978,7 @@ TLVs for flow delete and get stats command are:
FLOW_GROUP_ID 2 Flow group ID
On completion of get stats command, the descriptor buffer is written back with
-the following TLVs:
+the following TLVs::
field width description
---------------------------------------------------
@@ -986,7 +987,7 @@ the following TLVs:
FLOW_STAT_REF_COUNT 4 Flow reference count
FLOW_STAT_BUCKET_COUNT 4 Flow bucket count
-Possible status return codes in descriptor on completion are:
+Possible status return codes in descriptor on completion are::
DESC_COMP_ERR command reason
--------------------------------------------------------------------
diff --git a/docs/specs/tpm.rst b/docs/specs/tpm.rst
index 1ad36ad..b630a35 100644
--- a/docs/specs/tpm.rst
+++ b/docs/specs/tpm.rst
@@ -205,8 +205,8 @@ to be used with the passthrough backend or the swtpm backend.
QEMU files related to TPM backends:
- ``backends/tpm.c``
- - ``include/sysemu/tpm.h``
- - ``include/sysemu/tpm_backend.h``
+ - ``include/system/tpm.h``
+ - ``include/system/tpm_backend.h``
The QEMU TPM passthrough device
-------------------------------
@@ -240,7 +240,7 @@ PCRs.
QEMU files related to the TPM passthrough device:
- ``backends/tpm/tpm_passthrough.c``
- ``backends/tpm/tpm_util.c``
- - ``include/sysemu/tpm_util.h``
+ - ``include/system/tpm_util.h``
Command line to start QEMU with the TPM passthrough device using the host's
@@ -301,7 +301,7 @@ command.
QEMU files related to the TPM emulator device:
- ``backends/tpm/tpm_emulator.c``
- ``backends/tpm/tpm_util.c``
- - ``include/sysemu/tpm_util.h``
+ - ``include/system/tpm_util.h``
The following commands start the swtpm with a UnixIO control channel over
a socket interface. They do not need to be run as root.
diff --git a/docs/sphinx-static/theme_overrides.css b/docs/sphinx-static/theme_overrides.css
index 965ecac..b225bf7 100644
--- a/docs/sphinx-static/theme_overrides.css
+++ b/docs/sphinx-static/theme_overrides.css
@@ -18,8 +18,8 @@ h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend {
.rst-content dl:not(.docutils) dt {
border-top: none;
- border-left: solid 3px #ccc;
- background-color: #f0f0f0;
+ border-left: solid 5px #bcc6d2;
+ background-color: #eaedf1;
color: black;
}
@@ -208,3 +208,97 @@ div[class^="highlight"] pre {
color: inherit;
}
}
+
+/* QAPI domain theming */
+
+/* most content in a QAPI object definition should not eclipse about
+ 80ch, but nested field lists are explicitly exempt due to their
+ two-column nature */
+.qapi dd *:not(dl) {
+ max-width: 80ch;
+}
+
+/* but the content column itself should still be less than ~80ch. */
+.qapi .field-list dd {
+ max-width: 80ch;
+}
+
+.qapi-infopips {
+ margin-bottom: 1em;
+}
+
+.qapi-infopip {
+ display: inline-block;
+ padding: 0em 0.5em 0em 0.5em;
+ margin: 0.25em;
+}
+
+.qapi-deprecated,.qapi-unstable {
+ background-color: #fffef5;
+ border: solid #fff176 6px;
+ font-weight: bold;
+ padding: 8px;
+ border-radius: 15px;
+ margin: 5px;
+}
+
+.qapi-unstable::before {
+ content: '🚧 ';
+}
+
+.qapi-deprecated::before {
+ content: '⚠️ ';
+}
+
+.qapi-ifcond::before {
+ /* gaze ye into the crystal ball to determine feature availability */
+ content: '🔮 ';
+}
+
+.qapi-ifcond {
+ background-color: #f9f5ff;
+ border: solid #dac2ff 6px;
+ padding: 8px;
+ border-radius: 15px;
+ margin: 5px;
+}
+
+/* code blocks */
+.qapi div[class^="highlight"] {
+ width: fit-content;
+ background-color: #fffafd;
+ border: 2px solid #ffe1f3;
+}
+
+/* note, warning, etc. */
+.qapi .admonition {
+ width: fit-content;
+}
+
+/* pad the top of the field-list so the text doesn't start directly at
+ the top border; primarily for the field list labels, but adjust the
+ field bodies as well for parity. */
+dl.field-list > dt:first-of-type, dl.field-list > dd:first-of-type {
+ padding-top: 0.3em;
+}
+
+dl.field-list > dt:last-of-type, dl.field-list > dd:last-of-type {
+ padding-bottom: 0.3em;
+}
+
+/* pad the field list labels so they don't crash into the border */
+dl.field-list > dt {
+ padding-left: 0.5em;
+ padding-right: 0.5em;
+}
+
+/* Add a little padding between field list sections */
+dl.field-list > dd:not(:last-child) {
+ padding-bottom: 1em;
+}
+
+/* Sphinx 3.x: unresolved xrefs */
+.rst-content *:not(a) > code.xref {
+ font-weight: 400;
+ color: #333333;
+}
diff --git a/docs/sphinx/compat.py b/docs/sphinx/compat.py
new file mode 100644
index 0000000..9cf7fe0
--- /dev/null
+++ b/docs/sphinx/compat.py
@@ -0,0 +1,230 @@
+"""
+Sphinx cross-version compatibility goop
+"""
+
+import re
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Optional,
+ Type,
+)
+
+from docutils import nodes
+from docutils.nodes import Element, Node, Text
+from docutils.statemachine import StringList
+
+import sphinx
+from sphinx import addnodes, util
+from sphinx.directives import ObjectDescription
+from sphinx.environment import BuildEnvironment
+from sphinx.roles import XRefRole
+from sphinx.util import docfields
+from sphinx.util.docutils import (
+ ReferenceRole,
+ SphinxDirective,
+ switch_source_input,
+)
+from sphinx.util.typing import TextlikeNode
+
+
+MAKE_XREF_WORKAROUND = sphinx.version_info[:3] < (4, 1, 0)
+
+
+SpaceNode: Callable[[str], Node]
+KeywordNode: Callable[[str, str], Node]
+
+if sphinx.version_info[:3] >= (4, 0, 0):
+ SpaceNode = addnodes.desc_sig_space
+ KeywordNode = addnodes.desc_sig_keyword
+else:
+ SpaceNode = Text
+ KeywordNode = addnodes.desc_annotation
+
+
+def nested_parse_with_titles(
+ directive: SphinxDirective, content_node: Element
+) -> None:
+ """
+ This helper preserves error parsing context across sphinx versions.
+ """
+
+ # necessary so that the child nodes get the right source/line set
+ content_node.document = directive.state.document
+
+ try:
+ # Modern sphinx (6.2.0+) supports proper offsetting for
+ # nested parse error context management
+ util.nodes.nested_parse_with_titles(
+ directive.state,
+ directive.content,
+ content_node,
+ content_offset=directive.content_offset,
+ )
+ except TypeError:
+ # No content_offset argument. Fall back to SSI method.
+ with switch_source_input(directive.state, directive.content):
+ util.nodes.nested_parse_with_titles(
+ directive.state, directive.content, content_node
+ )
+
+
+# ###########################################
+# xref compatibility hacks for Sphinx < 4.1 #
+# ###########################################
+
+# When we require >= Sphinx 4.1, the following function and the
+# subsequent 3 compatibility classes can be removed. Anywhere in
+# qapi_domain that uses one of these Compat* types can be switched to
+# using the garden-variety lib-provided classes with no trickery.
+
+
+def _compat_make_xref( # pylint: disable=unused-argument
+ self: sphinx.util.docfields.Field,
+ rolename: str,
+ domain: str,
+ target: str,
+ innernode: Type[TextlikeNode] = addnodes.literal_emphasis,
+ contnode: Optional[Node] = None,
+ env: Optional[BuildEnvironment] = None,
+ inliner: Any = None,
+ location: Any = None,
+) -> Node:
+ """
+ Compatibility workaround for Sphinx versions prior to 4.1.0.
+
+ Older sphinx versions do not use the domain's XRefRole for parsing
+ and formatting cross-references, so we need to perform this magick
+ ourselves to avoid needing to write the parser/formatter in two
+ separate places.
+
+ This workaround isn't brick-for-brick compatible with modern Sphinx
+ versions, because we do not have access to the parent directive's
+ state during this parsing like we do in more modern versions.
+
+ It's no worse than what pre-Sphinx 4.1.0 does, so... oh well!
+ """
+
+ # Yes, this function is gross. Pre-4.1 support is a miracle.
+ # pylint: disable=too-many-locals
+
+ assert env
+ # Note: Sphinx's own code ignores the type warning here, too.
+ if not rolename:
+ return contnode or innernode(target, target) # type: ignore[call-arg]
+
+ # Get the role instance, but don't *execute it* - we lack the
+ # correct state to do so. Instead, we'll just use its public
+ # methods to do our reference formatting, and emulate the rest.
+ role = env.get_domain(domain).roles[rolename]
+ assert isinstance(role, XRefRole)
+
+ # XRefRole features not supported by this compatibility shim;
+ # these were not supported in Sphinx 3.x either, so nothing of
+ # value is really lost.
+ assert not target.startswith("!")
+ assert not re.match(ReferenceRole.explicit_title_re, target)
+ assert not role.lowercase
+ assert not role.fix_parens
+
+ # Code below based mostly on sphinx.roles.XRefRole; run() and
+ # create_xref_node()
+ options = {
+ "refdoc": env.docname,
+ "refdomain": domain,
+ "reftype": rolename,
+ "refexplicit": False,
+ "refwarn": role.warn_dangling,
+ }
+ refnode = role.nodeclass(target, **options)
+ title, target = role.process_link(env, refnode, False, target, target)
+ refnode["reftarget"] = target
+ classes = ["xref", domain, f"{domain}-{rolename}"]
+ refnode += role.innernodeclass(target, title, classes=classes)
+
+ # This is the very gross part of the hack. Normally,
+ # result_nodes takes a document object to which we would pass
+ # self.inliner.document. Prior to Sphinx 4.1, we don't *have* an
+ # inliner to pass, so we have nothing to pass here. However, the
+ # actual implementation of role.result_nodes in this case
+ # doesn't actually use that argument, so this winds up being
+ # ... fine. Rest easy at night knowing this code only runs under
+ # old versions of Sphinx, so at least it won't change in the
+ # future on us and lead to surprising new failures.
+ # Gross, I know.
+ result_nodes, _messages = role.result_nodes(
+ None, # type: ignore
+ env,
+ refnode,
+ is_ref=True,
+ )
+ return nodes.inline(target, "", *result_nodes)
+
+
+class CompatField(docfields.Field):
+ if MAKE_XREF_WORKAROUND:
+ make_xref = _compat_make_xref
+
+
+class CompatGroupedField(docfields.GroupedField):
+ if MAKE_XREF_WORKAROUND:
+ make_xref = _compat_make_xref
+
+
+class CompatTypedField(docfields.TypedField):
+ if MAKE_XREF_WORKAROUND:
+ make_xref = _compat_make_xref
+
+
+# ################################################################
+# Nested parsing error location fix for Sphinx 5.3.0 < x < 6.2.0 #
+# ################################################################
+
+# When we require Sphinx 4.x, the TYPE_CHECKING hack where we avoid
+# subscripting ObjectDescription at runtime can be removed in favor of
+# just always subscripting the class.
+
+# When we require Sphinx > 6.2.0, the rest of this compatibility hack
+# can be dropped and QAPIObject can just inherit directly from
+# ObjectDescription[Signature].
+
+SOURCE_LOCATION_FIX = (5, 3, 0) <= sphinx.version_info[:3] < (6, 2, 0)
+
+Signature = str
+
+
+if TYPE_CHECKING:
+ _BaseClass = ObjectDescription[Signature]
+else:
+ _BaseClass = ObjectDescription
+
+
+class ParserFix(_BaseClass):
+
+ _temp_content: StringList
+ _temp_offset: int
+ _temp_node: Optional[addnodes.desc_content]
+
+ def before_content(self) -> None:
+ # Work around a sphinx bug and parse the content ourselves.
+ self._temp_content = self.content
+ self._temp_offset = self.content_offset
+ self._temp_node = None
+
+ if SOURCE_LOCATION_FIX:
+ self._temp_node = addnodes.desc_content()
+ self.state.nested_parse(
+ self.content, self.content_offset, self._temp_node
+ )
+ # Sphinx will try to parse the content block itself,
+ # Give it nothingness to parse instead.
+ self.content = StringList()
+ self.content_offset = 0
+
+ def transform_content(self, content_node: addnodes.desc_content) -> None:
+ # Sphinx workaround: Inject our parsed content and restore state.
+ if self._temp_node:
+ content_node += self._temp_node.children
+ self.content = self._temp_content
+ self.content_offset = self._temp_offset
diff --git a/docs/sphinx/depfile.py b/docs/sphinx/depfile.py
index e74be6a..d3c774d 100644
--- a/docs/sphinx/depfile.py
+++ b/docs/sphinx/depfile.py
@@ -31,6 +31,9 @@ def get_infiles(env):
for path in Path(static_path).rglob('*'):
yield str(path)
+ # also include kdoc script
+ yield str(env.config.kerneldoc_bin[1])
+
def write_depfile(app, exception):
if exception:
diff --git a/docs/sphinx/qapi_domain.py b/docs/sphinx/qapi_domain.py
new file mode 100644
index 0000000..ebc46a7
--- /dev/null
+++ b/docs/sphinx/qapi_domain.py
@@ -0,0 +1,1055 @@
+"""
+QAPI domain extension.
+"""
+
+# The best laid plans of mice and men, ...
+# pylint: disable=too-many-lines
+
+from __future__ import annotations
+
+import re
+import types
+from typing import (
+ TYPE_CHECKING,
+ List,
+ NamedTuple,
+ Tuple,
+ Type,
+ cast,
+)
+
+from docutils import nodes
+from docutils.parsers.rst import directives
+from sphinx import addnodes
+from sphinx.directives import ObjectDescription
+from sphinx.domains import (
+ Domain,
+ Index,
+ IndexEntry,
+ ObjType,
+)
+from sphinx.locale import _, __
+from sphinx.roles import XRefRole
+from sphinx.util import logging
+from sphinx.util.docutils import SphinxDirective
+from sphinx.util.nodes import make_id, make_refnode
+
+from compat import (
+ CompatField,
+ CompatGroupedField,
+ CompatTypedField,
+ KeywordNode,
+ ParserFix,
+ Signature,
+ SpaceNode,
+)
+
+
+if TYPE_CHECKING:
+ from typing import (
+ AbstractSet,
+ Any,
+ Dict,
+ Iterable,
+ Optional,
+ Union,
+ )
+
+ from docutils.nodes import Element, Node
+ from sphinx.addnodes import desc_signature, pending_xref
+ from sphinx.application import Sphinx
+ from sphinx.builders import Builder
+ from sphinx.environment import BuildEnvironment
+ from sphinx.util.typing import OptionSpec
+
+
+logger = logging.getLogger(__name__)
+
+
+def _unpack_field(
+ field: nodes.Node,
+) -> Tuple[nodes.field_name, nodes.field_body]:
+ """
+ docutils helper: unpack a field node in a type-safe manner.
+ """
+ assert isinstance(field, nodes.field)
+ assert len(field.children) == 2
+ assert isinstance(field.children[0], nodes.field_name)
+ assert isinstance(field.children[1], nodes.field_body)
+ return (field.children[0], field.children[1])
+
+
+class ObjectEntry(NamedTuple):
+ docname: str
+ node_id: str
+ objtype: str
+ aliased: bool
+
+
+class QAPIXRefRole(XRefRole):
+
+ def process_link(
+ self,
+ env: BuildEnvironment,
+ refnode: Element,
+ has_explicit_title: bool,
+ title: str,
+ target: str,
+ ) -> tuple[str, str]:
+ refnode["qapi:namespace"] = env.ref_context.get("qapi:namespace")
+ refnode["qapi:module"] = env.ref_context.get("qapi:module")
+
+ # Cross-references that begin with a tilde adjust the title to
+ # only show the reference without a leading module, even if one
+ # was provided. This is a Sphinx-standard syntax; give it
+ # priority over QAPI-specific type markup below.
+ hide_module = False
+ if target.startswith("~"):
+ hide_module = True
+ target = target[1:]
+
+ # Type names that end with "?" are considered optional
+ # arguments and should be documented as such, but it's not
+ # part of the xref itself.
+ if target.endswith("?"):
+ refnode["qapi:optional"] = True
+ target = target[:-1]
+
+ # Type names wrapped in brackets denote lists. strip the
+ # brackets and remember to add them back later.
+ if target.startswith("[") and target.endswith("]"):
+ refnode["qapi:array"] = True
+ target = target[1:-1]
+
+ if has_explicit_title:
+ # Don't mess with the title at all if it was explicitly set.
+ # Explicit title syntax for references is e.g.
+ # :qapi:type:`target <explicit title>`
+ # and this explicit title overrides everything else here.
+ return title, target
+
+ title = target
+ if hide_module:
+ title = target.split(".")[-1]
+
+ return title, target
+
+ def result_nodes(
+ self,
+ document: nodes.document,
+ env: BuildEnvironment,
+ node: Element,
+ is_ref: bool,
+ ) -> Tuple[List[nodes.Node], List[nodes.system_message]]:
+
+ # node here is the pending_xref node (or whatever nodeclass was
+ # configured at XRefRole class instantiation time).
+ results: List[nodes.Node] = [node]
+
+ if node.get("qapi:array"):
+ results.insert(0, nodes.literal("[", "["))
+ results.append(nodes.literal("]", "]"))
+
+ if node.get("qapi:optional"):
+ results.append(nodes.Text(", "))
+ results.append(nodes.emphasis("?", "optional"))
+
+ return results, []
+
+
+class QAPIDescription(ParserFix):
+ """
+ Generic QAPI description.
+
+ This is meant to be an abstract class, not instantiated
+ directly. This class handles the abstract details of indexing, the
+ TOC, and reference targets for QAPI descriptions.
+ """
+
+ def handle_signature(self, sig: str, signode: desc_signature) -> Signature:
+ # pylint: disable=unused-argument
+
+ # Do nothing. The return value here is the "name" of the entity
+ # being documented; for QAPI, this is the same as the
+ # "signature", which is just a name.
+
+ # Normally this method must also populate signode with nodes to
+ # render the signature; here we do nothing instead - the
+ # subclasses will handle this.
+ return sig
+
+ def get_index_text(self, name: Signature) -> Tuple[str, str]:
+ """Return the text for the index entry of the object."""
+
+ # NB: this is used for the global index, not the QAPI index.
+ return ("single", f"{name} (QMP {self.objtype})")
+
+ def _get_context(self) -> Tuple[str, str]:
+ namespace = self.options.get(
+ "namespace", self.env.ref_context.get("qapi:namespace", "")
+ )
+ modname = self.options.get(
+ "module", self.env.ref_context.get("qapi:module", "")
+ )
+
+ return namespace, modname
+
+ def _get_fqn(self, name: Signature) -> str:
+ namespace, modname = self._get_context()
+
+ # If we're documenting a module, don't include the module as
+ # part of the FQN; we ARE the module!
+ if self.objtype == "module":
+ modname = ""
+
+ if modname:
+ name = f"{modname}.{name}"
+ if namespace:
+ name = f"{namespace}:{name}"
+ return name
+
+ def add_target_and_index(
+ self, name: Signature, sig: str, signode: desc_signature
+ ) -> None:
+ # pylint: disable=unused-argument
+
+ # name is the return value of handle_signature.
+ # sig is the original, raw text argument to handle_signature.
+ # For QAPI, these are identical, currently.
+
+ assert self.objtype
+
+ if not (fullname := signode.get("fullname", "")):
+ fullname = self._get_fqn(name)
+
+ node_id = make_id(
+ self.env, self.state.document, self.objtype, fullname
+ )
+ signode["ids"].append(node_id)
+
+ self.state.document.note_explicit_target(signode)
+ domain = cast(QAPIDomain, self.env.get_domain("qapi"))
+ domain.note_object(fullname, self.objtype, node_id, location=signode)
+
+ if "no-index-entry" not in self.options:
+ arity, indextext = self.get_index_text(name)
+ assert self.indexnode is not None
+ if indextext:
+ self.indexnode["entries"].append(
+ (arity, indextext, node_id, "", None)
+ )
+
+ @staticmethod
+ def split_fqn(name: str) -> Tuple[str, str, str]:
+ if ":" in name:
+ ns, name = name.split(":")
+ else:
+ ns = ""
+
+ if "." in name:
+ module, name = name.split(".")
+ else:
+ module = ""
+
+ return (ns, module, name)
+
+ def _object_hierarchy_parts(
+ self, sig_node: desc_signature
+ ) -> Tuple[str, ...]:
+ if "fullname" not in sig_node:
+ return ()
+ return self.split_fqn(sig_node["fullname"])
+
+ def _toc_entry_name(self, sig_node: desc_signature) -> str:
+ # This controls the name in the TOC and on the sidebar.
+
+ # This is the return type of _object_hierarchy_parts().
+ toc_parts = cast(Tuple[str, ...], sig_node.get("_toc_parts", ()))
+ if not toc_parts:
+ return ""
+
+ config = self.env.app.config
+ namespace, modname, name = toc_parts
+
+ if config.toc_object_entries_show_parents == "domain":
+ ret = name
+ if modname and modname != self.env.ref_context.get(
+ "qapi:module", ""
+ ):
+ ret = f"{modname}.{name}"
+ if namespace and namespace != self.env.ref_context.get(
+ "qapi:namespace", ""
+ ):
+ ret = f"{namespace}:{ret}"
+ return ret
+ if config.toc_object_entries_show_parents == "hide":
+ return name
+ if config.toc_object_entries_show_parents == "all":
+ return sig_node.get("fullname", name)
+ return ""
+
+
+class QAPIObject(QAPIDescription):
+ """
+ Description of a generic QAPI object.
+
+ It's not used directly, but is instead subclassed by specific directives.
+ """
+
+ # Inherit some standard options from Sphinx's ObjectDescription
+ option_spec: OptionSpec = ( # type:ignore[misc]
+ ObjectDescription.option_spec.copy()
+ )
+ option_spec.update(
+ {
+ # Context overrides:
+ "namespace": directives.unchanged,
+ "module": directives.unchanged,
+ # These are QAPI originals:
+ "since": directives.unchanged,
+ "ifcond": directives.unchanged,
+ "deprecated": directives.flag,
+ "unstable": directives.flag,
+ }
+ )
+
+ doc_field_types = [
+ # :feat name: descr
+ CompatGroupedField(
+ "feature",
+ label=_("Features"),
+ names=("feat",),
+ can_collapse=False,
+ ),
+ ]
+
+ def get_signature_prefix(self) -> List[nodes.Node]:
+ """Return a prefix to put before the object name in the signature."""
+ assert self.objtype
+ return [
+ KeywordNode("", self.objtype.title()),
+ SpaceNode(" "),
+ ]
+
+ def get_signature_suffix(self) -> List[nodes.Node]:
+ """Return a suffix to put after the object name in the signature."""
+ ret: List[nodes.Node] = []
+
+ if "since" in self.options:
+ ret += [
+ SpaceNode(" "),
+ addnodes.desc_sig_element(
+ "", f"(Since: {self.options['since']})"
+ ),
+ ]
+
+ return ret
+
+ def handle_signature(self, sig: str, signode: desc_signature) -> Signature:
+ """
+ Transform a QAPI definition name into RST nodes.
+
+ This method was originally intended for handling function
+ signatures. In the QAPI domain, however, we only pass the
+ definition name as the directive argument and handle everything
+ else in the content body with field lists.
+
+ As such, the only argument here is "sig", which is just the QAPI
+ definition name.
+ """
+ # No module or domain info allowed in the signature!
+ assert ":" not in sig
+ assert "." not in sig
+
+ namespace, modname = self._get_context()
+ signode["fullname"] = self._get_fqn(sig)
+ signode["namespace"] = namespace
+ signode["module"] = modname
+
+ sig_prefix = self.get_signature_prefix()
+ if sig_prefix:
+ signode += addnodes.desc_annotation(
+ str(sig_prefix), "", *sig_prefix
+ )
+ signode += addnodes.desc_name(sig, sig)
+ signode += self.get_signature_suffix()
+
+ return sig
+
+ def _add_infopips(self, contentnode: addnodes.desc_content) -> None:
+ # Add various eye-catches and things that go below the signature
+ # bar, but precede the user-defined content.
+ infopips = nodes.container()
+ infopips.attributes["classes"].append("qapi-infopips")
+
+ def _add_pip(
+ source: str, content: Union[str, List[nodes.Node]], classname: str
+ ) -> None:
+ node = nodes.container(source)
+ if isinstance(content, str):
+ node.append(nodes.Text(content))
+ else:
+ node.extend(content)
+ node.attributes["classes"].extend(["qapi-infopip", classname])
+ infopips.append(node)
+
+ if "deprecated" in self.options:
+ _add_pip(
+ ":deprecated:",
+ f"This {self.objtype} is deprecated.",
+ "qapi-deprecated",
+ )
+
+ if "unstable" in self.options:
+ _add_pip(
+ ":unstable:",
+ f"This {self.objtype} is unstable/experimental.",
+ "qapi-unstable",
+ )
+
+ if self.options.get("ifcond", ""):
+ ifcond = self.options["ifcond"]
+ _add_pip(
+ f":ifcond: {ifcond}",
+ [
+ nodes.emphasis("", "Availability"),
+ nodes.Text(": "),
+ nodes.literal(ifcond, ifcond),
+ ],
+ "qapi-ifcond",
+ )
+
+ if infopips.children:
+ contentnode.insert(0, infopips)
+
+ def _validate_field(self, field: nodes.field) -> None:
+ """Validate field lists in this QAPI Object Description."""
+ name, _ = _unpack_field(field)
+ allowed_fields = set(self.env.app.config.qapi_allowed_fields)
+
+ field_label = name.astext()
+ if field_label in allowed_fields:
+ # Explicitly allowed field list name, OK.
+ return
+
+ try:
+ # split into field type and argument (if provided)
+ # e.g. `:arg type name: descr` is
+ # field_type = "arg", field_arg = "type name".
+ field_type, field_arg = field_label.split(None, 1)
+ except ValueError:
+ # No arguments provided
+ field_type = field_label
+ field_arg = ""
+
+ typemap = self.get_field_type_map()
+ if field_type in typemap:
+ # This is a special docfield, yet-to-be-processed. Catch
+ # correct names, but incorrect arguments. This mismatch WILL
+ # cause Sphinx to render this field incorrectly (without a
+ # warning), which is never what we want.
+ typedesc = typemap[field_type][0]
+ if typedesc.has_arg != bool(field_arg):
+ msg = f"docfield field list type {field_type!r} "
+ if typedesc.has_arg:
+ msg += "requires an argument."
+ else:
+ msg += "takes no arguments."
+ logger.warning(msg, location=field)
+ else:
+ # This is unrecognized entirely. It's valid rST to use
+ # arbitrary fields, but let's ensure the documentation
+ # writer has done this intentionally.
+ valid = ", ".join(sorted(set(typemap) | allowed_fields))
+ msg = (
+ f"Unrecognized field list name {field_label!r}.\n"
+ f"Valid fields for qapi:{self.objtype} are: {valid}\n"
+ "\n"
+ "If this usage is intentional, please add it to "
+ "'qapi_allowed_fields' in docs/conf.py."
+ )
+ logger.warning(msg, location=field)
+
+ def transform_content(self, content_node: addnodes.desc_content) -> None:
+ # This hook runs after before_content and the nested parse, but
+ # before the DocFieldTransformer is executed.
+ super().transform_content(content_node)
+
+ self._add_infopips(content_node)
+
+ # Validate field lists.
+ for child in content_node:
+ if isinstance(child, nodes.field_list):
+ for field in child.children:
+ assert isinstance(field, nodes.field)
+ self._validate_field(field)
+
+
+class SpecialTypedField(CompatTypedField):
+ def make_field(self, *args: Any, **kwargs: Any) -> nodes.field:
+ ret = super().make_field(*args, **kwargs)
+
+ # Look for the characteristic " -- " text node that Sphinx
+ # inserts for each TypedField entry ...
+ for node in ret.traverse(lambda n: str(n) == " -- "):
+ par = node.parent
+ if par.children[0].astext() != "q_dummy":
+ continue
+
+ # If the first node's text is q_dummy, this is a dummy
+ # field we want to strip down to just its contents.
+ del par.children[:-1]
+
+ return ret
+
+
+class QAPICommand(QAPIObject):
+ """Description of a QAPI Command."""
+
+ doc_field_types = QAPIObject.doc_field_types.copy()
+ doc_field_types.extend(
+ [
+ # :arg TypeName ArgName: descr
+ SpecialTypedField(
+ "argument",
+ label=_("Arguments"),
+ names=("arg",),
+ typerolename="type",
+ can_collapse=False,
+ ),
+ # :error: descr
+ CompatField(
+ "error",
+ label=_("Errors"),
+ names=("error", "errors"),
+ has_arg=False,
+ ),
+ # :return TypeName: descr
+ CompatGroupedField(
+ "returnvalue",
+ label=_("Return"),
+ rolename="type",
+ names=("return",),
+ can_collapse=True,
+ ),
+ ]
+ )
+
+
+class QAPIEnum(QAPIObject):
+ """Description of a QAPI Enum."""
+
+ doc_field_types = QAPIObject.doc_field_types.copy()
+ doc_field_types.extend(
+ [
+ # :value name: descr
+ CompatGroupedField(
+ "value",
+ label=_("Values"),
+ names=("value",),
+ can_collapse=False,
+ )
+ ]
+ )
+
+
+class QAPIAlternate(QAPIObject):
+ """Description of a QAPI Alternate."""
+
+ doc_field_types = QAPIObject.doc_field_types.copy()
+ doc_field_types.extend(
+ [
+ # :alt type name: descr
+ CompatTypedField(
+ "alternative",
+ label=_("Alternatives"),
+ names=("alt",),
+ typerolename="type",
+ can_collapse=False,
+ ),
+ ]
+ )
+
+
+class QAPIObjectWithMembers(QAPIObject):
+ """Base class for Events/Structs/Unions"""
+
+ doc_field_types = QAPIObject.doc_field_types.copy()
+ doc_field_types.extend(
+ [
+ # :member type name: descr
+ SpecialTypedField(
+ "member",
+ label=_("Members"),
+ names=("memb",),
+ typerolename="type",
+ can_collapse=False,
+ ),
+ ]
+ )
+
+
+class QAPIEvent(QAPIObjectWithMembers):
+ # pylint: disable=too-many-ancestors
+ """Description of a QAPI Event."""
+
+
+class QAPIJSONObject(QAPIObjectWithMembers):
+ # pylint: disable=too-many-ancestors
+ """Description of a QAPI Object: structs and unions."""
+
+
+class QAPIModule(QAPIDescription):
+ """
+ Directive to mark description of a new module.
+
+ This directive doesn't generate any special formatting, and is just
+ a pass-through for the content body. Named section titles are
+ allowed in the content body.
+
+ Use this directive to create entries for the QAPI module in the
+ global index and the QAPI index; as well as to associate subsequent
+ definitions with the module they are defined in for purposes of
+ search and QAPI index organization.
+
+ :arg: The name of the module.
+ :opt no-index: Don't add cross-reference targets or index entries.
+ :opt no-typesetting: Don't render the content body (but preserve any
+ cross-reference target IDs in the squelched output.)
+
+ Example::
+
+ .. qapi:module:: block-core
+ :no-index:
+ :no-typesetting:
+
+ Lorem ipsum, dolor sit amet ...
+ """
+
+ def run(self) -> List[Node]:
+ modname = self.arguments[0].strip()
+ self.env.ref_context["qapi:module"] = modname
+ ret = super().run()
+
+ # ObjectDescription always creates a visible signature bar. We
+ # want module items to be "invisible", however.
+
+ # Extract the content body of the directive:
+ assert isinstance(ret[-1], addnodes.desc)
+ desc_node = ret.pop(-1)
+ assert isinstance(desc_node.children[1], addnodes.desc_content)
+ ret.extend(desc_node.children[1].children)
+
+ # Re-home node_ids so anchor refs still work:
+ node_ids: List[str]
+ if node_ids := [
+ node_id
+ for el in desc_node.children[0].traverse(nodes.Element)
+ for node_id in cast(List[str], el.get("ids", ()))
+ ]:
+ target_node = nodes.target(ids=node_ids)
+ ret.insert(1, target_node)
+
+ return ret
+
+
+class QAPINamespace(SphinxDirective):
+ has_content = False
+ required_arguments = 1
+
+ def run(self) -> List[Node]:
+ namespace = self.arguments[0].strip()
+ self.env.ref_context["qapi:namespace"] = namespace
+
+ return []
+
+
+class QAPIIndex(Index):
+ """
+ Index subclass to provide the QAPI definition index.
+ """
+
+ # pylint: disable=too-few-public-methods
+
+ name = "index"
+ localname = _("QAPI Index")
+ shortname = _("QAPI Index")
+ namespace = ""
+
+ def generate(
+ self,
+ docnames: Optional[Iterable[str]] = None,
+ ) -> Tuple[List[Tuple[str, List[IndexEntry]]], bool]:
+ assert isinstance(self.domain, QAPIDomain)
+ content: Dict[str, List[IndexEntry]] = {}
+ collapse = False
+
+ for objname, obj in self.domain.objects.items():
+ if docnames and obj.docname not in docnames:
+ continue
+
+ ns, _mod, name = QAPIDescription.split_fqn(objname)
+
+ if self.namespace != ns:
+ continue
+
+ # Add an alphabetical entry:
+ entries = content.setdefault(name[0].upper(), [])
+ entries.append(
+ IndexEntry(
+ name, 0, obj.docname, obj.node_id, obj.objtype, "", ""
+ )
+ )
+
+ # Add a categorical entry:
+ category = obj.objtype.title() + "s"
+ entries = content.setdefault(category, [])
+ entries.append(
+ IndexEntry(name, 0, obj.docname, obj.node_id, "", "", "")
+ )
+
+ # Sort entries within each category alphabetically
+ for category in content:
+ content[category] = sorted(content[category])
+
+ # Sort the categories themselves; type names first, ABC entries last.
+ sorted_content = sorted(
+ content.items(),
+ key=lambda x: (len(x[0]) == 1, x[0]),
+ )
+ return sorted_content, collapse
+
+
+class QAPIDomain(Domain):
+ """QAPI language domain."""
+
+ name = "qapi"
+ label = "QAPI"
+
+ # This table associates cross-reference object types (key) with an
+ # ObjType instance, which defines the valid cross-reference roles
+ # for each object type.
+ #
+ # e.g., the :qapi:type: cross-reference role can refer to enum,
+ # struct, union, or alternate objects; but :qapi:obj: can refer to
+ # anything. Each object also gets its own targeted cross-reference role.
+ object_types: Dict[str, ObjType] = {
+ "module": ObjType(_("module"), "mod", "any"),
+ "command": ObjType(_("command"), "cmd", "any"),
+ "event": ObjType(_("event"), "event", "any"),
+ "enum": ObjType(_("enum"), "enum", "type", "any"),
+ "object": ObjType(_("object"), "obj", "type", "any"),
+ "alternate": ObjType(_("alternate"), "alt", "type", "any"),
+ }
+
+ # Each of these provides a rST directive,
+ # e.g. .. qapi:module:: block-core
+ directives = {
+ "namespace": QAPINamespace,
+ "module": QAPIModule,
+ "command": QAPICommand,
+ "event": QAPIEvent,
+ "enum": QAPIEnum,
+ "object": QAPIJSONObject,
+ "alternate": QAPIAlternate,
+ }
+
+ # These are all cross-reference roles; e.g.
+ # :qapi:cmd:`query-block`. The keys correlate to the names used in
+ # the object_types table values above.
+ roles = {
+ "mod": QAPIXRefRole(),
+ "cmd": QAPIXRefRole(),
+ "event": QAPIXRefRole(),
+ "enum": QAPIXRefRole(),
+ "obj": QAPIXRefRole(), # specifically structs and unions.
+ "alt": QAPIXRefRole(),
+ # reference any data type (excludes modules, commands, events)
+ "type": QAPIXRefRole(),
+ "any": QAPIXRefRole(), # reference *any* type of QAPI object.
+ }
+
+ # Moved into the data property at runtime;
+ # this is the internal index of reference-able objects.
+ initial_data: Dict[str, Dict[str, Tuple[Any]]] = {
+ "objects": {}, # fullname -> ObjectEntry
+ }
+
+ # Index pages to generate; each entry is an Index class.
+ indices = [
+ QAPIIndex,
+ ]
+
+ @property
+ def objects(self) -> Dict[str, ObjectEntry]:
+ ret = self.data.setdefault("objects", {})
+ return ret # type: ignore[no-any-return]
+
+ def setup(self) -> None:
+ namespaces = set(self.env.app.config.qapi_namespaces)
+ for namespace in namespaces:
+ new_index: Type[QAPIIndex] = types.new_class(
+ f"{namespace}Index", bases=(QAPIIndex,)
+ )
+ new_index.name = f"{namespace.lower()}-index"
+ new_index.localname = _(f"{namespace} Index")
+ new_index.shortname = _(f"{namespace} Index")
+ new_index.namespace = namespace
+
+ self.indices.append(new_index)
+
+ super().setup()
+
+ def note_object(
+ self,
+ name: str,
+ objtype: str,
+ node_id: str,
+ aliased: bool = False,
+ location: Any = None,
+ ) -> None:
+ """Note a QAPI object for cross reference."""
+ if name in self.objects:
+ other = self.objects[name]
+ if other.aliased and aliased is False:
+ # The original definition found. Override it!
+ pass
+ elif other.aliased is False and aliased:
+ # The original definition is already registered.
+ return
+ else:
+ # duplicated
+ logger.warning(
+ __(
+ "duplicate object description of %s, "
+ "other instance in %s, use :no-index: for one of them"
+ ),
+ name,
+ other.docname,
+ location=location,
+ )
+ self.objects[name] = ObjectEntry(
+ self.env.docname, node_id, objtype, aliased
+ )
+
+ def clear_doc(self, docname: str) -> None:
+ for fullname, obj in list(self.objects.items()):
+ if obj.docname == docname:
+ del self.objects[fullname]
+
+ def merge_domaindata(
+ self, docnames: AbstractSet[str], otherdata: Dict[str, Any]
+ ) -> None:
+ for fullname, obj in otherdata["objects"].items():
+ if obj.docname in docnames:
+ # Sphinx's own python domain doesn't appear to bother to
+ # check for collisions. Assert they don't happen and
+ # we'll fix it if/when the case arises.
+ assert fullname not in self.objects, (
+ "bug - collision on merge?"
+ f" {fullname=} {obj=} {self.objects[fullname]=}"
+ )
+ self.objects[fullname] = obj
+
+ def find_obj(
+ self, namespace: str, modname: str, name: str, typ: Optional[str]
+ ) -> List[Tuple[str, ObjectEntry]]:
+ """
+ Find a QAPI object for "name", maybe using contextual information.
+
+ Returns a list of (name, object entry) tuples.
+
+ :param namespace: The current namespace context (if any!) under
+ which we are searching.
+ :param modname: The current module context (if any!) under
+ which we are searching.
+ :param name: The name of the x-ref to resolve; may or may not
+ include leading context.
+ :param type: The role name of the x-ref we're resolving, if
+ provided. This is absent for "any" role lookups.
+ """
+ if not name:
+ return []
+
+ # ##
+ # what to search for
+ # ##
+
+ parts = list(QAPIDescription.split_fqn(name))
+ explicit = tuple(bool(x) for x in parts)
+
+ # Fill in the blanks where possible:
+ if namespace and not parts[0]:
+ parts[0] = namespace
+ if modname and not parts[1]:
+ parts[1] = modname
+
+ implicit_fqn = ""
+ if all(parts):
+ implicit_fqn = f"{parts[0]}:{parts[1]}.{parts[2]}"
+
+ if typ is None:
+ # :any: lookup, search everything:
+ objtypes: List[str] = list(self.object_types)
+ else:
+ # type is specified and will be a role (e.g. obj, mod, cmd)
+ # convert this to eligible object types (e.g. command, module)
+ # using the QAPIDomain.object_types table.
+ objtypes = self.objtypes_for_role(typ, [])
+
+ # ##
+ # search!
+ # ##
+
+ def _search(needle: str) -> List[str]:
+ if (
+ needle
+ and needle in self.objects
+ and self.objects[needle].objtype in objtypes
+ ):
+ return [needle]
+ return []
+
+ if found := _search(name):
+ # Exact match!
+ pass
+ elif found := _search(implicit_fqn):
+ # Exact match using contextual information to fill in the gaps.
+ pass
+ else:
+ # No exact hits, perform applicable fuzzy searches.
+ searches = []
+
+ esc = tuple(re.escape(s) for s in parts)
+
+ # Try searching for ns:*.name or ns:name
+ if explicit[0] and not explicit[1]:
+ searches.append(f"^{esc[0]}:([^\\.]+\\.)?{esc[2]}$")
+ # Try searching for *:module.name or module.name
+ if explicit[1] and not explicit[0]:
+ searches.append(f"(^|:){esc[1]}\\.{esc[2]}$")
+ # Try searching for context-ns:*.name or context-ns:name
+ if parts[0] and not (explicit[0] or explicit[1]):
+ searches.append(f"^{esc[0]}:([^\\.]+\\.)?{esc[2]}$")
+ # Try searching for *:context-mod.name or context-mod.name
+ if parts[1] and not (explicit[0] or explicit[1]):
+ searches.append(f"(^|:){esc[1]}\\.{esc[2]}$")
+ # Try searching for *:name, *.name, or name
+ if not (explicit[0] or explicit[1]):
+ searches.append(f"(^|:|\\.){esc[2]}$")
+
+ for search in searches:
+ if found := [
+ oname
+ for oname in self.objects
+ if re.search(search, oname)
+ and self.objects[oname].objtype in objtypes
+ ]:
+ break
+
+ matches = [(oname, self.objects[oname]) for oname in found]
+ if len(matches) > 1:
+ matches = [m for m in matches if not m[1].aliased]
+ return matches
+
+ def resolve_xref(
+ self,
+ env: BuildEnvironment,
+ fromdocname: str,
+ builder: Builder,
+ typ: str,
+ target: str,
+ node: pending_xref,
+ contnode: Element,
+ ) -> nodes.reference | None:
+ namespace = node.get("qapi:namespace")
+ modname = node.get("qapi:module")
+ matches = self.find_obj(namespace, modname, target, typ)
+
+ if not matches:
+ # Normally, we could pass warn_dangling=True to QAPIXRefRole(),
+ # but that will trigger on references to these built-in types,
+ # which we'd like to ignore instead.
+
+ # Take care of that warning here instead, so long as the
+ # reference isn't to one of our built-in core types.
+ if target not in (
+ "string",
+ "number",
+ "int",
+ "boolean",
+ "null",
+ "value",
+ "q_empty",
+ ):
+ logger.warning(
+ __("qapi:%s reference target not found: %r"),
+ typ,
+ target,
+ type="ref",
+ subtype="qapi",
+ location=node,
+ )
+ return None
+
+ if len(matches) > 1:
+ logger.warning(
+ __("more than one target found for cross-reference %r: %s"),
+ target,
+ ", ".join(match[0] for match in matches),
+ type="ref",
+ subtype="qapi",
+ location=node,
+ )
+
+ name, obj = matches[0]
+ return make_refnode(
+ builder, fromdocname, obj.docname, obj.node_id, contnode, name
+ )
+
+ def resolve_any_xref(
+ self,
+ env: BuildEnvironment,
+ fromdocname: str,
+ builder: Builder,
+ target: str,
+ node: pending_xref,
+ contnode: Element,
+ ) -> List[Tuple[str, nodes.reference]]:
+ results: List[Tuple[str, nodes.reference]] = []
+ matches = self.find_obj(
+ node.get("qapi:namespace"), node.get("qapi:module"), target, None
+ )
+ for name, obj in matches:
+ rolename = self.role_for_objtype(obj.objtype)
+ assert rolename is not None
+ role = f"qapi:{rolename}"
+ refnode = make_refnode(
+ builder, fromdocname, obj.docname, obj.node_id, contnode, name
+ )
+ results.append((role, refnode))
+ return results
+
+
+def setup(app: Sphinx) -> Dict[str, Any]:
+ app.setup_extension("sphinx.directives")
+ app.add_config_value(
+ "qapi_allowed_fields",
+ set(),
+ "env", # Setting impacts parsing phase
+ types=set,
+ )
+ app.add_config_value(
+ "qapi_namespaces",
+ set(),
+ "env",
+ types=set,
+ )
+ app.add_domain(QAPIDomain)
+
+ return {
+ "version": "1.0",
+ "env_version": 1,
+ "parallel_read_safe": True,
+ "parallel_write_safe": True,
+ }
diff --git a/docs/sphinx/qapidoc.py b/docs/sphinx/qapidoc.py
index 738b245..8011ac9 100644
--- a/docs/sphinx/qapidoc.py
+++ b/docs/sphinx/qapidoc.py
@@ -2,6 +2,7 @@
#
# QEMU qapidoc QAPI file parsing extension
#
+# Copyright (c) 2024-2025 Red Hat
# Copyright (c) 2020 Linaro
#
# This work is licensed under the terms of the GNU GPLv2 or later.
@@ -24,446 +25,438 @@ The Sphinx documentation on writing extensions is at:
https://www.sphinx-doc.org/en/master/development/index.html
"""
+from __future__ import annotations
+
+
+__version__ = "2.0"
+
+from contextlib import contextmanager
import os
+from pathlib import Path
import re
import sys
-import textwrap
-from typing import List
+from typing import TYPE_CHECKING
from docutils import nodes
-from docutils.parsers.rst import Directive, directives
-from docutils.statemachine import ViewList
-from qapi.error import QAPIError, QAPISemError
-from qapi.gen import QAPISchemaVisitor
-from qapi.schema import QAPISchema
-
+from docutils.parsers.rst import directives
+from docutils.statemachine import StringList
+from qapi.error import QAPIError
+from qapi.parser import QAPIDoc
+from qapi.schema import (
+ QAPISchema,
+ QAPISchemaArrayType,
+ QAPISchemaCommand,
+ QAPISchemaDefinition,
+ QAPISchemaEnumMember,
+ QAPISchemaEvent,
+ QAPISchemaFeature,
+ QAPISchemaMember,
+ QAPISchemaObjectType,
+ QAPISchemaObjectTypeMember,
+ QAPISchemaType,
+ QAPISchemaVisitor,
+)
+from qapi.source import QAPISourceInfo
from sphinx import addnodes
from sphinx.directives.code import CodeBlock
from sphinx.errors import ExtensionError
-from sphinx.util.docutils import switch_source_input
+from sphinx.util import logging
+from sphinx.util.docutils import SphinxDirective, switch_source_input
from sphinx.util.nodes import nested_parse_with_titles
+from qapidoc_legacy import QAPISchemaGenRSTVisitor # type: ignore
-__version__ = "1.0"
+if TYPE_CHECKING:
+ from typing import (
+ Any,
+ Generator,
+ List,
+ Optional,
+ Sequence,
+ Union,
+ )
-def dedent(text: str) -> str:
- # Adjust indentation to make description text parse as paragraph.
+ from sphinx.application import Sphinx
+ from sphinx.util.typing import ExtensionMetadata
- lines = text.splitlines(True)
- if re.match(r"\s+", lines[0]):
- # First line is indented; description started on the line after
- # the name. dedent the whole block.
- return textwrap.dedent(text)
- # Descr started on same line. Dedent line 2+.
- return lines[0] + textwrap.dedent("".join(lines[1:]))
+logger = logging.getLogger(__name__)
-# Disable black auto-formatter until re-enabled:
-# fmt: off
+class Transmogrifier:
+ # pylint: disable=too-many-public-methods
+ # Field names used for different entity types:
+ field_types = {
+ "enum": "value",
+ "struct": "memb",
+ "union": "memb",
+ "event": "memb",
+ "command": "arg",
+ "alternate": "alt",
+ }
-class QAPISchemaGenRSTVisitor(QAPISchemaVisitor):
- """A QAPI schema visitor which generates docutils/Sphinx nodes
-
- This class builds up a tree of docutils/Sphinx nodes corresponding
- to documentation for the various QAPI objects. To use it, first
- create a QAPISchemaGenRSTVisitor object, and call its
- visit_begin() method. Then you can call one of the two methods
- 'freeform' (to add documentation for a freeform documentation
- chunk) or 'symbol' (to add documentation for a QAPI symbol). These
- will cause the visitor to build up the tree of document
- nodes. Once you've added all the documentation via 'freeform' and
- 'symbol' method calls, you can call 'get_document_nodes' to get
- the final list of document nodes (in a form suitable for returning
- from a Sphinx directive's 'run' method).
- """
- def __init__(self, sphinx_directive):
- self._cur_doc = None
- self._sphinx_directive = sphinx_directive
- self._top_node = nodes.section()
- self._active_headings = [self._top_node]
-
- def _make_dlitem(self, term, defn):
- """Return a dlitem node with the specified term and definition.
-
- term should be a list of Text and literal nodes.
- defn should be one of:
- - a string, which will be handed to _parse_text_into_node
- - a list of Text and literal nodes, which will be put into
- a paragraph node
- """
- dlitem = nodes.definition_list_item()
- dlterm = nodes.term('', '', *term)
- dlitem += dlterm
- if defn:
- dldef = nodes.definition()
- if isinstance(defn, list):
- dldef += nodes.paragraph('', '', *defn)
- else:
- self._parse_text_into_node(defn, dldef)
- dlitem += dldef
- return dlitem
-
- def _make_section(self, title):
- """Return a section node with optional title"""
- section = nodes.section(ids=[self._sphinx_directive.new_serialno()])
- if title:
- section += nodes.title(title, title)
- return section
-
- def _nodes_for_ifcond(self, ifcond, with_if=True):
- """Return list of Text, literal nodes for the ifcond
-
- Return a list which gives text like ' (If: condition)'.
- If with_if is False, we don't return the "(If: " and ")".
- """
-
- doc = ifcond.docgen()
- if not doc:
- return []
- doc = nodes.literal('', doc)
- if not with_if:
- return [doc]
+ def __init__(self) -> None:
+ self._curr_ent: Optional[QAPISchemaDefinition] = None
+ self._result = StringList()
+ self.indent = 0
- nodelist = [nodes.Text(' ('), nodes.strong('', 'If: ')]
- nodelist.append(doc)
- nodelist.append(nodes.Text(')'))
- return nodelist
+ @property
+ def result(self) -> StringList:
+ return self._result
- def _nodes_for_one_member(self, member):
- """Return list of Text, literal nodes for this member
+ @property
+ def entity(self) -> QAPISchemaDefinition:
+ assert self._curr_ent is not None
+ return self._curr_ent
- Return a list of doctree nodes which give text like
- 'name: type (optional) (If: ...)' suitable for use as the
- 'term' part of a definition list item.
- """
- term = [nodes.literal('', member.name)]
- if member.type.doc_type():
- term.append(nodes.Text(': '))
- term.append(nodes.literal('', member.type.doc_type()))
- if member.optional:
- term.append(nodes.Text(' (optional)'))
- if member.ifcond.is_present():
- term.extend(self._nodes_for_ifcond(member.ifcond))
- return term
-
- def _nodes_for_variant_when(self, branches, variant):
- """Return list of Text, literal nodes for variant 'when' clause
-
- Return a list of doctree nodes which give text like
- 'when tagname is variant (If: ...)' suitable for use in
- the 'branches' part of a definition list.
- """
- term = [nodes.Text(' when '),
- nodes.literal('', branches.tag_member.name),
- nodes.Text(' is '),
- nodes.literal('', '"%s"' % variant.name)]
- if variant.ifcond.is_present():
- term.extend(self._nodes_for_ifcond(variant.ifcond))
- return term
-
- def _nodes_for_members(self, doc, what, base=None, branches=None):
- """Return list of doctree nodes for the table of members"""
- dlnode = nodes.definition_list()
- for section in doc.args.values():
- term = self._nodes_for_one_member(section.member)
- # TODO drop fallbacks when undocumented members are outlawed
- if section.text:
- defn = dedent(section.text)
- else:
- defn = [nodes.Text('Not documented')]
+ @property
+ def member_field_type(self) -> str:
+ return self.field_types[self.entity.meta]
- dlnode += self._make_dlitem(term, defn)
+ # General-purpose rST generation functions
- if base:
- dlnode += self._make_dlitem([nodes.Text('The members of '),
- nodes.literal('', base.doc_type())],
- None)
+ def get_indent(self) -> str:
+ return " " * self.indent
- if branches:
- for v in branches.variants:
- if v.type.name == 'q_empty':
- continue
- assert not v.type.is_implicit()
- term = [nodes.Text('The members of '),
- nodes.literal('', v.type.doc_type())]
- term.extend(self._nodes_for_variant_when(branches, v))
- dlnode += self._make_dlitem(term, None)
-
- if not dlnode.children:
- return []
-
- section = self._make_section(what)
- section += dlnode
- return [section]
-
- def _nodes_for_enum_values(self, doc):
- """Return list of doctree nodes for the table of enum values"""
- seen_item = False
- dlnode = nodes.definition_list()
- for section in doc.args.values():
- termtext = [nodes.literal('', section.member.name)]
- if section.member.ifcond.is_present():
- termtext.extend(self._nodes_for_ifcond(section.member.ifcond))
- # TODO drop fallbacks when undocumented members are outlawed
- if section.text:
- defn = dedent(section.text)
- else:
- defn = [nodes.Text('Not documented')]
-
- dlnode += self._make_dlitem(termtext, defn)
- seen_item = True
-
- if not seen_item:
- return []
-
- section = self._make_section('Values')
- section += dlnode
- return [section]
-
- def _nodes_for_arguments(self, doc, arg_type):
- """Return list of doctree nodes for the arguments section"""
- if arg_type and not arg_type.is_implicit():
- assert not doc.args
- section = self._make_section('Arguments')
- dlnode = nodes.definition_list()
- dlnode += self._make_dlitem(
- [nodes.Text('The members of '),
- nodes.literal('', arg_type.name)],
- None)
- section += dlnode
- return [section]
-
- return self._nodes_for_members(doc, 'Arguments')
-
- def _nodes_for_features(self, doc):
- """Return list of doctree nodes for the table of features"""
- seen_item = False
- dlnode = nodes.definition_list()
- for section in doc.features.values():
- dlnode += self._make_dlitem(
- [nodes.literal('', section.member.name)], dedent(section.text))
- seen_item = True
-
- if not seen_item:
- return []
-
- section = self._make_section('Features')
- section += dlnode
- return [section]
-
- def _nodes_for_example(self, exampletext):
- """Return list of doctree nodes for a code example snippet"""
- return [nodes.literal_block(exampletext, exampletext)]
-
- def _nodes_for_sections(self, doc):
- """Return list of doctree nodes for additional sections"""
- nodelist = []
- for section in doc.sections:
- if section.tag and section.tag == 'TODO':
- # Hide TODO: sections
- continue
-
- if not section.tag:
- # Sphinx cannot handle sectionless titles;
- # Instead, just append the results to the prior section.
- container = nodes.container()
- self._parse_text_into_node(section.text, container)
- nodelist += container.children
- continue
-
- snode = self._make_section(section.tag)
- if section.tag.startswith('Example'):
- snode += self._nodes_for_example(dedent(section.text))
- else:
- self._parse_text_into_node(dedent(section.text), snode)
- nodelist.append(snode)
- return nodelist
-
- def _nodes_for_if_section(self, ifcond):
- """Return list of doctree nodes for the "If" section"""
- nodelist = []
- if ifcond.is_present():
- snode = self._make_section('If')
- snode += nodes.paragraph(
- '', '', *self._nodes_for_ifcond(ifcond, with_if=False)
- )
- nodelist.append(snode)
- return nodelist
-
- def _add_doc(self, typ, sections):
- """Add documentation for a command/object/enum...
-
- We assume we're documenting the thing defined in self._cur_doc.
- typ is the type of thing being added ("Command", "Object", etc)
+ @contextmanager
+ def indented(self) -> Generator[None]:
+ self.indent += 1
+ try:
+ yield
+ finally:
+ self.indent -= 1
- sections is a list of nodes for sections to add to the definition.
- """
+ def add_line_raw(self, line: str, source: str, *lineno: int) -> None:
+ """Append one line of generated reST to the output."""
- doc = self._cur_doc
- snode = nodes.section(ids=[self._sphinx_directive.new_serialno()])
- snode += nodes.title('', '', *[nodes.literal(doc.symbol, doc.symbol),
- nodes.Text(' (' + typ + ')')])
- self._parse_text_into_node(doc.body.text, snode)
- for s in sections:
- if s is not None:
- snode += s
- self._add_node_to_current_heading(snode)
-
- def visit_enum_type(self, name, info, ifcond, features, members, prefix):
- doc = self._cur_doc
- self._add_doc('Enum',
- self._nodes_for_enum_values(doc)
- + self._nodes_for_features(doc)
- + self._nodes_for_sections(doc)
- + self._nodes_for_if_section(ifcond))
-
- def visit_object_type(self, name, info, ifcond, features,
- base, members, branches):
- doc = self._cur_doc
- if base and base.is_implicit():
- base = None
- self._add_doc('Object',
- self._nodes_for_members(doc, 'Members', base, branches)
- + self._nodes_for_features(doc)
- + self._nodes_for_sections(doc)
- + self._nodes_for_if_section(ifcond))
-
- def visit_alternate_type(self, name, info, ifcond, features,
- alternatives):
- doc = self._cur_doc
- self._add_doc('Alternate',
- self._nodes_for_members(doc, 'Members')
- + self._nodes_for_features(doc)
- + self._nodes_for_sections(doc)
- + self._nodes_for_if_section(ifcond))
-
- def visit_command(self, name, info, ifcond, features, arg_type,
- ret_type, gen, success_response, boxed, allow_oob,
- allow_preconfig, coroutine):
- doc = self._cur_doc
- self._add_doc('Command',
- self._nodes_for_arguments(doc, arg_type)
- + self._nodes_for_features(doc)
- + self._nodes_for_sections(doc)
- + self._nodes_for_if_section(ifcond))
-
- def visit_event(self, name, info, ifcond, features, arg_type, boxed):
- doc = self._cur_doc
- self._add_doc('Event',
- self._nodes_for_arguments(doc, arg_type)
- + self._nodes_for_features(doc)
- + self._nodes_for_sections(doc)
- + self._nodes_for_if_section(ifcond))
-
- def symbol(self, doc, entity):
- """Add documentation for one symbol to the document tree
-
- This is the main entry point which causes us to add documentation
- nodes for a symbol (which could be a 'command', 'object', 'event',
- etc). We do this by calling 'visit' on the schema entity, which
- will then call back into one of our visit_* methods, depending
- on what kind of thing this symbol is.
- """
- self._cur_doc = doc
- entity.visit(self)
- self._cur_doc = None
+ # NB: Sphinx uses zero-indexed lines; subtract one.
+ lineno = tuple((n - 1 for n in lineno))
- def _start_new_heading(self, heading, level):
- """Start a new heading at the specified heading level
+ if line.strip():
+ # not a blank line
+ self._result.append(
+ self.get_indent() + line.rstrip("\n"), source, *lineno
+ )
+ else:
+ self._result.append("", source, *lineno)
+
+ def add_line(self, content: str, info: QAPISourceInfo) -> None:
+ # NB: We *require* an info object; this works out OK because we
+ # don't document built-in objects that don't have
+ # one. Everything else should.
+ self.add_line_raw(content, info.fname, info.line)
+
+ def add_lines(
+ self,
+ content: str,
+ info: QAPISourceInfo,
+ ) -> None:
+ lines = content.splitlines(True)
+ for i, line in enumerate(lines):
+ self.add_line_raw(line, info.fname, info.line + i)
+
+ def ensure_blank_line(self) -> None:
+ # Empty document -- no blank line required.
+ if not self._result:
+ return
+
+ # Last line isn't blank, add one.
+ if self._result[-1].strip(): # pylint: disable=no-member
+ fname, line = self._result.info(-1)
+ assert isinstance(line, int)
+ # New blank line is credited to one-after the current last line.
+ # +2: correct for zero/one index, then increment by one.
+ self.add_line_raw("", fname, line + 2)
+
+ def add_field(
+ self,
+ kind: str,
+ name: str,
+ body: str,
+ info: QAPISourceInfo,
+ typ: Optional[str] = None,
+ ) -> None:
+ if typ:
+ text = f":{kind} {typ} {name}: {body}"
+ else:
+ text = f":{kind} {name}: {body}"
+ self.add_lines(text, info)
+
+ def format_type(
+ self, ent: Union[QAPISchemaDefinition | QAPISchemaMember]
+ ) -> Optional[str]:
+ if isinstance(ent, (QAPISchemaEnumMember, QAPISchemaFeature)):
+ return None
+
+ qapi_type = ent
+ optional = False
+ if isinstance(ent, QAPISchemaObjectTypeMember):
+ qapi_type = ent.type
+ optional = ent.optional
+
+ if isinstance(qapi_type, QAPISchemaArrayType):
+ ret = f"[{qapi_type.element_type.doc_type()}]"
+ else:
+ assert isinstance(qapi_type, QAPISchemaType)
+ tmp = qapi_type.doc_type()
+ assert tmp
+ ret = tmp
+ if optional:
+ ret += "?"
+
+ return ret
+
+ def generate_field(
+ self,
+ kind: str,
+ member: QAPISchemaMember,
+ body: str,
+ info: QAPISourceInfo,
+ ) -> None:
+ typ = self.format_type(member)
+ self.add_field(kind, member.name, body, info, typ)
+
+ # Transmogrification helpers
+
+ def visit_paragraph(self, section: QAPIDoc.Section) -> None:
+ # Squelch empty paragraphs.
+ if not section.text:
+ return
+
+ self.ensure_blank_line()
+ self.add_lines(section.text, section.info)
+ self.ensure_blank_line()
+
+ def visit_member(self, section: QAPIDoc.ArgSection) -> None:
+ # FIXME: ifcond for members
+ # TODO: features for members (documented at entity-level,
+ # but sometimes defined per-member. Should we add such
+ # information to member descriptions when we can?)
+ assert section.member
+ self.generate_field(
+ self.member_field_type,
+ section.member,
+ # TODO drop fallbacks when undocumented members are outlawed
+ section.text if section.text else "Not documented",
+ section.info,
+ )
- Create a new section whose title is 'heading' and which is placed
- in the docutils node tree as a child of the most recent level-1
- heading. Subsequent document sections (commands, freeform doc chunks,
- etc) will be placed as children of this new heading section.
+ def visit_feature(self, section: QAPIDoc.ArgSection) -> None:
+ # FIXME - ifcond for features is not handled at all yet!
+ # Proposal: decorate the right-hand column with some graphical
+ # element to indicate conditional availability?
+ assert section.text # Guaranteed by parser.py
+ assert section.member
+
+ self.generate_field("feat", section.member, section.text, section.info)
+
+ def visit_returns(self, section: QAPIDoc.Section) -> None:
+ assert isinstance(self.entity, QAPISchemaCommand)
+ rtype = self.entity.ret_type
+ # q_empty can produce None, but we won't be documenting anything
+ # without an explicit return statement in the doc block, and we
+ # should not have any such explicit statements when there is no
+ # return value.
+ assert rtype
+
+ typ = self.format_type(rtype)
+ assert typ
+ assert section.text
+ self.add_field("return", typ, section.text, section.info)
+
+ def visit_errors(self, section: QAPIDoc.Section) -> None:
+ # FIXME: the formatting for errors may be inconsistent and may
+ # or may not require different newline placement to ensure
+ # proper rendering as a nested list.
+ self.add_lines(f":error:\n{section.text}", section.info)
+
+ def preamble(self, ent: QAPISchemaDefinition) -> None:
"""
- if len(self._active_headings) < level:
- raise QAPISemError(self._cur_doc.info,
- 'Level %d subheading found outside a '
- 'level %d heading'
- % (level, level - 1))
- snode = self._make_section(heading)
- self._active_headings[level - 1] += snode
- self._active_headings = self._active_headings[:level]
- self._active_headings.append(snode)
-
- def _add_node_to_current_heading(self, node):
- """Add the node to whatever the current active heading is"""
- self._active_headings[-1] += node
-
- def freeform(self, doc):
- """Add a piece of 'freeform' documentation to the document tree
-
- A 'freeform' document chunk doesn't relate to any particular
- symbol (for instance, it could be an introduction).
-
- If the freeform document starts with a line of the form
- '= Heading text', this is a section or subsection heading, with
- the heading level indicated by the number of '=' signs.
+ Generate option lines for QAPI entity directives.
"""
+ if ent.doc and ent.doc.since:
+ assert ent.doc.since.kind == QAPIDoc.Kind.SINCE
+ # Generated from the entity's docblock; info location is exact.
+ self.add_line(f":since: {ent.doc.since.text}", ent.doc.since.info)
+
+ if ent.ifcond.is_present():
+ doc = ent.ifcond.docgen()
+ assert ent.info
+ # Generated from entity definition; info location is approximate.
+ self.add_line(f":ifcond: {doc}", ent.info)
+
+ # Hoist special features such as :deprecated: and :unstable:
+ # into the options block for the entity. If, in the future, new
+ # special features are added, qapi-domain will chirp about
+ # unrecognized options and fail until they are handled in
+ # qapi-domain.
+ for feat in ent.features:
+ if feat.is_special():
+ # FIXME: handle ifcond if present. How to display that
+ # information is TBD.
+ # Generated from entity def; info location is approximate.
+ assert feat.info
+ self.add_line(f":{feat.name}:", feat.info)
+
+ self.ensure_blank_line()
+
+ def _insert_member_pointer(self, ent: QAPISchemaDefinition) -> None:
+
+ def _get_target(
+ ent: QAPISchemaDefinition,
+ ) -> Optional[QAPISchemaDefinition]:
+ if isinstance(ent, (QAPISchemaCommand, QAPISchemaEvent)):
+ return ent.arg_type
+ if isinstance(ent, QAPISchemaObjectType):
+ return ent.base
+ return None
+
+ target = _get_target(ent)
+ if target is not None and not target.is_implicit():
+ assert ent.info
+ self.add_field(
+ self.member_field_type,
+ "q_dummy",
+ f"The members of :qapi:type:`{target.name}`.",
+ ent.info,
+ "q_dummy",
+ )
- # QAPIDoc documentation says free-form documentation blocks
- # must have only a body section, nothing else.
- assert not doc.sections
- assert not doc.args
- assert not doc.features
- self._cur_doc = doc
-
- text = doc.body.text
- if re.match(r'=+ ', text):
- # Section/subsection heading (if present, will always be
- # the first line of the block)
- (heading, _, text) = text.partition('\n')
- (leader, _, heading) = heading.partition(' ')
- self._start_new_heading(heading, len(leader))
- if text == '':
- return
-
- node = self._make_section(None)
- self._parse_text_into_node(text, node)
- self._add_node_to_current_heading(node)
- self._cur_doc = None
-
- def _parse_text_into_node(self, doctext, node):
- """Parse a chunk of QAPI-doc-format text into the node
-
- The doc comment can contain most inline rST markup, including
- bulleted and enumerated lists.
- As an extra permitted piece of markup, @var will be turned
- into ``var``.
- """
+ if isinstance(ent, QAPISchemaObjectType) and ent.branches is not None:
+ for variant in ent.branches.variants:
+ if variant.type.name == "q_empty":
+ continue
+ assert ent.info
+ self.add_field(
+ self.member_field_type,
+ "q_dummy",
+ f" When ``{ent.branches.tag_member.name}`` is "
+ f"``{variant.name}``: "
+ f"The members of :qapi:type:`{variant.type.name}`.",
+ ent.info,
+ "q_dummy",
+ )
+
+ def visit_sections(self, ent: QAPISchemaDefinition) -> None:
+ sections = ent.doc.all_sections if ent.doc else []
+
+ # Determine the index location at which we should generate
+ # documentation for "The members of ..." pointers. This should
+ # go at the end of the members section(s) if any. Note that
+ # index 0 is assumed to be a plain intro section, even if it is
+ # empty; and that a members section if present will always
+ # immediately follow the opening PLAIN section.
+ gen_index = 1
+ if len(sections) > 1:
+ while sections[gen_index].kind == QAPIDoc.Kind.MEMBER:
+ gen_index += 1
+ if gen_index >= len(sections):
+ break
+
+ # Add sections in source order:
+ for i, section in enumerate(sections):
+ # @var is translated to ``var``:
+ section.text = re.sub(r"@([\w-]+)", r"``\1``", section.text)
+
+ if section.kind == QAPIDoc.Kind.PLAIN:
+ self.visit_paragraph(section)
+ elif section.kind == QAPIDoc.Kind.MEMBER:
+ assert isinstance(section, QAPIDoc.ArgSection)
+ self.visit_member(section)
+ elif section.kind == QAPIDoc.Kind.FEATURE:
+ assert isinstance(section, QAPIDoc.ArgSection)
+ self.visit_feature(section)
+ elif section.kind in (QAPIDoc.Kind.SINCE, QAPIDoc.Kind.TODO):
+ # Since is handled in preamble, TODO is skipped intentionally.
+ pass
+ elif section.kind == QAPIDoc.Kind.RETURNS:
+ self.visit_returns(section)
+ elif section.kind == QAPIDoc.Kind.ERRORS:
+ self.visit_errors(section)
+ else:
+ assert False
+
+ # Generate "The members of ..." entries if necessary:
+ if i == gen_index - 1:
+ self._insert_member_pointer(ent)
+
+ self.ensure_blank_line()
+
+ # Transmogrification core methods
+
+ def visit_module(self, path: str) -> None:
+ name = Path(path).stem
+ # module directives are credited to the first line of a module file.
+ self.add_line_raw(f".. qapi:module:: {name}", path, 1)
+ self.ensure_blank_line()
+
+ def visit_freeform(self, doc: QAPIDoc) -> None:
+ # TODO: Once the old qapidoc transformer is deprecated, freeform
+ # sections can be updated to pure rST, and this transformed removed.
+ #
+ # For now, translate our micro-format into rST. Code adapted
+ # from Peter Maydell's freeform().
+
+ assert len(doc.all_sections) == 1, doc.all_sections
+ body = doc.all_sections[0]
+ text = body.text
+ info = doc.info
+
+ if re.match(r"=+ ", text):
+ # Section/subsection heading (if present, will always be the
+ # first line of the block)
+ (heading, _, text) = text.partition("\n")
+ (leader, _, heading) = heading.partition(" ")
+ # Implicit +1 for heading in the containing .rst doc
+ level = len(leader) + 1
+
+ # https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#sections
+ markers = ' #*=_^"'
+ overline = level <= 2
+ marker = markers[level]
+
+ self.ensure_blank_line()
+ # This credits all 2 or 3 lines to the single source line.
+ if overline:
+ self.add_line(marker * len(heading), info)
+ self.add_line(heading, info)
+ self.add_line(marker * len(heading), info)
+ self.ensure_blank_line()
+
+ # Eat blank line(s) and advance info
+ trimmed = text.lstrip("\n")
+ text = trimmed
+ info = info.next_line(len(text) - len(trimmed) + 1)
+
+ self.add_lines(text, info)
+ self.ensure_blank_line()
+
+ def visit_entity(self, ent: QAPISchemaDefinition) -> None:
+ assert ent.info
- # Handle the "@var means ``var`` case
- doctext = re.sub(r'@([\w-]+)', r'``\1``', doctext)
-
- rstlist = ViewList()
- for line in doctext.splitlines():
- # The reported line number will always be that of the start line
- # of the doc comment, rather than the actual location of the error.
- # Being more precise would require overhaul of the QAPIDoc class
- # to track lines more exactly within all the sub-parts of the doc
- # comment, as well as counting lines here.
- rstlist.append(line, self._cur_doc.info.fname,
- self._cur_doc.info.line)
- # Append a blank line -- in some cases rST syntax errors get
- # attributed to the line after one with actual text, and if there
- # isn't anything in the ViewList corresponding to that then Sphinx
- # 1.6's AutodocReporter will then misidentify the source/line location
- # in the error message (usually attributing it to the top-level
- # .rst file rather than the offending .json file). The extra blank
- # line won't affect the rendered output.
- rstlist.append("", self._cur_doc.info.fname, self._cur_doc.info.line)
- self._sphinx_directive.do_parse(rstlist, node)
-
- def get_document_nodes(self):
- """Return the list of docutils nodes which make up the document"""
- return self._top_node.children
-
-
-# Turn the black formatter on for the rest of the file.
-# fmt: on
+ try:
+ self._curr_ent = ent
+
+ # Squish structs and unions together into an "object" directive.
+ meta = ent.meta
+ if meta in ("struct", "union"):
+ meta = "object"
+
+ # This line gets credited to the start of the /definition/.
+ self.add_line(f".. qapi:{meta}:: {ent.name}", ent.info)
+ with self.indented():
+ self.preamble(ent)
+ self.visit_sections(ent)
+ finally:
+ self._curr_ent = None
+
+ def set_namespace(self, namespace: str, source: str, lineno: int) -> None:
+ self.add_line_raw(
+ f".. qapi:namespace:: {namespace}", source, lineno + 1
+ )
+ self.ensure_blank_line()
class QAPISchemaGenDepVisitor(QAPISchemaVisitor):
@@ -474,22 +467,22 @@ class QAPISchemaGenDepVisitor(QAPISchemaVisitor):
schema file associated with each module in the QAPI input.
"""
- def __init__(self, env, qapidir):
+ def __init__(self, env: Any, qapidir: str) -> None:
self._env = env
self._qapidir = qapidir
- def visit_module(self, name):
+ def visit_module(self, name: str) -> None:
if name != "./builtin":
qapifile = self._qapidir + "/" + name
self._env.note_dependency(os.path.abspath(qapifile))
super().visit_module(name)
-class NestedDirective(Directive):
- def run(self):
+class NestedDirective(SphinxDirective):
+ def run(self) -> Sequence[nodes.Node]:
raise NotImplementedError
- def do_parse(self, rstlist, node):
+ def do_parse(self, rstlist: StringList, node: nodes.Node) -> None:
"""
Parse rST source lines and add them to the specified node
@@ -508,18 +501,110 @@ class QAPIDocDirective(NestedDirective):
required_argument = 1
optional_arguments = 1
- option_spec = {"qapifile": directives.unchanged_required}
+ option_spec = {
+ "qapifile": directives.unchanged_required,
+ "namespace": directives.unchanged,
+ "transmogrify": directives.flag,
+ }
has_content = False
- def new_serialno(self):
+ def new_serialno(self) -> str:
"""Return a unique new ID string suitable for use as a node's ID"""
env = self.state.document.settings.env
return "qapidoc-%d" % env.new_serialno("qapidoc")
- def run(self):
+ def transmogrify(self, schema: QAPISchema) -> nodes.Element:
+ logger.info("Transmogrifying QAPI to rST ...")
+ vis = Transmogrifier()
+ modules = set()
+
+ if "namespace" in self.options:
+ vis.set_namespace(
+ self.options["namespace"], *self.get_source_info()
+ )
+
+ for doc in schema.docs:
+ module_source = doc.info.fname
+ if module_source not in modules:
+ vis.visit_module(module_source)
+ modules.add(module_source)
+
+ if doc.symbol:
+ ent = schema.lookup_entity(doc.symbol)
+ assert isinstance(ent, QAPISchemaDefinition)
+ vis.visit_entity(ent)
+ else:
+ vis.visit_freeform(doc)
+
+ logger.info("Transmogrification complete.")
+
+ contentnode = nodes.section()
+ content = vis.result
+ titles_allowed = True
+
+ logger.info("Transmogrifier running nested parse ...")
+ with switch_source_input(self.state, content):
+ if titles_allowed:
+ node: nodes.Element = nodes.section()
+ node.document = self.state.document
+ nested_parse_with_titles(self.state, content, contentnode)
+ else:
+ node = nodes.paragraph()
+ node.document = self.state.document
+ self.state.nested_parse(content, 0, contentnode)
+ logger.info("Transmogrifier's nested parse completed.")
+
+ if self.env.app.verbosity >= 2 or os.environ.get("DEBUG"):
+ argname = "_".join(Path(self.arguments[0]).parts)
+ name = Path(argname).stem + ".ir"
+ self.write_intermediate(content, name)
+
+ sys.stdout.flush()
+ return contentnode
+
+ def write_intermediate(self, content: StringList, filename: str) -> None:
+ logger.info(
+ "writing intermediate rST for '%s' to '%s'",
+ self.arguments[0],
+ filename,
+ )
+
+ srctree = Path(self.env.app.config.qapidoc_srctree).resolve()
+ outlines = []
+ lcol_width = 0
+
+ for i, line in enumerate(content):
+ src, lineno = content.info(i)
+ srcpath = Path(src).resolve()
+ srcpath = srcpath.relative_to(srctree)
+
+ lcol = f"{srcpath}:{lineno:04d}"
+ lcol_width = max(lcol_width, len(lcol))
+ outlines.append((lcol, line))
+
+ with open(filename, "w", encoding="UTF-8") as outfile:
+ for lcol, rcol in outlines:
+ outfile.write(lcol.rjust(lcol_width))
+ outfile.write(" |")
+ if rcol:
+ outfile.write(f" {rcol}")
+ outfile.write("\n")
+
+ def legacy(self, schema: QAPISchema) -> nodes.Element:
+ vis = QAPISchemaGenRSTVisitor(self)
+ vis.visit_begin(schema)
+ for doc in schema.docs:
+ if doc.symbol:
+ vis.symbol(doc, schema.lookup_entity(doc.symbol))
+ else:
+ vis.freeform(doc)
+ return vis.get_document_node() # type: ignore
+
+ def run(self) -> Sequence[nodes.Node]:
env = self.state.document.settings.env
qapifile = env.config.qapidoc_srctree + "/" + self.arguments[0]
qapidir = os.path.dirname(qapifile)
+ transmogrify = "transmogrify" in self.options
try:
schema = QAPISchema(qapifile)
@@ -527,20 +612,18 @@ class QAPIDocDirective(NestedDirective):
# First tell Sphinx about all the schema files that the
# output documentation depends on (including 'qapifile' itself)
schema.visit(QAPISchemaGenDepVisitor(env, qapidir))
-
- vis = QAPISchemaGenRSTVisitor(self)
- vis.visit_begin(schema)
- for doc in schema.docs:
- if doc.symbol:
- vis.symbol(doc, schema.lookup_entity(doc.symbol))
- else:
- vis.freeform(doc)
- return vis.get_document_nodes()
except QAPIError as err:
# Launder QAPI parse errors into Sphinx extension errors
# so they are displayed nicely to the user
raise ExtensionError(str(err)) from err
+ if transmogrify:
+ contentnode = self.transmogrify(schema)
+ else:
+ contentnode = self.legacy(schema)
+
+ return contentnode.children
+
class QMPExample(CodeBlock, NestedDirective):
"""
@@ -591,7 +674,7 @@ class QMPExample(CodeBlock, NestedDirective):
)
return node
- def admonition_wrap(self, *content) -> List[nodes.Node]:
+ def admonition_wrap(self, *content: nodes.Node) -> List[nodes.Node]:
title = "Example:"
if "title" in self.options:
title = f"{title} {self.options['title']}"
@@ -637,8 +720,9 @@ class QMPExample(CodeBlock, NestedDirective):
return self.admonition_wrap(*content_nodes)
-def setup(app):
+def setup(app: Sphinx) -> ExtensionMetadata:
"""Register qapi-doc directive with Sphinx"""
+ app.setup_extension("qapi_domain")
app.add_config_value("qapidoc_srctree", None, "env")
app.add_directive("qapi-doc", QAPIDocDirective)
app.add_directive("qmp-example", QMPExample)
diff --git a/docs/sphinx/qapidoc_legacy.py b/docs/sphinx/qapidoc_legacy.py
new file mode 100644
index 0000000..13520f4
--- /dev/null
+++ b/docs/sphinx/qapidoc_legacy.py
@@ -0,0 +1,440 @@
+# coding=utf-8
+# type: ignore
+#
+# QEMU qapidoc QAPI file parsing extension
+#
+# Copyright (c) 2020 Linaro
+#
+# This work is licensed under the terms of the GNU GPLv2 or later.
+# See the COPYING file in the top-level directory.
+
+"""
+qapidoc is a Sphinx extension that implements the qapi-doc directive
+
+The purpose of this extension is to read the documentation comments
+in QAPI schema files, and insert them all into the current document.
+
+It implements one new rST directive, "qapi-doc::".
+Each qapi-doc:: directive takes one argument, which is the
+pathname of the schema file to process, relative to the source tree.
+
+The docs/conf.py file must set the qapidoc_srctree config value to
+the root of the QEMU source tree.
+
+The Sphinx documentation on writing extensions is at:
+https://www.sphinx-doc.org/en/master/development/index.html
+"""
+
+import re
+import textwrap
+
+from docutils import nodes
+from docutils.statemachine import ViewList
+from qapi.error import QAPISemError
+from qapi.gen import QAPISchemaVisitor
+from qapi.parser import QAPIDoc
+
+
+def dedent(text: str) -> str:
+ # Adjust indentation to make description text parse as paragraph.
+
+ lines = text.splitlines(True)
+ if re.match(r"\s+", lines[0]):
+ # First line is indented; description started on the line after
+ # the name. dedent the whole block.
+ return textwrap.dedent(text)
+
+ # Descr started on same line. Dedent line 2+.
+ return lines[0] + textwrap.dedent("".join(lines[1:]))
+
+
+class QAPISchemaGenRSTVisitor(QAPISchemaVisitor):
+ """A QAPI schema visitor which generates docutils/Sphinx nodes
+
+ This class builds up a tree of docutils/Sphinx nodes corresponding
+ to documentation for the various QAPI objects. To use it, first
+ create a QAPISchemaGenRSTVisitor object, and call its
+ visit_begin() method. Then you can call one of the two methods
+ 'freeform' (to add documentation for a freeform documentation
+ chunk) or 'symbol' (to add documentation for a QAPI symbol). These
+ will cause the visitor to build up the tree of document
+ nodes. Once you've added all the documentation via 'freeform' and
+ 'symbol' method calls, you can call 'get_document_nodes' to get
+ the final list of document nodes (in a form suitable for returning
+ from a Sphinx directive's 'run' method).
+ """
+ def __init__(self, sphinx_directive):
+ self._cur_doc = None
+ self._sphinx_directive = sphinx_directive
+ self._top_node = nodes.section()
+ self._active_headings = [self._top_node]
+
+ def _make_dlitem(self, term, defn):
+ """Return a dlitem node with the specified term and definition.
+
+ term should be a list of Text and literal nodes.
+ defn should be one of:
+ - a string, which will be handed to _parse_text_into_node
+ - a list of Text and literal nodes, which will be put into
+ a paragraph node
+ """
+ dlitem = nodes.definition_list_item()
+ dlterm = nodes.term('', '', *term)
+ dlitem += dlterm
+ if defn:
+ dldef = nodes.definition()
+ if isinstance(defn, list):
+ dldef += nodes.paragraph('', '', *defn)
+ else:
+ self._parse_text_into_node(defn, dldef)
+ dlitem += dldef
+ return dlitem
+
+ def _make_section(self, title):
+ """Return a section node with optional title"""
+ section = nodes.section(ids=[self._sphinx_directive.new_serialno()])
+ if title:
+ section += nodes.title(title, title)
+ return section
+
+ def _nodes_for_ifcond(self, ifcond, with_if=True):
+ """Return list of Text, literal nodes for the ifcond
+
+ Return a list which gives text like ' (If: condition)'.
+ If with_if is False, we don't return the "(If: " and ")".
+ """
+
+ doc = ifcond.docgen()
+ if not doc:
+ return []
+ doc = nodes.literal('', doc)
+ if not with_if:
+ return [doc]
+
+ nodelist = [nodes.Text(' ('), nodes.strong('', 'If: ')]
+ nodelist.append(doc)
+ nodelist.append(nodes.Text(')'))
+ return nodelist
+
+ def _nodes_for_one_member(self, member):
+ """Return list of Text, literal nodes for this member
+
+ Return a list of doctree nodes which give text like
+ 'name: type (optional) (If: ...)' suitable for use as the
+ 'term' part of a definition list item.
+ """
+ term = [nodes.literal('', member.name)]
+ if member.type.doc_type():
+ term.append(nodes.Text(': '))
+ term.append(nodes.literal('', member.type.doc_type()))
+ if member.optional:
+ term.append(nodes.Text(' (optional)'))
+ if member.ifcond.is_present():
+ term.extend(self._nodes_for_ifcond(member.ifcond))
+ return term
+
+ def _nodes_for_variant_when(self, branches, variant):
+ """Return list of Text, literal nodes for variant 'when' clause
+
+ Return a list of doctree nodes which give text like
+ 'when tagname is variant (If: ...)' suitable for use in
+ the 'branches' part of a definition list.
+ """
+ term = [nodes.Text(' when '),
+ nodes.literal('', branches.tag_member.name),
+ nodes.Text(' is '),
+ nodes.literal('', '"%s"' % variant.name)]
+ if variant.ifcond.is_present():
+ term.extend(self._nodes_for_ifcond(variant.ifcond))
+ return term
+
+ def _nodes_for_members(self, doc, what, base=None, branches=None):
+ """Return list of doctree nodes for the table of members"""
+ dlnode = nodes.definition_list()
+ for section in doc.args.values():
+ term = self._nodes_for_one_member(section.member)
+ # TODO drop fallbacks when undocumented members are outlawed
+ if section.text:
+ defn = dedent(section.text)
+ else:
+ defn = [nodes.Text('Not documented')]
+
+ dlnode += self._make_dlitem(term, defn)
+
+ if base:
+ dlnode += self._make_dlitem([nodes.Text('The members of '),
+ nodes.literal('', base.doc_type())],
+ None)
+
+ if branches:
+ for v in branches.variants:
+ if v.type.name == 'q_empty':
+ continue
+ assert not v.type.is_implicit()
+ term = [nodes.Text('The members of '),
+ nodes.literal('', v.type.doc_type())]
+ term.extend(self._nodes_for_variant_when(branches, v))
+ dlnode += self._make_dlitem(term, None)
+
+ if not dlnode.children:
+ return []
+
+ section = self._make_section(what)
+ section += dlnode
+ return [section]
+
+ def _nodes_for_enum_values(self, doc):
+ """Return list of doctree nodes for the table of enum values"""
+ seen_item = False
+ dlnode = nodes.definition_list()
+ for section in doc.args.values():
+ termtext = [nodes.literal('', section.member.name)]
+ if section.member.ifcond.is_present():
+ termtext.extend(self._nodes_for_ifcond(section.member.ifcond))
+ # TODO drop fallbacks when undocumented members are outlawed
+ if section.text:
+ defn = dedent(section.text)
+ else:
+ defn = [nodes.Text('Not documented')]
+
+ dlnode += self._make_dlitem(termtext, defn)
+ seen_item = True
+
+ if not seen_item:
+ return []
+
+ section = self._make_section('Values')
+ section += dlnode
+ return [section]
+
+ def _nodes_for_arguments(self, doc, arg_type):
+ """Return list of doctree nodes for the arguments section"""
+ if arg_type and not arg_type.is_implicit():
+ assert not doc.args
+ section = self._make_section('Arguments')
+ dlnode = nodes.definition_list()
+ dlnode += self._make_dlitem(
+ [nodes.Text('The members of '),
+ nodes.literal('', arg_type.name)],
+ None)
+ section += dlnode
+ return [section]
+
+ return self._nodes_for_members(doc, 'Arguments')
+
+ def _nodes_for_features(self, doc):
+ """Return list of doctree nodes for the table of features"""
+ seen_item = False
+ dlnode = nodes.definition_list()
+ for section in doc.features.values():
+ dlnode += self._make_dlitem(
+ [nodes.literal('', section.member.name)], dedent(section.text))
+ seen_item = True
+
+ if not seen_item:
+ return []
+
+ section = self._make_section('Features')
+ section += dlnode
+ return [section]
+
+ def _nodes_for_sections(self, doc):
+ """Return list of doctree nodes for additional sections"""
+ nodelist = []
+ for section in doc.sections:
+ if section.kind == QAPIDoc.Kind.TODO:
+ # Hide TODO: sections
+ continue
+
+ if section.kind == QAPIDoc.Kind.PLAIN:
+ # Sphinx cannot handle sectionless titles;
+ # Instead, just append the results to the prior section.
+ container = nodes.container()
+ self._parse_text_into_node(section.text, container)
+ nodelist += container.children
+ continue
+
+ snode = self._make_section(section.kind.name.title())
+ self._parse_text_into_node(dedent(section.text), snode)
+ nodelist.append(snode)
+ return nodelist
+
+ def _nodes_for_if_section(self, ifcond):
+ """Return list of doctree nodes for the "If" section"""
+ nodelist = []
+ if ifcond.is_present():
+ snode = self._make_section('If')
+ snode += nodes.paragraph(
+ '', '', *self._nodes_for_ifcond(ifcond, with_if=False)
+ )
+ nodelist.append(snode)
+ return nodelist
+
+ def _add_doc(self, typ, sections):
+ """Add documentation for a command/object/enum...
+
+ We assume we're documenting the thing defined in self._cur_doc.
+ typ is the type of thing being added ("Command", "Object", etc)
+
+ sections is a list of nodes for sections to add to the definition.
+ """
+
+ doc = self._cur_doc
+ snode = nodes.section(ids=[self._sphinx_directive.new_serialno()])
+ snode += nodes.title('', '', *[nodes.literal(doc.symbol, doc.symbol),
+ nodes.Text(' (' + typ + ')')])
+ self._parse_text_into_node(doc.body.text, snode)
+ for s in sections:
+ if s is not None:
+ snode += s
+ self._add_node_to_current_heading(snode)
+
+ def visit_enum_type(self, name, info, ifcond, features, members, prefix):
+ doc = self._cur_doc
+ self._add_doc('Enum',
+ self._nodes_for_enum_values(doc)
+ + self._nodes_for_features(doc)
+ + self._nodes_for_sections(doc)
+ + self._nodes_for_if_section(ifcond))
+
+ def visit_object_type(self, name, info, ifcond, features,
+ base, members, branches):
+ doc = self._cur_doc
+ if base and base.is_implicit():
+ base = None
+ self._add_doc('Object',
+ self._nodes_for_members(doc, 'Members', base, branches)
+ + self._nodes_for_features(doc)
+ + self._nodes_for_sections(doc)
+ + self._nodes_for_if_section(ifcond))
+
+ def visit_alternate_type(self, name, info, ifcond, features,
+ alternatives):
+ doc = self._cur_doc
+ self._add_doc('Alternate',
+ self._nodes_for_members(doc, 'Members')
+ + self._nodes_for_features(doc)
+ + self._nodes_for_sections(doc)
+ + self._nodes_for_if_section(ifcond))
+
+ def visit_command(self, name, info, ifcond, features, arg_type,
+ ret_type, gen, success_response, boxed, allow_oob,
+ allow_preconfig, coroutine):
+ doc = self._cur_doc
+ self._add_doc('Command',
+ self._nodes_for_arguments(doc, arg_type)
+ + self._nodes_for_features(doc)
+ + self._nodes_for_sections(doc)
+ + self._nodes_for_if_section(ifcond))
+
+ def visit_event(self, name, info, ifcond, features, arg_type, boxed):
+ doc = self._cur_doc
+ self._add_doc('Event',
+ self._nodes_for_arguments(doc, arg_type)
+ + self._nodes_for_features(doc)
+ + self._nodes_for_sections(doc)
+ + self._nodes_for_if_section(ifcond))
+
+ def symbol(self, doc, entity):
+ """Add documentation for one symbol to the document tree
+
+ This is the main entry point which causes us to add documentation
+ nodes for a symbol (which could be a 'command', 'object', 'event',
+ etc). We do this by calling 'visit' on the schema entity, which
+ will then call back into one of our visit_* methods, depending
+ on what kind of thing this symbol is.
+ """
+ self._cur_doc = doc
+ entity.visit(self)
+ self._cur_doc = None
+
+ def _start_new_heading(self, heading, level):
+ """Start a new heading at the specified heading level
+
+ Create a new section whose title is 'heading' and which is placed
+ in the docutils node tree as a child of the most recent level-1
+ heading. Subsequent document sections (commands, freeform doc chunks,
+ etc) will be placed as children of this new heading section.
+ """
+ if len(self._active_headings) < level:
+ raise QAPISemError(self._cur_doc.info,
+ 'Level %d subheading found outside a '
+ 'level %d heading'
+ % (level, level - 1))
+ snode = self._make_section(heading)
+ self._active_headings[level - 1] += snode
+ self._active_headings = self._active_headings[:level]
+ self._active_headings.append(snode)
+ return snode
+
+ def _add_node_to_current_heading(self, node):
+ """Add the node to whatever the current active heading is"""
+ self._active_headings[-1] += node
+
+ def freeform(self, doc):
+ """Add a piece of 'freeform' documentation to the document tree
+
+ A 'freeform' document chunk doesn't relate to any particular
+ symbol (for instance, it could be an introduction).
+
+ If the freeform document starts with a line of the form
+ '= Heading text', this is a section or subsection heading, with
+ the heading level indicated by the number of '=' signs.
+ """
+
+ # QAPIDoc documentation says free-form documentation blocks
+ # must have only a body section, nothing else.
+ assert not doc.sections
+ assert not doc.args
+ assert not doc.features
+ self._cur_doc = doc
+
+ text = doc.body.text
+ if re.match(r'=+ ', text):
+ # Section/subsection heading (if present, will always be
+ # the first line of the block)
+ (heading, _, text) = text.partition('\n')
+ (leader, _, heading) = heading.partition(' ')
+ node = self._start_new_heading(heading, len(leader))
+ if text == '':
+ return
+ else:
+ node = nodes.container()
+
+ self._parse_text_into_node(text, node)
+ self._cur_doc = None
+
+ def _parse_text_into_node(self, doctext, node):
+ """Parse a chunk of QAPI-doc-format text into the node
+
+ The doc comment can contain most inline rST markup, including
+ bulleted and enumerated lists.
+ As an extra permitted piece of markup, @var will be turned
+ into ``var``.
+ """
+
+ # Handle the "@var means ``var`` case
+ doctext = re.sub(r'@([\w-]+)', r'``\1``', doctext)
+
+ rstlist = ViewList()
+ for line in doctext.splitlines():
+ # The reported line number will always be that of the start line
+ # of the doc comment, rather than the actual location of the error.
+ # Being more precise would require overhaul of the QAPIDoc class
+ # to track lines more exactly within all the sub-parts of the doc
+ # comment, as well as counting lines here.
+ rstlist.append(line, self._cur_doc.info.fname,
+ self._cur_doc.info.line)
+ # Append a blank line -- in some cases rST syntax errors get
+ # attributed to the line after one with actual text, and if there
+ # isn't anything in the ViewList corresponding to that then Sphinx
+ # 1.6's AutodocReporter will then misidentify the source/line location
+ # in the error message (usually attributing it to the top-level
+ # .rst file rather than the offending .json file). The extra blank
+ # line won't affect the rendered output.
+ rstlist.append("", self._cur_doc.info.fname, self._cur_doc.info.line)
+ self._sphinx_directive.do_parse(rstlist, node)
+
+ def get_document_node(self):
+ """Return the root docutils node which makes up the document"""
+ return self._top_node
diff --git a/docs/sphinx/qmp_lexer.py b/docs/sphinx/qmp_lexer.py
index a59de8a..7b3b808 100644
--- a/docs/sphinx/qmp_lexer.py
+++ b/docs/sphinx/qmp_lexer.py
@@ -24,7 +24,7 @@ class QMPExampleMarkersLexer(RegexLexer):
'root': [
(r'-> ', token.Generic.Prompt),
(r'<- ', token.Generic.Prompt),
- (r' ?\.{3} ?', token.Generic.Prompt),
+ (r'\.{3}( .* \.{3})?', token.Comment.Multiline),
]
}
diff --git a/docs/spin/tcg-exclusive.promela b/docs/spin/tcg-exclusive.promela
index c91cfca..1d03af8 100644
--- a/docs/spin/tcg-exclusive.promela
+++ b/docs/spin/tcg-exclusive.promela
@@ -1,6 +1,6 @@
/*
* This model describes the implementation of exclusive sections in
- * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start,
+ * cpu-common.c (start_exclusive, end_exclusive, cpu_exec_start,
* cpu_exec_end).
*
* Author: Paolo Bonzini <pbonzini@redhat.com>
@@ -65,7 +65,7 @@
}
#define COND_BROADCAST(c) c++
-// this is the logic from cpus-common.c
+// this is the logic from cpu-common.c
mutex_t mutex;
cond_t exclusive_cond;
diff --git a/docs/system/arm/aspeed.rst b/docs/system/arm/aspeed.rst
index 6733ffd..43d27d8 100644
--- a/docs/system/arm/aspeed.rst
+++ b/docs/system/arm/aspeed.rst
@@ -1,12 +1,11 @@
-Aspeed family boards (``*-bmc``, ``ast2500-evb``, ``ast2600-evb``, ``ast2700-evb``)
-===================================================================================
+Aspeed family boards (``ast2500-evb``, ``ast2600-evb``, ``ast2700-evb``, ``bletchley-bmc``, ``fuji-bmc``, ``fby35-bmc``, ``fp5280g2-bmc``, ``g220a-bmc``, ``palmetto-bmc``, ``qcom-dc-scm-v1-bmc``, ``qcom-firework-bmc``, ``quanta-q71l-bmc``, ``rainier-bmc``, ``romulus-bmc``, ``sonorapass-bmc``, ``supermicrox11-bmc``, ``supermicrox11spi-bmc``, ``tiogapass-bmc``, ``witherspoon-bmc``, ``yosemitev2-bmc``)
+=================================================================================================================================================================================================================================================================================================================================================================================================================================
The QEMU Aspeed machines model BMCs of various OpenPOWER systems and
Aspeed evaluation boards. They are based on different releases of the
Aspeed SoC : the AST2400 integrating an ARM926EJ-S CPU (400MHz), the
AST2500 with an ARM1176JZS CPU (800MHz), the AST2600
-with dual cores ARM Cortex-A7 CPUs (1.2GHz) and more recently the AST2700
-with quad cores ARM Cortex-A35 64 bits CPUs (1.6GHz)
+with dual cores ARM Cortex-A7 CPUs (1.2GHz).
The SoC comes with RAM, Gigabit ethernet, USB, SD/MMC, USB, SPI, I2C,
etc.
@@ -15,7 +14,8 @@ AST2400 SoC based machines :
- ``palmetto-bmc`` OpenPOWER Palmetto POWER8 BMC
- ``quanta-q71l-bmc`` OpenBMC Quanta BMC
-- ``supermicrox11-bmc`` Supermicro X11 BMC
+- ``supermicrox11-bmc`` Supermicro X11 BMC (ARM926EJ-S)
+- ``supermicrox11spi-bmc`` Supermicro X11 SPI BMC (ARM1176)
AST2500 SoC based machines :
@@ -31,7 +31,6 @@ AST2500 SoC based machines :
AST2600 SoC based machines :
- ``ast2600-evb`` Aspeed AST2600 Evaluation board (Cortex-A7)
-- ``tacoma-bmc`` OpenPOWER Witherspoon POWER9 AST2600 BMC
- ``rainier-bmc`` IBM Rainier POWER10 BMC
- ``fuji-bmc`` Facebook Fuji BMC
- ``bletchley-bmc`` Facebook Bletchley BMC
@@ -39,10 +38,6 @@ AST2600 SoC based machines :
- ``qcom-dc-scm-v1-bmc`` Qualcomm DC-SCM V1 BMC
- ``qcom-firework-bmc`` Qualcomm Firework BMC
-AST2700 SoC based machines :
-
-- ``ast2700-evb`` Aspeed AST2700 Evaluation board (Cortex-A35)
-
Supported devices
-----------------
@@ -105,6 +100,9 @@ or directly from the ASPEED Forked OpenBMC GitHub release repository :
https://github.com/AspeedTech-BMC/openbmc/releases
+Booting from a kernel image
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
To boot a kernel directly from a Linux build tree:
.. code-block:: bash
@@ -114,16 +112,10 @@ To boot a kernel directly from a Linux build tree:
-dtb arch/arm/boot/dts/aspeed-ast2600-evb.dtb \
-initrd rootfs.cpio
-To boot the machine from the flash image, use an MTD drive :
-
-.. code-block:: bash
-
- $ qemu-system-arm -M romulus-bmc -nic user \
- -drive file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd -nographic
+Booting from a flash image
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Options specific to Aspeed machines are :
-
- * ``boot-emmc`` to set or unset boot from eMMC (AST2600).
+The machine options specific to Aspeed to boot from a flash image are :
* ``execute-in-place`` which emulates the boot from the CE0 flash
device by using the FMC controller to load the instructions, and
@@ -134,10 +126,12 @@ Options specific to Aspeed machines are :
* ``spi-model`` to change the default SPI Flash model.
- * ``bmc-console`` to change the default console device. Most of the
- machines use the ``UART5`` device for a boot console, which is
- mapped on ``/dev/ttyS4`` under Linux, but it is not always the
- case.
+To boot the machine from the flash image, use an MTD drive :
+
+.. code-block:: bash
+
+ $ qemu-system-arm -M romulus-bmc -nic user \
+ -drive file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd -nographic
To use other flash models, for instance a different FMC chip and a
bigger (64M) SPI for the ``ast2500-evb`` machine, run :
@@ -169,6 +163,78 @@ In that case, the machine boots fetching instructions from the FMC0
device. It is slower to start but closer to what HW does. Using the
machine option ``execute-in-place`` has a similar effect.
+Booting from an eMMC image
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The machine options specific to Aspeed machines to boot from an eMMC
+image are :
+
+ * ``boot-emmc`` to set or unset boot from eMMC (AST2600).
+
+Only the ``ast2600-evb`` and ``rainier-emmc`` machines have support to
+boot from an eMMC device. In this case, the machine assumes that the
+eMMC image includes special boot partitions. Such an image can be
+built this way :
+
+.. code-block:: bash
+
+ $ dd if=/dev/zero of=mmc-bootarea.img count=2 bs=1M
+ $ dd if=u-boot-spl.bin of=mmc-bootarea.img conv=notrunc
+ $ dd if=u-boot.bin of=mmc-bootarea.img conv=notrunc count=64 bs=1K
+ $ cat mmc-bootarea.img obmc-phosphor-image.wic > mmc.img
+ $ truncate --size 16GB mmc.img
+
+Boot the machine ``rainier-emmc`` with :
+
+.. code-block:: bash
+
+ $ qemu-system-arm -M rainier-bmc \
+ -drive file=mmc.img,format=raw,if=sd,index=2 \
+ -nographic
+
+The ``boot-emmc`` option can be set or unset, to change the default
+boot mode of machine: SPI or eMMC. This can be useful to boot the
+``ast2600-evb`` machine from an eMMC device (default being SPI) or to
+boot the ``rainier-bmc`` machine from a flash device (default being
+eMMC).
+
+As an example, here is how to to boot the ``rainier-bmc`` machine from
+the flash device with ``boot-emmc=false`` and let the machine use an
+eMMC image :
+
+.. code-block:: bash
+
+ $ qemu-system-arm -M rainier-bmc,boot-emmc=false \
+ -drive file=flash.img,format=raw,if=mtd \
+ -drive file=mmc.img,format=raw,if=sd,index=2 \
+ -nographic
+
+It should be noted that in this case the eMMC device must not have
+boot partitions, otherwise the contents will not be accessible to the
+machine. This limitation is due to the use of the ``-drive``
+interface.
+
+Ideally, one should be able to define the eMMC device and the
+associated backend directly on the command line, such as :
+
+.. code-block:: bash
+
+ -blockdev node-name=emmc0,driver=file,filename=mmc.img \
+ -device emmc,bus=sdhci-bus.2,drive=emmc0,boot-partition-size=1048576,boot-config=8
+
+This is not yet supported (as of QEMU-10.0). Work is needed to
+refactor the sdhci bus model.
+
+Other booting options
+^^^^^^^^^^^^^^^^^^^^^
+
+Other machine options specific to Aspeed machines are :
+
+ * ``bmc-console`` to change the default console device. Most of the
+ machines use the ``UART5`` device for a boot console, which is
+ mapped on ``/dev/ttyS4`` under Linux, but it is not always the
+ case.
+
To change the boot console and use device ``UART3`` (``/dev/ttyS2``
under Linux), use :
@@ -176,8 +242,78 @@ under Linux), use :
-M ast2500-evb,bmc-console=uart3
+Aspeed 2700 family boards (``ast2700-evb``)
+==================================================================
+
+The QEMU Aspeed machines model BMCs of Aspeed evaluation boards.
+They are based on different releases of the Aspeed SoC :
+the AST2700 with quad cores ARM Cortex-A35 64 bits CPUs (1.6GHz).
+
+The SoC comes with RAM, Gigabit ethernet, USB, SD/MMC, USB, SPI, I2C,
+etc.
+
+AST2700 SoC based machines :
-Boot the AST2700 machine from the flash image, use an MTD drive :
+- ``ast2700-evb`` Aspeed AST2700 Evaluation board (Cortex-A35)
+- ``ast2700fc`` Aspeed AST2700 Evaluation board (Cortex-A35 + Cortex-M4)
+
+Supported devices
+-----------------
+ * Interrupt Controller
+ * Timer Controller
+ * RTC Controller
+ * I2C Controller
+ * System Control Unit (SCU)
+ * SRAM mapping
+ * X-DMA Controller (basic interface)
+ * Static Memory Controller (SMC or FMC) - Only SPI Flash support
+ * SPI Memory Controller
+ * USB 2.0 Controller
+ * SD/MMC storage controllers
+ * SDRAM controller (dummy interface for basic settings and training)
+ * Watchdog Controller
+ * GPIO Controller (Master only)
+ * UART
+ * Ethernet controllers
+ * Front LEDs (PCA9552 on I2C bus)
+ * LPC Peripheral Controller (a subset of subdevices are supported)
+ * Hash/Crypto Engine (HACE) - Hash support only. TODO: Crypto
+ * ADC
+ * eMMC Boot Controller (dummy)
+ * PECI Controller (minimal)
+ * I3C Controller
+ * Internal Bridge Controller (SLI dummy)
+
+Missing devices
+---------------
+ * PWM and Fan Controller
+ * Slave GPIO Controller
+ * Super I/O Controller
+ * PCI-Express 1 Controller
+ * Graphic Display Controller
+ * MCTP Controller
+ * Mailbox Controller
+ * Virtual UART
+ * eSPI Controller
+
+Boot options
+------------
+
+Images can be downloaded from the ASPEED Forked OpenBMC GitHub release repository :
+
+ https://github.com/AspeedTech-BMC/openbmc/releases
+
+Booting the ast2700-evb machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Boot the AST2700 machine from the flash image.
+
+There are two supported methods for booting the AST2700 machine with a flash image:
+
+Manual boot using ``-device loader``:
+
+It causes all 4 CPU cores to start execution from address ``0x430000000``, which
+corresponds to the BL31 image load address.
.. code-block:: bash
@@ -197,6 +333,89 @@ Boot the AST2700 machine from the flash image, use an MTD drive :
-drive file=${IMGDIR}/image-bmc,format=raw,if=mtd \
-nographic
+Boot using a virtual boot ROM (``-bios``):
+
+If users do not specify the ``-bios option``, QEMU will attempt to load the
+default vbootrom image ``ast27x0_bootrom.bin`` from either the current working
+directory or the ``pc-bios`` directory within the QEMU source tree.
+
+.. code-block:: bash
+
+ $ qemu-system-aarch64 -M ast2700-evb \
+ -drive file=image-bmc,format=raw,if=mtd \
+ -nographic
+
+The ``-bios`` option allows users to specify a custom path for the vbootrom
+image to be loaded during boot. This will load the vbootrom image from the
+specified path in the ${HOME} directory.
+
+.. code-block:: bash
+
+ -bios ${HOME}/ast27x0_bootrom.bin
+
+Booting the ast2700fc machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+AST2700 features four Cortex-A35 primary processors and two Cortex-M4 coprocessors.
+**ast2700-evb** machine focuses on emulating the four Cortex-A35 primary processors,
+**ast2700fc** machine extends **ast2700-evb** by adding support for the two Cortex-M4 coprocessors.
+
+Steps to boot the AST2700fc machine:
+
+1. Ensure you have the following AST2700A1 binaries available in a directory
+
+ * u-boot-nodtb.bin
+ * u-boot.dtb
+ * bl31.bin
+ * optee/tee-raw.bin
+ * image-bmc
+ * zephyr-aspeed-ssp.elf (for SSP firmware, CPU 5)
+ * zephyr-aspeed-tsp.elf (for TSP firmware, CPU 6)
+
+2. Execute the following command to start ``ast2700fc`` machine:
+
+.. code-block:: bash
+
+ IMGDIR=ast2700-default
+ UBOOT_SIZE=$(stat --format=%s -L ${IMGDIR}/u-boot-nodtb.bin)
+
+ $ qemu-system-aarch64 -M ast2700fc \
+ -device loader,force-raw=on,addr=0x400000000,file=${IMGDIR}/u-boot-nodtb.bin \
+ -device loader,force-raw=on,addr=$((0x400000000 + ${UBOOT_SIZE})),file=${IMGDIR}/u-boot.dtb \
+ -device loader,force-raw=on,addr=0x430000000,file=${IMGDIR}/bl31.bin \
+ -device loader,force-raw=on,addr=0x430080000,file=${IMGDIR}/optee/tee-raw.bin \
+ -device loader,cpu-num=0,addr=0x430000000 \
+ -device loader,cpu-num=1,addr=0x430000000 \
+ -device loader,cpu-num=2,addr=0x430000000 \
+ -device loader,cpu-num=3,addr=0x430000000 \
+ -drive file=${IMGDIR}/image-bmc,if=mtd,format=raw \
+ -device loader,file=${IMGDIR}/zephyr-aspeed-ssp.elf,cpu-num=4 \
+ -device loader,file=${IMGDIR}/zephyr-aspeed-tsp.elf,cpu-num=5 \
+ -serial pty -serial pty -serial pty \
+ -snapshot \
+ -S -nographic
+
+After launching QEMU, serial devices will be automatically redirected.
+Example output:
+
+.. code-block:: bash
+
+ char device redirected to /dev/pts/55 (label serial0)
+ char device redirected to /dev/pts/56 (label serial1)
+ char device redirected to /dev/pts/57 (label serial2)
+
+- serial0: Console for the four Cortex-A35 primary processors.
+- serial1 and serial2: Consoles for the two Cortex-M4 coprocessors.
+
+Use ``tio`` or another terminal emulator to connect to the consoles:
+
+.. code-block:: bash
+
+ $ tio /dev/pts/55
+ $ tio /dev/pts/56
+ $ tio /dev/pts/57
+
+
Aspeed minibmc family boards (``ast1030-evb``)
==================================================================
@@ -257,51 +476,3 @@ To boot a kernel directly from a Zephyr build tree:
$ qemu-system-arm -M ast1030-evb -nographic \
-kernel zephyr.elf
-
-Facebook Yosemite v3.5 Platform and CraterLake Server (``fby35``)
-==================================================================
-
-Facebook has a series of multi-node compute server designs named
-Yosemite. The most recent version released was
-`Yosemite v3 <https://www.opencompute.org/documents/ocp-yosemite-v3-platform-design-specification-1v16-pdf>`__.
-
-Yosemite v3.5 is an iteration on this design, and is very similar: there's a
-baseboard with a BMC, and 4 server slots. The new server board design termed
-"CraterLake" includes a Bridge IC (BIC), with room for expansion boards to
-include various compute accelerators (video, inferencing, etc). At the moment,
-only the first server slot's BIC is included.
-
-Yosemite v3.5 is itself a sled which fits into a 40U chassis, and 3 sleds
-can be fit into a chassis. See `here <https://www.opencompute.org/products/423/wiwynn-yosemite-v3-server>`__
-for an example.
-
-In this generation, the BMC is an AST2600 and each BIC is an AST1030. The BMC
-runs `OpenBMC <https://github.com/facebook/openbmc>`__, and the BIC runs
-`OpenBIC <https://github.com/facebook/openbic>`__.
-
-Firmware images can be retrieved from the Github releases or built from the
-source code, see the README's for instructions on that. This image uses the
-"fby35" machine recipe from OpenBMC, and the "yv35-cl" target from OpenBIC.
-Some reference images can also be found here:
-
-.. code-block:: bash
-
- $ wget https://github.com/facebook/openbmc/releases/download/openbmc-e2294ff5d31d/fby35.mtd
- $ wget https://github.com/peterdelevoryas/OpenBIC/releases/download/oby35-cl-2022.13.01/Y35BCL.elf
-
-Since this machine has multiple SoC's, each with their own serial console, the
-recommended way to run it is to allocate a pseudoterminal for each serial
-console and let the monitor use stdio. Also, starting in a paused state is
-useful because it allows you to attach to the pseudoterminals before the boot
-process starts.
-
-.. code-block:: bash
-
- $ qemu-system-arm -machine fby35 \
- -drive file=fby35.mtd,format=raw,if=mtd \
- -device loader,file=Y35BCL.elf,addr=0,cpu-num=2 \
- -serial pty -serial pty -serial mon:stdio \
- -display none -S
- $ screen /dev/tty0 # In a separate TMUX pane, terminal window, etc.
- $ screen /dev/tty1
- $ (qemu) c # Start the boot process once screen is setup.
diff --git a/docs/system/arm/bananapi_m2u.rst b/docs/system/arm/bananapi_m2u.rst
index 587b488..03cc561 100644
--- a/docs/system/arm/bananapi_m2u.rst
+++ b/docs/system/arm/bananapi_m2u.rst
@@ -125,16 +125,15 @@ And then boot it.
$ qemu-system-arm -M bpim2u -nographic -sd sd.img
-Banana Pi M2U integration tests
-"""""""""""""""""""""""""""""""
+Banana Pi M2U functional tests
+""""""""""""""""""""""""""""""
-The Banana Pi M2U machine has several integration tests included.
+The Banana Pi M2U machine has several functional tests included.
To run the whole set of tests, build QEMU from source and simply
provide the following command:
.. code-block:: bash
$ cd qemu-build-dir
- $ AVOCADO_ALLOW_LARGE_STORAGE=yes tests/venv/bin/avocado \
- --verbose --show=app,console run -t machine:bpim2u \
- ../tests/avocado/boot_linux_console.py
+ $ QEMU_TEST_ALLOW_LARGE_STORAGE=1 \
+ pyvenv/bin/meson test --suite thorough func-arm-arm_bpim2u
diff --git a/docs/system/arm/cpu-features.rst b/docs/system/arm/cpu-features.rst
index a5fb929..37d5dfd 100644
--- a/docs/system/arm/cpu-features.rst
+++ b/docs/system/arm/cpu-features.rst
@@ -219,8 +219,11 @@ Below is the list of TCG VCPU features and their descriptions.
``pauth-qarma3``
When ``pauth`` is enabled, select the architected QARMA3 algorithm.
-Without either ``pauth-impdef`` or ``pauth-qarma3`` enabled,
-the architected QARMA5 algorithm is used. The architected QARMA5
+``pauth-qarma5``
+ When ``pauth`` is enabled, select the architected QARMA5 algorithm.
+
+Without ``pauth-impdef``, ``pauth-qarma3`` or ``pauth-qarma5`` enabled,
+the QEMU impdef algorithm is used. The architected QARMA5
and QARMA3 algorithms have good cryptographic properties, but can
be quite slow to emulate. The impdef algorithm used by QEMU is
non-cryptographic but significantly faster.
diff --git a/docs/system/arm/cubieboard.rst b/docs/system/arm/cubieboard.rst
index 58c4a2d..90d24c7 100644
--- a/docs/system/arm/cubieboard.rst
+++ b/docs/system/arm/cubieboard.rst
@@ -15,4 +15,5 @@ Emulated devices:
- USB controller
- SATA controller
- TWI (I2C) controller
+- SPI controller
- Watchdog timer
diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst
index 3ab6e72..78c2fd2 100644
--- a/docs/system/arm/emulation.rst
+++ b/docs/system/arm/emulation.rst
@@ -3,8 +3,8 @@
A-profile CPU architecture support
==================================
-QEMU's TCG emulation includes support for the Armv5, Armv6, Armv7 and
-Armv8 versions of the A-profile architecture. It also has support for
+QEMU's TCG emulation includes support for the Armv5, Armv6, Armv7,
+Armv8 and Armv9 versions of the A-profile architecture. It also has support for
the following architecture extensions:
- FEAT_AA32BF16 (AArch32 BFloat16 instructions)
@@ -20,12 +20,14 @@ the following architecture extensions:
- FEAT_AA64EL3 (Support for AArch64 at EL3)
- FEAT_AdvSIMD (Advanced SIMD Extension)
- FEAT_AES (AESD and AESE instructions)
+- FEAT_AFP (Alternate floating-point behavior)
- FEAT_Armv9_Crypto (Armv9 Cryptographic Extension)
- FEAT_ASID16 (16 bit ASID)
- FEAT_BBM at level 2 (Translation table break-before-make levels)
- FEAT_BF16 (AArch64 BFloat16 instructions)
- FEAT_BTI (Branch Target Identification)
- FEAT_CCIDX (Extended cache index)
+- FEAT_CMOW (Control for cache maintenance permission)
- FEAT_CRC32 (CRC32 instructions)
- FEAT_Crypto (Cryptographic Extension)
- FEAT_CSV2 (Cache speculation variant 2)
@@ -36,6 +38,7 @@ the following architecture extensions:
- FEAT_CSV3 (Cache speculation variant 3)
- FEAT_DGH (Data gathering hint)
- FEAT_DIT (Data Independent Timing instructions)
+- FEAT_DoubleLock (Double Lock)
- FEAT_DPB (DC CVAP instruction)
- FEAT_DPB2 (DC CVADP instruction)
- FEAT_Debugv8p1 (Debug with VHE)
@@ -45,6 +48,7 @@ the following architecture extensions:
- FEAT_DotProd (Advanced SIMD dot product instructions)
- FEAT_DoubleFault (Double Fault Extension)
- FEAT_E0PD (Preventing EL0 access to halves of address maps)
+- FEAT_EBF16 (AArch64 Extended BFloat16 instructions)
- FEAT_ECV (Enhanced Counter Virtualization)
- FEAT_EL0 (Support for execution at EL0)
- FEAT_EL1 (Support for execution at EL1)
@@ -86,12 +90,13 @@ the following architecture extensions:
- FEAT_LSE2 (Large System Extensions v2)
- FEAT_LVA (Large Virtual Address space)
- FEAT_MixedEnd (Mixed-endian support)
-- FEAT_MixdEndEL0 (Mixed-endian support at EL0)
+- FEAT_MixedEndEL0 (Mixed-endian support at EL0)
- FEAT_MOPS (Standardization of memory operations)
- FEAT_MTE (Memory Tagging Extension)
- FEAT_MTE2 (Memory Tagging Extension)
- FEAT_MTE3 (MTE Asymmetric Fault Handling)
- FEAT_MTE_ASYM_FAULT (Memory tagging asymmetric faults)
+- FEAT_MTE_ASYNC (Asynchronous reporting of Tag Check Fault)
- FEAT_NMI (Non-maskable Interrupt)
- FEAT_NV (Nested Virtualization)
- FEAT_NV2 (Enhanced nested virtualization support)
@@ -113,6 +118,7 @@ the following architecture extensions:
- FEAT_RDM (Advanced SIMD rounding double multiply accumulate instructions)
- FEAT_RME (Realm Management Extension) (NB: support status in QEMU is experimental)
- FEAT_RNG (Random number generator)
+- FEAT_RPRES (Increased precision of FRECPE and FRSQRTE)
- FEAT_S2FWB (Stage 2 forced Write-Back)
- FEAT_SB (Speculation Barrier)
- FEAT_SEL2 (Secure EL2)
@@ -135,6 +141,7 @@ the following architecture extensions:
- FEAT_SVE2 (Scalable Vector Extension version 2)
- FEAT_SPECRES (Speculation restriction instructions)
- FEAT_SSBS (Speculative Store Bypass Safe)
+- FEAT_SSBS2 (MRS and MSR instructions for SSBS version 2)
- FEAT_TGran16K (Support for 16KB memory translation granule size at stage 1)
- FEAT_TGran4K (Support for 4KB memory translation granule size at stage 1)
- FEAT_TGran64K (Support for 64KB memory translation granule size at stage 1)
@@ -149,9 +156,10 @@ the following architecture extensions:
- FEAT_VMID16 (16-bit VMID)
- FEAT_WFxT (WFE and WFI instructions with timeout)
- FEAT_XNX (Translation table stage 2 Unprivileged Execute-never)
+- FEAT_XS (XS attribute)
For information on the specifics of these extensions, please refer
-to the `Armv8-A Arm Architecture Reference Manual
+to the `Arm Architecture Reference Manual for A-profile architecture
<https://developer.arm.com/documentation/ddi0487/latest>`_.
When a specific named CPU is being emulated, only those features which
diff --git a/docs/system/arm/exynos.rst b/docs/system/arm/exynos.rst
new file mode 100644
index 0000000..86894bc
--- /dev/null
+++ b/docs/system/arm/exynos.rst
@@ -0,0 +1,9 @@
+Exynos4 boards (``nuri``, ``smdkc210``)
+=======================================
+
+These are machines which use the Samsung Exynos4210 SoC, which has Cortex-A9 CPUs.
+
+``nuri`` models the Samsung NURI board.
+
+``smdkc210`` models the Samsung SMDKC210 board.
+
diff --git a/docs/system/arm/fby35.rst b/docs/system/arm/fby35.rst
new file mode 100644
index 0000000..e19274e
--- /dev/null
+++ b/docs/system/arm/fby35.rst
@@ -0,0 +1,52 @@
+Facebook Yosemite v3.5 Platform and CraterLake Server (``fby35``)
+==================================================================
+
+Facebook has a series of multi-node compute server designs named
+Yosemite. The most recent version released was
+`Yosemite v3 <https://www.opencompute.org/documents/ocp-yosemite-v3-platform-design-specification-1v16-pdf>`__.
+
+Yosemite v3.5 is an iteration on this design, and is very similar: there's a
+baseboard with a BMC, and 4 server slots. The new server board design termed
+"CraterLake" includes a Bridge IC (BIC), with room for expansion boards to
+include various compute accelerators (video, inferencing, etc). At the moment,
+only the first server slot's BIC is included.
+
+Yosemite v3.5 is itself a sled which fits into a 40U chassis, and 3 sleds
+can be fit into a chassis. See `here <https://www.opencompute.org/products-chiplets/237/wiwynn-yosemite-v3-server>`__
+for an example.
+
+In this generation, the BMC is an AST2600 and each BIC is an AST1030. The BMC
+runs `OpenBMC <https://github.com/facebook/openbmc>`__, and the BIC runs
+`OpenBIC <https://github.com/facebook/openbic>`__.
+
+Firmware images can be retrieved from the Github releases or built from the
+source code, see the README's for instructions on that. This image uses the
+"fby35" machine recipe from OpenBMC, and the "yv35-cl" target from OpenBIC.
+Some reference images can also be found here:
+
+.. code-block:: bash
+
+ $ wget https://github.com/facebook/openbmc/releases/download/openbmc-e2294ff5d31d/fby35.mtd
+ $ wget https://github.com/peterdelevoryas/OpenBIC/releases/download/oby35-cl-2022.13.01/Y35BCL.elf
+
+Since this machine has multiple SoC's, each with their own serial console, the
+recommended way to run it is to allocate a pseudoterminal for each serial
+console and let the monitor use stdio. Also, starting in a paused state is
+useful because it allows you to attach to the pseudoterminals before the boot
+process starts.
+
+.. code-block:: bash
+
+ $ qemu-system-arm -machine fby35 \
+ -drive file=fby35.mtd,format=raw,if=mtd \
+ -device loader,file=Y35BCL.elf,addr=0,cpu-num=2 \
+ -serial pty -serial pty -serial mon:stdio \
+ -display none -S
+ $ screen /dev/tty0 # In a separate TMUX pane, terminal window, etc.
+ $ screen /dev/tty1
+ $ (qemu) c # Start the boot process once screen is setup.
+
+This machine model supports emulation of the boot from the CE0 flash device by
+setting option ``execute-in-place``. When using this option, the CPU fetches
+instructions to execute by reading CE0 and not from a preloaded ROM
+initialized at machine init time. As a result, execution will be slower.
diff --git a/docs/system/arm/gumstix.rst b/docs/system/arm/gumstix.rst
deleted file mode 100644
index cb37313..0000000
--- a/docs/system/arm/gumstix.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-Gumstix Connex and Verdex (``connex``, ``verdex``)
-==================================================
-
-These machines model the Gumstix Connex and Verdex boards.
-The Connex has a PXA255 CPU and the Verdex has a PXA270.
-
-Implemented devices:
-
- * NOR flash
- * SMC91C111 ethernet
- * Interrupt controller
- * DMA
- * Timer
- * GPIO
- * MMC/SD card
- * Fast infra-red communications port (FIR)
- * LCD controller
- * Synchronous serial ports (SPI)
- * PCMCIA interface
- * I2C
- * I2S
diff --git a/docs/system/arm/imx8mp-evk.rst b/docs/system/arm/imx8mp-evk.rst
new file mode 100644
index 0000000..b2f7d29
--- /dev/null
+++ b/docs/system/arm/imx8mp-evk.rst
@@ -0,0 +1,62 @@
+NXP i.MX 8M Plus Evaluation Kit (``imx8mp-evk``)
+================================================
+
+The ``imx8mp-evk`` machine models the i.MX 8M Plus Evaluation Kit, based on an
+i.MX 8M Plus SoC.
+
+Supported devices
+-----------------
+
+The ``imx8mp-evk`` machine implements the following devices:
+
+ * Up to 4 Cortex-A53 cores
+ * Generic Interrupt Controller (GICv3)
+ * 4 UARTs
+ * 3 USDHC Storage Controllers
+ * 1 Designware PCI Express Controller
+ * 1 Ethernet Controller
+ * 2 Designware USB 3 Controllers
+ * 5 GPIO Controllers
+ * 6 I2C Controllers
+ * 3 SPI Controllers
+ * 3 Watchdogs
+ * 6 General Purpose Timers
+ * Secure Non-Volatile Storage (SNVS) including an RTC
+ * Clock Tree
+
+Boot options
+------------
+
+The ``imx8mp-evk`` machine can start a Linux kernel directly using the standard
+``-kernel`` functionality.
+
+Direct Linux Kernel Boot
+''''''''''''''''''''''''
+
+Probably the easiest way to get started with a whole Linux system on the machine
+is to generate an image with Buildroot. Version 2024.11.1 is tested at the time
+of writing and involves two steps. First run the following commands in the
+toplevel directory of the Buildroot source tree:
+
+.. code-block:: bash
+
+ $ make freescale_imx8mpevk_defconfig
+ $ make
+
+Once finished successfully there is an ``output/image`` subfolder. Navigate into
+it and resize the SD card image to a power of two:
+
+.. code-block:: bash
+
+ $ qemu-img resize sdcard.img 256M
+
+Now that everything is prepared the machine can be started as follows:
+
+.. code-block:: bash
+
+ $ qemu-system-aarch64 -M imx8mp-evk -smp 4 -m 3G \
+ -display none -serial null -serial stdio \
+ -kernel Image \
+ -dtb imx8mp-evk.dtb \
+ -append "root=/dev/mmcblk2p2" \
+ -drive file=sdcard.img,if=sd,bus=2,format=raw,id=mmcblk2
diff --git a/docs/system/arm/mainstone.rst b/docs/system/arm/mainstone.rst
deleted file mode 100644
index 05310f4..0000000
--- a/docs/system/arm/mainstone.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-Intel Mainstone II board (``mainstone``)
-========================================
-
-The ``mainstone`` board emulates the Intel Mainstone II development
-board, which uses a PXA270 CPU.
-
-Emulated devices:
-
-- Flash memory
-- Keypad
-- MMC controller
-- 91C111 ethernet
-- PIC
-- Timer
-- DMA
-- GPIO
-- FIR
-- Serial
-- LCD controller
-- SSP
-- USB controller
-- RTC
-- PCMCIA
-- I2C
-- I2S
diff --git a/docs/system/arm/mcimx6ul-evk.rst b/docs/system/arm/mcimx6ul-evk.rst
new file mode 100644
index 0000000..8871138
--- /dev/null
+++ b/docs/system/arm/mcimx6ul-evk.rst
@@ -0,0 +1,5 @@
+NXP MCIMX6UL-EVK (``mcimx6ul-evk``)
+===================================
+
+The ``mcimx6ul-evk`` machine models the NXP i.MX6UltraLite Evaluation Kit
+MCIMX6UL-EVK development board. It has a single Cortex-A7 CPU.
diff --git a/docs/system/arm/mcimx7d-sabre.rst b/docs/system/arm/mcimx7d-sabre.rst
new file mode 100644
index 0000000..c5d35af
--- /dev/null
+++ b/docs/system/arm/mcimx7d-sabre.rst
@@ -0,0 +1,5 @@
+NXP MCIMX7D Sabre (``mcimx7d-sabre``)
+=====================================
+
+The ``mcimx7d-sabre`` machine models the NXP SABRE Board MCIMX7SABRE,
+based an an i.MX7Dual SoC.
diff --git a/docs/system/arm/nseries.rst b/docs/system/arm/nseries.rst
deleted file mode 100644
index cd9edf5..0000000
--- a/docs/system/arm/nseries.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-Nokia N800 and N810 tablets (``n800``, ``n810``)
-================================================
-
-Nokia N800 and N810 internet tablets (known also as RX-34 and RX-44 /
-48) emulation supports the following elements:
-
-- Texas Instruments OMAP2420 System-on-chip (ARM1136 core)
-
-- RAM and non-volatile OneNAND Flash memories
-
-- Display connected to EPSON remote framebuffer chip and OMAP on-chip
- display controller and a LS041y3 MIPI DBI-C controller
-
-- TI TSC2301 (in N800) and TI TSC2005 (in N810) touchscreen
- controllers driven through SPI bus
-
-- National Semiconductor LM8323-controlled qwerty keyboard driven
- through |I2C| bus
-
-- Secure Digital card connected to OMAP MMC/SD host
-
-- Three OMAP on-chip UARTs and on-chip STI debugging console
-
-- Mentor Graphics \"Inventra\" dual-role USB controller embedded in a
- TI TUSB6010 chip - only USB host mode is supported
-
-- TI TMP105 temperature sensor driven through |I2C| bus
-
-- TI TWL92230C power management companion with an RTC on
- |I2C| bus
-
-- Nokia RETU and TAHVO multi-purpose chips with an RTC, connected
- through CBUS
diff --git a/docs/system/arm/nuvoton.rst b/docs/system/arm/nuvoton.rst
index 0424cae..e4827fb 100644
--- a/docs/system/arm/nuvoton.rst
+++ b/docs/system/arm/nuvoton.rst
@@ -1,12 +1,13 @@
-Nuvoton iBMC boards (``*-bmc``, ``npcm750-evb``, ``quanta-gsj``)
-================================================================
+Nuvoton iBMC boards (``kudo-bmc``, ``mori-bmc``, ``npcm750-evb``, ``quanta-gbs-bmc``, ``quanta-gsj``, ``npcm845-evb``)
+======================================================================================================================
-The `Nuvoton iBMC`_ chips (NPCM7xx) are a family of ARM-based SoCs that are
+The `Nuvoton iBMC`_ chips are a family of Arm-based SoCs that are
designed to be used as Baseboard Management Controllers (BMCs) in various
-servers. They all feature one or two ARM Cortex-A9 CPU cores, as well as an
-assortment of peripherals targeted for either Enterprise or Data Center /
-Hyperscale applications. The former is a superset of the latter, so NPCM750 has
-all the peripherals of NPCM730 and more.
+servers. Currently there are two families: NPCM7XX series and
+NPCM8XX series. NPCM7XX series feature one or two Arm Cortex-A9 CPU cores,
+while NPCM8XX feature 4 Arm Cortex-A35 CPU cores. Both series contain a
+different assortment of peripherals targeted for either Enterprise or Data
+Center / Hyperscale applications.
.. _Nuvoton iBMC: https://www.nuvoton.com/products/cloud-computing/ibmc/
@@ -27,6 +28,11 @@ There are also two more SoCs, NPCM710 and NPCM705, which are single-core
variants of NPCM750 and NPCM730, respectively. These are currently not
supported by QEMU.
+The NPCM8xx SoC is the successor of the NPCM7xx SoC. It has 4 Cortex-A35 cores.
+The following machines are based on this chip :
+
+- ``npcm845-evb`` Nuvoton NPCM845 Evaluation board
+
Supported devices
-----------------
@@ -62,6 +68,8 @@ Missing devices
* System Wake-up Control (SWC)
* Shared memory (SHM)
* eSPI slave interface
+ * Block-transfer interface (8XX only)
+ * Virtual UART (8XX only)
* Ethernet controller (GMAC)
* USB device (USBD)
@@ -76,6 +84,11 @@ Missing devices
* Video capture
* Encoding compression engine
* Security features
+ * I3C buses (8XX only)
+ * Temperature sensor interface (8XX only)
+ * Virtual UART (8XX only)
+ * Flash monitor (8XX only)
+ * JTAG master (8XX only)
Boot options
------------
diff --git a/docs/system/arm/orangepi.rst b/docs/system/arm/orangepi.rst
index 9afa542..d81f6c3 100644
--- a/docs/system/arm/orangepi.rst
+++ b/docs/system/arm/orangepi.rst
@@ -119,7 +119,7 @@ Orange Pi PC images
Note that the mainline kernel does not have a root filesystem. You may provide it
with an official Orange Pi PC image from the official website:
- http://www.orangepi.org/downloadresources/
+ http://www.orangepi.org/html/serviceAndSupport/index.html
Another possibility is to run an Armbian image for Orange Pi PC which
can be downloaded from:
@@ -213,7 +213,7 @@ including the Orange Pi PC. NetBSD 9.0 is known to work best for the Orange Pi P
board and provides a fully working system with serial console, networking and storage.
For the Orange Pi PC machine, get the 'evbarm-earmv7hf' based image from:
- https://cdn.netbsd.org/pub/NetBSD/NetBSD-9.0/evbarm-earmv7hf/binary/gzimg/armv7.img.gz
+ https://archive.netbsd.org/pub/NetBSD-archive/NetBSD-9.0/evbarm-earmv7hf/binary/gzimg/armv7.img.gz
The image requires manually installing U-Boot in the image. Build U-Boot with
the orangepi_pc_defconfig configuration as described in the previous section.
@@ -252,14 +252,14 @@ and set the following environment variables before booting:
Optionally you may save the environment variables to SD card with 'saveenv'.
To continue booting simply give the 'boot' command and NetBSD boots.
-Orange Pi PC integration tests
-""""""""""""""""""""""""""""""
+Orange Pi PC functional tests
+"""""""""""""""""""""""""""""
-The Orange Pi PC machine has several integration tests included.
+The Orange Pi PC machine has several functional tests included.
To run the whole set of tests, build QEMU from source and simply
-provide the following command:
+provide the following command from the build directory:
.. code-block:: bash
- $ AVOCADO_ALLOW_LARGE_STORAGE=yes avocado --show=app,console run \
- -t machine:orangepi-pc tests/avocado/boot_linux_console.py
+ $ QEMU_TEST_ALLOW_LARGE_STORAGE=1 \
+ pyvenv/bin/meson test --suite thorough func-arm-arm_orangepi
diff --git a/docs/system/arm/palm.rst b/docs/system/arm/palm.rst
deleted file mode 100644
index 61bc8d3..0000000
--- a/docs/system/arm/palm.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Palm Tungsten|E PDA (``cheetah``)
-=================================
-
-The Palm Tungsten|E PDA (codename \"Cheetah\") emulation includes the
-following elements:
-
-- Texas Instruments OMAP310 System-on-chip (ARM925T core)
-
-- ROM and RAM memories (ROM firmware image can be loaded with
- -option-rom)
-
-- On-chip LCD controller
-
-- On-chip Real Time Clock
-
-- TI TSC2102i touchscreen controller / analog-digital converter /
- Audio CODEC, connected through MicroWire and |I2S| buses
-
-- GPIO-connected matrix keypad
-
-- Secure Digital card connected to OMAP MMC/SD host
-
-- Three on-chip UARTs
diff --git a/docs/system/arm/stm32.rst b/docs/system/arm/stm32.rst
index 3b640f3..511e3eb 100644
--- a/docs/system/arm/stm32.rst
+++ b/docs/system/arm/stm32.rst
@@ -1,5 +1,5 @@
-STMicroelectronics STM32 boards (``netduino2``, ``netduinoplus2``, ``stm32vldiscovery``)
-========================================================================================
+STMicroelectronics STM32 boards (``netduino2``, ``netduinoplus2``, ``olimex-stm32-h405``, ``stm32vldiscovery``)
+===============================================================================================================
The `STM32`_ chips are a family of 32-bit ARM-based microcontroller by
STMicroelectronics.
@@ -36,6 +36,7 @@ Supported devices
* SPI controller
* System configuration (SYSCFG)
* Timer controller (TIMER)
+ * Reset and Clock Controller (RCC) (STM32F4 only, reset and enable only)
Missing devices
---------------
@@ -53,7 +54,7 @@ Missing devices
* Power supply configuration (PWR)
* Random Number Generator (RNG)
* Real-Time Clock (RTC) controller
- * Reset and Clock Controller (RCC)
+ * Reset and Clock Controller (RCC) (other features than reset and enable)
* Secure Digital Input/Output (SDIO) interface
* USB OTG
* Watchdog controller (IWDG, WWDG)
diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst
index e67e7f0..6a719b9 100644
--- a/docs/system/arm/virt.rst
+++ b/docs/system/arm/virt.rst
@@ -1,3 +1,5 @@
+.. _arm-virt:
+
'virt' generic virtual platform (``virt``)
==========================================
@@ -19,6 +21,10 @@ of the 5.0 release and ``virt-5.0`` of the 5.1 release. Migration
is not guaranteed to work between different QEMU releases for
the non-versioned ``virt`` machine type.
+VM migration is not guaranteed when using ``-cpu max``, as features
+supported may change between QEMU versions. To ensure your VM can be
+migrated, it is recommended to use another cpu model instead.
+
Supported devices
"""""""""""""""""
@@ -64,11 +70,11 @@ Supported guest CPU types:
- ``cortex-a76`` (64-bit)
- ``cortex-a710`` (64-bit)
- ``a64fx`` (64-bit)
-- ``host`` (with KVM only)
+- ``host`` (with KVM and HVF only)
- ``neoverse-n1`` (64-bit)
- ``neoverse-v1`` (64-bit)
- ``neoverse-n2`` (64-bit)
-- ``max`` (same as ``host`` for KVM; best possible emulation with TCG)
+- ``max`` (same as ``host`` for KVM and HVF; best possible emulation with TCG)
Note that the default is ``cortex-a15``, so for an AArch64 guest you must
specify a CPU type.
@@ -138,6 +144,10 @@ highmem-mmio
Set ``on``/``off`` to enable/disable the high memory region for PCI MMIO.
The default is ``on``.
+highmem-mmio-size
+ Set the high memory region size for PCI MMIO. Must be a power of 2 and
+ greater than or equal to the default size (512G).
+
gic-version
Specify the version of the Generic Interrupt Controller (GIC) to provide.
Valid values are:
@@ -167,10 +177,18 @@ iommu
``smmuv3``
Create an SMMUv3
+default-bus-bypass-iommu
+ Set ``on``/``off`` to enable/disable `bypass_iommu
+ <https://gitlab.com/qemu-project/qemu/-/blob/master/docs/bypass-iommu.txt>`_
+ for default root bus.
+
ras
Set ``on``/``off`` to enable/disable reporting host memory errors to a guest
using ACPI and guest external abort exceptions. The default is off.
+acpi
+ Set ``on``/``off``/``auto`` to enable/disable ACPI.
+
dtb-randomness
Set ``on``/``off`` to pass random seeds via the guest DTB
rng-seed and kaslr-seed nodes (in both "/chosen" and
@@ -184,6 +202,14 @@ dtb-randomness
dtb-kaslr-seed
A deprecated synonym for dtb-randomness.
+x-oem-id
+ Set string (up to 6 bytes) to override the default value of field OEMID in ACPI
+ table header.
+
+x-oem-table-id
+ Set string (up to 8 bytes) to override the default value of field OEM Table ID
+ in ACPI table header.
+
Linux guest kernel configuration
""""""""""""""""""""""""""""""""
diff --git a/docs/system/arm/vmapple.rst b/docs/system/arm/vmapple.rst
new file mode 100644
index 0000000..35c329e
--- /dev/null
+++ b/docs/system/arm/vmapple.rst
@@ -0,0 +1,65 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+VMApple machine emulation
+========================================================================================
+
+VMApple is the device model that the macOS built-in hypervisor called "Virtualization.framework"
+exposes to Apple Silicon macOS guests. The "vmapple" machine model in QEMU implements the same
+device model, but does not use any code from Virtualization.Framework.
+
+Prerequisites
+-------------
+
+To run the vmapple machine model, you need to
+
+ * Run on Apple Silicon
+ * Run on macOS 12.0 or above
+ * Have an already installed copy of a Virtualization.Framework macOS 12 virtual
+ machine. Note that newer versions than 12.x are currently NOT supported on
+ the guest side. I will assume that you installed it using the
+ `macosvm <https://github.com/s-u/macosvm>`__ CLI.
+
+First, we need to extract the UUID from the virtual machine that you installed. You can do this
+by running the shell script in contrib/vmapple/uuid.sh on the macosvm.json file.
+
+.. code-block:: bash
+ :caption: uuid.sh script to extract the UUID from a macosvm.json file
+
+ $ contrib/vmapple/uuid.sh "path/to/macosvm.json"
+
+Now we also need to trim the aux partition. It contains metadata that we can just discard:
+
+.. code-block:: bash
+ :caption: Command to trim the aux file
+
+ $ dd if="aux.img" of="aux.img.trimmed" bs=$(( 0x4000 )) skip=1
+
+How to run
+----------
+
+Then, we can launch QEMU with the Virtualization.Framework pre-boot environment and the readily
+installed target disk images. I recommend to port forward the VM's ssh and vnc ports to the host
+to get better interactive access into the target system:
+
+.. code-block:: bash
+ :caption: Example execution command line
+
+ $ UUID="$(contrib/vmapple/uuid.sh 'macosvm.json')"
+ $ AVPBOOTER="/System/Library/Frameworks/Virtualization.framework/Resources/AVPBooter.vmapple2.bin"
+ $ AUX="aux.img.trimmed"
+ $ DISK="disk.img"
+ $ qemu-system-aarch64 \
+ -serial mon:stdio \
+ -m 4G \
+ -accel hvf \
+ -M vmapple,uuid="$UUID" \
+ -bios "$AVPBOOTER" \
+ -drive file="$AUX",if=pflash,format=raw \
+ -drive file="$DISK",if=pflash,format=raw \
+ -drive file="$AUX",if=none,id=aux,format=raw \
+ -drive file="$DISK",if=none,id=root,format=raw \
+ -device vmapple-virtio-blk-pci,variant=aux,drive=aux \
+ -device vmapple-virtio-blk-pci,variant=root,drive=root \
+ -netdev user,id=net0,ipv6=off,hostfwd=tcp::2222-:22,hostfwd=tcp::5901-:5900 \
+ -device virtio-net-pci,netdev=net0
+
diff --git a/docs/system/arm/xlnx-versal-virt.rst b/docs/system/arm/xlnx-versal-virt.rst
index 0bafc76..c5f35f2 100644
--- a/docs/system/arm/xlnx-versal-virt.rst
+++ b/docs/system/arm/xlnx-versal-virt.rst
@@ -178,6 +178,9 @@ Run the following at the U-Boot prompt:
fdt set /chosen/dom0 reg <0x00000000 0x40000000 0x0 0x03100000>
booti 30000000 - 20000000
+It's possible to change the OSPI flash model emulated by using the machine model
+option ``ospi-flash``.
+
BBRAM File Backend
""""""""""""""""""
BBRAM can have an optional file backend, which must be a seekable
diff --git a/docs/system/arm/xlnx-zcu102.rst b/docs/system/arm/xlnx-zcu102.rst
new file mode 100644
index 0000000..534cd1d
--- /dev/null
+++ b/docs/system/arm/xlnx-zcu102.rst
@@ -0,0 +1,19 @@
+Xilinx ZynqMP ZCU102 (``xlnx-zcu102``)
+======================================
+
+The ``xlnx-zcu102`` board models the Xilinx ZynqMP ZCU102 board.
+This board has 4 Cortex-A53 CPUs and 2 Cortex-R5F CPUs.
+
+Machine-specific options
+""""""""""""""""""""""""
+
+The following machine-specific options are supported:
+
+secure
+ Set ``on``/``off`` to enable/disable emulating a guest CPU which implements the
+ Arm Security Extensions (TrustZone). The default is ``off``.
+
+virtualization
+ Set ``on``/``off`` to enable/disable emulating a guest CPU which implements the
+ Arm Virtualization Extensions. The default is ``off``.
+
diff --git a/docs/system/arm/xscale.rst b/docs/system/arm/xscale.rst
deleted file mode 100644
index e239136..0000000
--- a/docs/system/arm/xscale.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-Sharp XScale-based PDA models (``akita``, ``borzoi``, ``spitz``, ``terrier``, ``tosa``)
-=======================================================================================
-
-The Sharp Zaurus are PDAs based on XScale, able to run Linux ('SL series').
-
-The SL-6000 (\"Tosa\"), released in 2005, uses a PXA255 System-on-chip.
-
-The SL-C3000 (\"Spitz\"), SL-C1000 (\"Akita\"), SL-C3100 (\"Borzoi\") and
-SL-C3200 (\"Terrier\") use a PXA270.
-
-The clamshell PDA models emulation includes the following peripherals:
-
-- Intel PXA255/PXA270 System-on-chip (ARMv5TE core)
-
-- NAND Flash memory - not in \"Tosa\"
-
-- IBM/Hitachi DSCM microdrive in a PXA PCMCIA slot - not in \"Akita\"
-
-- On-chip OHCI USB controller - not in \"Tosa\"
-
-- On-chip LCD controller
-
-- On-chip Real Time Clock
-
-- TI ADS7846 touchscreen controller on SSP bus
-
-- Maxim MAX1111 analog-digital converter on |I2C| bus
-
-- GPIO-connected keyboard controller and LEDs
-
-- Secure Digital card connected to PXA MMC/SD host
-
-- Three on-chip UARTs
-
-- WM8750 audio CODEC on |I2C| and |I2S| buses
diff --git a/docs/system/bootindex.rst b/docs/system/bootindex.rst
index 8b057f8..5e1b33e 100644
--- a/docs/system/bootindex.rst
+++ b/docs/system/bootindex.rst
@@ -49,10 +49,11 @@ Limitations
-----------
Some firmware has limitations on which devices can be considered for
-booting. For instance, the PC BIOS boot specification allows only one
-disk to be bootable. If boot from disk fails for some reason, the BIOS
+booting. For instance, the x86 PC BIOS boot specification allows only one
+disk to be bootable. If boot from disk fails for some reason, the x86 BIOS
won't retry booting from other disk. It can still try to boot from
-floppy or net, though.
+floppy or net, though. In the case of s390x BIOS, the BIOS will try up to
+8 total devices, any number of which may be disks or virtio-net devices.
Sometimes, firmware cannot map the device path QEMU wants firmware to
boot from to a boot method. It doesn't happen for devices the firmware
diff --git a/docs/system/confidential-guest-support.rst b/docs/system/confidential-guest-support.rst
index 0c490db..66129fb 100644
--- a/docs/system/confidential-guest-support.rst
+++ b/docs/system/confidential-guest-support.rst
@@ -38,6 +38,7 @@ Supported mechanisms
Currently supported confidential guest mechanisms are:
* AMD Secure Encrypted Virtualization (SEV) (see :doc:`i386/amd-memory-encryption`)
+* Intel Trust Domain Extension (TDX) (see :doc:`i386/tdx`)
* POWER Protected Execution Facility (PEF) (see :ref:`power-papr-protected-execution-facility-pef`)
* s390x Protected Virtualization (PV) (see :doc:`s390x/protvirt`)
diff --git a/docs/system/cpu-hotplug.rst b/docs/system/cpu-hotplug.rst
index 015ce2b..cc50937 100644
--- a/docs/system/cpu-hotplug.rst
+++ b/docs/system/cpu-hotplug.rst
@@ -33,23 +33,23 @@ vCPU hotplug
{
"return": [
{
- "type": "IvyBridge-IBRS-x86_64-cpu",
- "vcpus-count": 1,
"props": {
- "socket-id": 1,
- "core-id": 0,
+ "core-id": 1,
+ "socket-id": 0,
"thread-id": 0
- }
+ },
+ "type": "IvyBridge-IBRS-x86_64-cpu",
+ "vcpus-count": 1
},
{
- "qom-path": "/machine/unattached/device[0]",
- "type": "IvyBridge-IBRS-x86_64-cpu",
- "vcpus-count": 1,
"props": {
- "socket-id": 0,
"core-id": 0,
+ "socket-id": 0,
"thread-id": 0
- }
+ },
+ "qom-path": "/machine/unattached/device[0]",
+ "type": "IvyBridge-IBRS-x86_64-cpu",
+ "vcpus-count": 1
}
]
}
@@ -58,18 +58,18 @@ vCPU hotplug
(4) The ``query-hotpluggable-cpus`` command returns an object for CPUs
that are present (containing a "qom-path" member) or which may be
hot-plugged (no "qom-path" member). From its output in step (3), we
- can see that ``IvyBridge-IBRS-x86_64-cpu`` is present in socket 0,
- while hot-plugging a CPU into socket 1 requires passing the listed
+ can see that ``IvyBridge-IBRS-x86_64-cpu`` is present in socket 0 core 0,
+ while hot-plugging a CPU into socket 0 core 1 requires passing the listed
properties to QMP ``device_add``::
- (QEMU) device_add id=cpu-2 driver=IvyBridge-IBRS-x86_64-cpu socket-id=1 core-id=0 thread-id=0
+ (QEMU) device_add id=cpu-2 driver=IvyBridge-IBRS-x86_64-cpu socket-id=0 core-id=1 thread-id=0
{
"execute": "device_add",
"arguments": {
- "socket-id": 1,
+ "core-id": 1,
"driver": "IvyBridge-IBRS-x86_64-cpu",
"id": "cpu-2",
- "core-id": 0,
+ "socket-id": 0,
"thread-id": 0
}
}
@@ -83,34 +83,32 @@ vCPU hotplug
(QEMU) query-cpus-fast
{
- "execute": "query-cpus-fast",
"arguments": {}
+ "execute": "query-cpus-fast",
}
{
"return": [
{
- "qom-path": "/machine/unattached/device[0]",
- "target": "x86_64",
- "thread-id": 11534,
"cpu-index": 0,
"props": {
- "socket-id": 0,
"core-id": 0,
+ "socket-id": 0,
"thread-id": 0
},
- "arch": "x86"
+ "qom-path": "/machine/unattached/device[0]",
+ "target": "x86_64",
+ "thread-id": 28957
},
{
- "qom-path": "/machine/peripheral/cpu-2",
- "target": "x86_64",
- "thread-id": 12106,
"cpu-index": 1,
"props": {
- "socket-id": 1,
- "core-id": 0,
+ "core-id": 1,
+ "socket-id": 0,
"thread-id": 0
},
- "arch": "x86"
+ "qom-path": "/machine/peripheral/cpu-2",
+ "target": "x86_64",
+ "thread-id": 29095
}
]
}
@@ -123,10 +121,10 @@ From the 'qmp-shell', invoke the QMP ``device_del`` command::
(QEMU) device_del id=cpu-2
{
- "execute": "device_del",
"arguments": {
"id": "cpu-2"
}
+ "execute": "device_del",
}
{
"return": {}
diff --git a/docs/system/cpu-models-x86.rst.inc b/docs/system/cpu-models-x86.rst.inc
index ba27b56..6a770ca 100644
--- a/docs/system/cpu-models-x86.rst.inc
+++ b/docs/system/cpu-models-x86.rst.inc
@@ -71,6 +71,16 @@ mixture of host CPU models between machines, if live migration
compatibility is required, use the newest CPU model that is compatible
across all desired hosts.
+``ClearwaterForest``
+ Intel Xeon Processor (ClearwaterForest, 2025)
+
+``SierraForest``, ``SierraForest-v2``
+ Intel Xeon Processor (SierraForest, 2024), SierraForest-v2 mitigates
+ the GDS and RFDS vulnerabilities with stepping 3.
+
+``GraniteRapids``, ``GraniteRapids-v2``
+ Intel Xeon Processor (GraniteRapids, 2024)
+
``Cascadelake-Server``, ``Cascadelake-Server-noTSX``
Intel Xeon Processor (Cascade Lake, 2019), with "stepping" levels 6
or 7 only. (The Cascade Lake Xeon processor with *stepping 5 is
@@ -181,7 +191,7 @@ features are included if using "Host passthrough" or "Host model".
CVE-2018-12127, [MSBDS] CVE-2018-12126).
This is an MSR (Model-Specific Register) feature rather than a CPUID feature,
- so it will not appear in the Linux ``/proc/cpuinfo`` in the host or
+ therefore it will not appear in the Linux ``/proc/cpuinfo`` in the host or
guest. Instead, the host kernel uses it to populate the MDS
vulnerability file in ``sysfs``.
@@ -189,10 +199,10 @@ features are included if using "Host passthrough" or "Host model".
affected} in the ``/sys/devices/system/cpu/vulnerabilities/mds`` file.
``taa-no``
- Recommended to inform that the guest that the host is ``not``
+ Recommended to inform the guest that the host is ``not``
vulnerable to CVE-2019-11135, TSX Asynchronous Abort (TAA).
- This too is an MSR feature, so it does not show up in the Linux
+ This is also an MSR feature, therefore it does not show up in the Linux
``/proc/cpuinfo`` in the host or guest.
It should only be enabled for VMs if the host reports ``Not affected``
@@ -214,7 +224,7 @@ features are included if using "Host passthrough" or "Host model".
By disabling TSX, KVM-based guests can avoid paying the price of
mitigating TSX-based attacks.
- Note that ``tsx-ctrl`` too is an MSR feature, so it does not show
+ Note that ``tsx-ctrl`` is also an MSR feature, therefore it does not show
up in the Linux ``/proc/cpuinfo`` in the host or guest.
To validate that Intel TSX is indeed disabled for the guest, there are
@@ -223,6 +233,38 @@ features are included if using "Host passthrough" or "Host model".
``/sys/devices/system/cpu/vulnerabilities/tsx_async_abort`` file in
the guest should report ``Mitigation: TSX disabled``.
+``bhi-no``
+ Recommended to inform the guest that the host is ``not``
+ vulnerable to CVE-2022-0001, Branch History Injection (BHI).
+
+ This is also an MSR feature, therefore it does not show up in the Linux
+ ``/proc/cpuinfo`` in the host or guest.
+
+ It should only be enabled for VMs if the host reports
+ ``BHI: Not affected`` in the
+ ``/sys/devices/system/cpu/vulnerabilities/spectre_v2`` file.
+
+``gds-no``
+ Recommended to inform the guest that the host is ``not``
+ vulnerable to CVE-2022-40982, Gather Data Sampling (GDS).
+
+ This is also an MSR feature, therefore it does not show up in the Linux
+ ``/proc/cpuinfo`` in the host or guest.
+
+ It should only be enabled for VMs if the host reports ``Not affected``
+ in the ``/sys/devices/system/cpu/vulnerabilities/gather_data_sampling``
+ file.
+
+``rfds-no``
+ Recommended to inform the guest that the host is ``not``
+ vulnerable to CVE-2023-28746, Register File Data Sampling (RFDS).
+
+ This is also an MSR feature, therefore it does not show up in the Linux
+ ``/proc/cpuinfo`` in the host or guest.
+
+ It should only be enabled for VMs if the host reports ``Not affected``
+ in the ``/sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling``
+ file.
Preferred CPU models for AMD x86 hosts
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst
index f197774..9113816 100644
--- a/docs/system/device-emulation.rst
+++ b/docs/system/device-emulation.rst
@@ -85,7 +85,9 @@ Emulated Devices
devices/can.rst
devices/ccid.rst
devices/cxl.rst
+ devices/vfio-user.rst
devices/ivshmem.rst
+ devices/ivshmem-flat.rst
devices/keyboard.rst
devices/net.rst
devices/nvme.rst
diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst
index 882b036..e307caf 100644
--- a/docs/system/devices/cxl.rst
+++ b/docs/system/devices/cxl.rst
@@ -308,7 +308,7 @@ A very simple setup with just one directly attached CXL Type 3 Persistent Memory
-object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \
-device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
-device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
- -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+ -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0,sn=0x1 \
-M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
A very simple setup with just one directly attached CXL Type 3 Volatile Memory device::
@@ -349,13 +349,13 @@ the CXL Type3 device directly attached (no switches).::
-device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
-device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2 \
-device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
- -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+ -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0,sn=0x1 \
-device cxl-rp,port=1,bus=cxl.1,id=root_port14,chassis=0,slot=3 \
- -device cxl-type3,bus=root_port14,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \
+ -device cxl-type3,bus=root_port14,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1,sn=0x2 \
-device cxl-rp,port=0,bus=cxl.2,id=root_port15,chassis=0,slot=5 \
- -device cxl-type3,bus=root_port15,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \
+ -device cxl-type3,bus=root_port15,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2,sn=0x3 \
-device cxl-rp,port=1,bus=cxl.2,id=root_port16,chassis=0,slot=6 \
- -device cxl-type3,bus=root_port16,persistent-memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \
+ -device cxl-type3,bus=root_port16,persistent-memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3,sn=0x4 \
-M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.targets.1=cxl.2,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=8k
An example of 4 devices below a switch suitable for 1, 2 or 4 way interleave::
@@ -375,13 +375,13 @@ An example of 4 devices below a switch suitable for 1, 2 or 4 way interleave::
-device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \
-device cxl-upstream,bus=root_port0,id=us0 \
-device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
- -device cxl-type3,bus=swport0,persistent-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
+ -device cxl-type3,bus=swport0,persistent-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0,sn=0x1 \
-device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
- -device cxl-type3,bus=swport1,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
+ -device cxl-type3,bus=swport1,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1,sn=0x2 \
-device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
- -device cxl-type3,bus=swport2,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
+ -device cxl-type3,bus=swport2,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2,sn=0x3 \
-device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
- -device cxl-type3,bus=swport3,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
+ -device cxl-type3,bus=swport3,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3,sn=0x4 \
-M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
Deprecations
diff --git a/docs/system/devices/igb.rst b/docs/system/devices/igb.rst
index 04e79df..71f31cb 100644
--- a/docs/system/devices/igb.rst
+++ b/docs/system/devices/igb.rst
@@ -57,11 +57,12 @@ directory:
meson test qtest-x86_64/qos-test
ethtool can test register accesses, interrupts, etc. It is automated as an
-Avocado test and can be ran with the following command:
+functional test and can be run from the build directory with the following
+command:
.. code:: shell
- make check-avocado AVOCADO_TESTS=tests/avocado/netdev-ethtool.py
+ pyvenv/bin/meson test --suite thorough func-x86_64-netdev_ethtool
References
==========
diff --git a/docs/system/devices/ivshmem-flat.rst b/docs/system/devices/ivshmem-flat.rst
new file mode 100644
index 0000000..1f97052
--- /dev/null
+++ b/docs/system/devices/ivshmem-flat.rst
@@ -0,0 +1,33 @@
+Inter-VM Shared Memory Flat Device
+----------------------------------
+
+The ivshmem-flat device is meant to be used on machines that lack a PCI bus,
+making them unsuitable for the use of the traditional ivshmem device modeled as
+a PCI device. Machines like those with a Cortex-M MCU are good candidates to use
+the ivshmem-flat device. Also, since the flat version maps the control and
+status registers directly to the memory, it requires a quite tiny "device
+driver" to interact with other VMs, which is useful in some RTOSes, like
+Zephyr, which usually run on constrained resource targets.
+
+Similar to the ivshmem device, the ivshmem-flat device supports both peer
+notification via HW interrupts and Inter-VM shared memory. This allows the
+device to be used together with the traditional ivshmem, enabling communication
+between, for instance, an aarch64 VM (using the traditional ivshmem device and
+running Linux), and an arm VM (using the ivshmem-flat device and running Zephyr
+instead).
+
+The ivshmem-flat device does not support the use of a ``memdev`` option (see
+ivshmem.rst for more details). It relies on the ivshmem server to create and
+distribute the proper shared memory file descriptor and the eventfd(s) to notify
+(interrupt) the peers. Therefore, to use this device, it is always necessary to
+have an ivshmem server up and running for proper device creation.
+
+Although the ivshmem-flat supports both peer notification (interrupts) and
+shared memory, the interrupt mechanism is optional. If no input IRQ is
+specified for the device it is disabled, preventing the VM from notifying or
+being notified by other VMs (a warning will be displayed to the user to inform
+the IRQ mechanism is disabled). The shared memory region is always present.
+
+The MMRs (INTRMASK, INTRSTATUS, IVPOSITION, and DOORBELL registers) offsets at
+the MMR region, and their functions, follow the ivshmem spec, so they work
+exactly as in the ivshmem PCI device (see ./specs/ivshmem-spec.txt).
diff --git a/docs/system/devices/net.rst b/docs/system/devices/net.rst
index 2ab516d..a3efbdc 100644
--- a/docs/system/devices/net.rst
+++ b/docs/system/devices/net.rst
@@ -77,6 +77,106 @@ When using the ``'-netdev user,hostfwd=...'`` option, TCP or UDP
connections can be redirected from the host to the guest. It allows for
example to redirect X11, telnet or SSH connections.
+Using passt as the user mode network stack
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+passt_ can be used as a simple replacement for SLIRP (``-net user``).
+passt doesn't require any capability or privilege. passt has
+better performance than ``-net user``, full IPv6 support and better security
+as it's a daemon that is not executed in QEMU context.
+
+passt can be connected to QEMU either by using a socket
+(``-netdev stream``) or using the vhost-user interface (``-netdev vhost-user``).
+See `passt(1)`_ for more details on passt.
+
+.. _passt: https://passt.top/
+.. _passt(1): https://passt.top/builds/latest/web/passt.1.html
+
+To use socket based passt interface:
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Start passt as a daemon::
+
+ passt --socket ~/passt.socket
+
+If ``--socket`` is not provided, passt will print the path of the UNIX domain socket QEMU can connect to (``/tmp/passt_1.socket``, ``/tmp/passt_2.socket``,
+...). Then you can connect your QEMU instance to passt:
+
+.. parsed-literal::
+ |qemu_system| [...OPTIONS...] -device virtio-net-pci,netdev=netdev0 -netdev stream,id=netdev0,server=off,addr.type=unix,addr.path=~/passt.socket
+
+Where ``~/passt.socket`` is the UNIX socket created by passt to
+communicate with QEMU.
+
+To use vhost-based interface:
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Start passt with ``--vhost-user``::
+
+ passt --vhost-user --socket ~/passt.socket
+
+Then to connect QEMU:
+
+.. parsed-literal::
+ |qemu_system| [...OPTIONS...] -m $RAMSIZE -chardev socket,id=chr0,path=~/passt.socket -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0
+
+Where ``$RAMSIZE`` is the memory size of your VM ``-m`` and ``-object memory-backend-memfd,size=`` must match.
+
+Migration of passt:
+^^^^^^^^^^^^^^^^^^^
+
+When passt is connected to QEMU using the vhost-user interface it can
+be migrated with QEMU and the network connections are not interrupted.
+
+As passt runs with no privileges, it relies on passt-repair to save and
+load the TCP connections state, using the TCP_REPAIR socket option.
+The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as root. If passt-repair is not available, TCP connections will not be preserved.
+
+Example of migration of a guest on the same host
+________________________________________________
+
+Before being able to run passt-repair, the CAP_NET_ADMIN capability must be set
+on the file, run as root::
+
+ setcap cap_net_admin+eip ./passt-repair
+
+Start passt for the source side::
+
+ passt --vhost-user --socket ~/passt_src.socket --repair-path ~/passt-repair_src.socket
+
+Where ``~/passt-repair_src.socket`` is the UNIX socket created by passt to
+communicate with passt-repair. The default value is the ``--socket`` path
+appended with ``.repair``.
+
+Start passt-repair::
+
+ passt-repair ~/passt-repair_src.socket
+
+Start source side QEMU with a monitor to be able to send the migrate command:
+
+.. parsed-literal::
+ |qemu_system| [...OPTIONS...] [...VHOST USER OPTIONS...] -monitor stdio
+
+Start passt for the destination side::
+
+ passt --vhost-user --socket ~/passt_dst.socket --repair-path ~/passt-repair_dst.socket
+
+Start passt-repair::
+
+ passt-repair ~/passt-repair_dst.socket
+
+Start QEMU with the ``-incoming`` parameter:
+
+.. parsed-literal::
+ |qemu_system| [...OPTIONS...] [...VHOST USER OPTIONS...] -incoming tcp:localhost:4444
+
+Then in the source guest monitor the migration can be started::
+
+ (qemu) migrate tcp:localhost:4444
+
+A separate passt-repair instance must be started for every migration. In the case of a failed migration, passt-repair also needs to be restarted before trying
+again.
+
Hubs
~~~~
diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index d2b1ca9..6509b35 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -53,6 +53,13 @@ parameters.
Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
previously used.
+``ocp`` (default: ``off``)
+ The Open Compute Project defines the Datacenter NVMe SSD Specification that
+ sits on top of NVMe. It describes additional commands and NVMe behaviors
+ specific for the Datacenter. When this option is ``on`` OCP features such as
+ the SMART / Health information extended log become available in the
+ controller. We emulate version 5 of this log page.
+
Additional Namespaces
---------------------
diff --git a/docs/system/devices/vfio-user.rst b/docs/system/devices/vfio-user.rst
new file mode 100644
index 0000000..b6dcaa5
--- /dev/null
+++ b/docs/system/devices/vfio-user.rst
@@ -0,0 +1,26 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+=========
+vfio-user
+=========
+
+QEMU includes a ``vfio-user`` client. The ``vfio-user`` specification allows for
+implementing (PCI) devices in userspace outside of QEMU; it is similar to
+``vhost-user`` in this respect (see :doc:`vhost-user`), but can emulate arbitrary
+PCI devices, not just ``virtio``. Whereas ``vfio`` is handled by the host
+kernel, ``vfio-user``, while similar in implementation, is handled entirely in
+userspace.
+
+For example, SPDK includes a virtual PCI NVMe controller implementation; by
+setting up a ``vfio-user`` UNIX socket between QEMU and SPDK, a VM can send NVMe
+I/O to the SPDK process.
+
+Presuming a suitable ``vfio-user`` server has opened a socket at
+``/tmp/vfio-user.sock``, a device can be configured with for example:
+
+.. code-block:: console
+
+-device '{"driver": "vfio-user-pci","socket": {"path": "/tmp/vfio-user.sock", "type": "unix"}}'
+
+See `libvfio-user <https://github.com/nutanix/libvfio-user/>`_ for further
+information.
diff --git a/docs/system/devices/virtio-gpu.rst b/docs/system/devices/virtio-gpu.rst
index cb73dd7..b7eb0fc 100644
--- a/docs/system/devices/virtio-gpu.rst
+++ b/docs/system/devices/virtio-gpu.rst
@@ -71,6 +71,17 @@ representation back to OpenGL API calls.
.. _Gallium3D: https://www.freedesktop.org/wiki/Software/gallium/
.. _virglrenderer: https://gitlab.freedesktop.org/virgl/virglrenderer/
+Translation of Vulkan API calls is supported since release of `virglrenderer`_
+v1.0.0 using `venus`_ protocol. ``Venus`` virtio-gpu capability set ("capset")
+requires host blob support (``hostmem`` and ``blob`` fields) and should
+be enabled using ``venus`` field. The ``hostmem`` field specifies the size
+of virtio-gpu host memory window. This is typically between 256M and 8G.
+
+.. parsed-literal::
+ -device virtio-gpu-gl,hostmem=8G,blob=true,venus=true
+
+.. _venus: https://gitlab.freedesktop.org/virgl/venus-protocol/
+
virtio-gpu rutabaga
-------------------
diff --git a/docs/system/gdb.rst b/docs/system/gdb.rst
index 4228cb5..d50470b 100644
--- a/docs/system/gdb.rst
+++ b/docs/system/gdb.rst
@@ -20,7 +20,7 @@ connection, use the ``-gdb dev`` option instead of ``-s``. See
.. parsed-literal::
- |qemu_system| -s -S -kernel bzImage -hda rootdisk.img -append "root=/dev/hda"
+ |qemu_system| -s -S -kernel bzImage -drive file=rootdisk.img,format=raw -append "root=/dev/sda"
QEMU will launch but will silently wait for gdb to connect.
diff --git a/docs/system/i386/hyperv.rst b/docs/system/i386/hyperv.rst
index 2505dc4..1c1de77 100644
--- a/docs/system/i386/hyperv.rst
+++ b/docs/system/i386/hyperv.rst
@@ -262,14 +262,19 @@ Supplementary features
``hv-passthrough``
In some cases (e.g. during development) it may make sense to use QEMU in
'pass-through' mode and give Windows guests all enlightenments currently
- supported by KVM. This pass-through mode is enabled by "hv-passthrough" CPU
- flag.
+ supported by KVM.
Note: ``hv-passthrough`` flag only enables enlightenments which are known to QEMU
(have corresponding 'hv-' flag) and copies ``hv-spinlocks`` and ``hv-vendor-id``
values from KVM to QEMU. ``hv-passthrough`` overrides all other 'hv-' settings on
- the command line. Also, enabling this flag effectively prevents migration as the
- list of enabled enlightenments may differ between target and destination hosts.
+ the command line.
+
+ Note: ``hv-passthrough`` does not enable ``hv-syndbg`` which can prevent certain
+ Windows guests from booting when used without proper configuration. If needed,
+ ``hv-syndbg`` can be enabled additionally.
+
+ Note: ``hv-passthrough`` effectively prevents migration as the list of enabled
+ enlightenments may differ between target and destination hosts.
``hv-enforce-cpuid``
By default, KVM allows the guest to use all currently supported Hyper-V
@@ -278,6 +283,36 @@ Supplementary features
feature alters this behavior and only allows the guest to use exposed Hyper-V
enlightenments.
+Recommendations
+---------------
+
+To achieve the best performance of Windows and Hyper-V guests and unless there
+are any specific requirements (e.g. migration to older QEMU/KVM versions,
+emulating specific Hyper-V version, ...), it is recommended to enable all
+currently implemented Hyper-V enlightenments with the following exceptions:
+
+- ``hv-syndbg``, ``hv-passthrough``, ``hv-enforce-cpuid`` should not be enabled
+ in production configurations as these are debugging/development features.
+- ``hv-reset`` can be avoided as modern Hyper-V versions don't expose it.
+- ``hv-evmcs`` can (and should) be enabled on Intel CPUs only. While the feature
+ is only used in nested configurations (Hyper-V, WSL2), enabling it for regular
+ Windows guests should not have any negative effects.
+- ``hv-no-nonarch-coresharing`` must only be enabled if vCPUs are properly pinned
+ so no non-architectural core sharing is possible.
+- ``hv-vendor-id``, ``hv-version-id-build``, ``hv-version-id-major``,
+ ``hv-version-id-minor``, ``hv-version-id-spack``, ``hv-version-id-sbranch``,
+ ``hv-version-id-snumber`` can be left unchanged, guests are not supposed to
+ behave differently when different Hyper-V version is presented to them.
+- ``hv-crash`` must only be enabled if the crash information is consumed via
+ QAPI by higher levels of the virtualization stack. Enabling this feature
+ effectively prevents Windows from creating dumps upon crashes.
+- ``hv-reenlightenment`` can only be used on hardware which supports TSC
+ scaling or when guest migration is not needed.
+- ``hv-spinlocks`` should be set to e.g. 0xfff when host CPUs are overcommited
+ (meaning there are other scheduled tasks or guests) and can be left unchanged
+ from the default value (0xffffffff) otherwise.
+- ``hv-avic``/``hv-apicv`` should not be enabled if the hardware does not
+ support APIC virtualization (Intel APICv, AMD AVIC).
Useful links
------------
diff --git a/docs/system/i386/nitro-enclave.rst b/docs/system/i386/nitro-enclave.rst
new file mode 100644
index 0000000..7317f54
--- /dev/null
+++ b/docs/system/i386/nitro-enclave.rst
@@ -0,0 +1,78 @@
+'nitro-enclave' virtual machine (``nitro-enclave``)
+===================================================
+
+``nitro-enclave`` is a machine type which emulates an *AWS nitro enclave*
+virtual machine. `AWS nitro enclaves`_ is an Amazon EC2 feature that allows
+creating isolated execution environments, called enclaves, from Amazon EC2
+instances which are used for processing highly sensitive data. Enclaves have
+no persistent storage and no external networking. The enclave VMs are based
+on Firecracker microvm with a vhost-vsock device for communication with the
+parent EC2 instance that spawned it and a Nitro Secure Module (NSM) device
+for cryptographic attestation. The parent instance VM always has CID 3 while
+the enclave VM gets a dynamic CID. Enclaves use an EIF (`Enclave Image Format`_)
+file which contains the necessary kernel, cmdline and ramdisk(s) to boot.
+
+In QEMU, ``nitro-enclave`` is a machine type based on ``microvm`` similar to how
+AWS nitro enclaves look like a `Firecracker`_ microvm. This is useful for
+local testing of EIF files using QEMU instead of running real AWS Nitro Enclaves
+which can be difficult for debugging due to its roots in security. The vsock
+device emulation is done using vhost-user-vsock which means another process that
+can do the userspace emulation, like `vhost-device-vsock`_ from rust-vmm crate,
+must be run alongside nitro-enclave for the vsock communication to work.
+
+``libcbor`` and ``gnutls`` are required dependencies for nitro-enclave machine
+support to be added when building QEMU from source.
+
+.. _AWS nitro enclaves: https://docs.aws.amazon.com/enclaves/latest/user/nitro-enclave.html
+.. _Enclave Image Format: https://github.com/aws/aws-nitro-enclaves-image-format
+.. _vhost-device-vsock: https://github.com/rust-vmm/vhost-device/tree/main/vhost-device-vsock
+.. _Firecracker: https://firecracker-microvm.github.io
+
+Using the nitro-enclave machine type
+------------------------------------
+
+Machine-specific options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+It supports the following machine-specific options:
+
+- nitro-enclave.vsock=string (required) (Id of the chardev from '-chardev' option that vhost-user-vsock device will use)
+- nitro-enclave.id=string (optional) (Set enclave identifier)
+- nitro-enclave.parent-role=string (optional) (Set parent instance IAM role ARN)
+- nitro-enclave.parent-id=string (optional) (Set parent instance identifier)
+
+
+Running a nitro-enclave VM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+First, run `vhost-device-vsock`__ (or a similar tool that supports vhost-user-vsock).
+The forward-cid option below with value 1 forwards all connections from the enclave
+VM to the host machine and the forward-listen (port numbers separated by '+') is used
+for forwarding connections from the host machine to the enclave VM::
+
+ $ vhost-device-vsock \
+ --vm guest-cid=4,forward-cid=1,forward-listen=9001+9002,socket=/tmp/vhost4.socket
+
+__ https://github.com/rust-vmm/vhost-device/tree/main/vhost-device-vsock#using-the-vsock-backend
+
+Now run the necessary applications on the host machine so that the nitro-enclave VM
+applications' vsock communication works. For example, the nitro-enclave VM's init
+process connects to CID 3 and sends a single byte hello heartbeat (0xB7) to let the
+parent VM know that it booted expecting a heartbeat (0xB7) response. So you must run
+a AF_VSOCK server on the host machine that listens on port 9000 and sends the heartbeat
+after it receives the heartbeat for enclave VM to boot successfully. You should run all
+the applications on the host machine that would typically be running in the parent EC2
+VM for successful communication with the enclave VM.
+
+Then run the nitro-enclave VM using the following command where ``hello.eif`` is
+an EIF file you would use to spawn a real AWS nitro enclave virtual machine::
+
+ $ qemu-system-x86_64 -M nitro-enclave,vsock=c,id=hello-world \
+ -kernel hello-world.eif -nographic -m 4G --enable-kvm -cpu host \
+ -chardev socket,id=c,path=/tmp/vhost4.socket
+
+In this example, the nitro-enclave VM has CID 4. If there are applications that
+connect to the enclave VM, run them on the host machine after enclave VM starts.
+You need to modify the applications to connect to CID 1 (instead of the enclave
+VM's CID) and use the forward-listen (e.g., 9001+9002) option of vhost-device-vsock
+to forward the ports they connect to.
diff --git a/docs/system/i386/tdx.rst b/docs/system/i386/tdx.rst
new file mode 100644
index 0000000..8131750
--- /dev/null
+++ b/docs/system/i386/tdx.rst
@@ -0,0 +1,161 @@
+Intel Trusted Domain eXtension (TDX)
+====================================
+
+Intel Trusted Domain eXtensions (TDX) refers to an Intel technology that extends
+Virtual Machine Extensions (VMX) and Multi-Key Total Memory Encryption (MKTME)
+with a new kind of virtual machine guest called a Trust Domain (TD). A TD runs
+in a CPU mode that is designed to protect the confidentiality of its memory
+contents and its CPU state from any other software, including the hosting
+Virtual Machine Monitor (VMM), unless explicitly shared by the TD itself.
+
+Prerequisites
+-------------
+
+To run TD, the physical machine needs to have TDX module loaded and initialized
+while KVM hypervisor has TDX support and has TDX enabled. If those requirements
+are met, the ``KVM_CAP_VM_TYPES`` will report the support of ``KVM_X86_TDX_VM``.
+
+Trust Domain Virtual Firmware (TDVF)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Trust Domain Virtual Firmware (TDVF) is required to provide TD services to boot
+TD Guest OS. TDVF needs to be copied to guest private memory and measured before
+the TD boots.
+
+KVM vcpu ioctl ``KVM_TDX_INIT_MEM_REGION`` can be used to populate the TDVF
+content into its private memory.
+
+Since TDX doesn't support readonly memslot, TDVF cannot be mapped as pflash
+device and it actually works as RAM. "-bios" option is chosen to load TDVF.
+
+OVMF is the opensource firmware that implements the TDVF support. Thus the
+command line to specify and load TDVF is ``-bios OVMF.fd``
+
+Feature Configuration
+---------------------
+
+Unlike non-TDX VM, the CPU features (enumerated by CPU or MSR) of a TD are not
+under full control of VMM. VMM can only configure part of features of a TD on
+``KVM_TDX_INIT_VM`` command of VM scope ``MEMORY_ENCRYPT_OP`` ioctl.
+
+The configurable features have three types:
+
+- Attributes:
+ - PKS (bit 30) controls whether Supervisor Protection Keys is exposed to TD,
+ which determines related CPUID bit and CR4 bit;
+ - PERFMON (bit 63) controls whether PMU is exposed to TD.
+
+- XSAVE related features (XFAM):
+ XFAM is a 64b mask, which has the same format as XCR0 or IA32_XSS MSR. It
+ determines the set of extended features available for use by the guest TD.
+
+- CPUID features:
+ Only some bits of some CPUID leaves are directly configurable by VMM.
+
+What features can be configured is reported via TDX capabilities.
+
+TDX capabilities
+~~~~~~~~~~~~~~~~
+
+The VM scope ``MEMORY_ENCRYPT_OP`` ioctl provides command ``KVM_TDX_CAPABILITIES``
+to get the TDX capabilities from KVM. It returns a data structure of
+``struct kvm_tdx_capabilities``, which tells the supported configuration of
+attributes, XFAM and CPUIDs.
+
+TD attributes
+~~~~~~~~~~~~~
+
+QEMU supports configuring raw 64-bit TD attributes directly via "attributes"
+property of "tdx-guest" object. Note, it's users' responsibility to provide a
+valid value because some bits may not supported by current QEMU or KVM yet.
+
+QEMU also supports the configuration of individual attribute bits that are
+supported by it, via properties of "tdx-guest" object.
+E.g., "sept-ve-disable" (bit 28).
+
+MSR based features
+~~~~~~~~~~~~~~~~~~
+
+Current KVM doesn't support MSR based feature (e.g., MSR_IA32_ARCH_CAPABILITIES)
+configuration for TDX, and it's a future work to enable it in QEMU when KVM adds
+support of it.
+
+Feature check
+~~~~~~~~~~~~~
+
+QEMU checks if the final (CPU) features, determined by given cpu model and
+explicit feature adjustment of "+featureA/-featureB", can be supported or not.
+It can produce feature not supported warning like
+
+ "warning: host doesn't support requested feature: CPUID.07H:EBX.intel-pt [bit 25]"
+
+It can also produce warning like
+
+ "warning: TDX forcibly sets the feature: CPUID.80000007H:EDX.invtsc [bit 8]"
+
+if the fixed-1 feature is requested to be disabled explicitly. This is newly
+added to QEMU for TDX because TDX has fixed-1 features that are forcibly enabled
+by TDX module and VMM cannot disable them.
+
+Launching a TD (TDX VM)
+-----------------------
+
+To launch a TD, the necessary command line options are tdx-guest object and
+split kernel-irqchip, as below:
+
+.. parsed-literal::
+
+ |qemu_system_x86| \\
+ -accel kvm \\
+ -cpu host \\
+ -object tdx-guest,id=tdx0 \\
+ -machine ...,confidential-guest-support=tdx0 \\
+ -bios OVMF.fd \\
+
+Restrictions
+------------
+
+ - kernel-irqchip must be split;
+
+ This is set by default for TDX guest if kernel-irqchip is left on its default
+ 'auto' setting.
+
+ - No readonly support for private memory;
+
+ - No SMM support: SMM support requires manipulating the guest register states
+ which is not allowed;
+
+Debugging
+---------
+
+Bit 0 of TD attributes, is DEBUG bit, which decides if the TD runs in off-TD
+debug mode. When in off-TD debug mode, TD's VCPU state and private memory are
+accessible via given SEAMCALLs. This requires KVM to expose APIs to invoke those
+SEAMCALLs and corresonponding QEMU change.
+
+It's targeted as future work.
+
+TD attestation
+--------------
+
+In TD guest, the attestation process is used to verify the TDX guest
+trustworthiness to other entities before provisioning secrets to the guest.
+
+TD attestation is initiated first by calling TDG.MR.REPORT inside TD to get the
+REPORT. Then the REPORT data needs to be converted into a remotely verifiable
+Quote by SGX Quoting Enclave (QE).
+
+It's a future work in QEMU to add support of TD attestation since it lacks
+support in current KVM.
+
+Live Migration
+--------------
+
+Future work.
+
+References
+----------
+
+- `TDX Homepage <https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html>`__
+
+- `SGX QE <https://github.com/intel/SGXDataCenterAttestationPrimitives/tree/master/QuoteGeneration>`__
diff --git a/docs/system/i386/xenpvh.rst b/docs/system/i386/xenpvh.rst
new file mode 100644
index 0000000..354250f
--- /dev/null
+++ b/docs/system/i386/xenpvh.rst
@@ -0,0 +1,49 @@
+Xen PVH machine (``xenpvh``)
+=========================================
+
+Xen supports a spectrum of types of guests that vary in how they depend
+on HW virtualization features, emulation models and paravirtualization.
+PVH is a mode that uses HW virtualization features (like HVM) but tries
+to avoid emulation models and instead use passthrough or
+paravirtualized devices.
+
+QEMU can be used to provide PV virtio devices on an emulated PCIe controller.
+That is the purpose of this minimal machine.
+
+Supported devices
+-----------------
+
+The x86 Xen PVH QEMU machine provide the following devices:
+
+- RAM
+- GPEX host bridge
+- virtio-pci devices
+
+The idea is to only connect virtio-pci devices but in theory any compatible
+PCI device model will work depending on Xen and guest support.
+
+Running
+-------
+
+The Xen tools will typically construct a command-line and launch QEMU
+for you when needed. But here's an example of what it can look like in
+case you need to construct one manually:
+
+.. code-block:: console
+
+ qemu-system-i386 -xen-domid 3 -no-shutdown \
+ -chardev socket,id=libxl-cmd,path=/var/run/xen/qmp-libxl-3,server=on,wait=off \
+ -mon chardev=libxl-cmd,mode=control \
+ -chardev socket,id=libxenstat-cmd,path=/var/run/xen/qmp-libxenstat-3,server=on,wait=off \
+ -mon chardev=libxenstat-cmd,mode=control \
+ -nodefaults \
+ -no-user-config \
+ -xen-attach -name g0 \
+ -vnc none \
+ -display none \
+ -device virtio-net-pci,id=nic0,netdev=net0,mac=00:16:3e:5c:81:78 \
+ -netdev type=tap,id=net0,ifname=vif3.0-emu,br=xenbr0,script=no,downscript=no \
+ -smp 4,maxcpus=4 \
+ -nographic \
+ -machine xenpvh,ram-low-base=0,ram-low-size=2147483648,ram-high-base=4294967296,ram-high-size=2147483648,pci-ecam-base=824633720832,pci-ecam-size=268435456,pci-mmio-base=4026531840,pci-mmio-size=33554432,pci-mmio-high-base=824902156288,pci-mmio-high-size=68719476736 \
+ -m 4096
diff --git a/docs/system/images.rst b/docs/system/images.rst
index d000bd6..a555117 100644
--- a/docs/system/images.rst
+++ b/docs/system/images.rst
@@ -82,4 +82,6 @@ VM snapshots currently have the following known limitations:
- A few device drivers still have incomplete snapshot support so their
state is not saved or restored properly (in particular USB).
+.. _block-drivers:
+
.. include:: qemu-block-drivers.rst.inc
diff --git a/docs/system/index.rst b/docs/system/index.rst
index c21065e..718e9d3 100644
--- a/docs/system/index.rst
+++ b/docs/system/index.rst
@@ -39,3 +39,4 @@ or Hypervisor.Framework.
multi-process
confidential-guest-support
vm-templating
+ sriov
diff --git a/docs/system/introduction.rst b/docs/system/introduction.rst
index 746707e..338d374 100644
--- a/docs/system/introduction.rst
+++ b/docs/system/introduction.rst
@@ -169,7 +169,7 @@ would default to it anyway.
.. code::
- -cpu max,pauth-impdef=on \
+ -cpu max \
-smp 4 \
-accel tcg \
diff --git a/docs/system/linuxboot.rst b/docs/system/linuxboot.rst
index 5db2e56..2328b4a 100644
--- a/docs/system/linuxboot.rst
+++ b/docs/system/linuxboot.rst
@@ -11,7 +11,7 @@ The syntax is:
.. parsed-literal::
- |qemu_system| -kernel bzImage -hda rootdisk.img -append "root=/dev/hda"
+ |qemu_system| -kernel bzImage -drive file=rootdisk.img,format=raw -append "root=/dev/sda"
Use ``-kernel`` to provide the Linux kernel image and ``-append`` to
give the kernel command line arguments. The ``-initrd`` option can be
@@ -23,8 +23,8 @@ virtual serial port and the QEMU monitor to the console with the
.. parsed-literal::
- |qemu_system| -kernel bzImage -hda rootdisk.img \
- -append "root=/dev/hda console=ttyS0" -nographic
+ |qemu_system| -kernel bzImage -drive file=rootdisk.img,format=raw \
+ -append "root=/dev/sda console=ttyS0" -nographic
Use Ctrl-a c to switch between the serial console and the monitor (see
:ref:`GUI_keys`).
diff --git a/docs/system/loongarch/virt.rst b/docs/system/loongarch/virt.rst
index 06d034b..7845878 100644
--- a/docs/system/loongarch/virt.rst
+++ b/docs/system/loongarch/virt.rst
@@ -12,14 +12,15 @@ Supported devices
-----------------
The ``virt`` machine supports:
-- Gpex host bridge
-- Ls7a RTC device
-- Ls7a IOAPIC device
-- ACPI GED device
-- Fw_cfg device
-- PCI/PCIe devices
-- Memory device
-- CPU device. Type: la464.
+
+* Gpex host bridge
+* Ls7a RTC device
+* Ls7a IOAPIC device
+* ACPI GED device
+* Fw_cfg device
+* PCI/PCIe devices
+* Memory device
+* CPU device. Type: la464.
CPU and machine Type
--------------------
@@ -39,13 +40,7 @@ can be accessed by following steps.
.. code-block:: bash
- ./configure --disable-rdma --prefix=/usr \
- --target-list="loongarch64-softmmu" \
- --disable-libiscsi --disable-libnfs --disable-libpmem \
- --disable-glusterfs --enable-libusb --enable-usb-redir \
- --disable-opengl --disable-xen --enable-spice \
- --enable-debug --disable-capstone --disable-kvm \
- --enable-profiler
+ ./configure --target-list="loongarch64-softmmu"
make -j8
(2) Set cross tools:
@@ -53,9 +48,7 @@ can be accessed by following steps.
.. code-block:: bash
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-6.3-cross-tools-gcc-glibc.tar.xz
-
tar -vxf loongarch64-clfs-6.3-cross-tools-gcc-glibc.tar.xz -C /opt
-
export PATH=/opt/cross-tools/bin:$PATH
export LD_LIBRARY_PATH=/opt/cross-tools/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/cross-tools/loongarch64-unknown-linux-gnu/lib/:$LD_LIBRARY_PATH
@@ -64,7 +57,7 @@ Note: You need get the latest cross-tools at https://github.com/loongson/build-t
(3) Build BIOS:
- See: https://github.com/tianocore/edk2-platforms/tree/master/Platform/Loongson/LoongArchQemuPkg#readme
+ See: https://github.com/tianocore/edk2/tree/master/OvmfPkg/LoongArchVirt#readme
Note: To build the release version of the bios, set --buildtarget=RELEASE,
the bios file path: Build/LoongArchQemu/RELEASE_GCC5/FV/QEMU_EFI.fd
@@ -74,13 +67,9 @@ Note: To build the release version of the bios, set --buildtarget=RELEASE,
.. code-block:: bash
git clone https://github.com/loongson/linux.git
-
cd linux
-
git checkout loongarch-next
-
make ARCH=loongarch CROSS_COMPILE=loongarch64-unknown-linux-gnu- loongson3_defconfig
-
make ARCH=loongarch CROSS_COMPILE=loongarch64-unknown-linux-gnu- -j32
Note: The branch of linux source code is loongarch-next.
diff --git a/docs/system/ppc/amigang.rst b/docs/system/ppc/amigang.rst
index e2c9cb7..21bb14e 100644
--- a/docs/system/ppc/amigang.rst
+++ b/docs/system/ppc/amigang.rst
@@ -21,6 +21,7 @@ Emulated devices
* VIA VT82C686B south bridge
* PCI VGA compatible card (guests may need other card instead)
* PS/2 keyboard and mouse
+ * 4 KiB NVRAM (use ``-drive if=mtd,format=raw,file=nvram.bin`` to keep contents persistent)
Firmware
--------
@@ -54,14 +55,14 @@ To boot the system run:
-cdrom "A1 Linux Net Installer.iso" \
-device ati-vga,model=rv100,romfile=VGABIOS-lgpl-latest.bin
-From the firmware menu that appears select ``Boot sequence`` →
-``Amiga Multiboot Options`` and set ``Boot device 1`` to
-``Onboard VIA IDE CDROM``. Then hit escape until the main screen appears again,
-hit escape once more and from the exit menu that appears select either
-``Save settings and exit`` or ``Use settings for this session only``. It may
-take a long time loading the kernel into memory but eventually it boots and the
-installer becomes visible. The ``ati-vga`` RV100 emulation is not
-complete yet so only frame buffer works, DRM and 3D is not available.
+If a firmware menu appears, select ``Boot sequence`` → ``Amiga Multiboot Options``
+and set ``Boot device 1`` to ``Onboard VIA IDE CDROM``. Then hit escape until
+the main screen appears again, hit escape once more and from the exit menu that
+appears select either ``Save settings and exit`` or ``Use settings for this
+session only``. It may take a long time loading the kernel into memory but
+eventually it boots and the installer becomes visible. The ``ati-vga`` RV100
+emulation is not complete yet so only frame buffer works, DRM and 3D is not
+available.
Genesi/bPlan Pegasos II (``pegasos2``)
======================================
diff --git a/docs/system/ppc/embedded.rst b/docs/system/ppc/embedded.rst
index af3b3d9..5cb7d98 100644
--- a/docs/system/ppc/embedded.rst
+++ b/docs/system/ppc/embedded.rst
@@ -4,6 +4,5 @@ Embedded family boards
- ``bamboo`` bamboo
- ``mpc8544ds`` mpc8544ds
- ``ppce500`` generic paravirt e500 platform
-- ``ref405ep`` ref405ep
- ``sam460ex`` aCube Sam460ex
- ``virtex-ml507`` Xilinx Virtex ML507 reference design
diff --git a/docs/system/ppc/powermac.rst b/docs/system/ppc/powermac.rst
index 04334ba..3eac81c 100644
--- a/docs/system/ppc/powermac.rst
+++ b/docs/system/ppc/powermac.rst
@@ -4,8 +4,8 @@ PowerMac family boards (``g3beige``, ``mac99``)
Use the executable ``qemu-system-ppc`` to simulate a complete PowerMac
PowerPC system.
-- ``g3beige`` Heathrow based PowerMAC
-- ``mac99`` Mac99 based PowerMAC
+- ``g3beige`` Heathrow based PowerMac
+- ``mac99`` Mac99 based PowerMac
Supported devices
-----------------
diff --git a/docs/system/ppc/powernv.rst b/docs/system/ppc/powernv.rst
index 09f3965..f3ec2cc 100644
--- a/docs/system/ppc/powernv.rst
+++ b/docs/system/ppc/powernv.rst
@@ -181,7 +181,7 @@ connected to a remote QEMU machine acting as BMC, using these options
.. code-block:: bash
- -chardev socket,id=ipmi0,host=localhost,port=9002,reconnect=10 \
+ -chardev socket,id=ipmi0,host=localhost,port=9002,reconnect-ms=10000 \
-device ipmi-bmc-extern,id=bmc0,chardev=ipmi0 \
-device isa-ipmi-bt,bmc=bmc0,irq=10 \
-nodefaults
@@ -195,6 +195,13 @@ Use a MTD drive to add a PNOR to the machine, and get a NVRAM :
-drive file=./witherspoon.pnor,format=raw,if=mtd
+If no mtd drive is provided, the powernv platform will create a default
+PNOR device using a tiny formatted PNOR in pc-bios/pnv-pnor.bin opened
+read-only (PNOR changes will be persistent across reboots but not across
+invocations of QEMU). If no defaults are used, an erased 128MB PNOR is
+provided (which skiboot will probably not recognize since it is not
+formatted).
+
Maintainer contact information
------------------------------
diff --git a/docs/system/ppc/pseries.rst b/docs/system/ppc/pseries.rst
index a876d89..bbc51aa 100644
--- a/docs/system/ppc/pseries.rst
+++ b/docs/system/ppc/pseries.rst
@@ -14,10 +14,19 @@ virtualization capabilities.
Supported devices
=================
- * Multi processor support for many Power processors generations: POWER7,
- POWER7+, POWER8, POWER8NVL, POWER9, and Power10. Support for POWER5+ exists,
- but its state is unknown.
- * Interrupt Controller, XICS (POWER8) and XIVE (POWER9 and Power10)
+ * Multi processor support for many Power processors generations:
+ - POWER7, POWER7+
+ - POWER8, POWER8NVL
+ - POWER9
+ - Power10
+ - Power11
+ - Support for POWER5+ also exists, works with correct kernel/userspace
+ * Interrupt Controller
+ - XICS (POWER8)
+ - XIVE (Supported by below:)
+ - POWER9
+ - Power10
+ - Power11
* vPHB PCIe Host bridge.
* vscsi and vnet devices, compatible with the same devices available on a
PowerVM hypervisor with VIOS managing LPARs.
diff --git a/docs/system/riscv/microblaze-v-generic.rst b/docs/system/riscv/microblaze-v-generic.rst
new file mode 100644
index 0000000..5606f88
--- /dev/null
+++ b/docs/system/riscv/microblaze-v-generic.rst
@@ -0,0 +1,42 @@
+Microblaze-V generic board (``amd-microblaze-v-generic``)
+=========================================================
+The AMD MicroBlaze™ V processor is a soft-core RISC-V processor IP for AMD
+adaptive SoCs and FPGAs. The MicroBlaze™ V processor is based on the 32-bit (or
+64-bit) RISC-V instruction set architecture (ISA) and contains interfaces
+compatible with the classic MicroBlaze™ V processor (i.e it is a drop in
+replacement for the classic MicroBlaze™ processor in existing RTL designs).
+More information can be found in below document.
+
+https://docs.amd.com/r/en-US/ug1629-microblaze-v-user-guide/MicroBlaze-V-Architecture
+
+The MicroBlaze™ V generic board in QEMU has following supported devices:
+
+ - timer
+ - uartlite
+ - uart16550
+ - emaclite
+ - timer2
+ - axi emac
+ - axi dma
+
+The MicroBlaze™ V core in QEMU has the following configuration:
+
+ - RV32I base integer instruction set
+ - "Zicsr" Control and Status register instructions
+ - "Zifencei" instruction-fetch
+ - Extensions: m, a, f, c
+
+Running
+"""""""
+Below is an example command line for launching mainline U-boot
+(xilinx_mbv32_defconfig) on the Microblaze-V generic board.
+
+.. code-block:: bash
+
+ $ qemu-system-riscv32 -M amd-microblaze-v-generic \
+ -display none \
+ -device loader,addr=0x80000000,file=u-boot-spl.bin,cpu-num=0 \
+ -device loader,addr=0x80200000,file=u-boot.img \
+ -serial mon:stdio \
+ -device loader,addr=0x83000000,file=system.dtb \
+ -m 2g
diff --git a/docs/system/riscv/microchip-icicle-kit.rst b/docs/system/riscv/microchip-icicle-kit.rst
index 40798b1..9809e94 100644
--- a/docs/system/riscv/microchip-icicle-kit.rst
+++ b/docs/system/riscv/microchip-icicle-kit.rst
@@ -5,10 +5,10 @@ Microchip PolarFire SoC Icicle Kit integrates a PolarFire SoC, with one
SiFive's E51 plus four U54 cores and many on-chip peripherals and an FPGA.
For more details about Microchip PolarFire SoC, please see:
-https://www.microsemi.com/product-directory/soc-fpgas/5498-polarfire-soc-fpga
+https://www.microchip.com/en-us/products/fpgas-and-plds/system-on-chip-fpgas/polarfire-soc-fpgas
The Icicle Kit board information can be found here:
-https://www.microsemi.com/existing-parts/parts/152514
+https://www.microchip.com/en-us/development-tool/mpfs-icicle-kit-es
Supported devices
-----------------
@@ -26,95 +26,48 @@ The ``microchip-icicle-kit`` machine supports the following devices:
* 2 GEM Ethernet controllers
* 1 SDHC storage controller
+The memory is set to 1537 MiB by default. A sanity check on RAM size is
+performed in the machine init routine to prompt user to increase the RAM size
+to > 1537 MiB when less than 1537 MiB RAM is detected.
+
Boot options
------------
-The ``microchip-icicle-kit`` machine can start using the standard -bios
-functionality for loading its BIOS image, aka Hart Software Services (HSS_).
-HSS loads the second stage bootloader U-Boot from an SD card. Then a kernel
-can be loaded from U-Boot. It also supports direct kernel booting via the
--kernel option along with the device tree blob via -dtb. When direct kernel
-boot is used, the OpenSBI fw_dynamic BIOS image is used to boot a payload
-like U-Boot or OS kernel directly.
-
-The user provided DTB should have the following requirements:
-
-* The /cpus node should contain at least one subnode for E51 and the number
- of subnodes should match QEMU's ``-smp`` option
-* The /memory reg size should match QEMU’s selected ram_size via ``-m``
-* Should contain a node for the CLINT device with a compatible string
- "riscv,clint0"
-
-QEMU follows below truth table to select which payload to execute:
-
-===== ========== ========== =======
--bios -kernel -dtb payload
-===== ========== ========== =======
- N N don't care HSS
- Y don't care don't care HSS
- N Y Y kernel
-===== ========== ========== =======
-
-The memory is set to 1537 MiB by default which is the minimum required high
-memory size by HSS. A sanity check on ram size is performed in the machine
-init routine to prompt user to increase the RAM size to > 1537 MiB when less
-than 1537 MiB ram is detected.
-
-Running HSS
------------
-
-HSS 2020.12 release is tested at the time of writing. To build an HSS image
-that can be booted by the ``microchip-icicle-kit`` machine, type the following
-in the HSS source tree:
-
-.. code-block:: bash
-
- $ export CROSS_COMPILE=riscv64-linux-
- $ cp boards/mpfs-icicle-kit-es/def_config .config
- $ make BOARD=mpfs-icicle-kit-es
-
-Download the official SD card image released by Microchip and prepare it for
-QEMU usage:
-
-.. code-block:: bash
-
- $ wget ftp://ftpsoc.microsemi.com/outgoing/core-image-minimal-dev-icicle-kit-es-sd-20201009141623.rootfs.wic.gz
- $ gunzip core-image-minimal-dev-icicle-kit-es-sd-20201009141623.rootfs.wic.gz
- $ qemu-img resize core-image-minimal-dev-icicle-kit-es-sd-20201009141623.rootfs.wic 4G
-
-Then we can boot the machine by:
-
-.. code-block:: bash
-
- $ qemu-system-riscv64 -M microchip-icicle-kit -smp 5 \
- -bios path/to/hss.bin -sd path/to/sdcard.img \
- -nic user,model=cadence_gem \
- -nic tap,ifname=tap,model=cadence_gem,script=no \
- -display none -serial stdio \
- -chardev socket,id=serial1,path=serial1.sock,server=on,wait=on \
- -serial chardev:serial1
+The ``microchip-icicle-kit`` machine provides some options to run a firmware
+(BIOS) or a kernel image. QEMU follows below truth table to select the
+firmware:
-With above command line, current terminal session will be used for the first
-serial port. Open another terminal window, and use ``minicom`` to connect the
-second serial port.
+============= =========== ======================================
+-bios -kernel firmware
+============= =========== ======================================
+none N this is an error
+none Y the kernel image
+NULL, default N hss.bin
+NULL, default Y opensbi-riscv64-generic-fw_dynamic.bin
+other don't care the BIOS image
+============= =========== ======================================
-.. code-block:: bash
+Direct Kernel Boot
+------------------
- $ minicom -D unix\#serial1.sock
+Use the ``-kernel`` option to directly run a kernel image. When a direct
+kernel boot is requested, a device tree blob may be specified via the ``-dtb``
+option. Unlike other QEMU machines, this machine does not generate a device
+tree for the kernel. It shall be provided by the user. The user provided DTB
+should meet the following requirements:
-HSS output is on the first serial port (stdio) and U-Boot outputs on the
-second serial port. U-Boot will automatically load the Linux kernel from
-the SD card image.
+* The ``/cpus`` node should contain at least one subnode for E51 and the number
+ of subnodes should match QEMU's ``-smp`` option.
-Direct Kernel Boot
-------------------
+* The ``/memory`` reg size should match QEMU’s selected RAM size via the ``-m``
+ option.
-Sometimes we just want to test booting a new kernel, and transforming the
-kernel image to the format required by the HSS bootflow is tedious. We can
-use '-kernel' for direct kernel booting just like other RISC-V machines do.
+* It should contain a node for the CLINT device with a compatible string
+ "riscv,clint0".
-In this mode, the OpenSBI fw_dynamic BIOS image for 'generic' platform is
-used to boot an S-mode payload like U-Boot or OS kernel directly.
+When ``-bios`` is not specified or set to ``default``, the OpenSBI
+``fw_dynamic`` BIOS image for the ``generic`` platform is used to boot an
+S-mode payload like U-Boot or OS kernel directly.
For example, the following commands show building a U-Boot image from U-Boot
mainline v2021.07 for the Microchip Icicle Kit board:
@@ -146,4 +99,13 @@ CAVEATS:
``u-boot.bin`` has to be used which does contain one. To use the ELF image,
we need to change to CONFIG_OF_EMBED or CONFIG_OF_PRIOR_STAGE.
+Running HSS
+-----------
+
+The machine ``microchip-icicle-kit`` used to run the Hart Software Services
+(HSS_), however, the HSS development progressed and the QEMU machine
+implementation lacks behind. Currently, running the HSS no longer works.
+There is missing support in the clock and memory controller devices. In
+particular, reading from the SD card does not work.
+
.. _HSS: https://github.com/polarfire-soc/hart-software-services
diff --git a/docs/system/riscv/virt.rst b/docs/system/riscv/virt.rst
index 9a06f95..6085097 100644
--- a/docs/system/riscv/virt.rst
+++ b/docs/system/riscv/virt.rst
@@ -84,6 +84,25 @@ none``, as in
Firmware images used for pflash must be exactly 32 MiB in size.
+riscv-iommu support
+-------------------
+
+The board has support for the riscv-iommu-pci device by using the following
+command line:
+
+.. code-block:: bash
+
+ $ qemu-system-riscv64 -M virt -device riscv-iommu-pci (...)
+
+It also has support for the riscv-iommu-sys platform device:
+
+.. code-block:: bash
+
+ $ qemu-system-riscv64 -M virt,iommu-sys=on (...)
+
+Refer to :ref:`riscv-iommu` for more information on how the RISC-V IOMMU support
+works.
+
Machine-specific options
------------------------
@@ -110,12 +129,23 @@ The following machine-specific options are supported:
MSIs. When not specified, this option is assumed to be "none" which selects
SiFive PLIC to handle wired interrupts.
+ This option also interacts with '-accel kvm'. When using "aia=aplic-imsic"
+ with KVM, it is possible to set the use of the kernel irqchip in split mode
+ by using "-accel kvm,kernel-irqchip=split". In this case the ``virt`` machine
+ will emulate the APLIC controller instead of using the APLIC controller from
+ the irqchip. See :ref:`riscv-aia` for more details on all available AIA
+ modes.
+
- aia-guests=nnn
The number of per-HART VS-level AIA IMSIC pages to be emulated for a guest
having AIA IMSIC (i.e. "aia=aplic-imsic" selected). When not specified,
the default number of per-HART VS-level AIA IMSIC pages is 0.
+- iommu-sys=[on|off]
+
+ Enables the riscv-iommu-sys platform device. Defaults to 'off'.
+
Running Linux kernel
--------------------
diff --git a/docs/system/s390x/bootdevices.rst b/docs/system/s390x/bootdevices.rst
index 1a7a18b..97b3914 100644
--- a/docs/system/s390x/bootdevices.rst
+++ b/docs/system/s390x/bootdevices.rst
@@ -6,9 +6,7 @@ Booting with bootindex parameter
For classical mainframe guests (i.e. LPAR or z/VM installations), you always
have to explicitly specify the disk where you want to boot from (or "IPL" from,
-in s390x-speak -- IPL means "Initial Program Load"). In particular, there can
-also be only one boot device according to the architecture specification, thus
-specifying multiple boot devices is not possible (yet).
+in s390x-speak -- IPL means "Initial Program Load").
So for booting an s390x guest in QEMU, you should always mark the
device where you want to boot from with the ``bootindex`` property, for
@@ -17,6 +15,11 @@ example::
qemu-system-s390x -drive if=none,id=dr1,file=guest.qcow2 \
-device virtio-blk,drive=dr1,bootindex=1
+Multiple devices may have a bootindex. The lowest bootindex is assigned to the
+device to IPL first. If the IPL fails for the first, the device with the second
+lowest bootindex will be tried and so on until IPL is successful or there are no
+remaining boot devices to try.
+
For booting from a CD-ROM ISO image (which needs to include El-Torito boot
information in order to be bootable), it is recommended to specify a ``scsi-cd``
device, for example like this::
@@ -76,29 +79,45 @@ The second way to use this parameter is to use a number in the range from 0
to 31. The numbers that can be used here correspond to the numbers that are
shown when using the ``PROMPT`` option, and the s390-ccw bios will then try
to automatically boot the kernel that is associated with the given number.
-Note that ``0`` can be used to boot the default entry.
+Note that ``0`` can be used to boot the default entry. If the machine
+``loadparm`` is not assigned a value, then the default entry is used.
+
+By default, the machine ``loadparm`` applies to all boot devices. If multiple
+devices are assigned a ``bootindex`` and the ``loadparm`` is to be different
+between them, an independent ``loadparm`` may be assigned on a per-device basis.
+
+An example guest using per-device ``loadparm``::
+
+ qemu-system-s390x -drive if=none,id=dr1,file=primary.qcow2 \
+ -device virtio-blk,drive=dr1,bootindex=1 \
+ -drive if=none,id=dr2,file=secondary.qcow2 \
+ -device virtio-blk,drive=dr2,bootindex=2,loadparm=3
+
+In this case, the primary boot device will attempt to IPL using the default
+entry (because no ``loadparm`` is specified for this device or for the
+machine). If that device fails to boot, the secondary device will attempt to
+IPL using entry number 3.
+
+If a ``loadparm`` is specified on both the machine and a device, the per-device
+value will superseded the machine value. Per-device ``loadparm`` values are
+only used for devices with an assigned ``bootindex``. The machine ``loadparm``
+is used when attempting to boot without a ``bootindex``.
Booting from a network device
-----------------------------
-Beside the normal guest firmware (which is loaded from the file ``s390-ccw.img``
-in the data directory of QEMU, or via the ``-bios`` option), QEMU ships with
-a small TFTP network bootloader firmware for virtio-net-ccw devices, too. This
-firmware is loaded from a file called ``s390-netboot.img`` in the QEMU data
-directory. In case you want to load it from a different filename instead,
-you can specify it via the ``-global s390-ipl.netboot_fw=filename``
-command line option.
-
-The ``bootindex`` property is especially important for booting via the network.
-If you don't specify the ``bootindex`` property here, the network bootloader
-firmware code won't get loaded into the guest memory so that the network boot
-will fail. For a successful network boot, try something like this::
+The firmware that ships with QEMU includes a small TFTP network bootloader
+for virtio-net-ccw devices. The ``bootindex`` property is especially
+important for booting via the network. If you don't specify the ``bootindex``
+property here, the network bootloader won't be taken into consideration and
+the network boot will fail. For a successful network boot, try something
+like this::
qemu-system-s390x -netdev user,id=n1,tftp=...,bootfile=... \
-device virtio-net-ccw,netdev=n1,bootindex=1
-The network bootloader firmware also has basic support for pxelinux.cfg-style
+The network bootloader also has basic support for pxelinux.cfg-style
configuration files. See the `PXELINUX Configuration page
<https://wiki.syslinux.org/wiki/index.php?title=PXELINUX#Configuration>`__
for details how to set up the configuration file on your TFTP server.
diff --git a/docs/system/sriov.rst b/docs/system/sriov.rst
new file mode 100644
index 0000000..d12178f
--- /dev/null
+++ b/docs/system/sriov.rst
@@ -0,0 +1,37 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Compsable SR-IOV device
+=======================
+
+SR-IOV (Single Root I/O Virtualization) is an optional extended capability of a
+PCI Express device. It allows a single physical function (PF) to appear as
+multiple virtual functions (VFs) for the main purpose of eliminating software
+overhead in I/O from virtual machines.
+
+There are devices with predefined SR-IOV configurations, but it is also possible
+to compose an SR-IOV device yourself. Composing an SR-IOV device is currently
+only supported by virtio-net-pci.
+
+Users can configure an SR-IOV-capable virtio-net device by adding
+virtio-net-pci functions to a bus. Below is a command line example:
+
+.. code-block:: shell
+
+ -netdev user,id=n -netdev user,id=o
+ -netdev user,id=p -netdev user,id=q
+ -device pcie-root-port,id=b
+ -device virtio-net-pci,bus=b,addr=0x0.0x3,netdev=q,sriov-pf=f
+ -device virtio-net-pci,bus=b,addr=0x0.0x2,netdev=p,sriov-pf=f
+ -device virtio-net-pci,bus=b,addr=0x0.0x1,netdev=o,sriov-pf=f
+ -device virtio-net-pci,bus=b,addr=0x0.0x0,netdev=n,id=f
+
+The VFs specify the paired PF with ``sriov-pf`` property. The PF must be
+added after all VFs. It is the user's responsibility to ensure that VFs have
+function numbers larger than one of the PF, and that the function numbers
+have a consistent stride. Both the PF and VFs are ARI-capable so you can have
+255 VFs at maximum.
+
+You may also need to perform additional steps to activate the SR-IOV feature on
+your guest. For Linux, refer to [1]_.
+
+.. [1] https://docs.kernel.org/PCI/pci-iov-howto.html
diff --git a/docs/system/target-arm.rst b/docs/system/target-arm.rst
index 7b99272..b96a05a 100644
--- a/docs/system/target-arm.rst
+++ b/docs/system/target-arm.rst
@@ -63,10 +63,6 @@ large amounts of RAM. It also supports 64-bit CPUs.
Board-specific documentation
============================
-Unfortunately many of the Arm boards QEMU supports are currently
-undocumented; you can get a complete list by running
-``qemu-system-aarch64 --machine help``.
-
..
This table of contents should be kept sorted alphabetically
by the title text of each file, which isn't the same ordering
@@ -90,26 +86,28 @@ undocumented; you can get a complete list by running
arm/digic
arm/cubieboard
arm/emcraft-sf2
+ arm/exynos
+ arm/fby35
arm/musicpal
- arm/gumstix
- arm/mainstone
arm/kzm
- arm/nseries
arm/nrf
arm/nuvoton
arm/imx25-pdk
+ arm/mcimx6ul-evk
+ arm/mcimx7d-sabre
+ arm/imx8mp-evk
arm/orangepi
- arm/palm
arm/raspi
- arm/xscale
arm/collie
arm/sx1
arm/stellaris
arm/stm32
arm/virt
+ arm/vmapple
arm/xenpvh
arm/xlnx-versal-virt
arm/xlnx-zynq
+ arm/xlnx-zcu102
Emulated CPU architecture support
=================================
diff --git a/docs/system/target-i386.rst b/docs/system/target-i386.rst
index 1b8a1f2..43b09c7 100644
--- a/docs/system/target-i386.rst
+++ b/docs/system/target-i386.rst
@@ -14,8 +14,9 @@ Board-specific documentation
.. toctree::
:maxdepth: 1
- i386/microvm
i386/pc
+ i386/microvm
+ i386/nitro-enclave
Architectural features
~~~~~~~~~~~~~~~~~~~~~~
@@ -26,9 +27,11 @@ Architectural features
i386/cpu
i386/hyperv
i386/xen
+ i386/xenpvh
i386/kvm-pv
i386/sgx
i386/amd-memory-encryption
+ i386/tdx
OS requirements
~~~~~~~~~~~~~~~
diff --git a/docs/system/target-loongarch.rst b/docs/system/target-loongarch.rst
new file mode 100644
index 0000000..316c604
--- /dev/null
+++ b/docs/system/target-loongarch.rst
@@ -0,0 +1,19 @@
+.. _LoongArch-System-emulator:
+
+LoongArch System emulator
+-------------------------
+
+QEMU can emulate loongArch 64 bit systems via the
+``qemu-system-loongarch64`` binary. Only one machine type ``virt`` is
+supported.
+
+When using KVM as accelerator, QEMU can emulate la464 cpu model. And when
+using the default cpu model with TCG as accelerator, QEMU will emulate a
+subset of la464 cpu features that should be enough to run distributions
+built for the la464.
+
+Board-specific documentation
+============================
+
+.. toctree::
+ loongarch/virt
diff --git a/docs/system/target-mips.rst b/docs/system/target-mips.rst
index 83239fb..9028c3b 100644
--- a/docs/system/target-mips.rst
+++ b/docs/system/target-mips.rst
@@ -112,5 +112,5 @@ https://mipsdistros.mips.com/LinuxDistro/nanomips/kernels/v4.15.18-432-gb2eb9a8b
Start system emulation of Malta board with nanoMIPS I7200 CPU::
qemu-system-mipsel -cpu I7200 -kernel <kernel_image_file> \
- -M malta -serial stdio -m <memory_size> -hda <disk_image_file> \
+ -M malta -serial stdio -m <memory_size> -drive file=<disk_image_file>,format=raw \
-append "mem=256m@0x0 rw console=ttyS0 vga=cirrus vesa=0x111 root=/dev/sda"
diff --git a/docs/system/target-riscv.rst b/docs/system/target-riscv.rst
index ba195f1..95457af 100644
--- a/docs/system/target-riscv.rst
+++ b/docs/system/target-riscv.rst
@@ -66,6 +66,7 @@ undocumented; you can get a complete list by running
.. toctree::
:maxdepth: 1
+ riscv/microblaze-v-generic
riscv/microchip-icicle-kit
riscv/shakti-c
riscv/sifive_u
diff --git a/docs/system/targets.rst b/docs/system/targets.rst
index 224fada..38e2418 100644
--- a/docs/system/targets.rst
+++ b/docs/system/targets.rst
@@ -18,6 +18,7 @@ Contents:
target-arm
target-avr
+ target-loongarch
target-m68k
target-mips
target-ppc
diff --git a/docs/tools/index.rst b/docs/tools/index.rst
index 33ad438..1e88ae4 100644
--- a/docs/tools/index.rst
+++ b/docs/tools/index.rst
@@ -15,5 +15,4 @@ command line utilities and other standalone programs.
qemu-nbd
qemu-pr-helper
qemu-trace-stap
- virtfs-proxy-helper
qemu-vmsr-helper
diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst
index 329f44d..f82ea5f 100644
--- a/docs/tools/qemu-nbd.rst
+++ b/docs/tools/qemu-nbd.rst
@@ -1,3 +1,5 @@
+.. _qemu-nbd:
+
=====================================
QEMU Disk Network Block Device Server
=====================================
@@ -154,6 +156,11 @@ driver options if :option:`--image-opts` is specified.
Set the NBD volume export description, as a human-readable
string.
+.. option:: --handshake-limit=N
+
+ Set the timeout for a client to successfully complete its handshake
+ to N seconds (default 10), or 0 for no limit.
+
.. option:: -L, --list
Connect as a client and list all details about the exports exposed by
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
index ea00149..35ab2d7 100644
--- a/docs/tools/qemu-storage-daemon.rst
+++ b/docs/tools/qemu-storage-daemon.rst
@@ -1,3 +1,5 @@
+.. _storage-daemon:
+
===================
QEMU Storage Daemon
===================
diff --git a/docs/tools/qemu-vmsr-helper.rst b/docs/tools/qemu-vmsr-helper.rst
index 6ec87b4..9ce10b9 100644
--- a/docs/tools/qemu-vmsr-helper.rst
+++ b/docs/tools/qemu-vmsr-helper.rst
@@ -17,8 +17,8 @@ driver to advertise and monitor the power consumption or accumulated energy
consumption of different power domains, such as CPU packages, DRAM, and other
components when available.
-However those register are accesible under priviliged access (CAP_SYS_RAWIO).
-QEMU can use an external helper to access those priviliged register.
+However those registers are accessible under privileged access (CAP_SYS_RAWIO).
+QEMU can use an external helper to access those privileged registers.
:program:`qemu-vmsr-helper` is that external helper; it creates a listener
socket which will accept incoming connections for communication with QEMU.
diff --git a/docs/tools/virtfs-proxy-helper.rst b/docs/tools/virtfs-proxy-helper.rst
deleted file mode 100644
index bd310eb..0000000
--- a/docs/tools/virtfs-proxy-helper.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-QEMU 9p virtfs proxy filesystem helper
-======================================
-
-Synopsis
---------
-
-**virtfs-proxy-helper** [*OPTIONS*]
-
-Description
------------
-
-NOTE: The 9p 'proxy' backend is deprecated (since QEMU 8.1) and will be
-removed, along with this daemon, in a future version of QEMU!
-
-Pass-through security model in QEMU 9p server needs root privilege to do
-few file operations (like chown, chmod to any mode/uid:gid). There are two
-issues in pass-through security model:
-
-- TOCTTOU vulnerability: Following symbolic links in the server could
- provide access to files beyond 9p export path.
-
-- Running QEMU with root privilege could be a security issue.
-
-To overcome above issues, following approach is used: A new filesystem
-type 'proxy' is introduced. Proxy FS uses chroot + socket combination
-for securing the vulnerability known with following symbolic links.
-Intention of adding a new filesystem type is to allow qemu to run
-in non-root mode, but doing privileged operations using socket IO.
-
-Proxy helper (a stand alone binary part of qemu) is invoked with
-root privileges. Proxy helper chroots into 9p export path and creates
-a socket pair or a named socket based on the command line parameter.
-QEMU and proxy helper communicate using this socket. QEMU proxy fs
-driver sends filesystem request to proxy helper and receives the
-response from it.
-
-The proxy helper is designed so that it can drop root privileges except
-for the capabilities needed for doing filesystem operations.
-
-Options
--------
-
-The following options are supported:
-
-.. program:: virtfs-proxy-helper
-
-.. option:: -h
-
- Display help and exit
-
-.. option:: -p, --path PATH
-
- Path to export for proxy filesystem driver
-
-.. option:: -f, --fd SOCKET_ID
-
- Use given file descriptor as socket descriptor for communicating with
- qemu proxy fs drier. Usually a helper like libvirt will create
- socketpair and pass one of the fds as parameter to this option.
-
-.. option:: -s, --socket SOCKET_FILE
-
- Creates named socket file for communicating with qemu proxy fs driver
-
-.. option:: -u, --uid UID
-
- uid to give access to named socket file; used in combination with -g.
-
-.. option:: -g, --gid GID
-
- gid to give access to named socket file; used in combination with -u.
-
-.. option:: -n, --nodaemon
-
- Run as a normal program. By default program will run in daemon mode
diff --git a/docs/user/main.rst b/docs/user/main.rst
index e04bc2c..9a1c604 100644
--- a/docs/user/main.rst
+++ b/docs/user/main.rst
@@ -1,3 +1,5 @@
+.. _user-mode:
+
QEMU User space emulator
========================
@@ -42,6 +44,8 @@ QEMU was conceived so that ultimately it can emulate itself. Although it
is not very useful, it is an important test to show the power of the
emulator.
+.. _linux-user-mode:
+
Linux User space emulator
-------------------------
@@ -50,7 +54,7 @@ Command line options
::
- qemu-i386 [-h] [-d] [-L path] [-s size] [-cpu model] [-g port] [-B offset] [-R size] program [arguments...]
+ qemu-i386 [-h] [-d] [-L path] [-s size] [-cpu model] [-g endpoint] [-B offset] [-R size] program [arguments...]
``-h``
Print the help
@@ -87,8 +91,18 @@ Debug options:
Activate logging of the specified items (use '-d help' for a list of
log items)
-``-g port``
- Wait gdb connection to port
+``-g endpoint``
+ Wait gdb connection to a port (e.g., ``1234``) or a unix socket (e.g.,
+ ``/tmp/qemu.sock``).
+
+ If a unix socket path contains single ``%d`` placeholder (e.g.,
+ ``/tmp/qemu-%d.sock``), it is replaced by the emulator PID, which is useful
+ when passing this option via the ``QEMU_GDB`` environment variable to a
+ multi-process application.
+
+ If the endpoint address is followed by ``,suspend=n`` (e.g.,
+ ``1234,suspend=n``), then the emulated program starts without waiting for a
+ connection, which can be established at any later point in time.
``-one-insn-per-tb``
Run the emulation with one guest instruction per translation block.
@@ -130,10 +144,6 @@ Other binaries
The binary format is detected automatically.
-- user mode (Cris)
-
- * ``qemu-cris`` TODO.
-
- user mode (i386)
* ``qemu-i386`` TODO.
@@ -179,6 +189,8 @@ Other binaries
* ``qemu-sparc64`` can execute some Sparc64 (Sparc64 CPU, 64 bit ABI) and
SPARC32PLUS binaries (Sparc64 CPU, 32 bit ABI).
+.. _bsd-user-mode:
+
BSD User space emulator
-----------------------