aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2025-04-27 12:47:02 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2025-04-27 12:47:02 -0400
commit090c9641882da217e40936c98742749e4cc94130 (patch)
tree42dc4f324e2e8ad6adc571aa5fb497ace6febbed
parent019fbfa4bcd2d3a835c241295e22ab2b5b56129b (diff)
parenta9d270f6b85bab40d388fe5244f02d145759cc08 (diff)
downloadqemu-090c9641882da217e40936c98742749e4cc94130.zip
qemu-090c9641882da217e40936c98742749e4cc94130.tar.gz
qemu-090c9641882da217e40936c98742749e4cc94130.tar.bz2
Merge tag 'pull-vfio-20250425' of https://github.com/legoater/qemu into staging
vfio queue: * Updated IGD passthrough documentation * Fixed L2 crash on pseries machines * Reorganized code and renamed services * Moved HostIOMMUDevice realize after device attachement to help adding support for nested IOMMU * Fixed CPR registration with IOMMUFD backend * Refactored vfio-pci code to prepare ground for vfio-user # -----BEGIN PGP SIGNATURE----- # # iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmgLS9IACgkQUaNDx8/7 # 7KG4pxAAt5M4hDy6J1itsHhb50ORqEejdLwxKgJbSUPhRznzJBLqDoKFjtf0iiu3 # sSqGzcBAOPIrUmLiS7F31v7KxG7WkKGeWoRa3xaltnQ/6Z9v5FgWPvDKfUsI5C52 # eGg0gmXQeFQeuN5N+pJ/oDhyvzH9UKvesEYvbZrSRhau+7k06zFfrT2vGMjM4+0h # A6Rd56dh99jT2kxbULr9eHuuYZxwse/wv0XWLponutBWeroqUzeEwwjYctVRMKR4 # HMVzrz+05EoIlFpY1EELE3SIZ9IaYViI6K3CHV2wE1TqrUWvipvY1oh2eJalFG78 # 5pyUY1Kb/iFR5AXPowzJkPc6il/1duXEghxYhPs6qxIZus3wQag1UwHoT8bw2dzV # gLV6yk4w277VzzrYSTI7kgrCM0FNs/rdXmDAk8F0C63W9BV6lFt8t7i3DA27kEGe # 58Ee71OPKCswzuqyu+t7zJKXEGvcR7/g2sXHkiHWKcrCfVSWG4Va/hnXpMGxM7Iw # 35wzToYHYBxtdn5W92g9DtFgYAlQKivriylXSsJzhz9fjnQAOmLGKacjsJTCEpBR # CJrIEsz1Zg+jmeBK8AO2OJUetW4wLtKsHW9uNRJVgjQ+eY8FKnzA6lNwGrEwrCNB # rql77fw08jNtMNbU7M3G7a8OIYJoSVyH6wwar0qZA8IZjLkHY00= # =eSsm # -----END PGP SIGNATURE----- # gpg: Signature made Fri 25 Apr 2025 04:46:10 EDT # gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1 # gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [full] # gpg: aka "Cédric Le Goater <clg@kaod.org>" [full] # Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1 * tag 'pull-vfio-20250425' of https://github.com/legoater/qemu: (50 commits) vfio: refactor out vfio_pci_config_setup() vfio: refactor out vfio_interrupt_setup() vfio: Register/unregister container for CPR only once for each container vfio: Remove hiod_typename property vfio: Cleanup host IOMMU device creation vfio/container: Move realize() after attachment vfio/iommufd: Move realize() after attachment vfio/iommufd: Make a separate call to get IOMMU capabilities MAINTAINERS: Add a maintainer for util/vfio-helpers.c vfio: Rename VFIOContainer related services vfio: Rename VFIODevice related services vfio: Rename vfio-common.h to vfio-device.h vfio: Introduce vfio_listener_un/register() routines vfio: Rename RAM discard related services vfio: Introduce new files for VFIO MemoryListener vfio: Rename vfio_get_dirty_bitmap() vfio: Rename vfio_devices_all_device_dirty_tracking() vfio: Rename vfio_devices_all_dirty_tracking_started() vfio: Make vfio_container_query_dirty_bitmap() static vfio: Make vfio_devices_query_dirty_bitmap() static ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r--MAINTAINERS1
-rw-r--r--backends/iommufd.c2
-rw-r--r--docs/igd-assign.txt265
-rw-r--r--hw/core/sysbus-fdt.c1
-rw-r--r--hw/ppc/spapr_pci_vfio.c6
-rw-r--r--hw/s390x/s390-pci-vfio.c3
-rw-r--r--hw/vfio/ap.c14
-rw-r--r--hw/vfio/ccw.c30
-rw-r--r--hw/vfio/container-base.c192
-rw-r--r--hw/vfio/container.c144
-rw-r--r--hw/vfio/cpr.c3
-rw-r--r--hw/vfio/device.c400
-rw-r--r--hw/vfio/display.c10
-rw-r--r--hw/vfio/helpers.c702
-rw-r--r--hw/vfio/igd.c10
-rw-r--r--hw/vfio/iommufd.c55
-rw-r--r--hw/vfio/listener.c (renamed from hw/vfio/common.c)510
-rw-r--r--hw/vfio/meson.build10
-rw-r--r--hw/vfio/migration-multifd.c7
-rw-r--r--hw/vfio/migration-multifd.h2
-rw-r--r--hw/vfio/migration.c108
-rw-r--r--hw/vfio/pci.c283
-rw-r--r--hw/vfio/pci.h4
-rw-r--r--hw/vfio/platform.c15
-rw-r--r--hw/vfio/region.c395
-rw-r--r--hw/vfio/spapr.c79
-rw-r--r--hw/vfio/trace-events36
-rw-r--r--hw/vfio/vfio-cpr.h15
-rw-r--r--hw/vfio/vfio-display.h42
-rw-r--r--hw/vfio/vfio-helpers.h35
-rw-r--r--hw/vfio/vfio-iommufd.h34
-rw-r--r--hw/vfio/vfio-listener.h15
-rw-r--r--hw/vfio/vfio-migration-internal.h74
-rw-r--r--include/hw/s390x/vfio-ccw.h2
-rw-r--r--include/hw/vfio/vfio-common.h346
-rw-r--r--include/hw/vfio/vfio-container-base.h15
-rw-r--r--include/hw/vfio/vfio-container.h36
-rw-r--r--include/hw/vfio/vfio-device.h150
-rw-r--r--include/hw/vfio/vfio-migration.h16
-rw-r--r--include/hw/vfio/vfio-platform.h4
-rw-r--r--include/hw/vfio/vfio-region.h47
-rw-r--r--migration/target.c8
42 files changed, 2290 insertions, 1836 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 661a47d..a316907 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2197,6 +2197,7 @@ M: Alex Williamson <alex.williamson@redhat.com>
M: Cédric Le Goater <clg@redhat.com>
S: Supported
F: hw/vfio/*
+F: util/vfio-helpers.c
F: include/hw/vfio/
F: docs/devel/migration/vfio.rst
F: qapi/vfio.json
diff --git a/backends/iommufd.c b/backends/iommufd.c
index d57da44..9587e4d 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -18,7 +18,7 @@
#include "qemu/error-report.h"
#include "monitor/monitor.h"
#include "trace.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include <sys/ioctl.h>
#include <linux/iommufd.h>
diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt
index e17bb50..3aed795 100644
--- a/docs/igd-assign.txt
+++ b/docs/igd-assign.txt
@@ -1,44 +1,69 @@
Intel Graphics Device (IGD) assignment with vfio-pci
====================================================
-IGD has two different modes for assignment using vfio-pci:
-
-1) Universal Pass-Through (UPT) mode:
-
- In this mode the IGD device is added as a *secondary* (ie. non-primary)
- graphics device in combination with an emulated primary graphics device.
- This mode *requires* guest driver support to remove the external
- dependencies generally associated with IGD (see below). Those guest
- drivers only support this mode for Broadwell and newer IGD, according to
- Intel. Additionally, this mode by default, and as officially supported
- by Intel, does not support direct video output. The intention is to use
- this mode either to provide hardware acceleration to the emulated graphics
- or to use this mode in combination with guest-based remote access software,
- for example VNC (see below for optional output support). This mode
- theoretically has no device specific handling dependencies on vfio-pci or
- the VM firmware.
-
-2) "Legacy" mode:
-
- In this mode the IGD device is intended to be the primary and exclusive
- graphics device in the VM[1], as such QEMU does not facilitate any sort
- of remote graphics to the VM in this mode. A connected physical monitor
- is the intended output device for IGD. This mode includes several
- requirements and restrictions:
-
- * IGD must be given address 02.0 on the PCI root bus in the VM
- * The host kernel must support vfio extensions for IGD (v4.6)
- * vfio VGA support very likely needs to be enabled in the host kernel
- * The VM firmware must support specific fw_cfg enablers for IGD
- * The VM machine type must support a PCI host bridge at 00.0 (standard)
- * The VM machine type must provide or allow to be created a special
- ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at
- PCI address 1f.0.
- * The IGD device must have a VGA ROM, either provided via the romfile
- option or loaded automatically through vfio (standard). rombar=0
- will disable legacy mode support.
- * Hotplug of the IGD device is not supported.
- * The IGD device must be a SandyBridge or newer model device.
+Using vfio-pci, we can passthrough Intel Graphics Device (IGD) to guest, either
+serve as primary and exclusive graphics adapter, or used in combination with an
+emulated primary graphics device, depending on the config and guest driver
+support. However, IGD devices are not "clean" PCI devices, they use extra
+memory regions other than BARs. Special handling is required to make them work
+properly, including:
+
+* OpRegion for accessing Virtual BIOS Table (VBT) that contains display output
+ information.
+* Data Stolen Memory (DSM) region used as VRAM at early stage (BIOS/UEFI)
+
+Certain guest software also depends on following conditions to work:
+(*-Required by)
+
+| Condition | Linux | Windows | VBIOS | EFI GOP |
+|---------------------------------------------|-------|---------|-------|---------|
+| #1 IGD has a valid OpRegion containing VBT | * ^1 | * | * | * |
+| #2 VID/DID of LPC bridge at 00:1f.0 matches | | | * | * |
+| #3 IGD is assigned to BDF 00:02.0 | | | * | * |
+| #4 IGD has VGA controller device class | | | * | * |
+| #5 Host's VGA ranges are mapped to IGD | | | * | |
+| #6 Guest has valid VBIOS or UEFI Option ROM | | | * | * |
+
+^1 Though i915 driver is able to mock a OpRegion, it is still recommended to
+ use the VBT copied from host OpRegion to prevent incorrect configuration.
+
+For #1, the "x-igd-opregion=on" option exposes a copy of host IGD OpRegion to
+guest via fw_cfg, where guest firmware can set up guest OpRegion with it.
+
+For #2, "x-igd-lpc=on" option copies the IDs of host LPC bridge and host bridge
+to guest. Currently this is only supported on i440fx machines as there is
+already an ICH9 LPC bridge present on q35 machines, overwriting its IDs may
+lead to unexpected behavior.
+
+For #3, "addr=2.0" assigns IGD to 00:02.0.
+
+For #4, the primary display must be set to IGD in host BIOS.
+
+For #5, "x-vga=on" enables guest access to standard VGA IO/MMIO ranges.
+
+For #6, ROM either provided via the ROM BAR or romfile= option is needed, this
+Intel document [1] shows how to dump VBIOS to file. For UEFI Option ROM, see
+"Guest firmware" section.
+
+QEMU also provides a "Legacy" mode that implicitly enables full functionality
+on IGD, it is automatically enabled when
+* Machine type is i440fx
+* IGD is assigned to guest BDF 00:02.0
+* ROM BAR or romfile is present
+
+In "Legacy" mode, QEMU will automatically setup OpRegion, LPC bridge IDs and
+VGA range access, which is equivalent to:
+ x-igd-opregion=on,x-igd-lpc=on,x-vga=on
+
+By default, "Legacy" mode won't fail, it continues on error. User can set
+"x-igd-legacy-mode=on" to force enabling legacy mode, this also checks if the
+conditions above for legacy mode is met, and if any error occurs, QEMU will
+fail immediately. Users can also set "x-igd-legacy-mode=off" to disable legacy
+mode.
+
+In legacy mode, as the guest VGA ranges are assigned to IGD device, all other
+graphics devices should be removed, this can be done using "-nographic" or
+"-vga none" or "-nodefaults", along with adding the device using vfio-pci.
For either mode, depending on the host kernel, the i915 driver in the host
may generate faults and errors upon re-binding to an IGD device after it
@@ -73,31 +98,39 @@ DVI, or DisplayPort) may be unsupported in some use cases. In the author's
experience, even DP to VGA adapters can be troublesome while adapters between
digital formats work well.
-Usage
-=====
-The intention is for IGD assignment to be transparent for users and thus for
-management tools like libvirt. To make use of legacy mode, simply remove all
-other graphics options and use "-nographic" and either "-vga none" or
-"-nodefaults", along with adding the device using vfio-pci:
- -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2
+Options
+=======
+* x-igd-opregion=[on|*off*]
+ Copy host IGD OpRegion and expose it to guest with fw_cfg
+
+* x-igd-lpc=[on|*off*]
+ Creates a dummy LPC bridge at 00:1f:0 with host VID/DID (i440fx only)
+
+* x-igd-legacy-mode=[on|off|*auto*]
+ Enable/Disable legacy mode
+
+* x-igd-gms=[hex, default 0]
+ Overriding DSM region size in GGC register, 0 means uses host value.
+ Use this only when the DSM size cannot be changed through the
+ 'DVMT Pre-Allocated' option in host BIOS.
-For UPT mode, retain the default emulated graphics and simply add the vfio-pci
-device making use of any other bus address other than 02.0. libvirt will
-default to assigning the device a UPT compatible address while legacy mode
-users will need to manually edit the XML if using a tool like virt-manager
-where the VM device address is not expressly specified.
-An experimental vfio-pci option also exists to enable OpRegion, and thus
-external monitor support, for UPT mode. This can be enabled by adding
-"x-igd-opregion=on" to the vfio-pci device options for the IGD device. As
-with legacy mode, this requires the host to support features introduced in
-the v4.6 kernel. If Intel chooses to embrace this support, the option may
-be made non-experimental in the future, opening it to libvirt support.
+Examples
+========
+* Adding IGD with automatically legacy mode support
+ -device vfio-pci,host=00:02.0,id=hostdev0,addr=2.0
-Developer ABI
-=============
-Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
+* Adding IGD with OpRegion and LPC ID hack, but without VGA ranges
+ (For UEFI guests)
+ -device vfio-pci,host=00:02.0,id=hostdev0,addr=2.0,x-igd-legacy-mode=off,x-igd-opregion=on,x-igd-lpc=on,romfile=efi_oprom.rom
+
+
+Guest firmware
+==============
+Guest firmware is responsible for setting up OpRegion and Base of Data Stolen
+Memory (BDSM) in guest address space. IGD passthrough support imposes two
+fw_cfg requirements on the VM firmware:
1) "etc/igd-opregion"
@@ -117,17 +150,111 @@ Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
Firmware must allocate a reserved memory below 4GB with required 1MB
alignment equal to this size. Additionally the base address of this
reserved region must be written to the dword BDSM register in PCI config
- space of the IGD device at offset 0x5C. As this support is related to
- running the IGD ROM, which has other dependencies on the device appearing
- at guest address 00:02.0, it's expected that this fw_cfg file is only
- relevant to a single PCI class VGA device with Intel vendor ID, appearing
- at PCI bus address 00:02.0.
+ space of the IGD device at offset 0x5C (or 0xC0 for Gen 11+ devices using
+ 64-bit BDSM). As this support is related to running the IGD ROM, which
+ has other dependencies on the device appearing at guest address 00:02.0,
+ it's expected that this fw_cfg file is only relevant to a single PCI
+ class VGA device with Intel vendor ID, appearing at PCI bus address 00:02.0.
+
+Upstream Seabios has OpRegion and BDSM (pre-Gen11 device only) support.
+However, the support is not accepted by upstream EDK2/OVMF. A recommended
+solution is to create a virtual OpRom with following DXE drivers:
+
+* IgdAssignmentDxe: Set up OpRegion and BDSM according to fw_cfg (must)
+* IntelGopDriver: Closed-source Intel GOP driver
+* PlatformGopPolicy: Protocol required by IntelGopDriver
+
+IntelGopDriver and PlatformGopPolicy is only required when enabling GOP on IGD.
+
+The original IgdAssignmentDxe can be found at [3]. A Intel maintained version
+with PlatformGopPolicy for industrial computing is at [4]. There is also an
+unofficially maintained version with newer Gen11+ device support at [5].
+You need to build them with EDK2.
+
+For the IntelGopDriver, Intel never released it to public. You may contact
+Intel support to get one as [4] said, if you are an Intel Premier Support
+customer, or you can try extracting it from your host firmware using
+"UEFI BIOS Updater"[6].
+
+Once you got all the required DXE drivers, a Option ROM can be generated with
+EfiRom utility in EDK2, using
+ EfiRom -f 0x8086 -i <Device ID of your IGD> -o output.rom \
+ -e IgdAssignmentDxe.efi PlatformGOPPolicy.efi IntelGopDriver.efi
+
+
+Known issues
+============
+When using OVMF as guest firmware, you may encounter the following warning:
+warning: vfio_container_dma_map(0x55fab36ce610, 0x380010000000, 0x108000, 0x7fd336000000) = -22 (Invalid argument)
+
+Solution:
+Set the host physical address bits to IOMMU address width using
+ -cpu host,host-phys-bits-limit=<IOMMU address width>
+Or in libvirt XML with
+ <cpu>
+ <maxphysaddr mode='passthrough' limit='<IOMMU address width>'/>
+ </cpu>
+The IOMMU address width can be determined with
+ echo $(( ((0x$(cat /sys/devices/virtual/iommu/dmar0/intel-iommu/cap) & 0x3F0000) >> 16) + 1 ))
+Refer https://edk2.groups.io/g/devel/topic/patch_v1/102359124 for more details
+
+
+Memory View
+===========
+IGD has it own address space. To use system RAM as VRAM, a single-level page
+table named Global Graphics Translation Table (GTT) is used for the address
+translation. Each page table entry points a 4KB page. Illustration below shows
+the translation flow on IGD with 64-bit GTT PTEs.
+
+(PTE_SIZE == 8) +-------------+---+
+ | Address | V | V: Valid Bit
+ +-------------+---+
+ | ... | |
+IGD:0x01ae9010 0xd740| 0x70ffc000 | 1 | Mem:0x42ba3e010^
+-----------------------> 0xd748| 0x42ba3e000 | 1 +------------------>
+(addr >> 12) * PTE_SIZE 0xd750| 0x42ba3f000 | 1 |
+ | ... | |
+ +-------------+---+
+^ The address may be remapped by IOMMU
+
+The memory region store GTT is called GTT Stolen Memory (GSM) it is located
+right below the Data Stolen Memory (DSM). Accessing this region directly is
+not allowed, any access will immediately freeze the whole system. The only way
+to access it is through the second half of MMIO BAR0.
+
+The Data Stolen Memory is reserved by firmware, and acts as the VRAM in pre-OS
+environments. In QEMU, guest firmware (Seabios/OVMF) is responsible for
+reserving a continuous region and program its base address to BDSM register,
+then let VBIOS/GOP driver initializing this region. Illustration below shows
+how DSM is mapped.
+
+ IGD Addr Space Host Addr Space Guest Addr Space
+ +-------------+ +-------------+ +-------------+
+ | | | | | |
+ | | | | | |
+ | | +-------------+ +-------------+
+ | | | Data Stolen | | Data Stolen |
+ | | | (Guest) | | (Guest) |
+ | | +------------>+-------------+<------->+-------------+<--Guest BDSM
+ | | | Passthrough | | EPT | | Emulated by QEMU
+DSMSIZE+-------------+ | with IOMMU | | Mapping | | Programmed by guest FW
+ | | | | | | |
+ | | | | | | |
+ 0+-------------+--+ | | | |
+ | +-------------+ | |
+ | | Data Stolen | +-------------+
+ | | (Host) |
+ +------------>+-------------+<--Host BDSM
+ Non- | | "real" one in HW
+ Passthrough | | Programmed by host FW
+ +-------------+
Footnotes
=========
-[1] Nothing precludes adding additional emulated or assigned graphics devices
- as non-primary, other than the combination typically not working. I only
- intend to set user expectations, others are welcome to find working
- combinations or fix whatever issues prevent this from working in the common
- case.
+[1] https://www.intel.com/content/www/us/en/docs/graphics-for-linux/developer-reference/1-0/dump-video-bios.html
[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override
+[3] https://web.archive.org/web/20240827012422/https://bugzilla.tianocore.org/show_bug.cgi?id=935
+ Tianocore bugzilla was down since Jan 2025 :(
+[4] https://eci.intel.com/docs/3.3/components/kvm-hypervisor.html, Patch 0001-0004
+[5] https://github.com/tomitamoeko/VfioIgdPkg
+[6] https://winraid.level1techs.com/t/tool-guide-news-uefi-bios-updater-ubu/30357
diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c
index e85066b..c339a27 100644
--- a/hw/core/sysbus-fdt.c
+++ b/hw/core/sysbus-fdt.c
@@ -35,6 +35,7 @@
#include "hw/vfio/vfio-platform.h"
#include "hw/vfio/vfio-calxeda-xgmac.h"
#include "hw/vfio/vfio-amd-xgbe.h"
+#include "hw/vfio/vfio-region.h"
#include "hw/display/ramfb.h"
#include "hw/uefi/var-service-api.h"
#include "hw/arm/fdt.h"
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index 76b2a34..e318d0d 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -24,7 +24,7 @@
#include "hw/pci-host/spapr.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_device.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-container.h"
#include "qemu/error-report.h"
#include CONFIG_DEVICES /* CONFIG_VFIO_PCI */
@@ -85,7 +85,7 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
{
- VFIOAddressSpace *space = vfio_get_address_space(as);
+ VFIOAddressSpace *space = vfio_address_space_get(as);
VFIOContainerBase *bcontainer = NULL;
if (QLIST_EMPTY(&space->containers)) {
@@ -105,7 +105,7 @@ static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
}
out:
- vfio_put_address_space(space);
+ vfio_address_space_put(space);
return container_of(bcontainer, VFIOContainer, bcontainer);
}
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index db152a6..aaf9131 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -20,7 +20,8 @@
#include "hw/s390x/s390-pci-clp.h"
#include "hw/s390x/s390-pci-vfio.h"
#include "hw/vfio/pci.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-container.h"
+#include "hw/vfio/vfio-helpers.h"
/*
* Get the current DMA available count from vfio. Returns true if vfio is
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index d6575d7..be2e292 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -15,7 +15,7 @@
#include <linux/vfio.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "system/iommufd.h"
#include "hw/s390x/ap-device.h"
#include "qemu/error-report.h"
@@ -117,8 +117,8 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
fd = event_notifier_get_fd(notifier);
qemu_set_fd_handler(fd, fd_read, NULL, vapdev);
- if (!vfio_set_irq_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
- errp)) {
+ if (!vfio_device_irq_set_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
+ errp)) {
qemu_set_fd_handler(fd, NULL, NULL, vapdev);
event_notifier_cleanup(notifier);
}
@@ -141,8 +141,8 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,
return;
}
- if (!vfio_set_irq_signaling(&vapdev->vdev, irq, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
+ if (!vfio_device_irq_set_signaling(&vapdev->vdev, irq, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
warn_reportf_err(err, VFIO_MSG_PREFIX, vapdev->vdev.name);
}
@@ -162,7 +162,7 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
return;
}
- if (!vfio_attach_device(vbasedev->name, vbasedev,
+ if (!vfio_device_attach(vbasedev->name, vbasedev,
&address_space_memory, errp)) {
goto error;
}
@@ -187,7 +187,7 @@ static void vfio_ap_unrealize(DeviceState *dev)
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX);
- vfio_detach_device(&vapdev->vdev);
+ vfio_device_detach(&vapdev->vdev);
g_free(vapdev->vdev.name);
}
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 29e804e..3a91efc 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -21,7 +21,7 @@
#include <sys/ioctl.h>
#include "qapi/error.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "system/iommufd.h"
#include "hw/s390x/s390-ccw.h"
#include "hw/s390x/vfio-ccw.h"
@@ -426,8 +426,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev,
fd = event_notifier_get_fd(notifier);
qemu_set_fd_handler(fd, fd_read, NULL, vcdev);
- if (!vfio_set_irq_signaling(vdev, irq, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
+ if (!vfio_device_irq_set_signaling(vdev, irq, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
qemu_set_fd_handler(fd, NULL, NULL, vcdev);
event_notifier_cleanup(notifier);
}
@@ -456,8 +456,8 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev,
return;
}
- if (!vfio_set_irq_signaling(&vcdev->vdev, irq, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
+ if (!vfio_device_irq_set_signaling(&vcdev->vdev, irq, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
warn_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name);
}
@@ -488,7 +488,7 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
return false;
}
- ret = vfio_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, &info);
+ ret = vfio_device_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, &info);
if (ret) {
error_setg_errno(errp, -ret, "vfio: Error getting config info");
return false;
@@ -505,8 +505,8 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
g_free(info);
/* check for the optional async command region */
- ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW,
- VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, &info);
+ ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
+ VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, &info);
if (!ret) {
vcdev->async_cmd_region_size = info->size;
if (sizeof(*vcdev->async_cmd_region) != vcdev->async_cmd_region_size) {
@@ -518,8 +518,8 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
g_free(info);
}
- ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW,
- VFIO_REGION_SUBTYPE_CCW_SCHIB, &info);
+ ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
+ VFIO_REGION_SUBTYPE_CCW_SCHIB, &info);
if (!ret) {
vcdev->schib_region_size = info->size;
if (sizeof(*vcdev->schib_region) != vcdev->schib_region_size) {
@@ -531,8 +531,8 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
g_free(info);
}
- ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW,
- VFIO_REGION_SUBTYPE_CCW_CRW, &info);
+ ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
+ VFIO_REGION_SUBTYPE_CCW_CRW, &info);
if (!ret) {
vcdev->crw_region_size = info->size;
@@ -583,7 +583,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
goto out_unrealize;
}
- if (!vfio_attach_device(cdev->mdevid, vbasedev,
+ if (!vfio_device_attach(cdev->mdevid, vbasedev,
&address_space_memory, errp)) {
goto out_attach_dev_err;
}
@@ -620,7 +620,7 @@ out_irq_notifier_err:
out_io_notifier_err:
vfio_ccw_put_region(vcdev);
out_region_err:
- vfio_detach_device(vbasedev);
+ vfio_device_detach(vbasedev);
out_attach_dev_err:
g_free(vbasedev->name);
out_unrealize:
@@ -639,7 +639,7 @@ static void vfio_ccw_unrealize(DeviceState *dev)
vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX);
vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX);
vfio_ccw_put_region(vcdev);
- vfio_detach_device(&vcdev->vdev);
+ vfio_device_detach(&vcdev->vdev);
g_free(vcdev->vdev.name);
if (cdc->unrealize) {
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 749a3fd..09340fd 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -10,10 +10,68 @@
* SPDX-License-Identifier: GPL-2.0-or-later
*/
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+
#include "qemu/osdep.h"
+#include "system/tcg.h"
+#include "system/ram_addr.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "hw/vfio/vfio-container-base.h"
+#include "hw/vfio/vfio-device.h" /* vfio_device_reset_handler */
+#include "system/reset.h"
+#include "vfio-helpers.h"
+
+#include "trace.h"
+
+static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
+ QLIST_HEAD_INITIALIZER(vfio_address_spaces);
+
+VFIOAddressSpace *vfio_address_space_get(AddressSpace *as)
+{
+ VFIOAddressSpace *space;
+
+ QLIST_FOREACH(space, &vfio_address_spaces, list) {
+ if (space->as == as) {
+ return space;
+ }
+ }
+
+ /* No suitable VFIOAddressSpace, create a new one */
+ space = g_malloc0(sizeof(*space));
+ space->as = as;
+ QLIST_INIT(&space->containers);
+
+ if (QLIST_EMPTY(&vfio_address_spaces)) {
+ qemu_register_reset(vfio_device_reset_handler, NULL);
+ }
+
+ QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
+
+ return space;
+}
+
+void vfio_address_space_put(VFIOAddressSpace *space)
+{
+ if (!QLIST_EMPTY(&space->containers)) {
+ return;
+ }
+
+ QLIST_REMOVE(space, list);
+ g_free(space);
+
+ if (QLIST_EMPTY(&vfio_address_spaces)) {
+ qemu_unregister_reset(vfio_device_reset_handler, NULL);
+ }
+}
+
+void vfio_address_space_insert(VFIOAddressSpace *space,
+ VFIOContainerBase *bcontainer)
+{
+ QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
+ bcontainer->space = space;
+}
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
@@ -83,7 +141,71 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
return ret;
}
-int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+static bool vfio_container_devices_dirty_tracking_is_started(
+ const VFIOContainerBase *bcontainer)
+{
+ VFIODevice *vbasedev;
+
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+ if (!vbasedev->dirty_tracking) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool vfio_container_dirty_tracking_is_started(
+ const VFIOContainerBase *bcontainer)
+{
+ return vfio_container_devices_dirty_tracking_is_started(bcontainer) ||
+ bcontainer->dirty_pages_started;
+}
+
+bool vfio_container_devices_dirty_tracking_is_supported(
+ const VFIOContainerBase *bcontainer)
+{
+ VFIODevice *vbasedev;
+
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+ if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
+ return false;
+ }
+ if (!vbasedev->dirty_pages_supported) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
+ hwaddr size, void *bitmap)
+{
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+ sizeof(struct vfio_device_feature_dma_logging_report),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ struct vfio_device_feature_dma_logging_report *report =
+ (struct vfio_device_feature_dma_logging_report *)feature->data;
+
+ report->iova = iova;
+ report->length = size;
+ report->page_size = qemu_real_host_page_size();
+ report->bitmap = (uintptr_t)bitmap;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_GET |
+ VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
+
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
@@ -93,6 +215,74 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
errp);
}
+static int vfio_container_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+ VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
+{
+ VFIODevice *vbasedev;
+ int ret;
+
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+ ret = vfio_device_dma_logging_report(vbasedev, iova, size,
+ vbmap->bitmap);
+ if (ret) {
+ error_setg_errno(errp, -ret,
+ "%s: Failed to get DMA logging report, iova: "
+ "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx,
+ vbasedev->name, iova, size);
+
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
+ uint64_t size, ram_addr_t ram_addr, Error **errp)
+{
+ bool all_device_dirty_tracking =
+ vfio_container_devices_dirty_tracking_is_supported(bcontainer);
+ uint64_t dirty_pages;
+ VFIOBitmap vbmap;
+ int ret;
+
+ if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
+ cpu_physical_memory_set_dirty_range(ram_addr, size,
+ tcg_enabled() ? DIRTY_CLIENTS_ALL :
+ DIRTY_CLIENTS_NOCODE);
+ return 0;
+ }
+
+ ret = vfio_bitmap_alloc(&vbmap, size);
+ if (ret) {
+ error_setg_errno(errp, -ret,
+ "Failed to allocate dirty tracking bitmap");
+ return ret;
+ }
+
+ if (all_device_dirty_tracking) {
+ ret = vfio_container_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
+ errp);
+ } else {
+ ret = vfio_container_iommu_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
+ errp);
+ }
+
+ if (ret) {
+ goto out;
+ }
+
+ dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
+ vbmap.pages);
+
+ trace_vfio_container_query_dirty_bitmap(iova, size, vbmap.size, ram_addr,
+ dirty_pages);
+out:
+ g_free(vbmap.bitmap);
+
+ return ret;
+}
+
static gpointer copy_iova_range(gconstpointer src, gpointer data)
{
Range *source = (Range *)src;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 812d5ed..29170f0 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -22,7 +22,7 @@
#include <sys/ioctl.h>
#include <linux/vfio.h>
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "system/address-spaces.h"
#include "system/memory.h"
#include "system/ram_addr.h"
@@ -32,8 +32,15 @@
#include "trace.h"
#include "qapi/error.h"
#include "pci.h"
+#include "hw/vfio/vfio-container.h"
+#include "vfio-helpers.h"
+#include "vfio-cpr.h"
+#include "vfio-listener.h"
-VFIOGroupList vfio_group_list =
+#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
+
+typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
+static VFIOGroupList vfio_group_list =
QLIST_HEAD_INITIALIZER(vfio_group_list);
static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
@@ -131,8 +138,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
int ret;
Error *local_err = NULL;
- if (iotlb && vfio_devices_all_dirty_tracking_started(bcontainer)) {
- if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
+ if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) {
+ if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) &&
bcontainer->dirty_pages_supported) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
@@ -163,7 +170,7 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
}
if (need_dirty_sync) {
- ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
+ ret = vfio_container_query_dirty_bitmap(bcontainer, iova, size,
iotlb->translated_addr, &local_err);
if (ret) {
error_report_err(local_err);
@@ -273,37 +280,6 @@ static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
return ret;
}
-static struct vfio_info_cap_header *
-vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
-{
- if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
- return NULL;
- }
-
- return vfio_get_cap((void *)info, info->cap_offset, id);
-}
-
-bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
- unsigned int *avail)
-{
- struct vfio_info_cap_header *hdr;
- struct vfio_iommu_type1_info_dma_avail *cap;
-
- /* If the capability cannot be found, assume no DMA limiting */
- hdr = vfio_get_iommu_type1_info_cap(info,
- VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
- if (!hdr) {
- return false;
- }
-
- if (avail != NULL) {
- cap = (void *) hdr;
- *avail = cap->avail;
- }
-
- return true;
-}
-
static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
VFIOContainerBase *bcontainer)
{
@@ -330,7 +306,7 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
return true;
}
-static void vfio_kvm_device_add_group(VFIOGroup *group)
+static void vfio_group_add_kvm_device(VFIOGroup *group)
{
Error *err = NULL;
@@ -339,7 +315,7 @@ static void vfio_kvm_device_add_group(VFIOGroup *group)
}
}
-static void vfio_kvm_device_del_group(VFIOGroup *group)
+static void vfio_group_del_kvm_device(VFIOGroup *group)
{
Error *err = NULL;
@@ -535,7 +511,7 @@ static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
return true;
}
-static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
VFIOContainer *container;
@@ -544,7 +520,7 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
VFIOAddressSpace *space;
VFIOIOMMUClass *vioc;
- space = vfio_get_address_space(as);
+ space = vfio_address_space_get(as);
/*
* VFIO is currently incompatible with discarding of RAM insofar as the
@@ -593,7 +569,7 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
}
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
- vfio_kvm_device_add_group(group);
+ vfio_group_add_kvm_device(group);
return true;
}
}
@@ -633,19 +609,14 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
goto enable_discards_exit;
}
- vfio_kvm_device_add_group(group);
+ vfio_group_add_kvm_device(group);
vfio_address_space_insert(space, bcontainer);
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
- bcontainer->listener = vfio_memory_listener;
- memory_listener_register(&bcontainer->listener, bcontainer->space->as);
-
- if (bcontainer->error) {
- error_propagate_prepend(errp, bcontainer->error,
- "memory listener initialization failed: ");
+ if (!vfio_listener_register(bcontainer, errp)) {
goto listener_release_exit;
}
@@ -654,8 +625,8 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
return true;
listener_release_exit:
QLIST_REMOVE(group, container_next);
- vfio_kvm_device_del_group(group);
- memory_listener_unregister(&bcontainer->listener);
+ vfio_group_del_kvm_device(group);
+ vfio_listener_unregister(bcontainer);
if (vioc->release) {
vioc->release(bcontainer);
}
@@ -673,12 +644,12 @@ close_fd_exit:
close(fd);
put_space_exit:
- vfio_put_address_space(space);
+ vfio_address_space_put(space);
return false;
}
-static void vfio_disconnect_container(VFIOGroup *group)
+static void vfio_container_disconnect(VFIOGroup *group)
{
VFIOContainer *container = group->container;
VFIOContainerBase *bcontainer = &container->bcontainer;
@@ -693,7 +664,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
* group.
*/
if (QLIST_EMPTY(&container->group_list)) {
- memory_listener_unregister(&bcontainer->listener);
+ vfio_listener_unregister(bcontainer);
if (vioc->release) {
vioc->release(bcontainer);
}
@@ -707,16 +678,16 @@ static void vfio_disconnect_container(VFIOGroup *group)
if (QLIST_EMPTY(&container->group_list)) {
VFIOAddressSpace *space = bcontainer->space;
- trace_vfio_disconnect_container(container->fd);
+ trace_vfio_container_disconnect(container->fd);
vfio_cpr_unregister_container(bcontainer);
close(container->fd);
object_unref(container);
- vfio_put_address_space(space);
+ vfio_address_space_put(space);
}
}
-static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
+static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp)
{
ERRP_GUARD();
VFIOGroup *group;
@@ -760,7 +731,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
group->groupid = groupid;
QLIST_INIT(&group->device_list);
- if (!vfio_connect_container(group, as, errp)) {
+ if (!vfio_container_connect(group, as, errp)) {
error_prepend(errp, "failed to setup container for group %d: ",
groupid);
goto close_fd_exit;
@@ -779,7 +750,7 @@ free_group_exit:
return NULL;
}
-static void vfio_put_group(VFIOGroup *group)
+static void vfio_group_put(VFIOGroup *group)
{
if (!group || !QLIST_EMPTY(&group->device_list)) {
return;
@@ -788,15 +759,15 @@ static void vfio_put_group(VFIOGroup *group)
if (!group->ram_block_discard_allowed) {
vfio_ram_block_discard_disable(group->container, false);
}
- vfio_kvm_device_del_group(group);
- vfio_disconnect_container(group);
+ vfio_group_del_kvm_device(group);
+ vfio_container_disconnect(group);
QLIST_REMOVE(group, next);
- trace_vfio_put_group(group->fd);
+ trace_vfio_group_put(group->fd);
close(group->fd);
g_free(group);
}
-static bool vfio_get_device(VFIOGroup *group, const char *name,
+static bool vfio_device_get(VFIOGroup *group, const char *name,
VFIODevice *vbasedev, Error **errp)
{
g_autofree struct vfio_device_info *info = NULL;
@@ -848,25 +819,25 @@ static bool vfio_get_device(VFIOGroup *group, const char *name,
vbasedev->num_regions = info->num_regions;
vbasedev->flags = info->flags;
- trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
+ trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs);
vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
return true;
}
-static void vfio_put_base_device(VFIODevice *vbasedev)
+static void vfio_device_put(VFIODevice *vbasedev)
{
if (!vbasedev->group) {
return;
}
QLIST_REMOVE(vbasedev, next);
vbasedev->group = NULL;
- trace_vfio_put_base_device(vbasedev->fd);
+ trace_vfio_device_put(vbasedev->fd);
close(vbasedev->fd);
}
-static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
+static int vfio_device_get_groupid(VFIODevice *vbasedev, Error **errp)
{
char *tmp, group_path[PATH_MAX];
g_autofree char *group_name = NULL;
@@ -894,14 +865,14 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
}
/*
- * vfio_attach_device: attach a device to a security context
+ * vfio_device_attach: attach a device to a security context
* @name and @vbasedev->name are likely to be different depending
* on the type of the device, hence the need for passing @name
*/
static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
- int groupid = vfio_device_groupid(vbasedev, errp);
+ int groupid = vfio_device_get_groupid(vbasedev, errp);
VFIODevice *vbasedev_iter;
VFIOGroup *group;
VFIOContainerBase *bcontainer;
@@ -910,13 +881,9 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
return false;
}
- trace_vfio_attach_device(vbasedev->name, groupid);
-
- if (!vfio_device_hiod_realize(vbasedev, errp)) {
- return false;
- }
+ trace_vfio_device_attach(vbasedev->name, groupid);
- group = vfio_get_group(groupid, as, errp);
+ group = vfio_group_get(groupid, as, errp);
if (!group) {
return false;
}
@@ -924,13 +891,17 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
error_setg(errp, "device is already attached");
- vfio_put_group(group);
- return false;
+ goto group_put_exit;
}
}
- if (!vfio_get_device(group, name, vbasedev, errp)) {
- vfio_put_group(group);
- return false;
+ if (!vfio_device_get(group, name, vbasedev, errp)) {
+ goto group_put_exit;
+ }
+
+ if (!vfio_device_hiod_create_and_realize(vbasedev,
+ TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
+ errp)) {
+ goto device_put_exit;
}
bcontainer = &group->container->bcontainer;
@@ -939,6 +910,12 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
return true;
+
+device_put_exit:
+ vfio_device_put(vbasedev);
+group_put_exit:
+ vfio_group_put(group);
+ return false;
}
static void vfio_legacy_detach_device(VFIODevice *vbasedev)
@@ -948,9 +925,10 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev)
QLIST_REMOVE(vbasedev, global_next);
QLIST_REMOVE(vbasedev, container_next);
vbasedev->bcontainer = NULL;
- trace_vfio_detach_device(vbasedev->name, group->groupid);
- vfio_put_base_device(vbasedev);
- vfio_put_group(group);
+ trace_vfio_device_detach(vbasedev->name, group->groupid);
+ object_unref(vbasedev->hiod);
+ vfio_device_put(vbasedev);
+ vfio_group_put(group);
}
static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
@@ -1125,8 +1103,6 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
- vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
-
vioc->setup = vfio_legacy_setup;
vioc->dma_map = vfio_legacy_dma_map;
vioc->dma_unmap = vfio_legacy_dma_unmap;
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 3d1c8d2..3214184 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -6,10 +6,11 @@
*/
#include "qemu/osdep.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "migration/misc.h"
#include "qapi/error.h"
#include "system/runstate.h"
+#include "vfio-cpr.h"
static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier,
MigrationEvent *e, Error **errp)
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
new file mode 100644
index 0000000..d625a7c
--- /dev/null
+++ b/hw/vfio/device.c
@@ -0,0 +1,400 @@
+/*
+ * VFIO device
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ * Adapted for KVM by Qumranet.
+ * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+
+#include "hw/vfio/vfio-device.h"
+#include "hw/vfio/pci.h"
+#include "hw/hw.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/units.h"
+#include "monitor/monitor.h"
+#include "vfio-helpers.h"
+
+VFIODeviceList vfio_device_list =
+ QLIST_HEAD_INITIALIZER(vfio_device_list);
+
+/*
+ * We want to differentiate hot reset of multiple in-use devices vs
+ * hot reset of a single in-use device. VFIO_DEVICE_RESET will already
+ * handle the case of doing hot resets when there is only a single
+ * device per bus. The in-use here refers to how many VFIODevices are
+ * affected. A hot reset that affects multiple devices, but only a
+ * single in-use device, means that we can call it from our bus
+ * ->reset() callback since the extent is effectively a single
+ * device. This allows us to make use of it in the hotplug path. When
+ * there are multiple in-use devices, we can only trigger the hot
+ * reset during a system reset and thus from our reset handler. We
+ * separate _one vs _multi here so that we don't overlap and do a
+ * double reset on the system reset path where both our reset handler
+ * and ->reset() callback are used. Calling _one() will only do a hot
+ * reset for the one in-use devices case, calling _multi() will do
+ * nothing if a _one() would have been sufficient.
+ */
+void vfio_device_reset_handler(void *opaque)
+{
+ VFIODevice *vbasedev;
+
+ trace_vfio_device_reset_handler();
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
+ if (vbasedev->dev->realized) {
+ vbasedev->ops->vfio_compute_needs_reset(vbasedev);
+ }
+ }
+
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
+ if (vbasedev->dev->realized && vbasedev->needs_reset) {
+ vbasedev->ops->vfio_hot_reset_multi(vbasedev);
+ }
+ }
+}
+
+/*
+ * Common VFIO interrupt disable
+ */
+void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
+{
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = index,
+ .start = 0,
+ .count = 0,
+ };
+
+ ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
+{
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+ .index = index,
+ .start = 0,
+ .count = 1,
+ };
+
+ ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
+{
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+ .index = index,
+ .start = 0,
+ .count = 1,
+ };
+
+ ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+static inline const char *action_to_str(int action)
+{
+ switch (action) {
+ case VFIO_IRQ_SET_ACTION_MASK:
+ return "MASK";
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ return "UNMASK";
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ return "TRIGGER";
+ default:
+ return "UNKNOWN ACTION";
+ }
+}
+
+static const char *index_to_str(VFIODevice *vbasedev, int index)
+{
+ if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+ return NULL;
+ }
+
+ switch (index) {
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ return "INTX";
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ return "MSI";
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ return "MSIX";
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ return "ERR";
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ return "REQ";
+ default:
+ return NULL;
+ }
+}
+
+bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
+ int action, int fd, Error **errp)
+{
+ ERRP_GUARD();
+ g_autofree struct vfio_irq_set *irq_set = NULL;
+ int argsz;
+ const char *name;
+ int32_t *pfd;
+
+ argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+ irq_set = g_malloc0(argsz);
+ irq_set->argsz = argsz;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
+ irq_set->index = index;
+ irq_set->start = subindex;
+ irq_set->count = 1;
+ pfd = (int32_t *)&irq_set->data;
+ *pfd = fd;
+
+ if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
+ return true;
+ }
+
+ error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure");
+
+ name = index_to_str(vbasedev, index);
+ if (name) {
+ error_prepend(errp, "%s-%d: ", name, subindex);
+ } else {
+ error_prepend(errp, "index %d-%d: ", index, subindex);
+ }
+ error_prepend(errp,
+ "Failed to %s %s eventfd signaling for interrupt ",
+ fd < 0 ? "tear down" : "set up", action_to_str(action));
+ return false;
+}
+
+int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
+ struct vfio_region_info **info)
+{
+ size_t argsz = sizeof(struct vfio_region_info);
+
+ *info = g_malloc0(argsz);
+
+ (*info)->index = index;
+retry:
+ (*info)->argsz = argsz;
+
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
+ g_free(*info);
+ *info = NULL;
+ return -errno;
+ }
+
+ if ((*info)->argsz > argsz) {
+ argsz = (*info)->argsz;
+ *info = g_realloc(*info, argsz);
+
+ goto retry;
+ }
+
+ return 0;
+}
+
+int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
+ uint32_t subtype, struct vfio_region_info **info)
+{
+ int i;
+
+ for (i = 0; i < vbasedev->num_regions; i++) {
+ struct vfio_info_cap_header *hdr;
+ struct vfio_region_info_cap_type *cap_type;
+
+ if (vfio_device_get_region_info(vbasedev, i, info)) {
+ continue;
+ }
+
+ hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
+ if (!hdr) {
+ g_free(*info);
+ continue;
+ }
+
+ cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
+
+ trace_vfio_device_get_region_info_type(vbasedev->name, i,
+ cap_type->type, cap_type->subtype);
+
+ if (cap_type->type == type && cap_type->subtype == subtype) {
+ return 0;
+ }
+
+ g_free(*info);
+ }
+
+ *info = NULL;
+ return -ENODEV;
+}
+
+bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
+{
+ g_autofree struct vfio_region_info *info = NULL;
+ bool ret = false;
+
+ if (!vfio_device_get_region_info(vbasedev, region, &info)) {
+ if (vfio_get_region_info_cap(info, cap_type)) {
+ ret = true;
+ }
+ }
+
+ return ret;
+}
+
+bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
+{
+ ERRP_GUARD();
+ struct stat st;
+
+ if (vbasedev->fd < 0) {
+ if (stat(vbasedev->sysfsdev, &st) < 0) {
+ error_setg_errno(errp, errno, "no such host device");
+ error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
+ return false;
+ }
+ /* User may specify a name, e.g: VFIO platform device */
+ if (!vbasedev->name) {
+ vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
+ }
+ } else {
+ if (!vbasedev->iommufd) {
+ error_setg(errp, "Use FD passing only with iommufd backend");
+ return false;
+ }
+ /*
+ * Give a name with fd so any function printing out vbasedev->name
+ * will not break.
+ */
+ if (!vbasedev->name) {
+ vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
+ }
+ }
+
+ return true;
+}
+
+void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
+{
+ ERRP_GUARD();
+ int fd = monitor_fd_param(monitor_cur(), str, errp);
+
+ if (fd < 0) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+ vbasedev->fd = fd;
+}
+
+void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
+ DeviceState *dev, bool ram_discard)
+{
+ vbasedev->type = type;
+ vbasedev->ops = ops;
+ vbasedev->dev = dev;
+ vbasedev->fd = -1;
+
+ vbasedev->ram_block_discard_allowed = ram_discard;
+}
+
+int vfio_device_get_aw_bits(VFIODevice *vdev)
+{
+ /*
+ * iova_ranges is a sorted list. For old kernels that support
+ * VFIO but not support query of iova ranges, iova_ranges is NULL,
+ * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
+ */
+ GList *l = g_list_last(vdev->bcontainer->iova_ranges);
+
+ if (l) {
+ Range *range = l->data;
+ return range_get_last_bit(range) + 1;
+ }
+
+ return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
+}
+
+bool vfio_device_is_mdev(VFIODevice *vbasedev)
+{
+ g_autofree char *subsys = NULL;
+ g_autofree char *tmp = NULL;
+
+ if (!vbasedev->sysfsdev) {
+ return false;
+ }
+
+ tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
+ subsys = realpath(tmp, NULL);
+ return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
+}
+
+bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev,
+ const char *typename, Error **errp)
+{
+ HostIOMMUDevice *hiod;
+
+ if (vbasedev->mdev) {
+ return true;
+ }
+
+ hiod = HOST_IOMMU_DEVICE(object_new(typename));
+
+ if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) {
+ object_unref(hiod);
+ return false;
+ }
+
+ vbasedev->hiod = hiod;
+ return true;
+}
+
+VFIODevice *vfio_get_vfio_device(Object *obj)
+{
+ if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
+ return &VFIO_PCI(obj)->vbasedev;
+ } else {
+ return NULL;
+ }
+}
+
+bool vfio_device_attach(char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp)
+{
+ const VFIOIOMMUClass *ops =
+ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
+
+ if (vbasedev->iommufd) {
+ ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
+ }
+
+ assert(ops);
+
+ return ops->attach_device(name, vbasedev, as, errp);
+}
+
+void vfio_device_detach(VFIODevice *vbasedev)
+{
+ if (!vbasedev->bcontainer) {
+ return;
+ }
+ VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
+}
diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 4fdcef5..f3e6581 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -16,9 +16,9 @@
#include "qemu/error-report.h"
#include "hw/display/edid.h"
-#include "ui/console.h"
#include "qapi/error.h"
#include "pci.h"
+#include "vfio-display.h"
#include "trace.h"
#ifndef DRM_PLANE_TYPE_PRIMARY
@@ -129,10 +129,10 @@ static bool vfio_display_edid_init(VFIOPCIDevice *vdev, Error **errp)
int fd = vdev->vbasedev.fd;
int ret;
- ret = vfio_get_dev_region_info(&vdev->vbasedev,
- VFIO_REGION_TYPE_GFX,
- VFIO_REGION_SUBTYPE_GFX_EDID,
- &dpy->edid_info);
+ ret = vfio_device_get_region_info_type(&vdev->vbasedev,
+ VFIO_REGION_TYPE_GFX,
+ VFIO_REGION_SUBTYPE_GFX_EDID,
+ &dpy->edid_info);
if (ret) {
/* Failed to get GFX edid info, allow to go through without edid. */
return true;
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 4b255d4..d0dbab1 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -22,242 +22,11 @@
#include "qemu/osdep.h"
#include <sys/ioctl.h>
-#include "hw/vfio/vfio-common.h"
-#include "hw/vfio/pci.h"
+#include "system/kvm.h"
+#include "hw/vfio/vfio-device.h"
#include "hw/hw.h"
-#include "trace.h"
#include "qapi/error.h"
-#include "qemu/error-report.h"
-#include "qemu/units.h"
-#include "monitor/monitor.h"
-
-/*
- * Common VFIO interrupt disable
- */
-void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
-{
- struct vfio_irq_set irq_set = {
- .argsz = sizeof(irq_set),
- .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
- .index = index,
- .start = 0,
- .count = 0,
- };
-
- ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
-}
-
-void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
-{
- struct vfio_irq_set irq_set = {
- .argsz = sizeof(irq_set),
- .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
- .index = index,
- .start = 0,
- .count = 1,
- };
-
- ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
-}
-
-void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
-{
- struct vfio_irq_set irq_set = {
- .argsz = sizeof(irq_set),
- .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
- .index = index,
- .start = 0,
- .count = 1,
- };
-
- ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
-}
-
-static inline const char *action_to_str(int action)
-{
- switch (action) {
- case VFIO_IRQ_SET_ACTION_MASK:
- return "MASK";
- case VFIO_IRQ_SET_ACTION_UNMASK:
- return "UNMASK";
- case VFIO_IRQ_SET_ACTION_TRIGGER:
- return "TRIGGER";
- default:
- return "UNKNOWN ACTION";
- }
-}
-
-static const char *index_to_str(VFIODevice *vbasedev, int index)
-{
- if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
- return NULL;
- }
-
- switch (index) {
- case VFIO_PCI_INTX_IRQ_INDEX:
- return "INTX";
- case VFIO_PCI_MSI_IRQ_INDEX:
- return "MSI";
- case VFIO_PCI_MSIX_IRQ_INDEX:
- return "MSIX";
- case VFIO_PCI_ERR_IRQ_INDEX:
- return "ERR";
- case VFIO_PCI_REQ_IRQ_INDEX:
- return "REQ";
- default:
- return NULL;
- }
-}
-
-bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
- int action, int fd, Error **errp)
-{
- ERRP_GUARD();
- g_autofree struct vfio_irq_set *irq_set = NULL;
- int argsz;
- const char *name;
- int32_t *pfd;
-
- argsz = sizeof(*irq_set) + sizeof(*pfd);
-
- irq_set = g_malloc0(argsz);
- irq_set->argsz = argsz;
- irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
- irq_set->index = index;
- irq_set->start = subindex;
- irq_set->count = 1;
- pfd = (int32_t *)&irq_set->data;
- *pfd = fd;
-
- if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
- return true;
- }
-
- error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure");
-
- name = index_to_str(vbasedev, index);
- if (name) {
- error_prepend(errp, "%s-%d: ", name, subindex);
- } else {
- error_prepend(errp, "index %d-%d: ", index, subindex);
- }
- error_prepend(errp,
- "Failed to %s %s eventfd signaling for interrupt ",
- fd < 0 ? "tear down" : "set up", action_to_str(action));
- return false;
-}
-
-/*
- * IO Port/MMIO - Beware of the endians, VFIO is always little endian
- */
-void vfio_region_write(void *opaque, hwaddr addr,
- uint64_t data, unsigned size)
-{
- VFIORegion *region = opaque;
- VFIODevice *vbasedev = region->vbasedev;
- union {
- uint8_t byte;
- uint16_t word;
- uint32_t dword;
- uint64_t qword;
- } buf;
-
- switch (size) {
- case 1:
- buf.byte = data;
- break;
- case 2:
- buf.word = cpu_to_le16(data);
- break;
- case 4:
- buf.dword = cpu_to_le32(data);
- break;
- case 8:
- buf.qword = cpu_to_le64(data);
- break;
- default:
- hw_error("vfio: unsupported write size, %u bytes", size);
- break;
- }
-
- if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
- error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
- ",%d) failed: %m",
- __func__, vbasedev->name, region->nr,
- addr, data, size);
- }
-
- trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
-
- /*
- * A read or write to a BAR always signals an INTx EOI. This will
- * do nothing if not pending (including not in INTx mode). We assume
- * that a BAR access is in response to an interrupt and that BAR
- * accesses will service the interrupt. Unfortunately, we don't know
- * which access will service the interrupt, so we're potentially
- * getting quite a few host interrupts per guest interrupt.
- */
- vbasedev->ops->vfio_eoi(vbasedev);
-}
-
-uint64_t vfio_region_read(void *opaque,
- hwaddr addr, unsigned size)
-{
- VFIORegion *region = opaque;
- VFIODevice *vbasedev = region->vbasedev;
- union {
- uint8_t byte;
- uint16_t word;
- uint32_t dword;
- uint64_t qword;
- } buf;
- uint64_t data = 0;
-
- if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
- error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
- __func__, vbasedev->name, region->nr,
- addr, size);
- return (uint64_t)-1;
- }
- switch (size) {
- case 1:
- data = buf.byte;
- break;
- case 2:
- data = le16_to_cpu(buf.word);
- break;
- case 4:
- data = le32_to_cpu(buf.dword);
- break;
- case 8:
- data = le64_to_cpu(buf.qword);
- break;
- default:
- hw_error("vfio: unsupported read size, %u bytes", size);
- break;
- }
-
- trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
-
- /* Same as write above */
- vbasedev->ops->vfio_eoi(vbasedev);
-
- return data;
-}
-
-const MemoryRegionOps vfio_region_ops = {
- .read = vfio_region_read,
- .write = vfio_region_write,
- .endianness = DEVICE_LITTLE_ENDIAN,
- .valid = {
- .min_access_size = 1,
- .max_access_size = 8,
- },
- .impl = {
- .min_access_size = 1,
- .max_access_size = 8,
- },
-};
+#include "vfio-helpers.h"
int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
{
@@ -306,435 +75,126 @@ vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
return vfio_get_cap((void *)info, info->cap_offset, id);
}
-static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
- struct vfio_region_info *info)
+struct vfio_info_cap_header *
+vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
{
- struct vfio_info_cap_header *hdr;
- struct vfio_region_info_cap_sparse_mmap *sparse;
- int i, j;
-
- hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
- if (!hdr) {
- return -ENODEV;
- }
-
- sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
-
- trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
- region->nr, sparse->nr_areas);
-
- region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
-
- for (i = 0, j = 0; i < sparse->nr_areas; i++) {
- if (sparse->areas[i].size) {
- trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
- sparse->areas[i].offset +
- sparse->areas[i].size - 1);
- region->mmaps[j].offset = sparse->areas[i].offset;
- region->mmaps[j].size = sparse->areas[i].size;
- j++;
- }
+ if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
+ return NULL;
}
- region->nr_mmaps = j;
- region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
-
- return 0;
+ return vfio_get_cap((void *)info, info->cap_offset, id);
}
-int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
- int index, const char *name)
+bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
+ unsigned int *avail)
{
- g_autofree struct vfio_region_info *info = NULL;
- int ret;
+ struct vfio_info_cap_header *hdr;
+ struct vfio_iommu_type1_info_dma_avail *cap;
- ret = vfio_get_region_info(vbasedev, index, &info);
- if (ret) {
- return ret;
+ /* If the capability cannot be found, assume no DMA limiting */
+ hdr = vfio_get_iommu_type1_info_cap(info,
+ VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
+ if (!hdr) {
+ return false;
}
- region->vbasedev = vbasedev;
- region->flags = info->flags;
- region->size = info->size;
- region->fd_offset = info->offset;
- region->nr = index;
-
- if (region->size) {
- region->mem = g_new0(MemoryRegion, 1);
- memory_region_init_io(region->mem, obj, &vfio_region_ops,
- region, name, region->size);
-
- if (!vbasedev->no_mmap &&
- region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
-
- ret = vfio_setup_region_sparse_mmaps(region, info);
-
- if (ret) {
- region->nr_mmaps = 1;
- region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
- region->mmaps[0].offset = 0;
- region->mmaps[0].size = region->size;
- }
- }
+ if (avail != NULL) {
+ cap = (void *) hdr;
+ *avail = cap->avail;
}
- trace_vfio_region_setup(vbasedev->name, index, name,
- region->flags, region->fd_offset, region->size);
- return 0;
+ return true;
}
-static void vfio_subregion_unmap(VFIORegion *region, int index)
-{
- trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
- region->mmaps[index].offset,
- region->mmaps[index].offset +
- region->mmaps[index].size - 1);
- memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
- munmap(region->mmaps[index].mmap, region->mmaps[index].size);
- object_unparent(OBJECT(&region->mmaps[index].mem));
- region->mmaps[index].mmap = NULL;
-}
+#ifdef CONFIG_KVM
+/*
+ * We have a single VFIO pseudo device per KVM VM. Once created it lives
+ * for the life of the VM. Closing the file descriptor only drops our
+ * reference to it and the device's reference to kvm. Therefore once
+ * initialized, this file descriptor is only released on QEMU exit and
+ * we'll re-use it should another vfio device be attached before then.
+ */
+int vfio_kvm_device_fd = -1;
+#endif
-int vfio_region_mmap(VFIORegion *region)
+int vfio_kvm_device_add_fd(int fd, Error **errp)
{
- int i, ret, prot = 0;
- char *name;
+#ifdef CONFIG_KVM
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_FILE,
+ .attr = KVM_DEV_VFIO_FILE_ADD,
+ .addr = (uint64_t)(unsigned long)&fd,
+ };
- if (!region->mem) {
+ if (!kvm_enabled()) {
return 0;
}
- prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
- prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
-
- for (i = 0; i < region->nr_mmaps; i++) {
- size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB);
- void *map_base, *map_align;
-
- /*
- * Align the mmap for more efficient mapping in the kernel. Ideally
- * we'd know the PMD and PUD mapping sizes to use as discrete alignment
- * intervals, but we don't. As of Linux v6.12, the largest PUD size
- * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set
- * on x86_64). Align by power-of-two size, capped at 1GiB.
- *
- * NB. qemu_memalign() and friends actually allocate memory, whereas
- * the region size here can exceed host memory, therefore we manually
- * create an oversized anonymous mapping and clean it up for alignment.
- */
- map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (map_base == MAP_FAILED) {
- ret = -errno;
- goto no_mmap;
- }
+ if (vfio_kvm_device_fd < 0) {
+ struct kvm_create_device cd = {
+ .type = KVM_DEV_TYPE_VFIO,
+ };
- map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
- munmap(map_base, map_align - map_base);
- munmap(map_align + region->mmaps[i].size,
- align - (map_align - map_base));
-
- region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot,
- MAP_SHARED | MAP_FIXED,
- region->vbasedev->fd,
- region->fd_offset +
- region->mmaps[i].offset);
- if (region->mmaps[i].mmap == MAP_FAILED) {
- ret = -errno;
- goto no_mmap;
+ if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
+ error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
+ return -errno;
}
- name = g_strdup_printf("%s mmaps[%d]",
- memory_region_name(region->mem), i);
- memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
- memory_region_owner(region->mem),
- name, region->mmaps[i].size,
- region->mmaps[i].mmap);
- g_free(name);
- memory_region_add_subregion(region->mem, region->mmaps[i].offset,
- &region->mmaps[i].mem);
-
- trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
- region->mmaps[i].offset,
- region->mmaps[i].offset +
- region->mmaps[i].size - 1);
- }
-
- return 0;
-
-no_mmap:
- trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
- region->fd_offset + region->mmaps[i].offset,
- region->fd_offset + region->mmaps[i].offset +
- region->mmaps[i].size - 1, ret);
-
- region->mmaps[i].mmap = NULL;
-
- for (i--; i >= 0; i--) {
- vfio_subregion_unmap(region, i);
- }
-
- return ret;
-}
-
-void vfio_region_unmap(VFIORegion *region)
-{
- int i;
-
- if (!region->mem) {
- return;
+ vfio_kvm_device_fd = cd.fd;
}
- for (i = 0; i < region->nr_mmaps; i++) {
- if (region->mmaps[i].mmap) {
- vfio_subregion_unmap(region, i);
- }
- }
-}
-
-void vfio_region_exit(VFIORegion *region)
-{
- int i;
-
- if (!region->mem) {
- return;
- }
-
- for (i = 0; i < region->nr_mmaps; i++) {
- if (region->mmaps[i].mmap) {
- memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
- }
- }
-
- trace_vfio_region_exit(region->vbasedev->name, region->nr);
-}
-
-void vfio_region_finalize(VFIORegion *region)
-{
- int i;
-
- if (!region->mem) {
- return;
- }
-
- for (i = 0; i < region->nr_mmaps; i++) {
- if (region->mmaps[i].mmap) {
- munmap(region->mmaps[i].mmap, region->mmaps[i].size);
- object_unparent(OBJECT(&region->mmaps[i].mem));
- }
- }
-
- object_unparent(OBJECT(region->mem));
-
- g_free(region->mem);
- g_free(region->mmaps);
-
- trace_vfio_region_finalize(region->vbasedev->name, region->nr);
-
- region->mem = NULL;
- region->mmaps = NULL;
- region->nr_mmaps = 0;
- region->size = 0;
- region->flags = 0;
- region->nr = 0;
-}
-
-void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
-{
- int i;
-
- if (!region->mem) {
- return;
- }
-
- for (i = 0; i < region->nr_mmaps; i++) {
- if (region->mmaps[i].mmap) {
- memory_region_set_enabled(&region->mmaps[i].mem, enabled);
- }
- }
-
- trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
- enabled);
-}
-
-int vfio_get_region_info(VFIODevice *vbasedev, int index,
- struct vfio_region_info **info)
-{
- size_t argsz = sizeof(struct vfio_region_info);
-
- *info = g_malloc0(argsz);
-
- (*info)->index = index;
-retry:
- (*info)->argsz = argsz;
-
- if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
- g_free(*info);
- *info = NULL;
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device",
+ fd);
return -errno;
}
-
- if ((*info)->argsz > argsz) {
- argsz = (*info)->argsz;
- *info = g_realloc(*info, argsz);
-
- goto retry;
- }
-
+#endif
return 0;
}
-int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
- uint32_t subtype, struct vfio_region_info **info)
-{
- int i;
-
- for (i = 0; i < vbasedev->num_regions; i++) {
- struct vfio_info_cap_header *hdr;
- struct vfio_region_info_cap_type *cap_type;
-
- if (vfio_get_region_info(vbasedev, i, info)) {
- continue;
- }
-
- hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
- if (!hdr) {
- g_free(*info);
- continue;
- }
-
- cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
-
- trace_vfio_get_dev_region(vbasedev->name, i,
- cap_type->type, cap_type->subtype);
-
- if (cap_type->type == type && cap_type->subtype == subtype) {
- return 0;
- }
-
- g_free(*info);
- }
-
- *info = NULL;
- return -ENODEV;
-}
-
-bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
+int vfio_kvm_device_del_fd(int fd, Error **errp)
{
- g_autofree struct vfio_region_info *info = NULL;
- bool ret = false;
-
- if (!vfio_get_region_info(vbasedev, region, &info)) {
- if (vfio_get_region_info_cap(info, cap_type)) {
- ret = true;
- }
- }
-
- return ret;
-}
+#ifdef CONFIG_KVM
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_FILE,
+ .attr = KVM_DEV_VFIO_FILE_DEL,
+ .addr = (uint64_t)(unsigned long)&fd,
+ };
-bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
-{
- ERRP_GUARD();
- struct stat st;
-
- if (vbasedev->fd < 0) {
- if (stat(vbasedev->sysfsdev, &st) < 0) {
- error_setg_errno(errp, errno, "no such host device");
- error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
- return false;
- }
- /* User may specify a name, e.g: VFIO platform device */
- if (!vbasedev->name) {
- vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
- }
- } else {
- if (!vbasedev->iommufd) {
- error_setg(errp, "Use FD passing only with iommufd backend");
- return false;
- }
- /*
- * Give a name with fd so any function printing out vbasedev->name
- * will not break.
- */
- if (!vbasedev->name) {
- vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
- }
+ if (vfio_kvm_device_fd < 0) {
+ error_setg(errp, "KVM VFIO device isn't created yet");
+ return -EINVAL;
}
- return true;
-}
-
-void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
-{
- ERRP_GUARD();
- int fd = monitor_fd_param(monitor_cur(), str, errp);
-
- if (fd < 0) {
- error_prepend(errp, "Could not parse remote object fd %s:", str);
- return;
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_setg_errno(errp, errno,
+ "Failed to remove fd %d from KVM VFIO device", fd);
+ return -errno;
}
- vbasedev->fd = fd;
-}
-
-void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
- DeviceState *dev, bool ram_discard)
-{
- vbasedev->type = type;
- vbasedev->ops = ops;
- vbasedev->dev = dev;
- vbasedev->fd = -1;
-
- vbasedev->ram_block_discard_allowed = ram_discard;
+#endif
+ return 0;
}
-int vfio_device_get_aw_bits(VFIODevice *vdev)
+struct vfio_device_info *vfio_get_device_info(int fd)
{
- /*
- * iova_ranges is a sorted list. For old kernels that support
- * VFIO but not support query of iova ranges, iova_ranges is NULL,
- * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
- */
- GList *l = g_list_last(vdev->bcontainer->iova_ranges);
-
- if (l) {
- Range *range = l->data;
- return range_get_last_bit(range) + 1;
- }
+ struct vfio_device_info *info;
+ uint32_t argsz = sizeof(*info);
- return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
-}
+ info = g_malloc0(argsz);
-bool vfio_device_is_mdev(VFIODevice *vbasedev)
-{
- g_autofree char *subsys = NULL;
- g_autofree char *tmp = NULL;
+retry:
+ info->argsz = argsz;
- if (!vbasedev->sysfsdev) {
- return false;
+ if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
+ g_free(info);
+ return NULL;
}
- tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
- subsys = realpath(tmp, NULL);
- return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
-}
-
-bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp)
-{
- HostIOMMUDevice *hiod = vbasedev->hiod;
-
- if (!hiod) {
- return true;
+ if (info->argsz > argsz) {
+ argsz = info->argsz;
+ info = g_realloc(info, argsz);
+ goto retry;
}
- return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp);
-}
-
-VFIODevice *vfio_get_vfio_device(Object *obj)
-{
- if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
- return &VFIO_PCI(obj)->vbasedev;
- } else {
- return NULL;
- }
+ return info;
}
diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
index 265fffc..6678e0e 100644
--- a/hw/vfio/igd.c
+++ b/hw/vfio/igd.c
@@ -200,7 +200,7 @@ static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp)
return false;
}
- ret = vfio_get_dev_region_info(&vdev->vbasedev,
+ ret = vfio_device_get_region_info_type(&vdev->vbasedev,
VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
if (ret) {
@@ -385,7 +385,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp)
* Check whether we have all the vfio device specific regions to
* support LPC quirk (added in Linux v4.6).
*/
- ret = vfio_get_dev_region_info(&vdev->vbasedev,
+ ret = vfio_device_get_region_info_type(&vdev->vbasedev,
VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc);
if (ret) {
@@ -393,7 +393,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp)
return false;
}
- ret = vfio_get_dev_region_info(&vdev->vbasedev,
+ ret = vfio_device_get_region_info_type(&vdev->vbasedev,
VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host);
if (ret) {
@@ -542,8 +542,8 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp)
* there's no ROM, there's no point in setting up this quirk.
* NB. We only seem to get BIOS ROMs, so UEFI VM would need CSM support.
*/
- ret = vfio_get_region_info(&vdev->vbasedev,
- VFIO_PCI_ROM_REGION_INDEX, &rom);
+ ret = vfio_device_get_region_info(&vdev->vbasedev,
+ VFIO_PCI_ROM_REGION_INDEX, &rom);
if ((ret || !rom->size) && !vdev->pdev.romfile) {
error_setg(&err, "Device has no ROM");
goto error;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 42c8412..b2f72dc 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -15,7 +15,7 @@
#include <linux/vfio.h>
#include <linux/iommufd.h>
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "qapi/error.h"
@@ -25,6 +25,13 @@
#include "qemu/cutils.h"
#include "qemu/chardev_open.h"
#include "pci.h"
+#include "vfio-iommufd.h"
+#include "vfio-helpers.h"
+#include "vfio-cpr.h"
+#include "vfio-listener.h"
+
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
+ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
@@ -280,7 +287,8 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
{
ERRP_GUARD();
IOMMUFDBackend *iommufd = vbasedev->iommufd;
- uint32_t flags = 0;
+ uint32_t type, flags = 0;
+ uint64_t hw_caps;
VFIOIOASHwpt *hwpt;
uint32_t hwpt_id;
int ret;
@@ -317,7 +325,12 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
* vfio_migration_realize() may decide to use VF dirty tracking
* instead.
*/
- if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
+ if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
+ &type, NULL, 0, &hw_caps, errp)) {
+ return false;
+ }
+
+ if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
}
@@ -403,7 +416,8 @@ static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container)
if (!QLIST_EMPTY(&bcontainer->device_list)) {
return;
}
- memory_listener_unregister(&bcontainer->listener);
+ vfio_cpr_unregister_container(bcontainer);
+ vfio_listener_unregister(bcontainer);
iommufd_backend_free_id(container->be, container->ioas_id);
object_unref(container);
}
@@ -485,18 +499,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
goto err_connect_bind;
}
- space = vfio_get_address_space(as);
-
- /*
- * The HostIOMMUDevice data from legacy backend is static and doesn't need
- * any information from the (type1-iommu) backend to be initialized. In
- * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd
- * FD to be connected and having a devid to be able to successfully call
- * iommufd_backend_get_device_info().
- */
- if (!vfio_device_hiod_realize(vbasedev, errp)) {
- goto err_alloc_ioas;
- }
+ space = vfio_address_space_get(as);
/* try to attach to an existing container in this space */
QLIST_FOREACH(bcontainer, &space->containers, next) {
@@ -555,12 +558,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
bcontainer->pgsizes = qemu_real_host_page_size();
}
- bcontainer->listener = vfio_memory_listener;
- memory_listener_register(&bcontainer->listener, bcontainer->space->as);
+ if (!vfio_listener_register(bcontainer, errp)) {
+ goto err_listener_register;
+ }
- if (bcontainer->error) {
- error_propagate_prepend(errp, bcontainer->error,
- "memory listener initialization failed: ");
+ if (!vfio_cpr_register_container(bcontainer, errp)) {
goto err_listener_register;
}
@@ -573,7 +575,8 @@ found_container:
goto err_listener_register;
}
- if (!vfio_cpr_register_container(bcontainer, errp)) {
+ if (!vfio_device_hiod_create_and_realize(vbasedev,
+ TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) {
goto err_listener_register;
}
@@ -605,7 +608,7 @@ err_discard_disable:
err_attach_container:
iommufd_cdev_container_destroy(container);
err_alloc_ioas:
- vfio_put_address_space(space);
+ vfio_address_space_put(space);
iommufd_cdev_unbind_and_disconnect(vbasedev);
err_connect_bind:
close(vbasedev->fd);
@@ -627,10 +630,10 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
iommufd_cdev_ram_block_discard_disable(false);
}
- vfio_cpr_unregister_container(bcontainer);
+ object_unref(vbasedev->hiod);
iommufd_cdev_detach_container(vbasedev, container);
iommufd_cdev_container_destroy(container);
- vfio_put_address_space(space);
+ vfio_address_space_put(space);
iommufd_cdev_unbind_and_disconnect(vbasedev);
close(vbasedev->fd);
@@ -790,8 +793,6 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
- vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO;
-
vioc->dma_map = iommufd_cdev_map;
vioc->dma_unmap = iommufd_cdev_unmap;
vioc->attach_device = iommufd_cdev_attach;
diff --git a/hw/vfio/common.c b/hw/vfio/listener.c
index 9b49345..6f77e18 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/listener.c
@@ -25,7 +25,7 @@
#endif
#include <linux/vfio.h>
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "hw/vfio/pci.h"
#include "system/address-spaces.h"
#include "system/memory.h"
@@ -40,160 +40,23 @@
#include "trace.h"
#include "qapi/error.h"
#include "migration/misc.h"
-#include "migration/blocker.h"
#include "migration/qemu-file.h"
#include "system/tcg.h"
#include "system/tpm.h"
-
-VFIODeviceList vfio_device_list =
- QLIST_HEAD_INITIALIZER(vfio_device_list);
-static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
- QLIST_HEAD_INITIALIZER(vfio_address_spaces);
-
-#ifdef CONFIG_KVM
-/*
- * We have a single VFIO pseudo device per KVM VM. Once created it lives
- * for the life of the VM. Closing the file descriptor only drops our
- * reference to it and the device's reference to kvm. Therefore once
- * initialized, this file descriptor is only released on QEMU exit and
- * we'll re-use it should another vfio device be attached before then.
- */
-int vfio_kvm_device_fd = -1;
-#endif
+#include "vfio-migration-internal.h"
+#include "vfio-helpers.h"
+#include "vfio-listener.h"
/*
* Device state interfaces
*/
-bool vfio_mig_active(void)
-{
- VFIODevice *vbasedev;
-
- if (QLIST_EMPTY(&vfio_device_list)) {
- return false;
- }
-
- QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
- if (vbasedev->migration_blocker) {
- return false;
- }
- }
- return true;
-}
-
-static Error *multiple_devices_migration_blocker;
-
-/*
- * Multiple devices migration is allowed only if all devices support P2P
- * migration. Single device migration is allowed regardless of P2P migration
- * support.
- */
-static bool vfio_multiple_devices_migration_is_supported(void)
-{
- VFIODevice *vbasedev;
- unsigned int device_num = 0;
- bool all_support_p2p = true;
-
- QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
- if (vbasedev->migration) {
- device_num++;
-
- if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
- all_support_p2p = false;
- }
- }
- }
-
- return all_support_p2p || device_num <= 1;
-}
-
-int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
-{
- if (vfio_multiple_devices_migration_is_supported()) {
- return 0;
- }
-
- if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
- error_setg(errp, "Multiple VFIO devices migration is supported only if "
- "all of them support P2P migration");
- return -EINVAL;
- }
-
- if (multiple_devices_migration_blocker) {
- return 0;
- }
-
- error_setg(&multiple_devices_migration_blocker,
- "Multiple VFIO devices migration is supported only if all of "
- "them support P2P migration");
- return migrate_add_blocker_normal(&multiple_devices_migration_blocker,
- errp);
-}
-
-void vfio_unblock_multiple_devices_migration(void)
-{
- if (!multiple_devices_migration_blocker ||
- !vfio_multiple_devices_migration_is_supported()) {
- return;
- }
-
- migrate_del_blocker(&multiple_devices_migration_blocker);
-}
-
-bool vfio_viommu_preset(VFIODevice *vbasedev)
-{
- return vbasedev->bcontainer->space->as != &address_space_memory;
-}
-
-static void vfio_set_migration_error(int ret)
-{
- if (migration_is_running()) {
- migration_file_set_error(ret, NULL);
- }
-}
-
-bool vfio_device_state_is_running(VFIODevice *vbasedev)
-{
- VFIOMigration *migration = vbasedev->migration;
-
- return migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
- migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P;
-}
-
-bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
-{
- VFIOMigration *migration = vbasedev->migration;
-
- return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ||
- migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
-}
-
-static bool vfio_devices_all_device_dirty_tracking_started(
- const VFIOContainerBase *bcontainer)
-{
- VFIODevice *vbasedev;
-
- QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
- if (!vbasedev->dirty_tracking) {
- return false;
- }
- }
-
- return true;
-}
-
-bool vfio_devices_all_dirty_tracking_started(
- const VFIOContainerBase *bcontainer)
-{
- return vfio_devices_all_device_dirty_tracking_started(bcontainer) ||
- bcontainer->dirty_pages_started;
-}
static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
{
VFIODevice *vbasedev;
- if (!vfio_devices_all_dirty_tracking_started(bcontainer)) {
+ if (!vfio_container_dirty_tracking_is_started(bcontainer)) {
return false;
}
@@ -213,22 +76,6 @@ static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
return true;
}
-bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
-{
- VFIODevice *vbasedev;
-
- QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
- if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
- return false;
- }
- if (!vbasedev->dirty_pages_supported) {
- return false;
- }
- }
-
- return true;
-}
-
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
{
return (!memory_region_is_ram(section->mr) &&
@@ -287,9 +134,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
iova, iova + iotlb->addr_mask);
if (iotlb->target_as != &address_space_memory) {
- error_report("Wrong target AS \"%s\", only system memory is allowed",
- iotlb->target_as->name ? iotlb->target_as->name : "none");
- vfio_set_migration_error(-EINVAL);
+ error_setg(&local_err,
+ "Wrong target AS \"%s\", only system memory is allowed",
+ iotlb->target_as->name ? iotlb->target_as->name : "none");
+ if (migration_is_running()) {
+ migration_file_set_error(-EINVAL, local_err);
+ } else {
+ error_report_err(local_err);
+ }
return;
}
@@ -322,11 +174,16 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
ret = vfio_container_dma_unmap(bcontainer, iova,
iotlb->addr_mask + 1, iotlb);
if (ret) {
- error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%s)",
- bcontainer, iova,
- iotlb->addr_mask + 1, ret, strerror(-ret));
- vfio_set_migration_error(ret);
+ error_setg(&local_err,
+ "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+ "0x%"HWADDR_PRIx") = %d (%s)",
+ bcontainer, iova,
+ iotlb->addr_mask + 1, ret, strerror(-ret));
+ if (migration_is_running()) {
+ migration_file_set_error(ret, local_err);
+ } else {
+ error_report_err(local_err);
+ }
}
}
out:
@@ -386,7 +243,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
return 0;
}
-static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
+static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
@@ -461,7 +318,7 @@ static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
}
}
-static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
+static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
@@ -648,7 +505,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
* about changes.
*/
if (memory_region_has_ram_discard_manager(section->mr)) {
- vfio_register_ram_discard_listener(bcontainer, section);
+ vfio_ram_discard_register_listener(bcontainer, section);
return;
}
@@ -771,7 +628,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
} else if (memory_region_has_ram_discard_manager(section->mr)) {
- vfio_unregister_ram_discard_listener(bcontainer, section);
+ vfio_ram_discard_unregister_listener(bcontainer, section);
/* Unregistering will trigger an unmap. */
try_unmap = false;
}
@@ -1079,7 +936,7 @@ static bool vfio_listener_log_global_start(MemoryListener *listener,
listener);
bool ret;
- if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
+ if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
ret = vfio_devices_dma_logging_start(bcontainer, errp);
} else {
ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
@@ -1098,7 +955,7 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
Error *local_err = NULL;
int ret = 0;
- if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
+ if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
vfio_devices_dma_logging_stop(bcontainer);
} else {
ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
@@ -1108,102 +965,12 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
if (ret) {
error_prepend(&local_err,
"vfio: Could not stop dirty page tracking - ");
- error_report_err(local_err);
- vfio_set_migration_error(ret);
- }
-}
-
-static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
- hwaddr size, void *bitmap)
-{
- uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
- sizeof(struct vfio_device_feature_dma_logging_report),
- sizeof(uint64_t))] = {};
- struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
- struct vfio_device_feature_dma_logging_report *report =
- (struct vfio_device_feature_dma_logging_report *)feature->data;
-
- report->iova = iova;
- report->length = size;
- report->page_size = qemu_real_host_page_size();
- report->bitmap = (uintptr_t)bitmap;
-
- feature->argsz = sizeof(buf);
- feature->flags = VFIO_DEVICE_FEATURE_GET |
- VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
-
- if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
- return -errno;
- }
-
- return 0;
-}
-
-int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
- VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
-{
- VFIODevice *vbasedev;
- int ret;
-
- QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
- ret = vfio_device_dma_logging_report(vbasedev, iova, size,
- vbmap->bitmap);
- if (ret) {
- error_setg_errno(errp, -ret,
- "%s: Failed to get DMA logging report, iova: "
- "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx,
- vbasedev->name, iova, size);
-
- return ret;
+ if (migration_is_running()) {
+ migration_file_set_error(ret, local_err);
+ } else {
+ error_report_err(local_err);
}
}
-
- return 0;
-}
-
-int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
- uint64_t size, ram_addr_t ram_addr, Error **errp)
-{
- bool all_device_dirty_tracking =
- vfio_devices_all_device_dirty_tracking(bcontainer);
- uint64_t dirty_pages;
- VFIOBitmap vbmap;
- int ret;
-
- if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
- cpu_physical_memory_set_dirty_range(ram_addr, size,
- tcg_enabled() ? DIRTY_CLIENTS_ALL :
- DIRTY_CLIENTS_NOCODE);
- return 0;
- }
-
- ret = vfio_bitmap_alloc(&vbmap, size);
- if (ret) {
- error_setg_errno(errp, -ret,
- "Failed to allocate dirty tracking bitmap");
- return ret;
- }
-
- if (all_device_dirty_tracking) {
- ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
- errp);
- } else {
- ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
- errp);
- }
-
- if (ret) {
- goto out;
- }
-
- dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
- vbmap.pages);
-
- trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
-out:
- g_free(vbmap.bitmap);
-
- return ret;
}
typedef struct {
@@ -1225,25 +992,24 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
if (iotlb->target_as != &address_space_memory) {
- error_report("Wrong target AS \"%s\", only system memory is allowed",
- iotlb->target_as->name ? iotlb->target_as->name : "none");
+ error_setg(&local_err,
+ "Wrong target AS \"%s\", only system memory is allowed",
+ iotlb->target_as->name ? iotlb->target_as->name : "none");
goto out;
}
rcu_read_lock();
if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
- error_report_err(local_err);
goto out_unlock;
}
- ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
+ ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
translated_addr, &local_err);
if (ret) {
error_prepend(&local_err,
"vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
iotlb->addr_mask + 1);
- error_report_err(local_err);
}
out_unlock:
@@ -1251,11 +1017,15 @@ out_unlock:
out:
if (ret) {
- vfio_set_migration_error(ret);
+ if (migration_is_running()) {
+ migration_file_set_error(ret, local_err);
+ } else {
+ error_report_err(local_err);
+ }
}
}
-static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
+static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section,
void *opaque)
{
const hwaddr size = int128_get64(section->size);
@@ -1270,7 +1040,7 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
* Sync the whole mapped region (spanning multiple individual mappings)
* in one go.
*/
- ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
+ ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
&local_err);
if (ret) {
error_report_err(local_err);
@@ -1302,7 +1072,7 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
* which correspond to populated parts. Replay all populated parts.
*/
return ram_discard_manager_replay_populated(rdm, section,
- vfio_ram_discard_get_dirty_bitmap,
+ vfio_ram_discard_query_dirty_bitmap,
&vrdl);
}
@@ -1364,7 +1134,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
ram_addr = memory_region_get_ram_addr(section->mr) +
section->offset_within_region;
- return vfio_get_dirty_bitmap(bcontainer,
+ return vfio_container_query_dirty_bitmap(bcontainer,
REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
int128_get64(section->size), ram_addr, errp);
}
@@ -1384,13 +1154,16 @@ static void vfio_listener_log_sync(MemoryListener *listener,
if (vfio_log_sync_needed(bcontainer)) {
ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
if (ret) {
- error_report_err(local_err);
- vfio_set_migration_error(ret);
+ if (migration_is_running()) {
+ migration_file_set_error(ret, local_err);
+ } else {
+ error_report_err(local_err);
+ }
}
}
}
-const MemoryListener vfio_memory_listener = {
+static const MemoryListener vfio_memory_listener = {
.name = "vfio",
.region_add = vfio_listener_region_add,
.region_del = vfio_listener_region_del,
@@ -1399,184 +1172,21 @@ const MemoryListener vfio_memory_listener = {
.log_sync = vfio_listener_log_sync,
};
-void vfio_reset_handler(void *opaque)
+bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp)
{
- VFIODevice *vbasedev;
-
- trace_vfio_reset_handler();
- QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
- if (vbasedev->dev->realized) {
- vbasedev->ops->vfio_compute_needs_reset(vbasedev);
- }
- }
-
- QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
- if (vbasedev->dev->realized && vbasedev->needs_reset) {
- vbasedev->ops->vfio_hot_reset_multi(vbasedev);
- }
- }
-}
-
-int vfio_kvm_device_add_fd(int fd, Error **errp)
-{
-#ifdef CONFIG_KVM
- struct kvm_device_attr attr = {
- .group = KVM_DEV_VFIO_FILE,
- .attr = KVM_DEV_VFIO_FILE_ADD,
- .addr = (uint64_t)(unsigned long)&fd,
- };
-
- if (!kvm_enabled()) {
- return 0;
- }
-
- if (vfio_kvm_device_fd < 0) {
- struct kvm_create_device cd = {
- .type = KVM_DEV_TYPE_VFIO,
- };
-
- if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
- error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
- return -errno;
- }
-
- vfio_kvm_device_fd = cd.fd;
- }
+ bcontainer->listener = vfio_memory_listener;
+ memory_listener_register(&bcontainer->listener, bcontainer->space->as);
- if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
- error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device",
- fd);
- return -errno;
- }
-#endif
- return 0;
-}
-
-int vfio_kvm_device_del_fd(int fd, Error **errp)
-{
-#ifdef CONFIG_KVM
- struct kvm_device_attr attr = {
- .group = KVM_DEV_VFIO_FILE,
- .attr = KVM_DEV_VFIO_FILE_DEL,
- .addr = (uint64_t)(unsigned long)&fd,
- };
-
- if (vfio_kvm_device_fd < 0) {
- error_setg(errp, "KVM VFIO device isn't created yet");
- return -EINVAL;
- }
-
- if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
- error_setg_errno(errp, errno,
- "Failed to remove fd %d from KVM VFIO device", fd);
- return -errno;
- }
-#endif
- return 0;
-}
-
-VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
-{
- VFIOAddressSpace *space;
-
- QLIST_FOREACH(space, &vfio_address_spaces, list) {
- if (space->as == as) {
- return space;
- }
- }
-
- /* No suitable VFIOAddressSpace, create a new one */
- space = g_malloc0(sizeof(*space));
- space->as = as;
- QLIST_INIT(&space->containers);
-
- if (QLIST_EMPTY(&vfio_address_spaces)) {
- qemu_register_reset(vfio_reset_handler, NULL);
- }
-
- QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
-
- return space;
-}
-
-void vfio_put_address_space(VFIOAddressSpace *space)
-{
- if (!QLIST_EMPTY(&space->containers)) {
- return;
- }
-
- QLIST_REMOVE(space, list);
- g_free(space);
-
- if (QLIST_EMPTY(&vfio_address_spaces)) {
- qemu_unregister_reset(vfio_reset_handler, NULL);
- }
-}
-
-void vfio_address_space_insert(VFIOAddressSpace *space,
- VFIOContainerBase *bcontainer)
-{
- QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
- bcontainer->space = space;
-}
-
-struct vfio_device_info *vfio_get_device_info(int fd)
-{
- struct vfio_device_info *info;
- uint32_t argsz = sizeof(*info);
-
- info = g_malloc0(argsz);
-
-retry:
- info->argsz = argsz;
-
- if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
- g_free(info);
- return NULL;
- }
-
- if (info->argsz > argsz) {
- argsz = info->argsz;
- info = g_realloc(info, argsz);
- goto retry;
- }
-
- return info;
-}
-
-bool vfio_attach_device(char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp)
-{
- const VFIOIOMMUClass *ops =
- VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
- HostIOMMUDevice *hiod = NULL;
-
- if (vbasedev->iommufd) {
- ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
- }
-
- assert(ops);
-
-
- if (!vbasedev->mdev) {
- hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
- vbasedev->hiod = hiod;
- }
-
- if (!ops->attach_device(name, vbasedev, as, errp)) {
- object_unref(hiod);
- vbasedev->hiod = NULL;
+ if (bcontainer->error) {
+ error_propagate_prepend(errp, bcontainer->error,
+ "memory listener initialization failed: ");
return false;
}
return true;
}
-void vfio_detach_device(VFIODevice *vbasedev)
+void vfio_listener_unregister(VFIOContainerBase *bcontainer)
{
- if (!vbasedev->bcontainer) {
- return;
- }
- object_unref(vbasedev->hiod);
- VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
+ memory_listener_unregister(&bcontainer->listener);
}
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index a8939c8..bccb050 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -1,7 +1,9 @@
vfio_ss = ss.source_set()
vfio_ss.add(files(
- 'common.c',
+ 'listener.c',
+ 'container-base.c',
'container.c',
+ 'helpers.c',
))
vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c'))
vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
@@ -18,11 +20,11 @@ specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss)
system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c'))
system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c'))
system_ss.add(when: 'CONFIG_VFIO', if_true: files(
- 'helpers.c',
- 'container-base.c',
+ 'cpr.c',
+ 'device.c',
'migration.c',
'migration-multifd.c',
- 'cpr.c',
+ 'region.c',
))
system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files(
'iommufd.c',
diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
index 378f6f3..850a319 100644
--- a/hw/vfio/migration-multifd.c
+++ b/hw/vfio/migration-multifd.c
@@ -10,7 +10,7 @@
*/
#include "qemu/osdep.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "migration/misc.h"
#include "qapi/error.h"
#include "qemu/bswap.h"
@@ -21,6 +21,7 @@
#include "io/channel-buffer.h"
#include "migration/qemu-file.h"
#include "migration-multifd.h"
+#include "vfio-migration-internal.h"
#include "trace.h"
#define VFIO_DEVICE_STATE_CONFIG_STATE (1)
@@ -575,7 +576,7 @@ vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
return false;
}
- vfio_mig_add_bytes_transferred(packet_len);
+ vfio_migration_add_bytes_transferred(packet_len);
return true;
}
@@ -645,7 +646,7 @@ vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
goto thread_exit;
}
- vfio_mig_add_bytes_transferred(packet_size);
+ vfio_migration_add_bytes_transferred(packet_size);
}
if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
index a664051..0bab632 100644
--- a/hw/vfio/migration-multifd.h
+++ b/hw/vfio/migration-multifd.h
@@ -12,7 +12,7 @@
#ifndef HW_VFIO_MIGRATION_MULTIFD_H
#define HW_VFIO_MIGRATION_MULTIFD_H
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp);
void vfio_multifd_cleanup(VFIODevice *vbasedev);
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index fbff46c..1dceab1 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -16,7 +16,8 @@
#include <sys/ioctl.h>
#include "system/runstate.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
+#include "hw/vfio/vfio-migration.h"
#include "migration/misc.h"
#include "migration/savevm.h"
#include "migration/vmstate.h"
@@ -30,6 +31,7 @@
#include "pci.h"
#include "trace.h"
#include "hw/hw.h"
+#include "vfio-migration-internal.h"
/*
* This is an arbitrary size based on migration of mlx5 devices, where typically
@@ -373,7 +375,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
qemu_put_be64(f, data_size);
qemu_put_buffer(f, migration->data_buffer, data_size);
- vfio_mig_add_bytes_transferred(data_size);
+ vfio_migration_add_bytes_transferred(data_size);
trace_vfio_save_block(migration->vbasedev->name, data_size);
@@ -1021,6 +1023,65 @@ static int vfio_migration_init(VFIODevice *vbasedev)
return 0;
}
+static Error *multiple_devices_migration_blocker;
+
+/*
+ * Multiple devices migration is allowed only if all devices support P2P
+ * migration. Single device migration is allowed regardless of P2P migration
+ * support.
+ */
+static bool vfio_multiple_devices_migration_is_supported(void)
+{
+ VFIODevice *vbasedev;
+ unsigned int device_num = 0;
+ bool all_support_p2p = true;
+
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
+ if (vbasedev->migration) {
+ device_num++;
+
+ if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
+ all_support_p2p = false;
+ }
+ }
+ }
+
+ return all_support_p2p || device_num <= 1;
+}
+
+static int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
+{
+ if (vfio_multiple_devices_migration_is_supported()) {
+ return 0;
+ }
+
+ if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+ error_setg(errp, "Multiple VFIO devices migration is supported only if "
+ "all of them support P2P migration");
+ return -EINVAL;
+ }
+
+ if (multiple_devices_migration_blocker) {
+ return 0;
+ }
+
+ error_setg(&multiple_devices_migration_blocker,
+ "Multiple VFIO devices migration is supported only if all of "
+ "them support P2P migration");
+ return migrate_add_blocker_normal(&multiple_devices_migration_blocker,
+ errp);
+}
+
+static void vfio_unblock_multiple_devices_migration(void)
+{
+ if (!multiple_devices_migration_blocker ||
+ !vfio_multiple_devices_migration_is_supported()) {
+ return;
+ }
+
+ migrate_del_blocker(&multiple_devices_migration_blocker);
+}
+
static void vfio_migration_deinit(VFIODevice *vbasedev)
{
VFIOMigration *migration = vbasedev->migration;
@@ -1047,21 +1108,42 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
/* ---------------------------------------------------------------------- */
-int64_t vfio_mig_bytes_transferred(void)
+int64_t vfio_migration_bytes_transferred(void)
{
return MIN(qatomic_read(&bytes_transferred), INT64_MAX);
}
-void vfio_reset_bytes_transferred(void)
+void vfio_migration_reset_bytes_transferred(void)
{
qatomic_set(&bytes_transferred, 0);
}
-void vfio_mig_add_bytes_transferred(unsigned long val)
+void vfio_migration_add_bytes_transferred(unsigned long val)
{
qatomic_add(&bytes_transferred, val);
}
+bool vfio_migration_active(void)
+{
+ VFIODevice *vbasedev;
+
+ if (QLIST_EMPTY(&vfio_device_list)) {
+ return false;
+ }
+
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
+ if (vbasedev->migration_blocker) {
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool vfio_viommu_preset(VFIODevice *vbasedev)
+{
+ return vbasedev->bcontainer->space->as != &address_space_memory;
+}
+
/*
* Return true when either migration initialized or blocker registered.
* Currently only return false when adding blocker fails which will
@@ -1138,3 +1220,19 @@ void vfio_migration_exit(VFIODevice *vbasedev)
migrate_del_blocker(&vbasedev->migration_blocker);
}
+
+bool vfio_device_state_is_running(VFIODevice *vbasedev)
+{
+ VFIOMigration *migration = vbasedev->migration;
+
+ return migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+ migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P;
+}
+
+bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
+{
+ VFIOMigration *migration = vbasedev->migration;
+
+ return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ||
+ migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
+}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f87f3cc..4a9484d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -44,6 +44,8 @@
#include "migration/blocker.h"
#include "migration/qemu-file.h"
#include "system/iommufd.h"
+#include "vfio-migration-internal.h"
+#include "vfio-helpers.h"
#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
@@ -113,7 +115,7 @@ static void vfio_intx_eoi(VFIODevice *vbasedev)
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
- vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
}
static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
@@ -129,7 +131,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
/* Get to a known interrupt state */
qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
- vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
@@ -147,15 +149,15 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
goto fail_irqfd;
}
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
- VFIO_IRQ_SET_ACTION_UNMASK,
- event_notifier_get_fd(&vdev->intx.unmask),
- errp)) {
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
+ VFIO_IRQ_SET_ACTION_UNMASK,
+ event_notifier_get_fd(&vdev->intx.unmask),
+ errp)) {
goto fail_vfio;
}
/* Let'em rip */
- vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.kvm_accel = true;
@@ -170,7 +172,7 @@ fail_irqfd:
event_notifier_cleanup(&vdev->intx.unmask);
fail:
qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
- vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
return false;
#else
return true;
@@ -188,7 +190,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
* Get to a known state, hardware masked, QEMU ready to accept new
* interrupts, QEMU IRQ de-asserted.
*/
- vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
@@ -208,7 +210,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
vdev->intx.kvm_accel = false;
/* If we've missed an event, let it re-fire through QEMU */
- vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
#endif
@@ -297,7 +299,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
fd = event_notifier_get_fd(&vdev->intx.interrupt);
qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
qemu_set_fd_handler(fd, NULL, NULL, vdev);
event_notifier_cleanup(&vdev->intx.interrupt);
@@ -320,7 +322,7 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev)
timer_del(vdev->intx.mmap_timer);
vfio_intx_disable_kvm(vdev);
- vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+ vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
vfio_mmap_set_enabled(vdev, true);
@@ -576,7 +578,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
if (!vdev->defer_kvm_irq_routing) {
if (vdev->msix->noresize && resizing) {
- vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
+ vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
ret = vfio_enable_vectors(vdev, true);
if (ret) {
error_report("vfio: failed to enable vectors, %d", ret);
@@ -591,7 +593,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
fd = event_notifier_get_fd(&vector->interrupt);
}
- if (!vfio_set_irq_signaling(&vdev->vbasedev,
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev,
VFIO_PCI_MSIX_IRQ_INDEX, nr,
VFIO_IRQ_SET_ACTION_TRIGGER, fd,
&err)) {
@@ -636,7 +638,7 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
int32_t fd = event_notifier_get_fd(&vector->interrupt);
Error *err = NULL;
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
&err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
@@ -833,7 +835,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
* Always clear MSI-X IRQ index. A PF device could have enabled
* MSI-X with no vectors. See vfio_msix_enable().
*/
- vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
+ vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
vfio_msi_disable_common(vdev);
if (!vfio_intx_enable(vdev, &err)) {
@@ -850,7 +852,7 @@ static void vfio_msi_disable(VFIOPCIDevice *vdev)
{
Error *err = NULL;
- vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
+ vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
vfio_msi_disable_common(vdev);
vfio_intx_enable(vdev, &err);
if (err) {
@@ -884,8 +886,8 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
off_t off = 0;
ssize_t bytes;
- if (vfio_get_region_info(&vdev->vbasedev,
- VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
+ if (vfio_device_get_region_info(&vdev->vbasedev,
+ VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
error_report("vfio: Error getting ROM info: %m");
return;
}
@@ -1378,8 +1380,8 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
* If the host driver allows mapping of a MSIX data, we are going to
* do map the entire BAR and emulate MSIX table on top of that.
*/
- if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
- VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
+ if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr,
+ VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
return;
}
@@ -2671,7 +2673,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
g_autofree struct vfio_region_info *reg_info = NULL;
int ret;
- ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
+ ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
if (ret) {
error_setg_errno(errp, -ret,
"failed getting region info for VGA region index %d",
@@ -2769,8 +2771,8 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
QLIST_INIT(&vdev->bars[i].quirks);
}
- ret = vfio_get_region_info(vbasedev,
- VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
+ ret = vfio_device_get_region_info(vbasedev,
+ VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
if (ret) {
error_setg_errno(errp, -ret, "failed to get config info");
return false;
@@ -2814,7 +2816,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
static void vfio_pci_put_device(VFIOPCIDevice *vdev)
{
- vfio_detach_device(&vdev->vbasedev);
+ vfio_device_detach(&vdev->vbasedev);
g_free(vdev->vbasedev.name);
g_free(vdev->msix);
@@ -2866,8 +2868,8 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
fd = event_notifier_get_fd(&vdev->err_notifier);
qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
qemu_set_fd_handler(fd, NULL, NULL, vdev);
event_notifier_cleanup(&vdev->err_notifier);
@@ -2883,8 +2885,8 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
return;
}
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
}
qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
@@ -2931,8 +2933,8 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
fd = event_notifier_get_fd(&vdev->req_notifier);
qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
qemu_set_fd_handler(fd, NULL, NULL, vdev);
event_notifier_cleanup(&vdev->req_notifier);
@@ -2949,8 +2951,8 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
return;
}
- if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
+ if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
}
qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
@@ -2960,77 +2962,10 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
vdev->req_enabled = false;
}
-static void vfio_realize(PCIDevice *pdev, Error **errp)
+static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
{
- ERRP_GUARD();
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ PCIDevice *pdev = &vdev->pdev;
VFIODevice *vbasedev = &vdev->vbasedev;
- int i, ret;
- char uuid[UUID_STR_LEN];
- g_autofree char *name = NULL;
-
- if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
- if (!(~vdev->host.domain || ~vdev->host.bus ||
- ~vdev->host.slot || ~vdev->host.function)) {
- error_setg(errp, "No provided host device");
- error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
-#ifdef CONFIG_IOMMUFD
- "or -device vfio-pci,fd=DEVICE_FD "
-#endif
- "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
- return;
- }
- vbasedev->sysfsdev =
- g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
- vdev->host.domain, vdev->host.bus,
- vdev->host.slot, vdev->host.function);
- }
-
- if (!vfio_device_get_name(vbasedev, errp)) {
- return;
- }
-
- /*
- * Mediated devices *might* operate compatibly with discarding of RAM, but
- * we cannot know for certain, it depends on whether the mdev vendor driver
- * stays in sync with the active working set of the guest driver. Prevent
- * the x-balloon-allowed option unless this is minimally an mdev device.
- */
- vbasedev->mdev = vfio_device_is_mdev(vbasedev);
-
- trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
-
- if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
- error_setg(errp, "x-balloon-allowed only potentially compatible "
- "with mdev devices");
- goto error;
- }
-
- if (!qemu_uuid_is_null(&vdev->vf_token)) {
- qemu_uuid_unparse(&vdev->vf_token, uuid);
- name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
- } else {
- name = g_strdup(vbasedev->name);
- }
-
- if (!vfio_attach_device(name, vbasedev,
- pci_device_iommu_address_space(pdev), errp)) {
- goto error;
- }
-
- if (!vfio_populate_device(vdev, errp)) {
- goto error;
- }
-
- /* Get a copy of config space */
- ret = pread(vbasedev->fd, vdev->pdev.config,
- MIN(pci_config_size(&vdev->pdev), vdev->config_size),
- vdev->config_offset);
- if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
- ret = ret < 0 ? -errno : -EFAULT;
- error_setg_errno(errp, -ret, "failed to read device config space");
- goto error;
- }
/* vfio emulates a lot for us, but some bits need extra love */
vdev->emulated_config_bits = g_malloc0(vdev->config_size);
@@ -3048,7 +2983,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
if (vdev->vendor_id != PCI_ANY_ID) {
if (vdev->vendor_id >= 0xffff) {
error_setg(errp, "invalid PCI vendor ID provided");
- goto error;
+ return false;
}
vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
@@ -3059,7 +2994,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
if (vdev->device_id != PCI_ANY_ID) {
if (vdev->device_id > 0xffff) {
error_setg(errp, "invalid PCI device ID provided");
- goto error;
+ return false;
}
vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
@@ -3070,7 +3005,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
if (vdev->sub_vendor_id != PCI_ANY_ID) {
if (vdev->sub_vendor_id > 0xffff) {
error_setg(errp, "invalid PCI subsystem vendor ID provided");
- goto error;
+ return false;
}
vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
vdev->sub_vendor_id, ~0);
@@ -3081,7 +3016,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
if (vdev->sub_device_id != PCI_ANY_ID) {
if (vdev->sub_device_id > 0xffff) {
error_setg(errp, "invalid PCI subsystem device ID provided");
- goto error;
+ return false;
}
vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
@@ -3112,11 +3047,122 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vfio_bars_prepare(vdev);
if (!vfio_msix_early_setup(vdev, errp)) {
- goto error;
+ return false;
}
vfio_bars_register(vdev);
+ return true;
+}
+
+static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
+{
+ PCIDevice *pdev = &vdev->pdev;
+
+ /* QEMU emulates all of MSI & MSIX */
+ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
+ memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
+ MSIX_CAP_LENGTH);
+ }
+
+ if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
+ memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
+ vdev->msi_cap_size);
+ }
+
+ if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
+ vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+ vfio_intx_mmap_enable, vdev);
+ pci_device_set_intx_routing_notifier(&vdev->pdev,
+ vfio_intx_routing_notifier);
+ vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
+ kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
+ if (!vfio_intx_enable(vdev, errp)) {
+ timer_free(vdev->intx.mmap_timer);
+ pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+ kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+ return false;
+ }
+ }
+ return true;
+}
+
+static void vfio_realize(PCIDevice *pdev, Error **errp)
+{
+ ERRP_GUARD();
+ VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIODevice *vbasedev = &vdev->vbasedev;
+ int i, ret;
+ char uuid[UUID_STR_LEN];
+ g_autofree char *name = NULL;
+
+ if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
+ if (!(~vdev->host.domain || ~vdev->host.bus ||
+ ~vdev->host.slot || ~vdev->host.function)) {
+ error_setg(errp, "No provided host device");
+ error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
+#ifdef CONFIG_IOMMUFD
+ "or -device vfio-pci,fd=DEVICE_FD "
+#endif
+ "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
+ return;
+ }
+ vbasedev->sysfsdev =
+ g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
+ vdev->host.domain, vdev->host.bus,
+ vdev->host.slot, vdev->host.function);
+ }
+
+ if (!vfio_device_get_name(vbasedev, errp)) {
+ return;
+ }
+
+ /*
+ * Mediated devices *might* operate compatibly with discarding of RAM, but
+ * we cannot know for certain, it depends on whether the mdev vendor driver
+ * stays in sync with the active working set of the guest driver. Prevent
+ * the x-balloon-allowed option unless this is minimally an mdev device.
+ */
+ vbasedev->mdev = vfio_device_is_mdev(vbasedev);
+
+ trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
+
+ if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
+ error_setg(errp, "x-balloon-allowed only potentially compatible "
+ "with mdev devices");
+ goto error;
+ }
+
+ if (!qemu_uuid_is_null(&vdev->vf_token)) {
+ qemu_uuid_unparse(&vdev->vf_token, uuid);
+ name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
+ } else {
+ name = g_strdup(vbasedev->name);
+ }
+
+ if (!vfio_device_attach(name, vbasedev,
+ pci_device_iommu_address_space(pdev), errp)) {
+ goto error;
+ }
+
+ if (!vfio_populate_device(vdev, errp)) {
+ goto error;
+ }
+
+ /* Get a copy of config space */
+ ret = pread(vbasedev->fd, vdev->pdev.config,
+ MIN(pci_config_size(&vdev->pdev), vdev->config_size),
+ vdev->config_offset);
+ if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
+ ret = ret < 0 ? -errno : -EFAULT;
+ error_setg_errno(errp, -ret, "failed to read device config space");
+ goto error;
+ }
+
+ if (!vfio_pci_config_setup(vdev, errp)) {
+ goto error;
+ }
+
if (!vbasedev->mdev &&
!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
error_prepend(errp, "Failed to set vIOMMU: ");
@@ -3139,27 +3185,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vfio_bar_quirk_setup(vdev, i);
}
- /* QEMU emulates all of MSI & MSIX */
- if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
- memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
- MSIX_CAP_LENGTH);
- }
-
- if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
- memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
- vdev->msi_cap_size);
- }
-
- if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
- vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
- vfio_intx_mmap_enable, vdev);
- pci_device_set_intx_routing_notifier(&vdev->pdev,
- vfio_intx_routing_notifier);
- vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
- kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
- if (!vfio_intx_enable(vdev, errp)) {
- goto out_deregister;
- }
+ if (!vfio_interrupt_setup(vdev, errp)) {
+ goto out_unset_idev;
}
if (vdev->display != ON_OFF_AUTO_OFF) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 6c59300..f835b1d 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -14,12 +14,14 @@
#include "system/memory.h"
#include "hw/pci/pci_device.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
+#include "hw/vfio/vfio-region.h"
#include "qemu/event_notifier.h"
#include "qemu/queue.h"
#include "qemu/timer.h"
#include "qom/object.h"
#include "system/kvm.h"
+#include "vfio-display.h"
#define PCI_ANY_ID (~0)
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index f273ae9..49cd981 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -37,6 +37,7 @@
#include "hw/platform-bus.h"
#include "hw/qdev-properties.h"
#include "system/kvm.h"
+#include "hw/vfio/vfio-region.h"
/*
* Functions used whatever the injection method
@@ -118,8 +119,8 @@ static int vfio_set_trigger_eventfd(VFIOINTp *intp,
qemu_set_fd_handler(fd, (IOHandler *)handler, NULL, intp);
- if (!vfio_set_irq_signaling(vbasedev, intp->pin, 0,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
+ if (!vfio_device_irq_set_signaling(vbasedev, intp->pin, 0,
+ VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
qemu_set_fd_handler(fd, NULL, NULL, NULL);
return -EINVAL;
@@ -300,7 +301,7 @@ static void vfio_platform_eoi(VFIODevice *vbasedev)
if (vfio_irq_is_automasked(intp)) {
/* unmasks the physical level-sensitive IRQ */
- vfio_unmask_single_irqindex(vbasedev, intp->pin);
+ vfio_device_irq_unmask(vbasedev, intp->pin);
}
/* a single IRQ can be active at a time */
@@ -356,8 +357,8 @@ static int vfio_set_resample_eventfd(VFIOINTp *intp)
Error *err = NULL;
qemu_set_fd_handler(fd, NULL, NULL, NULL);
- if (!vfio_set_irq_signaling(vbasedev, intp->pin, 0,
- VFIO_IRQ_SET_ACTION_UNMASK, fd, &err)) {
+ if (!vfio_device_irq_set_signaling(vbasedev, intp->pin, 0,
+ VFIO_IRQ_SET_ACTION_UNMASK, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
return -EINVAL;
}
@@ -545,7 +546,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
return false;
}
- if (!vfio_attach_device(vbasedev->name, vbasedev,
+ if (!vfio_device_attach(vbasedev->name, vbasedev,
&address_space_memory, errp)) {
return false;
}
@@ -554,7 +555,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
return true;
}
- vfio_detach_device(vbasedev);
+ vfio_device_detach(vbasedev);
return false;
}
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
new file mode 100644
index 0000000..04bf9eb
--- /dev/null
+++ b/hw/vfio/region.c
@@ -0,0 +1,395 @@
+/*
+ * VFIO regions
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ * Adapted for KVM by Qumranet.
+ * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+
+#include "hw/vfio/vfio-region.h"
+#include "hw/vfio/vfio-device.h"
+#include "hw/hw.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/units.h"
+#include "monitor/monitor.h"
+#include "vfio-helpers.h"
+
+/*
+ * IO Port/MMIO - Beware of the endians, VFIO is always little endian
+ */
+void vfio_region_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ VFIORegion *region = opaque;
+ VFIODevice *vbasedev = region->vbasedev;
+ union {
+ uint8_t byte;
+ uint16_t word;
+ uint32_t dword;
+ uint64_t qword;
+ } buf;
+
+ switch (size) {
+ case 1:
+ buf.byte = data;
+ break;
+ case 2:
+ buf.word = cpu_to_le16(data);
+ break;
+ case 4:
+ buf.dword = cpu_to_le32(data);
+ break;
+ case 8:
+ buf.qword = cpu_to_le64(data);
+ break;
+ default:
+ hw_error("vfio: unsupported write size, %u bytes", size);
+ break;
+ }
+
+ if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+ error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
+ ",%d) failed: %m",
+ __func__, vbasedev->name, region->nr,
+ addr, data, size);
+ }
+
+ trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
+
+ /*
+ * A read or write to a BAR always signals an INTx EOI. This will
+ * do nothing if not pending (including not in INTx mode). We assume
+ * that a BAR access is in response to an interrupt and that BAR
+ * accesses will service the interrupt. Unfortunately, we don't know
+ * which access will service the interrupt, so we're potentially
+ * getting quite a few host interrupts per guest interrupt.
+ */
+ vbasedev->ops->vfio_eoi(vbasedev);
+}
+
+uint64_t vfio_region_read(void *opaque,
+ hwaddr addr, unsigned size)
+{
+ VFIORegion *region = opaque;
+ VFIODevice *vbasedev = region->vbasedev;
+ union {
+ uint8_t byte;
+ uint16_t word;
+ uint32_t dword;
+ uint64_t qword;
+ } buf;
+ uint64_t data = 0;
+
+ if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+ error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
+ __func__, vbasedev->name, region->nr,
+ addr, size);
+ return (uint64_t)-1;
+ }
+ switch (size) {
+ case 1:
+ data = buf.byte;
+ break;
+ case 2:
+ data = le16_to_cpu(buf.word);
+ break;
+ case 4:
+ data = le32_to_cpu(buf.dword);
+ break;
+ case 8:
+ data = le64_to_cpu(buf.qword);
+ break;
+ default:
+ hw_error("vfio: unsupported read size, %u bytes", size);
+ break;
+ }
+
+ trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
+
+ /* Same as write above */
+ vbasedev->ops->vfio_eoi(vbasedev);
+
+ return data;
+}
+
+static const MemoryRegionOps vfio_region_ops = {
+ .read = vfio_region_read,
+ .write = vfio_region_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .valid = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+};
+
+static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
+ struct vfio_region_info *info)
+{
+ struct vfio_info_cap_header *hdr;
+ struct vfio_region_info_cap_sparse_mmap *sparse;
+ int i, j;
+
+ hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
+ if (!hdr) {
+ return -ENODEV;
+ }
+
+ sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
+
+ trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
+ region->nr, sparse->nr_areas);
+
+ region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
+
+ for (i = 0, j = 0; i < sparse->nr_areas; i++) {
+ if (sparse->areas[i].size) {
+ trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
+ sparse->areas[i].offset +
+ sparse->areas[i].size - 1);
+ region->mmaps[j].offset = sparse->areas[i].offset;
+ region->mmaps[j].size = sparse->areas[i].size;
+ j++;
+ }
+ }
+
+ region->nr_mmaps = j;
+ region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
+
+ return 0;
+}
+
+int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
+ int index, const char *name)
+{
+ g_autofree struct vfio_region_info *info = NULL;
+ int ret;
+
+ ret = vfio_device_get_region_info(vbasedev, index, &info);
+ if (ret) {
+ return ret;
+ }
+
+ region->vbasedev = vbasedev;
+ region->flags = info->flags;
+ region->size = info->size;
+ region->fd_offset = info->offset;
+ region->nr = index;
+
+ if (region->size) {
+ region->mem = g_new0(MemoryRegion, 1);
+ memory_region_init_io(region->mem, obj, &vfio_region_ops,
+ region, name, region->size);
+
+ if (!vbasedev->no_mmap &&
+ region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
+
+ ret = vfio_setup_region_sparse_mmaps(region, info);
+
+ if (ret) {
+ region->nr_mmaps = 1;
+ region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+ region->mmaps[0].offset = 0;
+ region->mmaps[0].size = region->size;
+ }
+ }
+ }
+
+ trace_vfio_region_setup(vbasedev->name, index, name,
+ region->flags, region->fd_offset, region->size);
+ return 0;
+}
+
+static void vfio_subregion_unmap(VFIORegion *region, int index)
+{
+ trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
+ region->mmaps[index].offset,
+ region->mmaps[index].offset +
+ region->mmaps[index].size - 1);
+ memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
+ munmap(region->mmaps[index].mmap, region->mmaps[index].size);
+ object_unparent(OBJECT(&region->mmaps[index].mem));
+ region->mmaps[index].mmap = NULL;
+}
+
+int vfio_region_mmap(VFIORegion *region)
+{
+ int i, ret, prot = 0;
+ char *name;
+
+ if (!region->mem) {
+ return 0;
+ }
+
+ prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
+ prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
+
+ for (i = 0; i < region->nr_mmaps; i++) {
+ size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB);
+ void *map_base, *map_align;
+
+ /*
+ * Align the mmap for more efficient mapping in the kernel. Ideally
+ * we'd know the PMD and PUD mapping sizes to use as discrete alignment
+ * intervals, but we don't. As of Linux v6.12, the largest PUD size
+ * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set
+ * on x86_64). Align by power-of-two size, capped at 1GiB.
+ *
+ * NB. qemu_memalign() and friends actually allocate memory, whereas
+ * the region size here can exceed host memory, therefore we manually
+ * create an oversized anonymous mapping and clean it up for alignment.
+ */
+ map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (map_base == MAP_FAILED) {
+ ret = -errno;
+ goto no_mmap;
+ }
+
+ map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
+ munmap(map_base, map_align - map_base);
+ munmap(map_align + region->mmaps[i].size,
+ align - (map_align - map_base));
+
+ region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot,
+ MAP_SHARED | MAP_FIXED,
+ region->vbasedev->fd,
+ region->fd_offset +
+ region->mmaps[i].offset);
+ if (region->mmaps[i].mmap == MAP_FAILED) {
+ ret = -errno;
+ goto no_mmap;
+ }
+
+ name = g_strdup_printf("%s mmaps[%d]",
+ memory_region_name(region->mem), i);
+ memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
+ memory_region_owner(region->mem),
+ name, region->mmaps[i].size,
+ region->mmaps[i].mmap);
+ g_free(name);
+ memory_region_add_subregion(region->mem, region->mmaps[i].offset,
+ &region->mmaps[i].mem);
+
+ trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
+ region->mmaps[i].offset,
+ region->mmaps[i].offset +
+ region->mmaps[i].size - 1);
+ }
+
+ return 0;
+
+no_mmap:
+ trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
+ region->fd_offset + region->mmaps[i].offset,
+ region->fd_offset + region->mmaps[i].offset +
+ region->mmaps[i].size - 1, ret);
+
+ region->mmaps[i].mmap = NULL;
+
+ for (i--; i >= 0; i--) {
+ vfio_subregion_unmap(region, i);
+ }
+
+ return ret;
+}
+
+void vfio_region_unmap(VFIORegion *region)
+{
+ int i;
+
+ if (!region->mem) {
+ return;
+ }
+
+ for (i = 0; i < region->nr_mmaps; i++) {
+ if (region->mmaps[i].mmap) {
+ vfio_subregion_unmap(region, i);
+ }
+ }
+}
+
+void vfio_region_exit(VFIORegion *region)
+{
+ int i;
+
+ if (!region->mem) {
+ return;
+ }
+
+ for (i = 0; i < region->nr_mmaps; i++) {
+ if (region->mmaps[i].mmap) {
+ memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
+ }
+ }
+
+ trace_vfio_region_exit(region->vbasedev->name, region->nr);
+}
+
+void vfio_region_finalize(VFIORegion *region)
+{
+ int i;
+
+ if (!region->mem) {
+ return;
+ }
+
+ for (i = 0; i < region->nr_mmaps; i++) {
+ if (region->mmaps[i].mmap) {
+ munmap(region->mmaps[i].mmap, region->mmaps[i].size);
+ object_unparent(OBJECT(&region->mmaps[i].mem));
+ }
+ }
+
+ object_unparent(OBJECT(region->mem));
+
+ g_free(region->mem);
+ g_free(region->mmaps);
+
+ trace_vfio_region_finalize(region->vbasedev->name, region->nr);
+
+ region->mem = NULL;
+ region->mmaps = NULL;
+ region->nr_mmaps = 0;
+ region->size = 0;
+ region->flags = 0;
+ region->nr = 0;
+}
+
+void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
+{
+ int i;
+
+ if (!region->mem) {
+ return;
+ }
+
+ for (i = 0; i < region->nr_mmaps; i++) {
+ if (region->mmaps[i].mmap) {
+ memory_region_set_enabled(&region->mmaps[i].mem, enabled);
+ }
+ }
+
+ trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
+ enabled);
+}
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 66a2d2b..95ccbad 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -15,17 +15,26 @@
#include "system/hostmem.h"
#include "system/address-spaces.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-container.h"
#include "hw/hw.h"
#include "system/ram_addr.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "trace.h"
+#include "vfio-helpers.h"
+
+typedef struct VFIOHostDMAWindow {
+ hwaddr min_iova;
+ hwaddr max_iova;
+ uint64_t iova_pgsizes;
+ QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
+} VFIOHostDMAWindow;
typedef struct VFIOSpaprContainer {
VFIOContainer container;
MemoryListener prereg_listener;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+ unsigned int levels;
} VFIOSpaprContainer;
OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
@@ -230,15 +239,17 @@ static int vfio_spapr_remove_window(VFIOContainer *container,
return 0;
}
-static int vfio_spapr_create_window(VFIOContainer *container,
+static bool vfio_spapr_create_window(VFIOContainer *container,
MemoryRegionSection *section,
- hwaddr *pgsize)
+ hwaddr *pgsize, Error **errp)
{
int ret = 0;
VFIOContainerBase *bcontainer = &container->bcontainer;
+ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+ container);
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
- unsigned entries, bits_total, bits_per_level, max_levels;
+ unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
long rampagesize = qemu_minrampagesize();
@@ -252,11 +263,11 @@ static int vfio_spapr_create_window(VFIOContainer *container,
pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
if (!pagesize) {
- error_report("Host doesn't support page size 0x%"PRIx64
- ", the supported mask is 0x%lx",
- memory_region_iommu_get_min_page_size(iommu_mr),
- bcontainer->pgsizes);
- return -EINVAL;
+ error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64
+ ", the supported mask is 0x%lx",
+ memory_region_iommu_get_min_page_size(iommu_mr),
+ bcontainer->pgsizes);
+ return false;
}
/*
@@ -291,28 +302,41 @@ static int vfio_spapr_create_window(VFIOContainer *container,
*/
bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
create.levels = bits_total / bits_per_level;
- if (bits_total % bits_per_level) {
- ++create.levels;
- }
- max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
- for ( ; create.levels <= max_levels; ++create.levels) {
- ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
- if (!ret) {
- break;
+
+ ddw_levels = scontainer->levels;
+ if (ddw_levels > 1) {
+ if (bits_total % bits_per_level) {
+ ++create.levels;
}
+ max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
+ for ( ; create.levels <= max_levels; ++create.levels) {
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ if (!ret) {
+ break;
+ }
+ }
+ } else { /* ddw_levels == 1 */
+ if (create.levels > ddw_levels) {
+ error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables"
+ ". Use larger IO page size. Supported mask is 0x%lx",
+ bcontainer->pgsizes);
+ return false;
+ }
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
}
+
if (ret) {
- error_report("Failed to create a window, ret = %d (%m)", ret);
- return -errno;
+ error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret);
+ return false;
}
if (create.start_addr != section->offset_within_address_space) {
vfio_spapr_remove_window(container, create.start_addr);
- error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
- section->offset_within_address_space,
- (uint64_t)create.start_addr);
- return -EINVAL;
+ error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx
+ ", must be %"PRIx64, section->offset_within_address_space,
+ (uint64_t)create.start_addr);
+ return false;
}
trace_vfio_spapr_create_window(create.page_shift,
create.levels,
@@ -320,7 +344,7 @@ static int vfio_spapr_create_window(VFIOContainer *container,
create.start_addr);
*pgsize = pagesize;
- return 0;
+ return true;
}
static bool
@@ -377,9 +401,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
}
}
- ret = vfio_spapr_create_window(container, section, &pgsize);
- if (ret) {
- error_setg_errno(errp, -ret, "Failed to create SPAPR window");
+ ret = vfio_spapr_create_window(container, section, &pgsize, errp);
+ if (!ret) {
return false;
}
@@ -502,6 +525,8 @@ static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
goto listener_unregister_exit;
}
+ scontainer->levels = info.ddw.levels;
+
if (v2) {
bcontainer->pgsizes = info.ddw.pgsizes;
/*
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 9347e3a..e90ec9b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -37,8 +37,6 @@ vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int
vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s"
vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
-vfio_attach_device(const char *name, int group_id) " (%s) group %d"
-vfio_detach_device(const char *name, int group_id) " (%s) group %d"
vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
vfio_pci_reset(const char *name) " (%s)"
@@ -89,9 +87,7 @@ vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
vfio_pci_igd_host_bridge_enabled(const char *name) "%s"
vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
-# common.c
-vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
-vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
+# listener.c
vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64
vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
@@ -103,10 +99,21 @@ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t si
vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
-vfio_disconnect_container(int fd) "close container->fd=%d"
-vfio_put_group(int fd) "close group->fd=%d"
-vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
-vfio_put_base_device(int fd) "close vdev->fd=%d"
+vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+
+# container-base.c
+vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
+
+# container.c
+vfio_container_disconnect(int fd) "close container->fd=%d"
+vfio_group_put(int fd) "close group->fd=%d"
+vfio_device_get(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
+vfio_device_put(int fd) "close vdev->fd=%d"
+vfio_legacy_dma_unmap_overflow_workaround(void) ""
+
+# region.c
+vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
+vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d"
vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]"
@@ -116,11 +123,6 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e
vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]"
vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
-vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
-vfio_legacy_dma_unmap_overflow_workaround(void) ""
-vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
-vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
-vfio_reset_handler(void) ""
# platform.c
vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"
@@ -192,3 +194,9 @@ iommufd_cdev_fail_attach_existing_container(const char *msg) " %s"
iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d"
iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d"
+
+# device.c
+vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
+vfio_device_reset_handler(void) ""
+vfio_device_attach(const char *name, int group_id) " (%s) group %d"
+vfio_device_detach(const char *name, int group_id) " (%s) group %d"
diff --git a/hw/vfio/vfio-cpr.h b/hw/vfio/vfio-cpr.h
new file mode 100644
index 0000000..134b83a
--- /dev/null
+++ b/hw/vfio/vfio-cpr.h
@@ -0,0 +1,15 @@
+/*
+ * VFIO CPR
+ *
+ * Copyright (c) 2025 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_CPR_H
+#define HW_VFIO_CPR_H
+
+bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
+void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer);
+
+#endif /* HW_VFIO_CPR_H */
diff --git a/hw/vfio/vfio-display.h b/hw/vfio/vfio-display.h
new file mode 100644
index 0000000..2606c34
--- /dev/null
+++ b/hw/vfio/vfio-display.h
@@ -0,0 +1,42 @@
+/*
+ * VFIO display
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_DISPLAY_H
+#define HW_VFIO_VFIO_DISPLAY_H
+
+#include "ui/console.h"
+#include "hw/display/ramfb.h"
+#include "hw/vfio/vfio-region.h"
+
+typedef struct VFIODMABuf {
+ QemuDmaBuf *buf;
+ uint32_t pos_x, pos_y, pos_updates;
+ uint32_t hot_x, hot_y, hot_updates;
+ int dmabuf_id;
+ QTAILQ_ENTRY(VFIODMABuf) next;
+} VFIODMABuf;
+
+typedef struct VFIODisplay {
+ QemuConsole *con;
+ RAMFBState *ramfb;
+ struct vfio_region_info *edid_info;
+ struct vfio_region_gfx_edid *edid_regs;
+ uint8_t *edid_blob;
+ QEMUTimer *edid_link_timer;
+ struct {
+ VFIORegion buffer;
+ DisplaySurface *surface;
+ } region;
+ struct {
+ QTAILQ_HEAD(, VFIODMABuf) bufs;
+ VFIODMABuf *primary;
+ VFIODMABuf *cursor;
+ } dmabuf;
+} VFIODisplay;
+
+#endif /* HW_VFIO_VFIO_DISPLAY_H */
diff --git a/hw/vfio/vfio-helpers.h b/hw/vfio/vfio-helpers.h
new file mode 100644
index 0000000..54a327f
--- /dev/null
+++ b/hw/vfio/vfio-helpers.h
@@ -0,0 +1,35 @@
+/*
+ * VFIO helpers
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_HELPERS_H
+#define HW_VFIO_VFIO_HELPERS_H
+
+#ifdef CONFIG_LINUX
+#include <linux/vfio.h>
+
+extern int vfio_kvm_device_fd;
+
+struct vfio_info_cap_header *
+vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id);
+struct vfio_info_cap_header *
+vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id);
+struct vfio_info_cap_header *
+vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id);
+struct vfio_info_cap_header *
+vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id);
+bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
+ unsigned int *avail);
+#endif
+
+int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
+struct vfio_device_info *vfio_get_device_info(int fd);
+
+int vfio_kvm_device_add_fd(int fd, Error **errp);
+int vfio_kvm_device_del_fd(int fd, Error **errp);
+
+#endif /* HW_VFIO_VFIO_HELPERS_H */
diff --git a/hw/vfio/vfio-iommufd.h b/hw/vfio/vfio-iommufd.h
new file mode 100644
index 0000000..07ea0f4
--- /dev/null
+++ b/hw/vfio/vfio-iommufd.h
@@ -0,0 +1,34 @@
+/*
+ * VFIO iommufd
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_IOMMUFD_H
+#define HW_VFIO_VFIO_IOMMUFD_H
+
+#include "hw/vfio/vfio-container-base.h"
+
+typedef struct VFIODevice VFIODevice;
+
+typedef struct VFIOIOASHwpt {
+ uint32_t hwpt_id;
+ uint32_t hwpt_flags;
+ QLIST_HEAD(, VFIODevice) device_list;
+ QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
+typedef struct IOMMUFDBackend IOMMUFDBackend;
+
+typedef struct VFIOIOMMUFDContainer {
+ VFIOContainerBase bcontainer;
+ IOMMUFDBackend *be;
+ uint32_t ioas_id;
+ QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
+} VFIOIOMMUFDContainer;
+
+OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD);
+
+#endif /* HW_VFIO_VFIO_IOMMUFD_H */
diff --git a/hw/vfio/vfio-listener.h b/hw/vfio/vfio-listener.h
new file mode 100644
index 0000000..eb69ddd
--- /dev/null
+++ b/hw/vfio/vfio-listener.h
@@ -0,0 +1,15 @@
+/*
+ * VFIO MemoryListener services
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_LISTENER_H
+#define HW_VFIO_VFIO_LISTENER_H
+
+bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp);
+void vfio_listener_unregister(VFIOContainerBase *bcontainer);
+
+#endif /* HW_VFIO_VFIO_LISTENER_H */
diff --git a/hw/vfio/vfio-migration-internal.h b/hw/vfio/vfio-migration-internal.h
new file mode 100644
index 0000000..a8b456b
--- /dev/null
+++ b/hw/vfio/vfio-migration-internal.h
@@ -0,0 +1,74 @@
+/*
+ * VFIO migration
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_MIGRATION_INTERNAL_H
+#define HW_VFIO_VFIO_MIGRATION_INTERNAL_H
+
+#ifdef CONFIG_LINUX
+#include <linux/vfio.h>
+#endif
+
+#include "qemu/typedefs.h"
+#include "qemu/notify.h"
+
+/*
+ * Flags to be used as unique delimiters for VFIO devices in the migration
+ * stream. These flags are composed as:
+ * 0xffffffff => MSB 32-bit all 1s
+ * 0xef10 => Magic ID, represents emulated (virtual) function IO
+ * 0x0000 => 16-bits reserved for flags
+ *
+ * The beginning of state information is marked by _DEV_CONFIG_STATE,
+ * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
+ * certain state information is marked by _END_OF_STATE.
+ */
+#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL)
+#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL)
+#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
+#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
+#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
+
+typedef struct VFIODevice VFIODevice;
+typedef struct VFIOMultifd VFIOMultifd;
+
+typedef struct VFIOMigration {
+ struct VFIODevice *vbasedev;
+ VMChangeStateEntry *vm_state;
+ NotifierWithReturn migration_state;
+ uint32_t device_state;
+ int data_fd;
+ void *data_buffer;
+ size_t data_buffer_size;
+ uint64_t mig_flags;
+ uint64_t precopy_init_size;
+ uint64_t precopy_dirty_size;
+ bool multifd_transfer;
+ VFIOMultifd *multifd;
+ bool initial_data_sent;
+
+ bool event_save_iterate_started;
+ bool event_precopy_empty_hit;
+} VFIOMigration;
+
+bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
+void vfio_migration_exit(VFIODevice *vbasedev);
+bool vfio_device_state_is_running(VFIODevice *vbasedev);
+bool vfio_device_state_is_precopy(VFIODevice *vbasedev);
+int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp);
+int vfio_load_device_config_state(QEMUFile *f, void *opaque);
+
+#ifdef CONFIG_LINUX
+int vfio_migration_set_state(VFIODevice *vbasedev,
+ enum vfio_device_mig_state new_state,
+ enum vfio_device_mig_state recover_state,
+ Error **errp);
+#endif
+
+void vfio_migration_add_bytes_transferred(unsigned long val);
+
+#endif /* HW_VFIO_VFIO_MIGRATION_INTERNAL_H */
diff --git a/include/hw/s390x/vfio-ccw.h b/include/hw/s390x/vfio-ccw.h
index 4209d27..1e0922d 100644
--- a/include/hw/s390x/vfio-ccw.h
+++ b/include/hw/s390x/vfio-ccw.h
@@ -14,7 +14,7 @@
#ifndef HW_VFIO_CCW_H
#define HW_VFIO_CCW_H
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "hw/s390x/s390-ccw.h"
#include "hw/s390x/ccw-device.h"
#include "qom/object.h"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
deleted file mode 100644
index f5b3f45..0000000
--- a/include/hw/vfio/vfio-common.h
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * common header for vfio based device assignment support
- *
- * Copyright Red Hat, Inc. 2012
- *
- * Authors:
- * Alex Williamson <alex.williamson@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- * Based on qemu-kvm device-assignment:
- * Adapted for KVM by Qumranet.
- * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
- * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
- * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
- * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
- * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
- */
-
-#ifndef HW_VFIO_VFIO_COMMON_H
-#define HW_VFIO_VFIO_COMMON_H
-
-#include "system/memory.h"
-#include "qemu/queue.h"
-#include "qemu/notify.h"
-#include "ui/console.h"
-#include "hw/display/ramfb.h"
-#ifdef CONFIG_LINUX
-#include <linux/vfio.h>
-#endif
-#include "system/system.h"
-#include "hw/vfio/vfio-container-base.h"
-#include "system/host_iommu_device.h"
-#include "system/iommufd.h"
-
-#define VFIO_MSG_PREFIX "vfio %s: "
-
-/*
- * Flags to be used as unique delimiters for VFIO devices in the migration
- * stream. These flags are composed as:
- * 0xffffffff => MSB 32-bit all 1s
- * 0xef10 => Magic ID, represents emulated (virtual) function IO
- * 0x0000 => 16-bits reserved for flags
- *
- * The beginning of state information is marked by _DEV_CONFIG_STATE,
- * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
- * certain state information is marked by _END_OF_STATE.
- */
-#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL)
-#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL)
-#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
-#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
-#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
-
-enum {
- VFIO_DEVICE_TYPE_PCI = 0,
- VFIO_DEVICE_TYPE_PLATFORM = 1,
- VFIO_DEVICE_TYPE_CCW = 2,
- VFIO_DEVICE_TYPE_AP = 3,
-};
-
-typedef struct VFIOMmap {
- MemoryRegion mem;
- void *mmap;
- off_t offset;
- size_t size;
-} VFIOMmap;
-
-typedef struct VFIORegion {
- struct VFIODevice *vbasedev;
- off_t fd_offset; /* offset of region within device fd */
- MemoryRegion *mem; /* slow, read/write access */
- size_t size;
- uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
- uint32_t nr_mmaps;
- VFIOMmap *mmaps;
- uint8_t nr; /* cache the region number for debug */
-} VFIORegion;
-
-typedef struct VFIOMultifd VFIOMultifd;
-
-typedef struct VFIOMigration {
- struct VFIODevice *vbasedev;
- VMChangeStateEntry *vm_state;
- NotifierWithReturn migration_state;
- uint32_t device_state;
- int data_fd;
- void *data_buffer;
- size_t data_buffer_size;
- uint64_t mig_flags;
- uint64_t precopy_init_size;
- uint64_t precopy_dirty_size;
- bool multifd_transfer;
- VFIOMultifd *multifd;
- bool initial_data_sent;
-
- bool event_save_iterate_started;
- bool event_precopy_empty_hit;
-} VFIOMigration;
-
-struct VFIOGroup;
-
-typedef struct VFIOContainer {
- VFIOContainerBase bcontainer;
- int fd; /* /dev/vfio/vfio, empowered by the attached groups */
- unsigned iommu_type;
- QLIST_HEAD(, VFIOGroup) group_list;
-} VFIOContainer;
-
-OBJECT_DECLARE_SIMPLE_TYPE(VFIOContainer, VFIO_IOMMU_LEGACY);
-
-typedef struct VFIOHostDMAWindow {
- hwaddr min_iova;
- hwaddr max_iova;
- uint64_t iova_pgsizes;
- QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
-} VFIOHostDMAWindow;
-
-typedef struct IOMMUFDBackend IOMMUFDBackend;
-
-typedef struct VFIOIOASHwpt {
- uint32_t hwpt_id;
- uint32_t hwpt_flags;
- QLIST_HEAD(, VFIODevice) device_list;
- QLIST_ENTRY(VFIOIOASHwpt) next;
-} VFIOIOASHwpt;
-
-typedef struct VFIOIOMMUFDContainer {
- VFIOContainerBase bcontainer;
- IOMMUFDBackend *be;
- uint32_t ioas_id;
- QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
-} VFIOIOMMUFDContainer;
-
-OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD);
-
-typedef struct VFIODeviceOps VFIODeviceOps;
-
-typedef struct VFIODevice {
- QLIST_ENTRY(VFIODevice) next;
- QLIST_ENTRY(VFIODevice) container_next;
- QLIST_ENTRY(VFIODevice) global_next;
- struct VFIOGroup *group;
- VFIOContainerBase *bcontainer;
- char *sysfsdev;
- char *name;
- DeviceState *dev;
- int fd;
- int type;
- bool mdev;
- bool reset_works;
- bool needs_reset;
- bool no_mmap;
- bool ram_block_discard_allowed;
- OnOffAuto enable_migration;
- OnOffAuto migration_multifd_transfer;
- bool migration_events;
- VFIODeviceOps *ops;
- unsigned int num_irqs;
- unsigned int num_regions;
- unsigned int flags;
- VFIOMigration *migration;
- Error *migration_blocker;
- OnOffAuto pre_copy_dirty_page_tracking;
- OnOffAuto device_dirty_page_tracking;
- bool dirty_pages_supported;
- bool dirty_tracking; /* Protected by BQL */
- bool iommu_dirty_tracking;
- HostIOMMUDevice *hiod;
- int devid;
- IOMMUFDBackend *iommufd;
- VFIOIOASHwpt *hwpt;
- QLIST_ENTRY(VFIODevice) hwpt_next;
-} VFIODevice;
-
-struct VFIODeviceOps {
- void (*vfio_compute_needs_reset)(VFIODevice *vdev);
- int (*vfio_hot_reset_multi)(VFIODevice *vdev);
- void (*vfio_eoi)(VFIODevice *vdev);
- Object *(*vfio_get_object)(VFIODevice *vdev);
-
- /**
- * @vfio_save_config
- *
- * Save device config state
- *
- * @vdev: #VFIODevice for which to save the config
- * @f: #QEMUFile where to send the data
- * @errp: pointer to Error*, to store an error if it happens.
- *
- * Returns zero to indicate success and negative for error
- */
- int (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f, Error **errp);
-
- /**
- * @vfio_load_config
- *
- * Load device config state
- *
- * @vdev: #VFIODevice for which to load the config
- * @f: #QEMUFile where to get the data
- *
- * Returns zero to indicate success and negative for error
- */
- int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
-};
-
-typedef struct VFIOGroup {
- int fd;
- int groupid;
- VFIOContainer *container;
- QLIST_HEAD(, VFIODevice) device_list;
- QLIST_ENTRY(VFIOGroup) next;
- QLIST_ENTRY(VFIOGroup) container_next;
- bool ram_block_discard_allowed;
-} VFIOGroup;
-
-#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
-#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
- TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
-
-typedef struct VFIODMABuf {
- QemuDmaBuf *buf;
- uint32_t pos_x, pos_y, pos_updates;
- uint32_t hot_x, hot_y, hot_updates;
- int dmabuf_id;
- QTAILQ_ENTRY(VFIODMABuf) next;
-} VFIODMABuf;
-
-typedef struct VFIODisplay {
- QemuConsole *con;
- RAMFBState *ramfb;
- struct vfio_region_info *edid_info;
- struct vfio_region_gfx_edid *edid_regs;
- uint8_t *edid_blob;
- QEMUTimer *edid_link_timer;
- struct {
- VFIORegion buffer;
- DisplaySurface *surface;
- } region;
- struct {
- QTAILQ_HEAD(, VFIODMABuf) bufs;
- VFIODMABuf *primary;
- VFIODMABuf *cursor;
- } dmabuf;
-} VFIODisplay;
-
-VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
-void vfio_put_address_space(VFIOAddressSpace *space);
-void vfio_address_space_insert(VFIOAddressSpace *space,
- VFIOContainerBase *bcontainer);
-
-void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
-void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
-void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index);
-bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
- int action, int fd, Error **errp);
-void vfio_region_write(void *opaque, hwaddr addr,
- uint64_t data, unsigned size);
-uint64_t vfio_region_read(void *opaque,
- hwaddr addr, unsigned size);
-int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
- int index, const char *name);
-int vfio_region_mmap(VFIORegion *region);
-void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled);
-void vfio_region_unmap(VFIORegion *region);
-void vfio_region_exit(VFIORegion *region);
-void vfio_region_finalize(VFIORegion *region);
-void vfio_reset_handler(void *opaque);
-struct vfio_device_info *vfio_get_device_info(int fd);
-bool vfio_device_is_mdev(VFIODevice *vbasedev);
-bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp);
-bool vfio_attach_device(char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp);
-void vfio_detach_device(VFIODevice *vbasedev);
-VFIODevice *vfio_get_vfio_device(Object *obj);
-
-int vfio_kvm_device_add_fd(int fd, Error **errp);
-int vfio_kvm_device_del_fd(int fd, Error **errp);
-
-bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
-void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer);
-
-extern const MemoryRegionOps vfio_region_ops;
-typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
-typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
-extern VFIOGroupList vfio_group_list;
-extern VFIODeviceList vfio_device_list;
-extern const MemoryListener vfio_memory_listener;
-extern int vfio_kvm_device_fd;
-
-bool vfio_mig_active(void);
-int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
-void vfio_unblock_multiple_devices_migration(void);
-bool vfio_viommu_preset(VFIODevice *vbasedev);
-int64_t vfio_mig_bytes_transferred(void);
-void vfio_reset_bytes_transferred(void);
-void vfio_mig_add_bytes_transferred(unsigned long val);
-bool vfio_device_state_is_running(VFIODevice *vbasedev);
-bool vfio_device_state_is_precopy(VFIODevice *vbasedev);
-
-int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp);
-int vfio_load_device_config_state(QEMUFile *f, void *opaque);
-
-#ifdef CONFIG_LINUX
-int vfio_get_region_info(VFIODevice *vbasedev, int index,
- struct vfio_region_info **info);
-int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
- uint32_t subtype, struct vfio_region_info **info);
-bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type);
-struct vfio_info_cap_header *
-vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id);
-bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
- unsigned int *avail);
-struct vfio_info_cap_header *
-vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id);
-struct vfio_info_cap_header *
-vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id);
-
-int vfio_migration_set_state(VFIODevice *vbasedev,
- enum vfio_device_mig_state new_state,
- enum vfio_device_mig_state recover_state,
- Error **errp);
-#endif
-
-bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
-void vfio_migration_exit(VFIODevice *vbasedev);
-
-int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
-bool vfio_devices_all_dirty_tracking_started(
- const VFIOContainerBase *bcontainer);
-bool
-vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer);
-int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
- VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp);
-int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
- uint64_t size, ram_addr_t ram_addr, Error **errp);
-
-/* Returns 0 on success, or a negative errno. */
-bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
-void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
-void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
- DeviceState *dev, bool ram_discard);
-int vfio_device_get_aw_bits(VFIODevice *vdev);
-#endif /* HW_VFIO_VFIO_COMMON_H */
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 6aca02f..5527e02 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -71,6 +71,11 @@ typedef struct VFIORamDiscardListener {
QLIST_ENTRY(VFIORamDiscardListener) next;
} VFIORamDiscardListener;
+VFIOAddressSpace *vfio_address_space_get(AddressSpace *as);
+void vfio_address_space_put(VFIOAddressSpace *space);
+void vfio_address_space_insert(VFIOAddressSpace *space,
+ VFIOContainerBase *bcontainer);
+
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
@@ -84,8 +89,12 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start, Error **errp);
+bool vfio_container_dirty_tracking_is_started(
+ const VFIOContainerBase *bcontainer);
+bool vfio_container_devices_dirty_tracking_is_supported(
+ const VFIOContainerBase *bcontainer);
int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
- VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp);
+ uint64_t iova, uint64_t size, ram_addr_t ram_addr, Error **errp);
GList *vfio_container_get_iova_ranges(const VFIOContainerBase *bcontainer);
@@ -106,9 +115,6 @@ OBJECT_DECLARE_TYPE(VFIOContainerBase, VFIOIOMMUClass, VFIO_IOMMU)
struct VFIOIOMMUClass {
ObjectClass parent_class;
- /* Properties */
- const char *hiod_typename;
-
/* basic feature */
bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
int (*dma_map)(const VFIOContainerBase *bcontainer,
@@ -163,4 +169,5 @@ struct VFIOIOMMUClass {
MemoryRegionSection *section);
void (*release)(VFIOContainerBase *bcontainer);
};
+
#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */
diff --git a/include/hw/vfio/vfio-container.h b/include/hw/vfio/vfio-container.h
new file mode 100644
index 0000000..afc498d
--- /dev/null
+++ b/include/hw/vfio/vfio-container.h
@@ -0,0 +1,36 @@
+/*
+ * VFIO container
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_CONTAINER_H
+#define HW_VFIO_CONTAINER_H
+
+#include "hw/vfio/vfio-container-base.h"
+
+typedef struct VFIOContainer VFIOContainer;
+typedef struct VFIODevice VFIODevice;
+
+typedef struct VFIOGroup {
+ int fd;
+ int groupid;
+ VFIOContainer *container;
+ QLIST_HEAD(, VFIODevice) device_list;
+ QLIST_ENTRY(VFIOGroup) next;
+ QLIST_ENTRY(VFIOGroup) container_next;
+ bool ram_block_discard_allowed;
+} VFIOGroup;
+
+typedef struct VFIOContainer {
+ VFIOContainerBase bcontainer;
+ int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+ unsigned iommu_type;
+ QLIST_HEAD(, VFIOGroup) group_list;
+} VFIOContainer;
+
+OBJECT_DECLARE_SIMPLE_TYPE(VFIOContainer, VFIO_IOMMU_LEGACY);
+
+#endif /* HW_VFIO_CONTAINER_H */
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
new file mode 100644
index 0000000..81c95bb
--- /dev/null
+++ b/include/hw/vfio/vfio-device.h
@@ -0,0 +1,150 @@
+/*
+ * VFIO Device interface
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ * Adapted for KVM by Qumranet.
+ * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#ifndef HW_VFIO_VFIO_COMMON_H
+#define HW_VFIO_VFIO_COMMON_H
+
+#include "system/memory.h"
+#include "qemu/queue.h"
+#ifdef CONFIG_LINUX
+#include <linux/vfio.h>
+#endif
+#include "system/system.h"
+#include "hw/vfio/vfio-container-base.h"
+#include "system/host_iommu_device.h"
+#include "system/iommufd.h"
+
+#define VFIO_MSG_PREFIX "vfio %s: "
+
+enum {
+ VFIO_DEVICE_TYPE_PCI = 0,
+ VFIO_DEVICE_TYPE_PLATFORM = 1,
+ VFIO_DEVICE_TYPE_CCW = 2,
+ VFIO_DEVICE_TYPE_AP = 3,
+};
+
+typedef struct VFIODeviceOps VFIODeviceOps;
+typedef struct VFIOMigration VFIOMigration;
+
+typedef struct IOMMUFDBackend IOMMUFDBackend;
+typedef struct VFIOIOASHwpt VFIOIOASHwpt;
+
+typedef struct VFIODevice {
+ QLIST_ENTRY(VFIODevice) next;
+ QLIST_ENTRY(VFIODevice) container_next;
+ QLIST_ENTRY(VFIODevice) global_next;
+ struct VFIOGroup *group;
+ VFIOContainerBase *bcontainer;
+ char *sysfsdev;
+ char *name;
+ DeviceState *dev;
+ int fd;
+ int type;
+ bool mdev;
+ bool reset_works;
+ bool needs_reset;
+ bool no_mmap;
+ bool ram_block_discard_allowed;
+ OnOffAuto enable_migration;
+ OnOffAuto migration_multifd_transfer;
+ bool migration_events;
+ VFIODeviceOps *ops;
+ unsigned int num_irqs;
+ unsigned int num_regions;
+ unsigned int flags;
+ VFIOMigration *migration;
+ Error *migration_blocker;
+ OnOffAuto pre_copy_dirty_page_tracking;
+ OnOffAuto device_dirty_page_tracking;
+ bool dirty_pages_supported;
+ bool dirty_tracking; /* Protected by BQL */
+ bool iommu_dirty_tracking;
+ HostIOMMUDevice *hiod;
+ int devid;
+ IOMMUFDBackend *iommufd;
+ VFIOIOASHwpt *hwpt;
+ QLIST_ENTRY(VFIODevice) hwpt_next;
+} VFIODevice;
+
+struct VFIODeviceOps {
+ void (*vfio_compute_needs_reset)(VFIODevice *vdev);
+ int (*vfio_hot_reset_multi)(VFIODevice *vdev);
+ void (*vfio_eoi)(VFIODevice *vdev);
+ Object *(*vfio_get_object)(VFIODevice *vdev);
+
+ /**
+ * @vfio_save_config
+ *
+ * Save device config state
+ *
+ * @vdev: #VFIODevice for which to save the config
+ * @f: #QEMUFile where to send the data
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns zero to indicate success and negative for error
+ */
+ int (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f, Error **errp);
+
+ /**
+ * @vfio_load_config
+ *
+ * Load device config state
+ *
+ * @vdev: #VFIODevice for which to load the config
+ * @f: #QEMUFile where to get the data
+ *
+ * Returns zero to indicate success and negative for error
+ */
+ int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
+};
+
+void vfio_device_irq_disable(VFIODevice *vbasedev, int index);
+void vfio_device_irq_unmask(VFIODevice *vbasedev, int index);
+void vfio_device_irq_mask(VFIODevice *vbasedev, int index);
+bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
+ int action, int fd, Error **errp);
+
+void vfio_device_reset_handler(void *opaque);
+bool vfio_device_is_mdev(VFIODevice *vbasedev);
+bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev,
+ const char *typename, Error **errp);
+bool vfio_device_attach(char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp);
+void vfio_device_detach(VFIODevice *vbasedev);
+VFIODevice *vfio_get_vfio_device(Object *obj);
+
+typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
+extern VFIODeviceList vfio_device_list;
+
+#ifdef CONFIG_LINUX
+int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
+ struct vfio_region_info **info);
+int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
+ uint32_t subtype, struct vfio_region_info **info);
+bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type);
+#endif
+
+/* Returns 0 on success, or a negative errno. */
+bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
+void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
+void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
+ DeviceState *dev, bool ram_discard);
+int vfio_device_get_aw_bits(VFIODevice *vdev);
+#endif /* HW_VFIO_VFIO_COMMON_H */
diff --git a/include/hw/vfio/vfio-migration.h b/include/hw/vfio/vfio-migration.h
new file mode 100644
index 0000000..0d4ecd3
--- /dev/null
+++ b/include/hw/vfio/vfio-migration.h
@@ -0,0 +1,16 @@
+/*
+ * VFIO migration interface
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_MIGRATION_H
+#define HW_VFIO_VFIO_MIGRATION_H
+
+bool vfio_migration_active(void);
+int64_t vfio_migration_bytes_transferred(void);
+void vfio_migration_reset_bytes_transferred(void);
+
+#endif /* HW_VFIO_VFIO_MIGRATION_H */
diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h
index c414c3d..256d850 100644
--- a/include/hw/vfio/vfio-platform.h
+++ b/include/hw/vfio/vfio-platform.h
@@ -17,7 +17,7 @@
#define HW_VFIO_VFIO_PLATFORM_H
#include "hw/sysbus.h"
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-device.h"
#include "qemu/event_notifier.h"
#include "qemu/queue.h"
#include "qom/object.h"
@@ -47,6 +47,8 @@ typedef struct VFIOINTp {
/* function type for user side eventfd handler */
typedef void (*eventfd_user_side_handler_t)(VFIOINTp *intp);
+typedef struct VFIORegion VFIORegion;
+
struct VFIOPlatformDevice {
SysBusDevice sbdev;
VFIODevice vbasedev; /* not a QOM object */
diff --git a/include/hw/vfio/vfio-region.h b/include/hw/vfio/vfio-region.h
new file mode 100644
index 0000000..cbffb26
--- /dev/null
+++ b/include/hw/vfio/vfio-region.h
@@ -0,0 +1,47 @@
+/*
+ * VFIO region
+ *
+ * Copyright Red Hat, Inc. 2025
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_REGION_H
+#define HW_VFIO_REGION_H
+
+#include "system/memory.h"
+
+typedef struct VFIOMmap {
+ MemoryRegion mem;
+ void *mmap;
+ off_t offset;
+ size_t size;
+} VFIOMmap;
+
+typedef struct VFIODevice VFIODevice;
+
+typedef struct VFIORegion {
+ struct VFIODevice *vbasedev;
+ off_t fd_offset; /* offset of region within device fd */
+ MemoryRegion *mem; /* slow, read/write access */
+ size_t size;
+ uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
+ uint32_t nr_mmaps;
+ VFIOMmap *mmaps;
+ uint8_t nr; /* cache the region number for debug */
+} VFIORegion;
+
+
+void vfio_region_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size);
+uint64_t vfio_region_read(void *opaque,
+ hwaddr addr, unsigned size);
+int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
+ int index, const char *name);
+int vfio_region_mmap(VFIORegion *region);
+void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled);
+void vfio_region_unmap(VFIORegion *region);
+void vfio_region_exit(VFIORegion *region);
+void vfio_region_finalize(VFIORegion *region);
+
+#endif /* HW_VFIO_REGION_H */
diff --git a/migration/target.c b/migration/target.c
index a6ffa9a..12fd399 100644
--- a/migration/target.c
+++ b/migration/target.c
@@ -11,21 +11,21 @@
#include CONFIG_DEVICES
#ifdef CONFIG_VFIO
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-migration.h"
#endif
#ifdef CONFIG_VFIO
void migration_populate_vfio_info(MigrationInfo *info)
{
- if (vfio_mig_active()) {
+ if (vfio_migration_active()) {
info->vfio = g_malloc0(sizeof(*info->vfio));
- info->vfio->transferred = vfio_mig_bytes_transferred();
+ info->vfio->transferred = vfio_migration_bytes_transferred();
}
}
void migration_reset_vfio_bytes_transferred(void)
{
- vfio_reset_bytes_transferred();
+ vfio_migration_reset_bytes_transferred();
}
#else
void migration_populate_vfio_info(MigrationInfo *info)