diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2025-06-05 11:00:29 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2025-06-05 11:00:29 -0400 |
commit | 2a53c4f5c534a1ab825ba03e0d3ec45a7c2b90d8 (patch) | |
tree | bbfa4d3b788e2c596035800e18d64b5a4ee1fb12 | |
parent | 83c2201fc47bd0dfa656bde7202bd0e2539d54a0 (diff) | |
parent | 3ed34463a2d8ab8aabfa1d612f12b56600c87983 (diff) | |
download | qemu-2a53c4f5c534a1ab825ba03e0d3ec45a7c2b90d8.zip qemu-2a53c4f5c534a1ab825ba03e0d3ec45a7c2b90d8.tar.gz qemu-2a53c4f5c534a1ab825ba03e0d3ec45a7c2b90d8.tar.bz2 |
Merge tag 'pull-vfio-20250605' of https://github.com/legoater/qemu into staging
vfio queue:
* Fixed OpRegion detection in IGD
* Added prerequisite rework for IOMMU nesting support
* Added prerequisite rework for vfio-user
* Added prerequisite rework for VFIO live update
* Modified memory_get_xlat_addr() to return a MemoryRegion
# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmhBWBgACgkQUaNDx8/7
# 7KEg8RAAyNuFzKs1dUHc24QeApjnd56PF3DhmjT++19hh2VH/CpKeJkWnNuWQupo
# 7yqQmYuxYMrHrh0ncdv5S+sIU2fHGMuxsL4d129H3/BPaAr92Zgtk2ID5deFTG9c
# Ns5sC/1Z6UyRqgh5PRxDmkfMVxyJ73dofTWyAQGNwwt5ASV876JEApMSO4smGpyy
# cu0tpya6WVaYp/Ry2MjpK1N6utr1pJgzIVWQ2ww595OtaqQMa9OD5Sepafp5kf+y
# ZqihINpMY9eGuu4olDQYcaUKThH0DAWR4Eb6ndgG9gOSh0M2YI0YygvG9q9giQzA
# WXlmM2e9ZVAULl2Y8Eb4PVybyk3U9eDK3MzI9PzKBLNdROjJNwNK9ahjtFgPWN9H
# cIYnBEnTP2d1e4BOtJIoQRXdDFOQHqzzEPwFhqMLEnzu1beVRnnt8SiYPKV/pEO0
# ZEAzWka7WN27DDoqgSNzc8ptIzbM6yO66dvLwOhXyr+WyqVaiehxhvfZiEbpeIWa
# 6LuCnyJkgEcAX7I7BaqZxAVvBqwR0z0TElfxadAj6YXgjVEUTahaBV+6M7bBDoid
# BlXTFBrdhlTOjrzV0LkZe9ac9VbxPc9fW/uGoYntD0cRsuWqpDpgNoDlmHDjVudz
# b4TCVksIsfrVkNqQclXfYuSNMZV0KwBADD1wVqZ42nyx1KcgqMQ=
# =tHwb
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 05 Jun 2025 04:40:56 EDT
# gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [full]
# gpg: aka "Cédric Le Goater <clg@kaod.org>" [full]
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1
* tag 'pull-vfio-20250605' of https://github.com/legoater/qemu:
vfio: move vfio-cpr.h
vfio: vfio_find_ram_discard_listener
MAINTAINERS: Add reviewer for CPR
vfio/iommufd: Save vendor specific device info
vfio/iommufd: Implement [at|de]tach_hwpt handlers
vfio/iommufd: Add properties and handlers to TYPE_HOST_IOMMU_DEVICE_IOMMUFD
backends/iommufd: Add a helper to invalidate user-managed HWPT
vfio/container: pass MemoryRegion to DMA operations
vfio: return mr from vfio_get_xlat_addr
vfio/igd: Fix incorrect error propagation in vfio_pci_igd_opregion_detect()
vfio/iommufd: Add comment emphasizing no movement of hiod->realize() call
vfio: refactor out IRQ signalling setup
vfio: move config space read into vfio_pci_config_setup()
vfio: move more cleanup into vfio_pci_put_device()
vfio: add more VFIOIOMMUClass docs
vfio/igd: OpRegion not found fix error typo
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r-- | MAINTAINERS | 10 | ||||
-rw-r--r-- | backends/iommufd.c | 58 | ||||
-rw-r--r-- | backends/trace-events | 1 | ||||
-rw-r--r-- | hw/vfio/container-base.c | 4 | ||||
-rw-r--r-- | hw/vfio/container.c | 5 | ||||
-rw-r--r-- | hw/vfio/cpr.c | 2 | ||||
-rw-r--r-- | hw/vfio/igd.c | 22 | ||||
-rw-r--r-- | hw/vfio/iommufd.c | 45 | ||||
-rw-r--r-- | hw/vfio/listener.c | 74 | ||||
-rw-r--r-- | hw/vfio/pci.c | 89 | ||||
-rw-r--r-- | hw/vfio/vfio-cpr.h | 15 | ||||
-rw-r--r-- | hw/virtio/vhost-vdpa.c | 9 | ||||
-rw-r--r-- | include/hw/vfio/vfio-container-base.h | 83 | ||||
-rw-r--r-- | include/hw/vfio/vfio-cpr.h | 18 | ||||
-rw-r--r-- | include/system/host_iommu_device.h | 15 | ||||
-rw-r--r-- | include/system/iommufd.h | 54 | ||||
-rw-r--r-- | include/system/memory.h | 19 | ||||
-rw-r--r-- | system/memory.c | 32 |
18 files changed, 406 insertions, 149 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 16af379..aa67630 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3032,6 +3032,16 @@ F: include/qemu/co-shared-resource.h T: git https://gitlab.com/jsnow/qemu.git jobs T: git https://gitlab.com/vsementsov/qemu.git block +CheckPoint and Restart (CPR) +R: Steve Sistare <steven.sistare@oracle.com> +S: Supported +F: hw/vfio/cpr* +F: include/hw/vfio/vfio-cpr.h +F: include/migration/cpr.h +F: migration/cpr* +F: tests/qtest/migration/cpr* +F: docs/devel/migration/CPR.rst + Compute Express Link M: Jonathan Cameron <jonathan.cameron@huawei.com> R: Fan Ni <fan.ni@samsung.com> diff --git a/backends/iommufd.c b/backends/iommufd.c index b73f75c..c2c47ab 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -311,6 +311,62 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, return true; } +bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data, + Error **errp) +{ + int ret, fd = be->fd; + uint32_t total_entries = *entry_num; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = total_entries, + .data_uptr = (uintptr_t)data, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + trace_iommufd_backend_invalidate_cache(fd, id, data_type, entry_len, + total_entries, cache.entry_num, + (uintptr_t)data, ret ? errno : 0); + *entry_num = cache.entry_num; + + if (ret) { + error_setg_errno(errp, errno, "IOMMU_HWPT_INVALIDATE failed:" + " total %d entries, processed %d entries", + total_entries, cache.entry_num); + } else if (total_entries != cache.entry_num) { + error_setg(errp, "IOMMU_HWPT_INVALIDATE succeed but with unprocessed" + " entries: total %d entries, processed %d entries." + " Kernel BUG?!", total_entries, cache.entry_num); + return false; + } + + return !ret; +} + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->attach_hwpt); + return idevc->attach_hwpt(idev, hwpt_id, errp); +} + +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->detach_hwpt); + return idevc->detach_hwpt(idev, errp); +} + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { HostIOMMUDeviceCaps *caps = &hiod->caps; @@ -349,6 +405,8 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, .parent = TYPE_HOST_IOMMU_DEVICE, + .instance_size = sizeof(HostIOMMUDeviceIOMMUFD), + .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass), .class_init = hiod_iommufd_class_init, .abstract = true, } diff --git a/backends/trace-events b/backends/trace-events index 40811a3..7278214 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -18,3 +18,4 @@ iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" +iommufd_backend_invalidate_cache(int iommufd, uint32_t id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 1c6ca94..d834bd4 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -75,12 +75,12 @@ void vfio_address_space_insert(VFIOAddressSpace *space, int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) + void *vaddr, bool readonly, MemoryRegion *mr) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index a9f0dba..0f948d0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -33,8 +33,8 @@ #include "qapi/error.h" #include "pci.h" #include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-cpr.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" @@ -207,7 +207,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, } static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 3214184..0210e76 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -8,9 +8,9 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" #include "migration/misc.h" +#include "hw/vfio/vfio-cpr.h" #include "qapi/error.h" #include "system/runstate.h" -#include "vfio-cpr.h" static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, MigrationEvent *e, Error **errp) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index e7952d1..e7a9d1f 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -187,23 +187,21 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, } static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, - struct vfio_region_info **opregion, - Error **errp) + struct vfio_region_info **opregion) { int ret; - /* Hotplugging is not supported for opregion access */ - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); - return false; - } - ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { - error_setg_errno(errp, -ret, - "Device does not supports IGD OpRegion feature"); + return false; + } + + /* Hotplugging is not supported for opregion access */ + if (vdev->pdev.qdev.hotplugged) { + warn_report("IGD device detected, but OpRegion is not supported " + "on hotplugged device."); return false; } @@ -524,7 +522,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) } /* IGD device always comes with OpRegion */ - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { return true; } info_report("OpRegion detected on Intel display %x.", vdev->device_id); @@ -695,7 +693,7 @@ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) return true; } - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { /* Should never reach here, KVMGT always emulates OpRegion */ return false; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index af1c7ab..d3efef7 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -21,20 +21,21 @@ #include "qapi/error.h" #include "system/iommufd.h" #include "hw/qdev-core.h" +#include "hw/vfio/vfio-cpr.h" #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" #include "vfio-iommufd.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); @@ -592,6 +593,10 @@ found_container: goto err_listener_register; } + /* + * Do not move this code before attachment! The nested IOMMU support + * needs device and hwpt id which are generated only after attachment. + */ if (!vfio_device_hiod_create_and_realize(vbasedev, TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) { goto err_listener_register; @@ -810,21 +815,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { VFIODevice *vdev = opaque; + HostIOMMUDeviceIOMMUFD *idev; HostIOMMUDeviceCaps *caps = &hiod->caps; + VendorCaps *vendor_caps = &caps->vendor_caps; enum iommu_hw_info_type type; - union { - struct iommu_hw_info_vtd vtd; - } data; uint64_t hw_caps; hiod->agent = opaque; - if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, + vendor_caps, sizeof(*vendor_caps), &hw_caps, errp)) { return false; } @@ -833,6 +855,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, caps->type = type; caps->hw_caps = hw_caps; + idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->hwpt_id = vdev->hwpt->hwpt_id; + return true; } @@ -858,10 +885,14 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod) static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges; hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index bfacb3d..203ed03 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -90,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) +/* + * Called with rcu_read_lock held. + * The returned MemoryRegion must not be accessed after calling rcu_read_unlock. + */ +static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { - bool ret, mr_has_discard_manager; + MemoryRegion *mr; - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { + mr = memory_translate_iotlb(iotlb, xlat_p, errp); + if (mr && memory_region_has_ram_discard_manager(mr)) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The * pages will remain pinned inside vfio until unmapped, resulting in a @@ -118,7 +119,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, " intended via an IOMMU. It's possible to mitigate " " by setting/adjusting RLIMIT_MEMLOCK."); } - return ret; + return mr; } static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) @@ -126,6 +127,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mr; + hwaddr xlat; void *vaddr; int ret; Error *local_err = NULL; @@ -150,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); goto out; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be @@ -163,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mr); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -233,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -449,6 +456,26 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) } } +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -557,7 +584,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -1010,6 +1037,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ram_addr_t translated_addr; Error *local_err = NULL; int ret = -EINVAL; + MemoryRegion *mr; + hwaddr xlat; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); @@ -1021,9 +1050,11 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { goto out_unlock; } + translated_addr = memory_region_get_ram_addr(mr) + xlat; ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr, &local_err); @@ -1075,19 +1106,8 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); /* * We only want/can synchronize the bitmap for actually mapped parts - diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a1bfdfe..b1250d8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -511,6 +511,25 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -583,21 +602,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_device_irq_set_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -2854,6 +2859,18 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) static void vfio_pci_put_device(VFIOPCIDevice *vdev) { + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + vfio_device_detach(&vdev->vbasedev); g_free(vdev->vbasedev.name); @@ -3005,6 +3022,19 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; + uint32_t config_space_size; + int ret; + + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); + + /* Get a copy of config space */ + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; + } /* vfio emulates a lot for us, but some bits need extra love */ vdev->emulated_config_bits = g_malloc0(vdev->config_size); @@ -3126,15 +3156,14 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - int i, ret; + int i; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; - uint32_t config_space_size; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -3189,17 +3218,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } - config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); - - /* Get a copy of config space */ - ret = vfio_pci_config_space_read(vdev, 0, config_space_size, - vdev->pdev.config); - if (ret < (int)config_space_size) { - ret = ret < 0 ? -ret : EFAULT; - error_setg_errno(errp, ret, "failed to read device config space"); - goto error; - } - if (!vfio_pci_config_setup(vdev, errp)) { goto error; } @@ -3302,17 +3320,6 @@ static void vfio_instance_finalize(Object *obj) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } @@ -3514,7 +3521,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif dc->desc = "VFIO-based PCI device assignment"; - pdc->realize = vfio_realize; + pdc->realize = vfio_pci_realize; object_class_property_set_description(klass, /* 1.3 */ "host", diff --git a/hw/vfio/vfio-cpr.h b/hw/vfio/vfio-cpr.h deleted file mode 100644 index 134b83a..0000000 --- a/hw/vfio/vfio-cpr.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * VFIO CPR - * - * Copyright (c) 2025 Oracle and/or its affiliates. - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#ifndef HW_VFIO_CPR_H -#define HW_VFIO_CPR_H - -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp); -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer); - -#endif /* HW_VFIO_CPR_H */ diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index e20da95..7061b6e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -209,6 +209,8 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) int ret; Int128 llend; Error *local_err = NULL; + MemoryRegion *mr; + hwaddr xlat; if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", @@ -228,11 +230,14 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, - &local_err)) { + mr = memory_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); return; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, iotlb->addr_mask + 1, vaddr, read_only); if (ret) { diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 3d392b0..9d37f86 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -78,7 +78,7 @@ void vfio_address_space_insert(VFIOAddressSpace *space, int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly); + void *vaddr, bool readonly, MemoryRegion *mr); int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all); @@ -115,13 +115,57 @@ OBJECT_DECLARE_TYPE(VFIOContainerBase, VFIOIOMMUClass, VFIO_IOMMU) struct VFIOIOMMUClass { ObjectClass parent_class; - /* basic feature */ + /** + * @setup + * + * Perform basic setup of the container, including configuring IOMMU + * capabilities, IOVA ranges, supported page sizes, etc. + * + * @bcontainer: #VFIOContainerBase + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for error. + */ bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); + + /** + * @listener_begin + * + * Called at the beginning of an address space update transaction. + * See #MemoryListener. + * + * @bcontainer: #VFIOContainerBase + */ void (*listener_begin)(VFIOContainerBase *bcontainer); + + /** + * @listener_commit + * + * Called at the end of an address space update transaction, + * See #MemoryListener. + * + * @bcontainer: #VFIOContainerBase + */ void (*listener_commit)(VFIOContainerBase *bcontainer); + + /** + * @dma_map + * + * Map an address range into the container. Note that the memory region is + * referenced within an RCU read lock region across this call. + * + * @bcontainer: #VFIOContainerBase to use + * @iova: start address to map + * @size: size of the range to map + * @vaddr: process virtual address of mapping + * @readonly: true if mapping should be readonly + * @mr: the memory region for this mapping + * + * Returns 0 to indicate success and -errno otherwise. + */ int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly); + void *vaddr, bool readonly, MemoryRegion *mr); /** * @dma_unmap * @@ -132,12 +176,38 @@ struct VFIOIOMMUClass { * @size: size of the range to unmap * @iotlb: The IOMMU TLB mapping entry (or NULL) * @unmap_all: if set, unmap the entire address space + * + * Returns 0 to indicate success and -errno otherwise. */ int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all); + + + /** + * @attach_device + * + * Associate the given device with a container and do some related + * initialization of the device context. + * + * @name: name of the device + * @vbasedev: the device + * @as: address space to use + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for error. + */ bool (*attach_device)(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); + + /* + * @detach_device + * + * Detach the given device from its container and clean up any necessary + * state. + * + * @vbasedev: the device to disassociate + */ void (*detach_device)(VFIODevice *vbasedev); /* migration feature */ @@ -152,7 +222,7 @@ struct VFIOIOMMUClass { * @start: indicates whether to start or stop dirty pages tracking * @errp: pointer to Error*, to store an error if it happens. * - * Returns zero to indicate success and negative for error + * Returns zero to indicate success and negative for error. */ int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, bool start, Error **errp); @@ -167,7 +237,7 @@ struct VFIOIOMMUClass { * @size: size of iova range * @errp: pointer to Error*, to store an error if it happens. * - * Returns zero to indicate success and negative for error + * Returns zero to indicate success and negative for error. */ int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp); @@ -183,4 +253,7 @@ struct VFIOIOMMUClass { void (*release)(VFIOContainerBase *bcontainer); }; +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section); + #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h new file mode 100644 index 0000000..750ea5b --- /dev/null +++ b/include/hw/vfio/vfio-cpr.h @@ -0,0 +1,18 @@ +/* + * VFIO CPR + * + * Copyright (c) 2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_CPR_H +#define HW_VFIO_VFIO_CPR_H + +struct VFIOContainerBase; + +bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer, + Error **errp); +void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); + +#endif /* HW_VFIO_VFIO_CPR_H */ diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h index 809cced..ab849a4 100644 --- a/include/system/host_iommu_device.h +++ b/include/system/host_iommu_device.h @@ -14,6 +14,13 @@ #include "qom/object.h" #include "qapi/error.h" +#ifdef CONFIG_LINUX +#include "linux/iommufd.h" + +typedef union VendorCaps { + struct iommu_hw_info_vtd vtd; + struct iommu_hw_info_arm_smmuv3 smmuv3; +} VendorCaps; /** * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. @@ -22,11 +29,17 @@ * * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) + * + * @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on + * IOMMUFD this represents a user-space buffer filled by kernel + * with host IOMMU @type specific hardware information data) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; uint64_t hw_caps; + VendorCaps vendor_caps; } HostIOMMUDeviceCaps; +#endif #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) @@ -38,7 +51,9 @@ struct HostIOMMUDevice { void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ PCIBus *aliased_bus; int aliased_devfn; +#ifdef CONFIG_LINUX HostIOMMUDeviceCaps caps; +#endif }; /** diff --git a/include/system/iommufd.h b/include/system/iommufd.h index cbab75b..283861b 100644 --- a/include/system/iommufd.h +++ b/include/system/iommufd.h @@ -61,6 +61,60 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, uint64_t iova, ram_addr_t size, uint64_t page_size, uint64_t *data, Error **errp); +bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Overload of the host IOMMU device for the iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t hwpt_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); #endif diff --git a/include/system/memory.h b/include/system/memory.h index fc35a0d..0848690 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -739,21 +739,20 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, RamDiscardListener *rdl); /** - * memory_get_xlat_addr: Extract addresses from a TLB entry + * memory_translate_iotlb: Extract addresses from a TLB entry. + * Called with rcu_read_lock held. * * @iotlb: pointer to an #IOMMUTLBEntry - * @vaddr: virtual address - * @ram_addr: RAM address - * @read_only: indicates if writes are allowed - * @mr_has_discard_manager: indicates memory is controlled by a - * RamDiscardManager + * @xlat_p: return the offset of the entry from the start of the returned + * MemoryRegion. * @errp: pointer to Error*, to store an error if it happens. * - * Return: true on success, else false setting @errp with error. + * Return: On success, return the MemoryRegion containing the @iotlb translated + * addr. The MemoryRegion must not be accessed after rcu_read_unlock. + * On failure, return NULL, setting @errp with error. */ -bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp); +MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp); typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; diff --git a/system/memory.c b/system/memory.c index 63b983e..306e9ff 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2174,18 +2174,14 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, } /* Called with rcu_read_lock held. */ -bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp) +MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { MemoryRegion *mr; hwaddr xlat; hwaddr len = iotlb->addr_mask + 1; bool writable = iotlb->perm & IOMMU_WO; - if (mr_has_discard_manager) { - *mr_has_discard_manager = false; - } /* * The IOMMU TLB entry we have just covers translation through * this IOMMU to its immediate target. We need to translate @@ -2195,7 +2191,7 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, &xlat, &len, writable, MEMTXATTRS_UNSPECIFIED); if (!memory_region_is_ram(mr)) { error_setg(errp, "iommu map to non memory area %" HWADDR_PRIx "", xlat); - return false; + return NULL; } else if (memory_region_has_ram_discard_manager(mr)) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); MemoryRegionSection tmp = { @@ -2203,9 +2199,6 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, .offset_within_region = xlat, .size = int128_make64(len), }; - if (mr_has_discard_manager) { - *mr_has_discard_manager = true; - } /* * Malicious VMs can map memory into the IOMMU, which is expected * to remain discarded. vfio will pin all pages, populating memory. @@ -2216,7 +2209,7 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, error_setg(errp, "iommu map to discarded memory (e.g., unplugged" " via virtio-mem): %" HWADDR_PRIx "", iotlb->translated_addr); - return false; + return NULL; } } @@ -2226,22 +2219,11 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, */ if (len & iotlb->addr_mask) { error_setg(errp, "iommu has granularity incompatible with target AS"); - return false; - } - - if (vaddr) { - *vaddr = memory_region_get_ram_ptr(mr) + xlat; - } - - if (ram_addr) { - *ram_addr = memory_region_get_ram_addr(mr) + xlat; - } - - if (read_only) { - *read_only = !writable || mr->readonly; + return NULL; } - return true; + *xlat_p = xlat; + return mr; } void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) |