From 6b62a90c24eb766b331343f9ad80104a7f16e7a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 26 Mar 2025 08:51:17 +0100 Subject: vfio: Introduce new files for VFIO MemoryListener MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit File "common.c" has been emptied of most of its definitions by the previous changes and the only definitions left are related to the VFIO MemoryListener handlers. Rename it to "listener.c" and introduce its associated "vfio-listener.h" header file for the declarations. Cleanup a little the includes while at it. Reviewed-by: Joao Martins Reviewed-by: John Levon Reviewed-by: Avihai Horon Link: https://lore.kernel.org/qemu-devel/20250326075122.1299361-33-clg@redhat.com Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 1172 ----------------------------------------- hw/vfio/container.c | 1 + hw/vfio/iommufd.c | 1 + hw/vfio/listener.c | 1172 +++++++++++++++++++++++++++++++++++++++++ hw/vfio/meson.build | 2 +- hw/vfio/trace-events | 2 +- hw/vfio/vfio-listener.h | 14 + include/hw/vfio/vfio-common.h | 1 - 8 files changed, 1190 insertions(+), 1175 deletions(-) delete mode 100644 hw/vfio/common.c create mode 100644 hw/vfio/listener.c create mode 100644 hw/vfio/vfio-listener.h diff --git a/hw/vfio/common.c b/hw/vfio/common.c deleted file mode 100644 index bde1bb3..0000000 --- a/hw/vfio/common.c +++ /dev/null @@ -1,1172 +0,0 @@ -/* - * generic functions used by VFIO devices - * - * Copyright Red Hat, Inc. 2012 - * - * Authors: - * Alex Williamson - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Based on qemu-kvm device-assignment: - * Adapted for KVM by Qumranet. - * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) - * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) - * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) - * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) - * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) - */ - -#include "qemu/osdep.h" -#include -#ifdef CONFIG_KVM -#include -#endif -#include - -#include "hw/vfio/vfio-common.h" -#include "hw/vfio/pci.h" -#include "system/address-spaces.h" -#include "system/memory.h" -#include "system/ram_addr.h" -#include "hw/hw.h" -#include "qemu/error-report.h" -#include "qemu/main-loop.h" -#include "qemu/range.h" -#include "system/kvm.h" -#include "system/reset.h" -#include "system/runstate.h" -#include "trace.h" -#include "qapi/error.h" -#include "migration/misc.h" -#include "migration/qemu-file.h" -#include "system/tcg.h" -#include "system/tpm.h" -#include "vfio-migration-internal.h" -#include "vfio-helpers.h" - -/* - * Device state interfaces - */ - - -static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - if (!vfio_container_dirty_tracking_is_started(bcontainer)) { - return false; - } - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - VFIOMigration *migration = vbasedev->migration; - - if (!migration) { - return false; - } - - if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && - (vfio_device_state_is_running(vbasedev) || - vfio_device_state_is_precopy(vbasedev))) { - return false; - } - } - return true; -} - -static bool vfio_listener_skipped_section(MemoryRegionSection *section) -{ - return (!memory_region_is_ram(section->mr) && - !memory_region_is_iommu(section->mr)) || - memory_region_is_protected(section->mr) || - /* - * Sizing an enabled 64-bit BAR can cause spurious mappings to - * addresses in the upper part of the 64-bit address space. These - * are never accessed by the CPU and beyond the address width of - * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. - */ - section->offset_within_address_space & (1ULL << 63); -} - -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) -{ - bool ret, mr_has_discard_manager; - - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { - /* - * Malicious VMs might trigger discarding of IOMMU-mapped memory. The - * pages will remain pinned inside vfio until unmapped, resulting in a - * higher memory consumption than expected. If memory would get - * populated again later, there would be an inconsistency between pages - * pinned by vfio and pages seen by QEMU. This is the case until - * unmapped from the IOMMU (e.g., during device reset). - * - * With malicious guests, we really only care about pinning more memory - * than expected. RLIMIT_MEMLOCK set for the user/process can never be - * exceeded and can be used to mitigate this problem. - */ - warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" - " RAM (e.g., virtio-mem) works, however, malicious" - " guests can trigger pinning of more memory than" - " intended via an IOMMU. It's possible to mitigate " - " by setting/adjusting RLIMIT_MEMLOCK."); - } - return ret; -} - -static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) -{ - VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainerBase *bcontainer = giommu->bcontainer; - hwaddr iova = iotlb->iova + giommu->iommu_offset; - void *vaddr; - int ret; - Error *local_err = NULL; - - trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", - iova, iova + iotlb->addr_mask); - - if (iotlb->target_as != &address_space_memory) { - error_setg(&local_err, - "Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); - if (migration_is_running()) { - migration_file_set_error(-EINVAL, local_err); - } else { - error_report_err(local_err); - } - return; - } - - rcu_read_lock(); - - if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { - bool read_only; - - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { - error_report_err(local_err); - goto out; - } - /* - * vaddr is only valid until rcu_read_unlock(). But after - * vfio_dma_map has set up the mapping the pages will be - * pinned by the kernel. This makes sure that the RAM backend - * of vaddr will always be there, even if the memory object is - * destroyed and its backing memory munmap-ed. - */ - ret = vfio_container_dma_map(bcontainer, iova, - iotlb->addr_mask + 1, vaddr, - read_only); - if (ret) { - error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%s)", - bcontainer, iova, - iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); - } - } else { - ret = vfio_container_dma_unmap(bcontainer, iova, - iotlb->addr_mask + 1, iotlb); - if (ret) { - error_setg(&local_err, - "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, - iotlb->addr_mask + 1, ret, strerror(-ret)); - if (migration_is_running()) { - migration_file_set_error(ret, local_err); - } else { - error_report_err(local_err); - } - } - } -out: - rcu_read_unlock(); -} - -static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, - MemoryRegionSection *section) -{ - VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, - listener); - VFIOContainerBase *bcontainer = vrdl->bcontainer; - const hwaddr size = int128_get64(section->size); - const hwaddr iova = section->offset_within_address_space; - int ret; - - /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); - if (ret) { - error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, - strerror(-ret)); - } -} - -static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, - MemoryRegionSection *section) -{ - VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, - listener); - VFIOContainerBase *bcontainer = vrdl->bcontainer; - const hwaddr end = section->offset_within_region + - int128_get64(section->size); - hwaddr start, next, iova; - void *vaddr; - int ret; - - /* - * Map in (aligned within memory region) minimum granularity, so we can - * unmap in minimum granularity later. - */ - for (start = section->offset_within_region; start < end; start = next) { - next = ROUND_UP(start + 1, vrdl->granularity); - next = MIN(next, end); - - iova = start - section->offset_within_region + - section->offset_within_address_space; - vaddr = memory_region_get_ram_ptr(section->mr) + start; - - ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); - if (ret) { - /* Rollback */ - vfio_ram_discard_notify_discard(rdl, section); - return ret; - } - } - return 0; -} - -static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) -{ - RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - int target_page_size = qemu_target_page_size(); - VFIORamDiscardListener *vrdl; - - /* Ignore some corner cases not relevant in practice. */ - g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size)); - g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, - target_page_size)); - g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size)); - - vrdl = g_new0(VFIORamDiscardListener, 1); - vrdl->bcontainer = bcontainer; - vrdl->mr = section->mr; - vrdl->offset_within_address_space = section->offset_within_address_space; - vrdl->size = int128_get64(section->size); - vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, - section->mr); - - g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); - g_assert(bcontainer->pgsizes && - vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); - - ram_discard_listener_init(&vrdl->listener, - vfio_ram_discard_notify_populate, - vfio_ram_discard_notify_discard, true); - ram_discard_manager_register_listener(rdm, &vrdl->listener, section); - QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); - - /* - * Sanity-check if we have a theoretically problematic setup where we could - * exceed the maximum number of possible DMA mappings over time. We assume - * that each mapped section in the same address space as a RamDiscardManager - * section consumes exactly one DMA mapping, with the exception of - * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections - * in the same address space as RamDiscardManager sections. - * - * We assume that each section in the address space consumes one memslot. - * We take the number of KVM memory slots as a best guess for the maximum - * number of sections in the address space we could have over time, - * also consuming DMA mappings. - */ - if (bcontainer->dma_max_mappings) { - unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; - -#ifdef CONFIG_KVM - if (kvm_enabled()) { - max_memslots = kvm_get_max_memslots(); - } -#endif - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - hwaddr start, end; - - start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, - vrdl->granularity); - end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, - vrdl->granularity); - vrdl_mappings += (end - start) / vrdl->granularity; - vrdl_count++; - } - - if (vrdl_mappings + max_memslots - vrdl_count > - bcontainer->dma_max_mappings) { - warn_report("%s: possibly running out of DMA mappings. E.g., try" - " increasing the 'block-size' of virtio-mem devies." - " Maximum possible DMA mappings: %d, Maximum possible" - " memslots: %d", __func__, bcontainer->dma_max_mappings, - max_memslots); - } - } -} - -static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) -{ - RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to unregister missing RAM discard listener"); - } - - ram_discard_manager_unregister_listener(rdm, &vrdl->listener); - QLIST_REMOVE(vrdl, next); - g_free(vrdl); -} - -static bool vfio_known_safe_misalignment(MemoryRegionSection *section) -{ - MemoryRegion *mr = section->mr; - - if (!TPM_IS_CRB(mr->owner)) { - return false; - } - - /* this is a known safe misaligned region, just trace for debug purpose */ - trace_vfio_known_safe_misalignment(memory_region_name(mr), - section->offset_within_address_space, - section->offset_within_region, - qemu_real_host_page_size()); - return true; -} - -static bool vfio_listener_valid_section(MemoryRegionSection *section, - const char *name) -{ - if (vfio_listener_skipped_section(section)) { - trace_vfio_listener_region_skip(name, - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(int128_sub(section->size, int128_one()))); - return false; - } - - if (unlikely((section->offset_within_address_space & - ~qemu_real_host_page_mask()) != - (section->offset_within_region & ~qemu_real_host_page_mask()))) { - if (!vfio_known_safe_misalignment(section)) { - error_report("%s received unaligned region %s iova=0x%"PRIx64 - " offset_within_region=0x%"PRIx64 - " qemu_real_host_page_size=0x%"PRIxPTR, - __func__, memory_region_name(section->mr), - section->offset_within_address_space, - section->offset_within_region, - qemu_real_host_page_size()); - } - return false; - } - - return true; -} - -static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, - hwaddr *out_iova, hwaddr *out_end, - Int128 *out_llend) -{ - Int128 llend; - hwaddr iova; - - iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); - llend = int128_make64(section->offset_within_address_space); - llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); - - if (int128_ge(int128_make64(iova), llend)) { - return false; - } - - *out_iova = iova; - *out_end = int128_get64(int128_sub(llend, int128_one())); - if (out_llend) { - *out_llend = llend; - } - return true; -} - -static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) -{ - /* - * MMIO region mapping failures are not fatal but in this case PCI - * peer-to-peer transactions are broken. - */ - if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) { - error_append_hint(errp, "%s: PCI peer-to-peer transactions " - "on BARs are not supported.\n", vbasedev->name); - } -} - -static void vfio_listener_region_add(MemoryListener *listener, - MemoryRegionSection *section) -{ - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - hwaddr iova, end; - Int128 llend, llsize; - void *vaddr; - int ret; - Error *err = NULL; - - if (!vfio_listener_valid_section(section, "region_add")) { - return; - } - - if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, - &llend)) { - if (memory_region_is_ram_device(section->mr)) { - trace_vfio_listener_region_add_no_dma_map( - memory_region_name(section->mr), - section->offset_within_address_space, - int128_getlo(section->size), - qemu_real_host_page_size()); - } - return; - } - - /* PPC64/pseries machine only */ - if (!vfio_container_add_section_window(bcontainer, section, &err)) { - goto mmio_dma_error; - } - - memory_region_ref(section->mr); - - if (memory_region_is_iommu(section->mr)) { - VFIOGuestIOMMU *giommu; - IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); - int iommu_idx; - - trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); - /* - * FIXME: For VFIO iommu types which have KVM acceleration to - * avoid bouncing all map/unmaps through qemu this way, this - * would be the right place to wire that up (tell the KVM - * device emulation the VFIO iommu handles to use). - */ - giommu = g_malloc0(sizeof(*giommu)); - giommu->iommu_mr = iommu_mr; - giommu->iommu_offset = section->offset_within_address_space - - section->offset_within_region; - giommu->bcontainer = bcontainer; - llend = int128_add(int128_make64(section->offset_within_region), - section->size); - llend = int128_sub(llend, int128_one()); - iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, - MEMTXATTRS_UNSPECIFIED); - iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, - IOMMU_NOTIFIER_IOTLB_EVENTS, - section->offset_within_region, - int128_get64(llend), - iommu_idx); - - ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, - &err); - if (ret) { - g_free(giommu); - goto fail; - } - QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); - memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); - - return; - } - - /* Here we assume that memory_region_is_ram(section->mr)==true */ - - /* - * For RAM memory regions with a RamDiscardManager, we only want to map the - * actually populated parts - and update the mapping whenever we're notified - * about changes. - */ - if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(bcontainer, section); - return; - } - - vaddr = memory_region_get_ram_ptr(section->mr) + - section->offset_within_region + - (iova - section->offset_within_address_space); - - trace_vfio_listener_region_add_ram(iova, end, vaddr); - - llsize = int128_sub(llend, int128_make64(iova)); - - if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; - - if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { - trace_vfio_listener_region_add_no_dma_map( - memory_region_name(section->mr), - section->offset_within_address_space, - int128_getlo(section->size), - pgmask + 1); - return; - } - } - - ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); - if (ret) { - error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%s)", - bcontainer, iova, int128_get64(llsize), vaddr, ret, - strerror(-ret)); - mmio_dma_error: - if (memory_region_is_ram_device(section->mr)) { - /* Allow unexpected mappings not to be fatal for RAM devices */ - VFIODevice *vbasedev = - vfio_get_vfio_device(memory_region_owner(section->mr)); - vfio_device_error_append(vbasedev, &err); - warn_report_err_once(err); - return; - } - goto fail; - } - - return; - -fail: - if (!bcontainer->initialized) { - /* - * At machine init time or when the device is attached to the - * VM, store the first error in the container so we can - * gracefully fail the device realize routine. - */ - if (!bcontainer->error) { - error_propagate_prepend(&bcontainer->error, err, - "Region %s: ", - memory_region_name(section->mr)); - } else { - error_free(err); - } - } else { - /* - * At runtime, there's not much we can do other than throw a - * hardware error. - */ - error_report_err(err); - hw_error("vfio: DMA mapping failed, unable to continue"); - } -} - -static void vfio_listener_region_del(MemoryListener *listener, - MemoryRegionSection *section) -{ - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - hwaddr iova, end; - Int128 llend, llsize; - int ret; - bool try_unmap = true; - - if (!vfio_listener_valid_section(section, "region_del")) { - return; - } - - if (memory_region_is_iommu(section->mr)) { - VFIOGuestIOMMU *giommu; - - trace_vfio_listener_region_del_iommu(section->mr->name); - QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { - if (MEMORY_REGION(giommu->iommu_mr) == section->mr && - giommu->n.start == section->offset_within_region) { - memory_region_unregister_iommu_notifier(section->mr, - &giommu->n); - QLIST_REMOVE(giommu, giommu_next); - g_free(giommu); - break; - } - } - - /* - * FIXME: We assume the one big unmap below is adequate to - * remove any individual page mappings in the IOMMU which - * might have been copied into VFIO. This works for a page table - * based IOMMU where a big unmap flattens a large range of IO-PTEs. - * That may not be true for all IOMMU types. - */ - } - - if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, - &llend)) { - return; - } - - llsize = int128_sub(llend, int128_make64(iova)); - - trace_vfio_listener_region_del(iova, end); - - if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask; - - pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; - try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); - } else if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_unregister_ram_discard_listener(bcontainer, section); - /* Unregistering will trigger an unmap. */ - try_unmap = false; - } - - if (try_unmap) { - if (int128_eq(llsize, int128_2_64())) { - /* The unmap ioctl doesn't accept a full 64-bit span. */ - llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); - if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, int128_get64(llsize), ret, - strerror(-ret)); - } - iova += int128_get64(llsize); - } - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); - if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, int128_get64(llsize), ret, - strerror(-ret)); - } - } - - memory_region_unref(section->mr); - - /* PPC64/pseries machine only */ - vfio_container_del_section_window(bcontainer, section); -} - -typedef struct VFIODirtyRanges { - hwaddr min32; - hwaddr max32; - hwaddr min64; - hwaddr max64; - hwaddr minpci64; - hwaddr maxpci64; -} VFIODirtyRanges; - -typedef struct VFIODirtyRangesListener { - VFIOContainerBase *bcontainer; - VFIODirtyRanges ranges; - MemoryListener listener; -} VFIODirtyRangesListener; - -static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainerBase *bcontainer) -{ - VFIOPCIDevice *pcidev; - VFIODevice *vbasedev; - Object *owner; - - owner = memory_region_owner(section->mr); - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); - if (OBJECT(pcidev) == owner) { - return true; - } - } - - return false; -} - -static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range, - hwaddr iova, hwaddr end, - bool update_pci) -{ - hwaddr *min, *max; - - /* - * The address space passed to the dirty tracker is reduced to three ranges: - * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the - * PCI 64-bit hole. - * - * The underlying reports of dirty will query a sub-interval of each of - * these ranges. - * - * The purpose of the three range handling is to handle known cases of big - * holes in the address space, like the x86 AMD 1T hole, and firmware (like - * OVMF) which may relocate the pci-hole64 to the end of the address space. - * The latter would otherwise generate large ranges for tracking, stressing - * the limits of supported hardware. The pci-hole32 will always be below 4G - * (overlapping or not) so it doesn't need special handling and is part of - * the 32-bit range. - * - * The alternative would be an IOVATree but that has a much bigger runtime - * overhead and unnecessary complexity. - */ - if (update_pci && iova >= UINT32_MAX) { - min = &range->minpci64; - max = &range->maxpci64; - } else { - min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; - max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; - } - if (*min > iova) { - *min = iova; - } - if (*max < end) { - *max = end; - } - - trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); -} - -static void vfio_dirty_tracking_update(MemoryListener *listener, - MemoryRegionSection *section) -{ - VFIODirtyRangesListener *dirty = - container_of(listener, VFIODirtyRangesListener, listener); - hwaddr iova, end; - - if (!vfio_listener_valid_section(section, "tracking_update") || - !vfio_get_section_iova_range(dirty->bcontainer, section, - &iova, &end, NULL)) { - return; - } - - vfio_dirty_tracking_update_range(&dirty->ranges, iova, end, - vfio_section_is_vfio_pci(section, dirty->bcontainer)); -} - -static const MemoryListener vfio_dirty_tracking_listener = { - .name = "vfio-tracking", - .region_add = vfio_dirty_tracking_update, -}; - -static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, - VFIODirtyRanges *ranges) -{ - VFIODirtyRangesListener dirty; - - memset(&dirty, 0, sizeof(dirty)); - dirty.ranges.min32 = UINT32_MAX; - dirty.ranges.min64 = UINT64_MAX; - dirty.ranges.minpci64 = UINT64_MAX; - dirty.listener = vfio_dirty_tracking_listener; - dirty.bcontainer = bcontainer; - - memory_listener_register(&dirty.listener, - bcontainer->space->as); - - *ranges = dirty.ranges; - - /* - * The memory listener is synchronous, and used to calculate the range - * to dirty tracking. Unregister it after we are done as we are not - * interested in any follow-up updates. - */ - memory_listener_unregister(&dirty.listener); -} - -static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) -{ - uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), - sizeof(uint64_t))] = {}; - struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; - VFIODevice *vbasedev; - - feature->argsz = sizeof(buf); - feature->flags = VFIO_DEVICE_FEATURE_SET | - VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (!vbasedev->dirty_tracking) { - continue; - } - - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { - warn_report("%s: Failed to stop DMA logging, err %d (%s)", - vbasedev->name, -errno, strerror(errno)); - } - vbasedev->dirty_tracking = false; - } -} - -static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, - VFIODirtyRanges *tracking) -{ - struct vfio_device_feature *feature; - size_t feature_size; - struct vfio_device_feature_dma_logging_control *control; - struct vfio_device_feature_dma_logging_range *ranges; - - feature_size = sizeof(struct vfio_device_feature) + - sizeof(struct vfio_device_feature_dma_logging_control); - feature = g_try_malloc0(feature_size); - if (!feature) { - errno = ENOMEM; - return NULL; - } - feature->argsz = feature_size; - feature->flags = VFIO_DEVICE_FEATURE_SET | - VFIO_DEVICE_FEATURE_DMA_LOGGING_START; - - control = (struct vfio_device_feature_dma_logging_control *)feature->data; - control->page_size = qemu_real_host_page_size(); - - /* - * DMA logging uAPI guarantees to support at least a number of ranges that - * fits into a single host kernel base page. - */ - control->num_ranges = !!tracking->max32 + !!tracking->max64 + - !!tracking->maxpci64; - ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, - control->num_ranges); - if (!ranges) { - g_free(feature); - errno = ENOMEM; - - return NULL; - } - - control->ranges = (uintptr_t)ranges; - if (tracking->max32) { - ranges->iova = tracking->min32; - ranges->length = (tracking->max32 - tracking->min32) + 1; - ranges++; - } - if (tracking->max64) { - ranges->iova = tracking->min64; - ranges->length = (tracking->max64 - tracking->min64) + 1; - ranges++; - } - if (tracking->maxpci64) { - ranges->iova = tracking->minpci64; - ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; - } - - trace_vfio_device_dirty_tracking_start(control->num_ranges, - tracking->min32, tracking->max32, - tracking->min64, tracking->max64, - tracking->minpci64, tracking->maxpci64); - - return feature; -} - -static void vfio_device_feature_dma_logging_start_destroy( - struct vfio_device_feature *feature) -{ - struct vfio_device_feature_dma_logging_control *control = - (struct vfio_device_feature_dma_logging_control *)feature->data; - struct vfio_device_feature_dma_logging_range *ranges = - (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; - - g_free(ranges); - g_free(feature); -} - -static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, - Error **errp) -{ - struct vfio_device_feature *feature; - VFIODirtyRanges ranges; - VFIODevice *vbasedev; - int ret = 0; - - vfio_dirty_tracking_init(bcontainer, &ranges); - feature = vfio_device_feature_dma_logging_start_create(bcontainer, - &ranges); - if (!feature) { - error_setg_errno(errp, errno, "Failed to prepare DMA logging"); - return false; - } - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->dirty_tracking) { - continue; - } - - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); - if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "%s: Failed to start DMA logging", - vbasedev->name); - goto out; - } - vbasedev->dirty_tracking = true; - } - -out: - if (ret) { - vfio_devices_dma_logging_stop(bcontainer); - } - - vfio_device_feature_dma_logging_start_destroy(feature); - - return ret == 0; -} - -static bool vfio_listener_log_global_start(MemoryListener *listener, - Error **errp) -{ - ERRP_GUARD(); - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - bool ret; - - if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { - ret = vfio_devices_dma_logging_start(bcontainer, errp); - } else { - ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0; - } - - if (!ret) { - error_prepend(errp, "vfio: Could not start dirty page tracking - "); - } - return ret; -} - -static void vfio_listener_log_global_stop(MemoryListener *listener) -{ - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - Error *local_err = NULL; - int ret = 0; - - if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { - vfio_devices_dma_logging_stop(bcontainer); - } else { - ret = vfio_container_set_dirty_page_tracking(bcontainer, false, - &local_err); - } - - if (ret) { - error_prepend(&local_err, - "vfio: Could not stop dirty page tracking - "); - if (migration_is_running()) { - migration_file_set_error(ret, local_err); - } else { - error_report_err(local_err); - } - } -} - -typedef struct { - IOMMUNotifier n; - VFIOGuestIOMMU *giommu; -} vfio_giommu_dirty_notifier; - -static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) -{ - vfio_giommu_dirty_notifier *gdn = container_of(n, - vfio_giommu_dirty_notifier, n); - VFIOGuestIOMMU *giommu = gdn->giommu; - VFIOContainerBase *bcontainer = giommu->bcontainer; - hwaddr iova = iotlb->iova + giommu->iommu_offset; - ram_addr_t translated_addr; - Error *local_err = NULL; - int ret = -EINVAL; - - trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); - - if (iotlb->target_as != &address_space_memory) { - error_setg(&local_err, - "Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); - goto out; - } - - rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { - goto out_unlock; - } - - ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, - translated_addr, &local_err); - if (ret) { - error_prepend(&local_err, - "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") failed - ", bcontainer, iova, - iotlb->addr_mask + 1); - } - -out_unlock: - rcu_read_unlock(); - -out: - if (ret) { - if (migration_is_running()) { - migration_file_set_error(ret, local_err); - } else { - error_report_err(local_err); - } - } -} - -static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, - void *opaque) -{ - const hwaddr size = int128_get64(section->size); - const hwaddr iova = section->offset_within_address_space; - const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + - section->offset_within_region; - VFIORamDiscardListener *vrdl = opaque; - Error *local_err = NULL; - int ret; - - /* - * Sync the whole mapped region (spanning multiple individual mappings) - * in one go. - */ - ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, - &local_err); - if (ret) { - error_report_err(local_err); - } - return ret; -} - -static int -vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) -{ - RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } - - /* - * We only want/can synchronize the bitmap for actually mapped parts - - * which correspond to populated parts. Replay all populated parts. - */ - return ram_discard_manager_replay_populated(rdm, section, - vfio_ram_discard_get_dirty_bitmap, - &vrdl); -} - -static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) -{ - VFIOGuestIOMMU *giommu; - bool found = false; - Int128 llend; - vfio_giommu_dirty_notifier gdn; - int idx; - - QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { - if (MEMORY_REGION(giommu->iommu_mr) == section->mr && - giommu->n.start == section->offset_within_region) { - found = true; - break; - } - } - - if (!found) { - return 0; - } - - gdn.giommu = giommu; - idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, - MEMTXATTRS_UNSPECIFIED); - - llend = int128_add(int128_make64(section->offset_within_region), - section->size); - llend = int128_sub(llend, int128_one()); - - iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP, - section->offset_within_region, int128_get64(llend), - idx); - memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); - - return 0; -} - -static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, Error **errp) -{ - ram_addr_t ram_addr; - - if (memory_region_is_iommu(section->mr)) { - return vfio_sync_iommu_dirty_bitmap(bcontainer, section); - } else if (memory_region_has_ram_discard_manager(section->mr)) { - int ret; - - ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); - if (ret) { - error_setg(errp, - "Failed to sync dirty bitmap with RAM discard listener"); - } - return ret; - } - - ram_addr = memory_region_get_ram_addr(section->mr) + - section->offset_within_region; - - return vfio_container_query_dirty_bitmap(bcontainer, - REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), - int128_get64(section->size), ram_addr, errp); -} - -static void vfio_listener_log_sync(MemoryListener *listener, - MemoryRegionSection *section) -{ - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - int ret; - Error *local_err = NULL; - - if (vfio_listener_skipped_section(section)) { - return; - } - - if (vfio_log_sync_needed(bcontainer)) { - ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err); - if (ret) { - if (migration_is_running()) { - migration_file_set_error(ret, local_err); - } else { - error_report_err(local_err); - } - } - } -} - -const MemoryListener vfio_memory_listener = { - .name = "vfio", - .region_add = vfio_listener_region_add, - .region_del = vfio_listener_region_del, - .log_global_start = vfio_listener_log_global_start, - .log_global_stop = vfio_listener_log_global_stop, - .log_sync = vfio_listener_log_sync, -}; diff --git a/hw/vfio/container.c b/hw/vfio/container.c index bb3990a..ff540e1 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -35,6 +35,7 @@ #include "hw/vfio/vfio-container.h" #include "vfio-helpers.h" #include "vfio-cpr.h" +#include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index a5bd189..7488d21 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -28,6 +28,7 @@ #include "vfio-iommufd.h" #include "vfio-helpers.h" #include "vfio-cpr.h" +#include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c new file mode 100644 index 0000000..bde1bb3 --- /dev/null +++ b/hw/vfio/listener.c @@ -0,0 +1,1172 @@ +/* + * generic functions used by VFIO devices + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include +#ifdef CONFIG_KVM +#include +#endif +#include + +#include "hw/vfio/vfio-common.h" +#include "hw/vfio/pci.h" +#include "system/address-spaces.h" +#include "system/memory.h" +#include "system/ram_addr.h" +#include "hw/hw.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/range.h" +#include "system/kvm.h" +#include "system/reset.h" +#include "system/runstate.h" +#include "trace.h" +#include "qapi/error.h" +#include "migration/misc.h" +#include "migration/qemu-file.h" +#include "system/tcg.h" +#include "system/tpm.h" +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" + +/* + * Device state interfaces + */ + + +static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) +{ + VFIODevice *vbasedev; + + if (!vfio_container_dirty_tracking_is_started(bcontainer)) { + return false; + } + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && + (vfio_device_state_is_running(vbasedev) || + vfio_device_state_is_precopy(vbasedev))) { + return false; + } + } + return true; +} + +static bool vfio_listener_skipped_section(MemoryRegionSection *section) +{ + return (!memory_region_is_ram(section->mr) && + !memory_region_is_iommu(section->mr)) || + memory_region_is_protected(section->mr) || + /* + * Sizing an enabled 64-bit BAR can cause spurious mappings to + * addresses in the upper part of the 64-bit address space. These + * are never accessed by the CPU and beyond the address width of + * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. + */ + section->offset_within_address_space & (1ULL << 63); +} + +/* Called with rcu_read_lock held. */ +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + ram_addr_t *ram_addr, bool *read_only, + Error **errp) +{ + bool ret, mr_has_discard_manager; + + ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, + &mr_has_discard_manager, errp); + if (ret && mr_has_discard_manager) { + /* + * Malicious VMs might trigger discarding of IOMMU-mapped memory. The + * pages will remain pinned inside vfio until unmapped, resulting in a + * higher memory consumption than expected. If memory would get + * populated again later, there would be an inconsistency between pages + * pinned by vfio and pages seen by QEMU. This is the case until + * unmapped from the IOMMU (e.g., during device reset). + * + * With malicious guests, we really only care about pinning more memory + * than expected. RLIMIT_MEMLOCK set for the user/process can never be + * exceeded and can be used to mitigate this problem. + */ + warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" + " RAM (e.g., virtio-mem) works, however, malicious" + " guests can trigger pinning of more memory than" + " intended via an IOMMU. It's possible to mitigate " + " by setting/adjusting RLIMIT_MEMLOCK."); + } + return ret; +} + +static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); + VFIOContainerBase *bcontainer = giommu->bcontainer; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + void *vaddr; + int ret; + Error *local_err = NULL; + + trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", + iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + if (migration_is_running()) { + migration_file_set_error(-EINVAL, local_err); + } else { + error_report_err(local_err); + } + return; + } + + rcu_read_lock(); + + if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { + bool read_only; + + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + error_report_err(local_err); + goto out; + } + /* + * vaddr is only valid until rcu_read_unlock(). But after + * vfio_dma_map has set up the mapping the pages will be + * pinned by the kernel. This makes sure that the RAM backend + * of vaddr will always be there, even if the memory object is + * destroyed and its backing memory munmap-ed. + */ + ret = vfio_container_dma_map(bcontainer, iova, + iotlb->addr_mask + 1, vaddr, + read_only); + if (ret) { + error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", + bcontainer, iova, + iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); + } + } else { + ret = vfio_container_dma_unmap(bcontainer, iova, + iotlb->addr_mask + 1, iotlb); + if (ret) { + error_setg(&local_err, + "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, + iotlb->addr_mask + 1, ret, strerror(-ret)); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } + } +out: + rcu_read_unlock(); +} + +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + if (ret) { + error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } +} + +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; + void *vaddr; + int ret; + + /* + * Map in (aligned within memory region) minimum granularity, so we can + * unmap in minimum granularity later. + */ + for (start = section->offset_within_region; start < end; start = next) { + next = ROUND_UP(start + 1, vrdl->granularity); + next = MIN(next, end); + + iova = start - section->offset_within_region + + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + + ret = vfio_container_dma_map(bcontainer, iova, next - start, + vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); + return ret; + } + } + return 0; +} + +static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + int target_page_size = qemu_target_page_size(); + VFIORamDiscardListener *vrdl; + + /* Ignore some corner cases not relevant in practice. */ + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, + target_page_size)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size)); + + vrdl = g_new0(VFIORamDiscardListener, 1); + vrdl->bcontainer = bcontainer; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); + vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); + g_assert(bcontainer->pgsizes && + vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); + QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could + * exceed the maximum number of possible DMA mappings over time. We assume + * that each mapped section in the same address space as a RamDiscardManager + * section consumes exactly one DMA mapping, with the exception of + * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections + * in the same address space as RamDiscardManager sections. + * + * We assume that each section in the address space consumes one memslot. + * We take the number of KVM memory slots as a best guess for the maximum + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ + if (bcontainer->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + +#ifdef CONFIG_KVM + if (kvm_enabled()) { + max_memslots = kvm_get_max_memslots(); + } +#endif + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, + vrdl->granularity); + end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, + vrdl->granularity); + vrdl_mappings += (end - start) / vrdl->granularity; + vrdl_count++; + } + + if (vrdl_mappings + max_memslots - vrdl_count > + bcontainer->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" + " memslots: %d", __func__, bcontainer->dma_max_mappings, + max_memslots); + } + } +} + +static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to unregister missing RAM discard listener"); + } + + ram_discard_manager_unregister_listener(rdm, &vrdl->listener); + QLIST_REMOVE(vrdl, next); + g_free(vrdl); +} + +static bool vfio_known_safe_misalignment(MemoryRegionSection *section) +{ + MemoryRegion *mr = section->mr; + + if (!TPM_IS_CRB(mr->owner)) { + return false; + } + + /* this is a known safe misaligned region, just trace for debug purpose */ + trace_vfio_known_safe_misalignment(memory_region_name(mr), + section->offset_within_address_space, + section->offset_within_region, + qemu_real_host_page_size()); + return true; +} + +static bool vfio_listener_valid_section(MemoryRegionSection *section, + const char *name) +{ + if (vfio_listener_skipped_section(section)) { + trace_vfio_listener_region_skip(name, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); + return false; + } + + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask()) != + (section->offset_within_region & ~qemu_real_host_page_mask()))) { + if (!vfio_known_safe_misalignment(section)) { + error_report("%s received unaligned region %s iova=0x%"PRIx64 + " offset_within_region=0x%"PRIx64 + " qemu_real_host_page_size=0x%"PRIxPTR, + __func__, memory_region_name(section->mr), + section->offset_within_address_space, + section->offset_within_region, + qemu_real_host_page_size()); + } + return false; + } + + return true; +} + +static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + hwaddr *out_iova, hwaddr *out_end, + Int128 *out_llend) +{ + Int128 llend; + hwaddr iova; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); + + if (int128_ge(int128_make64(iova), llend)) { + return false; + } + + *out_iova = iova; + *out_end = int128_get64(int128_sub(llend, int128_one())); + if (out_llend) { + *out_llend = llend; + } + return true; +} + +static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) +{ + /* + * MMIO region mapping failures are not fatal but in this case PCI + * peer-to-peer transactions are broken. + */ + if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) { + error_append_hint(errp, "%s: PCI peer-to-peer transactions " + "on BARs are not supported.\n", vbasedev->name); + } +} + +static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; + int ret; + Error *err = NULL; + + if (!vfio_listener_valid_section(section, "region_add")) { + return; + } + + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { + if (memory_region_is_ram_device(section->mr)) { + trace_vfio_listener_region_add_no_dma_map( + memory_region_name(section->mr), + section->offset_within_address_space, + int128_getlo(section->size), + qemu_real_host_page_size()); + } + return; + } + + /* PPC64/pseries machine only */ + if (!vfio_container_add_section_window(bcontainer, section, &err)) { + goto mmio_dma_error; + } + + memory_region_ref(section->mr); + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + int iommu_idx; + + trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + /* + * FIXME: For VFIO iommu types which have KVM acceleration to + * avoid bouncing all map/unmaps through qemu this way, this + * would be the right place to wire that up (tell the KVM + * device emulation the VFIO iommu handles to use). + */ + giommu = g_malloc0(sizeof(*giommu)); + giommu->iommu_mr = iommu_mr; + giommu->iommu_offset = section->offset_within_address_space - + section->offset_within_region; + giommu->bcontainer = bcontainer; + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); + iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, + IOMMU_NOTIFIER_IOTLB_EVENTS, + section->offset_within_region, + int128_get64(llend), + iommu_idx); + + ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, + &err); + if (ret) { + g_free(giommu); + goto fail; + } + QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); + + return; + } + + /* Here we assume that memory_region_is_ram(section->mr)==true */ + + /* + * For RAM memory regions with a RamDiscardManager, we only want to map the + * actually populated parts - and update the mapping whenever we're notified + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_register_ram_discard_listener(bcontainer, section); + return; + } + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + trace_vfio_listener_region_add_ram(iova, end, vaddr); + + llsize = int128_sub(llend, int128_make64(iova)); + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + + if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { + trace_vfio_listener_region_add_no_dma_map( + memory_region_name(section->mr), + section->offset_within_address_space, + int128_getlo(section->size), + pgmask + 1); + return; + } + } + + ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), + vaddr, section->readonly); + if (ret) { + error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", + bcontainer, iova, int128_get64(llsize), vaddr, ret, + strerror(-ret)); + mmio_dma_error: + if (memory_region_is_ram_device(section->mr)) { + /* Allow unexpected mappings not to be fatal for RAM devices */ + VFIODevice *vbasedev = + vfio_get_vfio_device(memory_region_owner(section->mr)); + vfio_device_error_append(vbasedev, &err); + warn_report_err_once(err); + return; + } + goto fail; + } + + return; + +fail: + if (!bcontainer->initialized) { + /* + * At machine init time or when the device is attached to the + * VM, store the first error in the container so we can + * gracefully fail the device realize routine. + */ + if (!bcontainer->error) { + error_propagate_prepend(&bcontainer->error, err, + "Region %s: ", + memory_region_name(section->mr)); + } else { + error_free(err); + } + } else { + /* + * At runtime, there's not much we can do other than throw a + * hardware error. + */ + error_report_err(err); + hw_error("vfio: DMA mapping failed, unable to continue"); + } +} + +static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + hwaddr iova, end; + Int128 llend, llsize; + int ret; + bool try_unmap = true; + + if (!vfio_listener_valid_section(section, "region_del")) { + return; + } + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + trace_vfio_listener_region_del_iommu(section->mr->name); + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu_mr) == section->mr && + giommu->n.start == section->offset_within_region) { + memory_region_unregister_iommu_notifier(section->mr, + &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + break; + } + } + + /* + * FIXME: We assume the one big unmap below is adequate to + * remove any individual page mappings in the IOMMU which + * might have been copied into VFIO. This works for a page table + * based IOMMU where a big unmap flattens a large range of IO-PTEs. + * That may not be true for all IOMMU types. + */ + } + + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + + trace_vfio_listener_region_del(iova, end); + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; + + pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(bcontainer, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; + } + + if (try_unmap) { + if (int128_eq(llsize, int128_2_64())) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + llsize = int128_rshift(llsize, 1); + ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + iova += int128_get64(llsize); + } + ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + } + + memory_region_unref(section->mr); + + /* PPC64/pseries machine only */ + vfio_container_del_section_window(bcontainer, section); +} + +typedef struct VFIODirtyRanges { + hwaddr min32; + hwaddr max32; + hwaddr min64; + hwaddr max64; + hwaddr minpci64; + hwaddr maxpci64; +} VFIODirtyRanges; + +typedef struct VFIODirtyRangesListener { + VFIOContainerBase *bcontainer; + VFIODirtyRanges ranges; + MemoryListener listener; +} VFIODirtyRangesListener; + +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainerBase *bcontainer) +{ + VFIOPCIDevice *pcidev; + VFIODevice *vbasedev; + Object *owner; + + owner = memory_region_owner(section->mr); + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + if (OBJECT(pcidev) == owner) { + return true; + } + } + + return false; +} + +static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range, + hwaddr iova, hwaddr end, + bool update_pci) +{ + hwaddr *min, *max; + + /* + * The address space passed to the dirty tracker is reduced to three ranges: + * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the + * PCI 64-bit hole. + * + * The underlying reports of dirty will query a sub-interval of each of + * these ranges. + * + * The purpose of the three range handling is to handle known cases of big + * holes in the address space, like the x86 AMD 1T hole, and firmware (like + * OVMF) which may relocate the pci-hole64 to the end of the address space. + * The latter would otherwise generate large ranges for tracking, stressing + * the limits of supported hardware. The pci-hole32 will always be below 4G + * (overlapping or not) so it doesn't need special handling and is part of + * the 32-bit range. + * + * The alternative would be an IOVATree but that has a much bigger runtime + * overhead and unnecessary complexity. + */ + if (update_pci && iova >= UINT32_MAX) { + min = &range->minpci64; + max = &range->maxpci64; + } else { + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; + } + if (*min > iova) { + *min = iova; + } + if (*max < end) { + *max = end; + } + + trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); +} + +static void vfio_dirty_tracking_update(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIODirtyRangesListener *dirty = + container_of(listener, VFIODirtyRangesListener, listener); + hwaddr iova, end; + + if (!vfio_listener_valid_section(section, "tracking_update") || + !vfio_get_section_iova_range(dirty->bcontainer, section, + &iova, &end, NULL)) { + return; + } + + vfio_dirty_tracking_update_range(&dirty->ranges, iova, end, + vfio_section_is_vfio_pci(section, dirty->bcontainer)); +} + +static const MemoryListener vfio_dirty_tracking_listener = { + .name = "vfio-tracking", + .region_add = vfio_dirty_tracking_update, +}; + +static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, + VFIODirtyRanges *ranges) +{ + VFIODirtyRangesListener dirty; + + memset(&dirty, 0, sizeof(dirty)); + dirty.ranges.min32 = UINT32_MAX; + dirty.ranges.min64 = UINT64_MAX; + dirty.ranges.minpci64 = UINT64_MAX; + dirty.listener = vfio_dirty_tracking_listener; + dirty.bcontainer = bcontainer; + + memory_listener_register(&dirty.listener, + bcontainer->space->as); + + *ranges = dirty.ranges; + + /* + * The memory listener is synchronous, and used to calculate the range + * to dirty tracking. Unregister it after we are done as we are not + * interested in any follow-up updates. + */ + memory_listener_unregister(&dirty.listener); +} + +static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + VFIODevice *vbasedev; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_tracking) { + continue; + } + + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + warn_report("%s: Failed to stop DMA logging, err %d (%s)", + vbasedev->name, -errno, strerror(errno)); + } + vbasedev->dirty_tracking = false; + } +} + +static struct vfio_device_feature * +vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, + VFIODirtyRanges *tracking) +{ + struct vfio_device_feature *feature; + size_t feature_size; + struct vfio_device_feature_dma_logging_control *control; + struct vfio_device_feature_dma_logging_range *ranges; + + feature_size = sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_control); + feature = g_try_malloc0(feature_size); + if (!feature) { + errno = ENOMEM; + return NULL; + } + feature->argsz = feature_size; + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; + + control = (struct vfio_device_feature_dma_logging_control *)feature->data; + control->page_size = qemu_real_host_page_size(); + + /* + * DMA logging uAPI guarantees to support at least a number of ranges that + * fits into a single host kernel base page. + */ + control->num_ranges = !!tracking->max32 + !!tracking->max64 + + !!tracking->maxpci64; + ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, + control->num_ranges); + if (!ranges) { + g_free(feature); + errno = ENOMEM; + + return NULL; + } + + control->ranges = (uintptr_t)ranges; + if (tracking->max32) { + ranges->iova = tracking->min32; + ranges->length = (tracking->max32 - tracking->min32) + 1; + ranges++; + } + if (tracking->max64) { + ranges->iova = tracking->min64; + ranges->length = (tracking->max64 - tracking->min64) + 1; + ranges++; + } + if (tracking->maxpci64) { + ranges->iova = tracking->minpci64; + ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; + } + + trace_vfio_device_dirty_tracking_start(control->num_ranges, + tracking->min32, tracking->max32, + tracking->min64, tracking->max64, + tracking->minpci64, tracking->maxpci64); + + return feature; +} + +static void vfio_device_feature_dma_logging_start_destroy( + struct vfio_device_feature *feature) +{ + struct vfio_device_feature_dma_logging_control *control = + (struct vfio_device_feature_dma_logging_control *)feature->data; + struct vfio_device_feature_dma_logging_range *ranges = + (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; + + g_free(ranges); + g_free(feature); +} + +static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, + Error **errp) +{ + struct vfio_device_feature *feature; + VFIODirtyRanges ranges; + VFIODevice *vbasedev; + int ret = 0; + + vfio_dirty_tracking_init(bcontainer, &ranges); + feature = vfio_device_feature_dma_logging_start_create(bcontainer, + &ranges); + if (!feature) { + error_setg_errno(errp, errno, "Failed to prepare DMA logging"); + return false; + } + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->dirty_tracking) { + continue; + } + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + if (ret) { + ret = -errno; + error_setg_errno(errp, errno, "%s: Failed to start DMA logging", + vbasedev->name); + goto out; + } + vbasedev->dirty_tracking = true; + } + +out: + if (ret) { + vfio_devices_dma_logging_stop(bcontainer); + } + + vfio_device_feature_dma_logging_start_destroy(feature); + + return ret == 0; +} + +static bool vfio_listener_log_global_start(MemoryListener *listener, + Error **errp) +{ + ERRP_GUARD(); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + bool ret; + + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { + ret = vfio_devices_dma_logging_start(bcontainer, errp); + } else { + ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0; + } + + if (!ret) { + error_prepend(errp, "vfio: Could not start dirty page tracking - "); + } + return ret; +} + +static void vfio_listener_log_global_stop(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + Error *local_err = NULL; + int ret = 0; + + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { + vfio_devices_dma_logging_stop(bcontainer); + } else { + ret = vfio_container_set_dirty_page_tracking(bcontainer, false, + &local_err); + } + + if (ret) { + error_prepend(&local_err, + "vfio: Could not stop dirty page tracking - "); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } +} + +typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +} vfio_giommu_dirty_notifier; + +static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + vfio_giommu_dirty_notifier *gdn = container_of(n, + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; + VFIOContainerBase *bcontainer = giommu->bcontainer; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + Error *local_err = NULL; + int ret = -EINVAL; + + trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + goto out; + } + + rcu_read_lock(); + if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + goto out_unlock; + } + + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, + translated_addr, &local_err); + if (ret) { + error_prepend(&local_err, + "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") failed - ", bcontainer, iova, + iotlb->addr_mask + 1); + } + +out_unlock: + rcu_read_unlock(); + +out: + if (ret) { + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } +} + +static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + VFIORamDiscardListener *vrdl = opaque; + Error *local_err = NULL; + int ret; + + /* + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ + ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, + &local_err); + if (ret) { + error_report_err(local_err); + } + return ret; +} + +static int +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + } + + /* + * We only want/can synchronize the bitmap for actually mapped parts - + * which correspond to populated parts. Replay all populated parts. + */ + return ram_discard_manager_replay_populated(rdm, section, + vfio_ram_discard_get_dirty_bitmap, + &vrdl); +} + +static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu; + bool found = false; + Int128 llend; + vfio_giommu_dirty_notifier gdn; + int idx; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu_mr) == section->mr && + giommu->n.start == section->offset_within_region) { + found = true; + break; + } + } + + if (!found) { + return 0; + } + + gdn.giommu = giommu; + idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, + MEMTXATTRS_UNSPECIFIED); + + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + + iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP, + section->offset_within_region, int128_get64(llend), + idx); + memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); + + return 0; +} + +static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, Error **errp) +{ + ram_addr_t ram_addr; + + if (memory_region_is_iommu(section->mr)) { + return vfio_sync_iommu_dirty_bitmap(bcontainer, section); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + int ret; + + ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); + if (ret) { + error_setg(errp, + "Failed to sync dirty bitmap with RAM discard listener"); + } + return ret; + } + + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + + return vfio_container_query_dirty_bitmap(bcontainer, + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr, errp); +} + +static void vfio_listener_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + int ret; + Error *local_err = NULL; + + if (vfio_listener_skipped_section(section)) { + return; + } + + if (vfio_log_sync_needed(bcontainer)) { + ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err); + if (ret) { + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } + } +} + +const MemoryListener vfio_memory_listener = { + .name = "vfio", + .region_add = vfio_listener_region_add, + .region_del = vfio_listener_region_del, + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, + .log_sync = vfio_listener_log_sync, +}; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 9c8a989..bccb050 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -1,6 +1,6 @@ vfio_ss = ss.source_set() vfio_ss.add(files( - 'common.c', + 'listener.c', 'container-base.c', 'container.c', 'helpers.c', diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index aa0ba69..ddb1bcc 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -89,7 +89,7 @@ vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB" vfio_pci_igd_host_bridge_enabled(const char *name) "%s" vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" -# common.c +# listener.c vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" diff --git a/hw/vfio/vfio-listener.h b/hw/vfio/vfio-listener.h new file mode 100644 index 0000000..93af674 --- /dev/null +++ b/hw/vfio/vfio-listener.h @@ -0,0 +1,14 @@ +/* + * VFIO MemoryListener services + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_LISTENER_H +#define HW_VFIO_VFIO_LISTENER_H + +extern const MemoryListener vfio_memory_listener; + +#endif /* HW_VFIO_VFIO_LISTENER_H */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0dfae24..92381c6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -131,7 +131,6 @@ VFIODevice *vfio_get_vfio_device(Object *obj); typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIODeviceList vfio_device_list; -extern const MemoryListener vfio_memory_listener; #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, -- cgit v1.1