diff options
-rw-r--r-- | hw/vfio/common.c | 140 | ||||
-rw-r--r-- | hw/vfio/platform.c | 116 | ||||
-rw-r--r-- | include/exec/memory.h | 13 | ||||
-rw-r--r-- | include/hw/vfio/vfio-common.h | 23 | ||||
-rw-r--r-- | include/hw/vfio/vfio-platform.h | 4 | ||||
-rw-r--r-- | memory.c | 20 | ||||
-rw-r--r-- | trace-events | 4 |
7 files changed, 204 insertions, 116 deletions
diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 0d341a3..6797208 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -312,11 +312,15 @@ out: rcu_read_unlock(); } +static hwaddr vfio_container_granularity(VFIOContainer *container) +{ + return (hwaddr)1 << ctz64(container->iova_pgsizes); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - iommu_data.type1.listener); + VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; Int128 llend; void *vaddr; @@ -344,14 +348,22 @@ static void vfio_listener_region_add(MemoryListener *listener, if (int128_ge(int128_make64(iova), llend)) { return; } + end = int128_get64(llend); + + if ((iova < container->min_iova) || ((end - 1) > container->max_iova)) { + error_report("vfio: IOMMU container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, + container, iova, end - 1); + ret = -EFAULT; + goto fail; + } memory_region_ref(section->mr); if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - trace_vfio_listener_region_add_iommu(iova, - int128_get64(int128_sub(llend, int128_one()))); + trace_vfio_listener_region_add_iommu(iova, end - 1); /* * FIXME: We should do some checking to see if the * capabilities of the host VFIO IOMMU are adequate to model @@ -362,33 +374,22 @@ static void vfio_listener_region_add(MemoryListener *listener, * would be the right place to wire that up (tell the KVM * device emulation the VFIO iommu handles to use). */ - /* - * This assumes that the guest IOMMU is empty of - * mappings at this point. - * - * One way of doing this is: - * 1. Avoid sharing IOMMUs between emulated devices or different - * IOMMU groups. - * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if - * there are some mappings in IOMMU. - * - * VFIO on SPAPR does that. Other IOMMU models may do that different, - * they must make sure there are no existing mappings or - * loop through existing mappings to map them into VFIO. - */ giommu = g_malloc0(sizeof(*giommu)); giommu->iommu = section->mr; giommu->container = container; giommu->n.notify = vfio_iommu_map_notify; QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); + memory_region_iommu_replay(giommu->iommu, &giommu->n, + vfio_container_granularity(container), + false); return; } /* Here we assume that memory_region_is_ram(section->mr)==true */ - end = int128_get64(llend); vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); @@ -400,27 +401,30 @@ static void vfio_listener_region_add(MemoryListener *listener, error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", container, iova, end - iova, vaddr, ret); + goto fail; + } - /* - * On the initfn path, store the first error in the container so we - * can gracefully fail. Runtime, there's not much we can do other - * than throw a hardware error. - */ - if (!container->iommu_data.type1.initialized) { - if (!container->iommu_data.type1.error) { - container->iommu_data.type1.error = ret; - } - } else { - hw_error("vfio: DMA mapping failed, unable to continue"); + return; + +fail: + /* + * On the initfn path, store the first error in the container so we + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ + if (!container->initialized) { + if (!container->error) { + container->error = ret; } + } else { + hw_error("vfio: DMA mapping failed, unable to continue"); } } static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - iommu_data.type1.listener); + VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; int ret; @@ -485,7 +489,7 @@ static const MemoryListener vfio_memory_listener = { static void vfio_listener_release(VFIOContainer *container) { - memory_listener_unregister(&container->iommu_data.type1.listener); + memory_listener_unregister(&container->listener); } int vfio_mmap_region(Object *obj, VFIORegion *region, @@ -668,6 +672,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) || ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU); + struct vfio_iommu_type1_info info; ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { @@ -684,21 +689,27 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - container->iommu_data.type1.listener = vfio_memory_listener; - container->iommu_data.release = vfio_listener_release; - - memory_listener_register(&container->iommu_data.type1.listener, - container->space->as); - - if (container->iommu_data.type1.error) { - ret = container->iommu_data.type1.error; - error_report("vfio: memory listener initialization failed for container"); - goto listener_release_exit; + /* + * FIXME: This assumes that a Type1 IOMMU can map any 64-bit + * IOVA whatsoever. That's not actually true, but the current + * kernel interface doesn't tell us what it can map, and the + * existing Type1 IOMMUs generally support any IOVA we're + * going to actually try in practice. + */ + container->min_iova = 0; + container->max_iova = (hwaddr)-1; + + /* Assume just 4K IOVA page size */ + container->iova_pgsizes = 0x1000; + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); + /* Ignore errors */ + if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + container->iova_pgsizes = info.iova_pgsizes; } - - container->iommu_data.type1.initialized = true; - } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { + struct vfio_iommu_spapr_tce_info info; + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { error_report("vfio: failed to set group container: %m"); @@ -724,18 +735,41 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - container->iommu_data.type1.listener = vfio_memory_listener; - container->iommu_data.release = vfio_listener_release; - - memory_listener_register(&container->iommu_data.type1.listener, - container->space->as); + /* + * This only considers the host IOMMU's 32-bit window. At + * some point we need to add support for the optional 64-bit + * window and dynamic windows + */ + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m"); + ret = -errno; + goto free_container_exit; + } + container->min_iova = info.dma32_window_start; + container->max_iova = container->min_iova + info.dma32_window_size - 1; + /* Assume just 4K IOVA pages for now */ + container->iova_pgsizes = 0x1000; } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; goto free_container_exit; } + container->listener = vfio_memory_listener; + + memory_listener_register(&container->listener, container->space->as); + + if (container->error) { + ret = container->error; + error_report("vfio: memory listener initialization failed for container"); + goto listener_release_exit; + } + + container->initialized = true; + QLIST_INIT(&container->group_list); QLIST_INSERT_HEAD(&space->containers, container, next); @@ -774,9 +808,7 @@ static void vfio_disconnect_container(VFIOGroup *group) VFIOAddressSpace *space = container->space; VFIOGuestIOMMU *giommu, *tmp; - if (container->iommu_data.release) { - container->iommu_data.release(container); - } + vfio_listener_release(container); QLIST_REMOVE(container, next); QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index a6726cd..5c1156c 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -32,6 +32,11 @@ * Functions used whatever the injection method */ +static inline bool vfio_irq_is_automasked(VFIOINTp *intp) +{ + return intp->flags & VFIO_IRQ_INFO_AUTOMASKED; +} + /** * vfio_init_intp - allocate, initialize the IRQ struct pointer * and add it into the list of IRQs @@ -57,18 +62,25 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev, sysbus_init_irq(sbdev, &intp->qemuirq); /* Get an eventfd for trigger */ - ret = event_notifier_init(&intp->interrupt, 0); + intp->interrupt = g_malloc0(sizeof(EventNotifier)); + ret = event_notifier_init(intp->interrupt, 0); if (ret) { + g_free(intp->interrupt); g_free(intp); error_report("vfio: Error: trigger event_notifier_init failed "); return NULL; } - /* Get an eventfd for resample/unmask */ - ret = event_notifier_init(&intp->unmask, 0); - if (ret) { - g_free(intp); - error_report("vfio: Error: resamplefd event_notifier_init failed"); - return NULL; + if (vfio_irq_is_automasked(intp)) { + /* Get an eventfd for resample/unmask */ + intp->unmask = g_malloc0(sizeof(EventNotifier)); + ret = event_notifier_init(intp->unmask, 0); + if (ret) { + g_free(intp->interrupt); + g_free(intp->unmask); + g_free(intp); + error_report("vfio: Error: resamplefd event_notifier_init failed"); + return NULL; + } } QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); @@ -100,7 +112,7 @@ static int vfio_set_trigger_eventfd(VFIOINTp *intp, irq_set->start = 0; irq_set->count = 1; pfd = (int32_t *)&irq_set->data; - *pfd = event_notifier_get_fd(&intp->interrupt); + *pfd = event_notifier_get_fd(intp->interrupt); qemu_set_fd_handler(*pfd, (IOHandler *)handler, NULL, intp); ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set); g_free(irq_set); @@ -182,7 +194,7 @@ static void vfio_intp_mmap_enable(void *opaque) static void vfio_intp_inject_pending_lockheld(VFIOINTp *intp) { trace_vfio_platform_intp_inject_pending_lockheld(intp->pin, - event_notifier_get_fd(&intp->interrupt)); + event_notifier_get_fd(intp->interrupt)); intp->state = VFIO_IRQ_ACTIVE; @@ -224,18 +236,18 @@ static void vfio_intp_interrupt(VFIOINTp *intp) trace_vfio_intp_interrupt_set_pending(intp->pin); QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue, intp, pqnext); - ret = event_notifier_test_and_clear(&intp->interrupt); + ret = event_notifier_test_and_clear(intp->interrupt); qemu_mutex_unlock(&vdev->intp_mutex); return; } trace_vfio_platform_intp_interrupt(intp->pin, - event_notifier_get_fd(&intp->interrupt)); + event_notifier_get_fd(intp->interrupt)); - ret = event_notifier_test_and_clear(&intp->interrupt); + ret = event_notifier_test_and_clear(intp->interrupt); if (!ret) { error_report("Error when clearing fd=%d (ret = %d)", - event_notifier_get_fd(&intp->interrupt), ret); + event_notifier_get_fd(intp->interrupt), ret); } intp->state = VFIO_IRQ_ACTIVE; @@ -283,13 +295,13 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) QLIST_FOREACH(intp, &vdev->intp_list, next) { if (intp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_eoi(intp->pin, - event_notifier_get_fd(&intp->interrupt)); + event_notifier_get_fd(intp->interrupt)); intp->state = VFIO_IRQ_INACTIVE; /* deassert the virtual IRQ */ qemu_set_irq(intp->qemuirq, 0); - if (intp->flags & VFIO_IRQ_INFO_AUTOMASKED) { + if (vfio_irq_is_automasked(intp)) { /* unmasks the physical level-sensitive IRQ */ vfio_unmask_single_irqindex(vbasedev, intp->pin); } @@ -310,18 +322,29 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) /** * vfio_start_eventfd_injection - starts the virtual IRQ injection using * user-side handled eventfds - * @intp: the IRQ struct pointer + * @sbdev: the sysbus device handle + * @irq: the qemu irq handle */ -static int vfio_start_eventfd_injection(VFIOINTp *intp) +static void vfio_start_eventfd_injection(SysBusDevice *sbdev, qemu_irq irq) { int ret; + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev); + VFIOINTp *intp; + + QLIST_FOREACH(intp, &vdev->intp_list, next) { + if (intp->qemuirq == irq) { + break; + } + } + assert(intp); ret = vfio_set_trigger_eventfd(intp, vfio_intp_interrupt); if (ret) { - error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m"); + error_report("vfio: failed to start eventfd signaling for IRQ %d: %m", + intp->pin); + abort(); } - return ret; } /* @@ -349,7 +372,7 @@ static int vfio_set_resample_eventfd(VFIOINTp *intp) irq_set->start = 0; irq_set->count = 1; pfd = (int32_t *)&irq_set->data; - *pfd = event_notifier_get_fd(&intp->unmask); + *pfd = event_notifier_get_fd(intp->unmask); qemu_set_fd_handler(*pfd, NULL, NULL, NULL); ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set); g_free(irq_set); @@ -359,6 +382,15 @@ static int vfio_set_resample_eventfd(VFIOINTp *intp) return ret; } +/** + * vfio_start_irqfd_injection - starts the virtual IRQ injection using + * irqfd + * + * @sbdev: the sysbus device handle + * @irq: the qemu irq handle + * + * In case the irqfd setup fails, we fallback to userspace handled eventfd + */ static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev); @@ -366,7 +398,7 @@ static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq) if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() || !vdev->irqfd_allowed) { - return; + goto fail_irqfd; } QLIST_FOREACH(intp, &vdev->intp_list, next) { @@ -376,39 +408,36 @@ static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq) } assert(intp); - /* Get to a known interrupt state */ - qemu_set_fd_handler(event_notifier_get_fd(&intp->interrupt), - NULL, NULL, vdev); - - vfio_mask_single_irqindex(&vdev->vbasedev, intp->pin); - qemu_set_irq(intp->qemuirq, 0); - - if (kvm_irqchip_add_irqfd_notifier(kvm_state, &intp->interrupt, - &intp->unmask, irq) < 0) { + if (kvm_irqchip_add_irqfd_notifier(kvm_state, intp->interrupt, + intp->unmask, irq) < 0) { goto fail_irqfd; } if (vfio_set_trigger_eventfd(intp, NULL) < 0) { goto fail_vfio; } - if (vfio_set_resample_eventfd(intp) < 0) { - goto fail_vfio; + if (vfio_irq_is_automasked(intp)) { + if (vfio_set_resample_eventfd(intp) < 0) { + goto fail_vfio; + } + trace_vfio_platform_start_level_irqfd_injection(intp->pin, + event_notifier_get_fd(intp->interrupt), + event_notifier_get_fd(intp->unmask)); + } else { + trace_vfio_platform_start_edge_irqfd_injection(intp->pin, + event_notifier_get_fd(intp->interrupt)); } - /* Let's resume injection with irqfd setup */ - vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin); - intp->kvm_accel = true; - trace_vfio_platform_start_irqfd_injection(intp->pin, - event_notifier_get_fd(&intp->interrupt), - event_notifier_get_fd(&intp->unmask)); return; fail_vfio: - kvm_irqchip_remove_irqfd_notifier(kvm_state, &intp->interrupt, irq); + kvm_irqchip_remove_irqfd_notifier(kvm_state, intp->interrupt, irq); + error_report("vfio: failed to start eventfd signaling for IRQ %d: %m", + intp->pin); + abort(); fail_irqfd: - vfio_start_eventfd_injection(intp); - vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin); + vfio_start_eventfd_injection(sbdev, irq); return; } @@ -646,7 +675,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); SysBusDevice *sbdev = SYS_BUS_DEVICE(dev); VFIODevice *vbasedev = &vdev->vbasedev; - VFIOINTp *intp; int i, ret; vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; @@ -665,10 +693,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) vfio_map_region(vdev, i); sysbus_init_mmio(sbdev, &vdev->regions[i]->mem); } - - QLIST_FOREACH(intp, &vdev->intp_list, next) { - vfio_start_eventfd_injection(intp); - } } static const VMStateDescription vfio_platform_vmstate = { diff --git a/include/exec/memory.h b/include/exec/memory.h index 5baaf48..0f07159 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -583,6 +583,19 @@ void memory_region_notify_iommu(MemoryRegion *mr, void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n); /** + * memory_region_iommu_replay: replay existing IOMMU translations to + * a notifier + * + * @mr: the memory region to observe + * @n: the notifier to which to replay iommu mappings + * @granularity: Minimum page granularity to replay notifications for + * @is_write: Whether to treat the replay as a translate "write" + * through the iommu + */ +void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, + hwaddr granularity, bool is_write); + +/** * memory_region_unregister_iommu_notifier: unregister a notifier for * changes to IOMMU translation entries. * diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9b9901f..f037f3c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -59,22 +59,19 @@ typedef struct VFIOAddressSpace { struct VFIOGroup; -typedef struct VFIOType1 { - MemoryListener listener; - int error; - bool initialized; -} VFIOType1; - typedef struct VFIOContainer { VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - struct { - /* enable abstraction to support various iommu backends */ - union { - VFIOType1 type1; - }; - void (*release)(struct VFIOContainer *); - } iommu_data; + MemoryListener listener; + int error; + bool initialized; + /* + * This assumes the host IOMMU can support only a single + * contiguous IOVA window. We may need to generalize that in + * future + */ + hwaddr min_iova, max_iova; + uint64_t iova_pgsizes; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_ENTRY(VFIOContainer) next; diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h index c5cf1d7..b468f80 100644 --- a/include/hw/vfio/vfio-platform.h +++ b/include/hw/vfio/vfio-platform.h @@ -34,8 +34,8 @@ enum { typedef struct VFIOINTp { QLIST_ENTRY(VFIOINTp) next; /* entry for IRQ list */ QSIMPLEQ_ENTRY(VFIOINTp) pqnext; /* entry for pending IRQ queue */ - EventNotifier interrupt; /* eventfd triggered on interrupt */ - EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ + EventNotifier *interrupt; /* eventfd triggered on interrupt */ + EventNotifier *unmask; /* eventfd for unmask on QEMU bypass */ qemu_irq qemuirq; struct VFIOPlatformDevice *vdev; /* back pointer to device */ int state; /* inactive, pending, active */ @@ -1403,6 +1403,26 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n) notifier_list_add(&mr->iommu_notify, n); } +void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, + hwaddr granularity, bool is_write) +{ + hwaddr addr; + IOMMUTLBEntry iotlb; + + for (addr = 0; addr < memory_region_size(mr); addr += granularity) { + iotlb = mr->iommu_ops->translate(mr, addr, is_write); + if (iotlb.perm != IOMMU_NONE) { + n->notify(n, &iotlb); + } + + /* if (2^64 - MR size) < granularity, it's possible to get an + * infinite loop here. This should catch such a wraparound */ + if ((addr + granularity) < addr) { + break; + } + } +} + void memory_region_unregister_iommu_notifier(Notifier *n) { notifier_remove(n); diff --git a/trace-events b/trace-events index 36db793..6790292 100644 --- a/trace-events +++ b/trace-events @@ -1624,7 +1624,9 @@ vfio_platform_intp_interrupt(int pin, int fd) "Inject IRQ #%d (fd = %d)" vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)" vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d: count %d, flags=0x%x" vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING" -vfio_platform_start_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" +vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" +vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d" + #hw/acpi/memory_hotplug.c mhp_acpi_invalid_slot_selected(uint32_t slot) "0x%"PRIx32 |