diff options
Diffstat (limited to 'hw/vfio/pci.c')
-rw-r--r-- | hw/vfio/pci.c | 1024 |
1 files changed, 652 insertions, 372 deletions
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e03d9f3..fa25bde 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -30,20 +30,23 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "migration/vmstate.h" -#include "qapi/qmp/qdict.h" +#include "migration/cpr.h" +#include "qobject/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/range.h" #include "qemu/units.h" -#include "sysemu/kvm.h" -#include "sysemu/runstate.h" +#include "system/kvm.h" +#include "system/runstate.h" #include "pci.h" #include "trace.h" #include "qapi/error.h" #include "migration/blocker.h" #include "migration/qemu-file.h" -#include "sysemu/iommufd.h" +#include "system/iommufd.h" +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -54,6 +57,23 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr, Error **errp) +{ + int ret = event_notifier_init(e, 0); + + if (ret) { + error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + } + return !ret; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -101,7 +121,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_pci_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -109,11 +129,11 @@ static void vfio_intx_eoi(VFIODevice *vbasedev) return; } - trace_vfio_intx_eoi(vbasedev->name); + trace_vfio_pci_intx_eoi(vbasedev->name); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); - vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); } static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) @@ -129,13 +149,12 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) /* Get to a known interrupt state */ qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { goto fail; } @@ -147,15 +166,15 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) goto fail_irqfd; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_UNMASK, - event_notifier_get_fd(&vdev->intx.unmask), - errp)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_UNMASK, + event_notifier_get_fd(&vdev->intx.unmask), + errp)) { goto fail_vfio; } /* Let'em rip */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.kvm_accel = true; @@ -167,10 +186,10 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); return false; #else return true; @@ -188,7 +207,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) * Get to a known state, hardware masked, QEMU ready to accept new * interrupts, QEMU IRQ de-asserted. */ - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); @@ -199,7 +218,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -208,7 +227,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) vdev->intx.kvm_accel = false; /* If we've missed an event, let it re-fire through QEMU */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); trace_vfio_intx_disable_kvm(vdev->vbasedev.name); #endif @@ -234,12 +253,12 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) } /* Re-enable the interrupt in cased we missed an EOI */ - vfio_intx_eoi(&vdev->vbasedev); + vfio_pci_intx_eoi(&vdev->vbasedev); } static void vfio_intx_routing_notifier(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); PCIINTxRoute route; if (vdev->interrupt != VFIO_INT_INTx) { @@ -266,7 +285,6 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); Error *err = NULL; int32_t fd; - int ret; if (!pin) { @@ -289,18 +307,17 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); - if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0, + errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return false; } @@ -320,20 +337,25 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) timer_del(vdev->intx.mmap_timer); vfio_intx_disable_kvm(vdev); - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); vfio_mmap_set_enabled(vdev, true); fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; trace_vfio_intx_disable(vdev->vbasedev.name); } +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp) +{ + return vfio_intx_enable(vdev, errp); +} + /* * MSI/X */ @@ -379,7 +401,7 @@ static void vfio_msi_interrupt(void *opaque) static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) { g_autofree struct vfio_irq_set *irq_set = NULL; - int ret = 0, argsz; + int argsz; int32_t *fd; argsz = sizeof(*irq_set) + sizeof(*fd); @@ -394,9 +416,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) fd = (int32_t *)&irq_set->data; *fd = -1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); - - return ret; + return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) @@ -453,15 +473,15 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); g_free(irq_set); return ret; } -static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix) { if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; @@ -471,13 +491,16 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, vector_n, &vdev->pdev); } -static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) { + const char *name = "kvm_interrupt"; + if (vector->virq < 0) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr, + NULL)) { goto fail_notifier; } @@ -489,19 +512,20 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) return; fail_kvm: - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr); fail_notifier: kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -511,10 +535,47 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + Error *local_err = NULL; + + vector->vdev = vdev; + vector->virq = -1; + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr, + &local_err)) { + error_report_err(local_err); + } + vector->use = true; + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_vector_use(pdev, nr); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -524,13 +585,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_pci_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -542,19 +597,19 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } } else { if (msg) { if (vdev->defer_kvm_irq_routing) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); } else { vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); - vfio_connect_kvm_msi_virq(vector); + vfio_connect_kvm_msi_virq(vector, nr); } } } @@ -576,27 +631,14 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, if (!vdev->defer_kvm_irq_routing) { if (vdev->msix->noresize && resizing) { - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_set_irq_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -619,7 +661,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev, static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); @@ -636,7 +678,7 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) int32_t fd = event_notifier_get_fd(&vector->interrupt); Error *err = NULL; - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); @@ -644,14 +686,14 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); vdev->defer_kvm_irq_routing = true; vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); } -static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { int i; @@ -661,7 +703,7 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) kvm_irqchip_commit_route_changes(&vfio_route_change); for (i = 0; i < vdev->nr_vectors; i++) { - vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i); } } @@ -681,19 +723,20 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * routes once rather than per vector provides a substantial * performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { /* @@ -710,7 +753,8 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) */ ret = vfio_enable_msix_no_vec(vdev); if (ret) { - error_report("vfio: failed to enable MSI-X, %d", ret); + error_report("vfio: failed to enable MSI-X, %s", + strerror(-ret)); } } @@ -730,19 +774,21 @@ retry: * Deferring to commit the KVM routes once rather than per vector * provides a substantial performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i, + &local_err)) { + error_report_err(local_err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -752,10 +798,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vdev, vector, i, false); + vfio_pci_add_kvm_msi_virq(vdev, vector, i, false); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; @@ -763,7 +809,8 @@ retry: ret = vfio_enable_vectors(vdev, false); if (ret) { if (ret < 0) { - error_report("vfio: Error: Failed to setup MSI fds: %m"); + error_report("vfio: Error: Failed to setup MSI fds: %s", + strerror(-ret)); } else { error_report("vfio: Error: Failed to enable %d " "MSI vectors, retry with %d", vdev->nr_vectors, ret); @@ -797,11 +844,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -833,7 +880,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) * Always clear MSI-X IRQ index. A PF device could have enabled * MSI-X with no vectors. See vfio_msix_enable(). */ - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); vfio_msi_disable_common(vdev); if (!vfio_intx_enable(vdev, &err)) { @@ -850,7 +897,7 @@ static void vfio_msi_disable(VFIOPCIDevice *vdev) { Error *err = NULL; - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); vfio_msi_disable_common(vdev); vfio_intx_enable(vdev, &err); if (err) { @@ -879,18 +926,22 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { - g_autofree struct vfio_region_info *reg_info = NULL; + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info = NULL; uint64_t size; off_t off = 0; ssize_t bytes; + int ret; + + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, + ®_info); - if (vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { - error_report("vfio: Error getting ROM info: %m"); + if (ret != 0) { + error_report("vfio: Error getting ROM info: %s", strerror(-ret)); return; } - trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, + trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -899,8 +950,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) if (!vdev->rom_size) { vdev->rom_read_failed = true; - error_report("vfio-pci: Cannot read device rom at " - "%s", vdev->vbasedev.name); + error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name); error_printf("Device option ROM contents are probably invalid " "(check dmesg).\nSkip option ROM probe with rombar=0, " "or load from file with romfile=\n"); @@ -911,18 +961,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = vbasedev->io_ops->region_read(vbasedev, + VFIO_PCI_ROM_REGION_INDEX, + off, size, vdev->rom + off); + if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strreaderror(bytes)); + break; } } @@ -958,6 +1012,24 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) } } +/* "Raw" read of underlying config space. */ +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); +} + +/* "Raw" write of underlying config space. */ +static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data, false); +} + static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) { VFIOPCIDevice *vdev = opaque; @@ -1010,11 +1082,9 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; - DeviceState *dev = DEVICE(vdev); char *name; - int fd = vdev->vbasedev.fd; if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ @@ -1031,11 +1101,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) * Use the same size ROM BAR as the physical device. The contents * will get filled in later when the guest tries to read it. */ - if (pread(fd, &orig, 4, offset) != 4 || - pwrite(fd, &size, 4, offset) != 4 || - pread(fd, &size, 4, offset) != 4 || - pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); + if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) { + + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); return; } @@ -1046,12 +1117,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) } if (vfio_opt_rom_in_denylist(vdev)) { - if (dev->opts && qdict_haskey(dev->opts, "rombar")) { + if (vdev->pdev.rom_bar > 0) { warn_report("Device at %s is known to cause system instability" " issues during option rom execution", vdev->vbasedev.name); error_printf("Proceeding anyway since user specified" - " non zero value for rombar\n"); + " positive value for rombar\n"); } else { warn_report("Rom loading for device at %s has been disabled" " due to system instability issues", @@ -1168,7 +1239,7 @@ static const MemoryRegionOps vfio_vga_ops = { */ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIORegion *region = &vdev->bars[bar].region; MemoryRegion *mmap_mr, *region_mr, *base_mr; PCIIORegion *r; @@ -1214,7 +1285,8 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) */ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1227,12 +1299,12 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { ssize_t ret; - ret = pread(vdev->vbasedev.fd, &phys_val, len, - vdev->config_offset + addr); + ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val); if (ret != len) { - error_report("%s(%s, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, len); - return -errno; + error_report("%s(%s, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, len, + strreaderror(ret)); + return -1; } phys_val = le32_to_cpu(phys_val); } @@ -1247,16 +1319,19 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); + int ret; trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); /* Write everything to VFIO, let it filter out what we can't write */ - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) - != len) { - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, val, len); + ret = vfio_pci_config_space_write(vdev, addr, len, &val_le); + if (ret != len) { + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, val, len, + strwriteerror(ret)); } /* MSI/MSI-X Enabling/Disabling */ @@ -1344,9 +1419,11 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) int ret, entries; Error *err = NULL; - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", + strreaderror(ret)); return false; } ctrl = le16_to_cpu(ctrl); @@ -1379,8 +1456,8 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) * If the host driver allows mapping of a MSIX data, we are going to * do map the entire BAR and emulate MSIX table on top of that. */ - if (vfio_has_region_cap(&vdev->vbasedev, region->nr, - VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { + if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr, + VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { return; } @@ -1452,7 +1529,7 @@ static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) int target_bar = -1; size_t msix_sz; - if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { + if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) { return true; } @@ -1464,7 +1541,7 @@ static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) /* PCI BARs must be a power of 2 */ msix_sz = pow2ceil(msix_sz); - if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) { + if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) { /* * TODO: Lookup table for known devices. * @@ -1479,7 +1556,7 @@ static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) return false; } } else { - target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0); + target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0); } /* I/O port BARs cannot host MSI-X structures */ @@ -1553,31 +1630,35 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int ret, fd = vdev->vbasedev.fd; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_MSIX_IRQ_INDEX }; + struct vfio_irq_info irq_info; VFIOMSIXInfo *msix; + int ret; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); if (!pos) { return true; } - if (pread(fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed to read PCI MSIX FLAGS: %s", + strreaderror(ret)); return false; } - if (pread(fd, &table, sizeof(table), - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE, + sizeof(table), &table); + if (ret != sizeof(table)) { + error_setg(errp, "failed to read PCI MSIX TABLE: %s", + strreaderror(ret)); return false; } - if (pread(fd, &pba, sizeof(pba), - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA, + sizeof(pba), &pba); + if (ret != sizeof(pba)) { + error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret)); return false; } @@ -1592,7 +1673,8 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + &irq_info); if (ret < 0) { error_setg_errno(errp, -ret, "failed to get MSI-X irq info"); g_free(msix); @@ -1624,7 +1706,7 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU, PCI_DEVICE_ID_KUNLUN_VF)) { msix->pba_offset = 0xb400; - } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { + } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) { error_setg(errp, "hardware reports invalid configuration, " "MSIX PBA outside of specified BAR"); g_free(msix); @@ -1699,7 +1781,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev) { msi_uninit(&vdev->pdev); @@ -1736,10 +1818,10 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) } /* Determine what type of BAR this is for registration */ - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr), + sizeof(pci_bar), &pci_bar); if (ret != sizeof(pci_bar)) { - error_report("vfio: Failed to read BAR %d (%m)", nr); + error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret)); return; } @@ -1749,6 +1831,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) @@ -1795,7 +1880,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_pci_bars_exit(VFIOPCIDevice *vdev) { int i; @@ -2216,8 +2301,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) break; case PCI_CAP_ID_PM: vfio_check_pm_reset(vdev, pos); - vdev->pm_cap = pos; - ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0; + ret = pci_pm_init(pdev, pos, errp) >= 0; + /* + * PCI-core config space emulation needs write access to the power + * state enabled for tracking BAR mapping relative to PM state. + */ + pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); break; case PCI_CAP_ID_AF: vfio_check_af_flr(vdev, pos); @@ -2380,10 +2469,9 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) } g_free(config); - return; } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -2407,18 +2495,27 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_disable_interrupts(vdev); + /* + * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. + * Also put INTx Disable in known state. + */ + cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); + cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + /* Make sure the device is in D0 */ - if (vdev->pm_cap) { + if (pdev->pm_cap) { uint16_t pmcsr; uint8_t state; - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { pmcsr &= ~PCI_PM_CTRL_STATE_MASK; - vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); + vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); /* vfio handles the necessary delay here */ - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { error_report("vfio: Unable to power on device, stuck in D%d", @@ -2426,34 +2523,27 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) } } } - - /* - * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. - * Also put INTx Disable in known state. - */ - cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); - cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | - PCI_COMMAND_INTX_DISABLE); - vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } void vfio_pci_post_reset(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; Error *err = NULL; - int nr; + int ret, nr; if (!vfio_intx_enable(vdev, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); uint32_t val = 0; uint32_t len = sizeof(val); - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { - error_report("%s(%s) reset bar %d failed: %m", __func__, - vdev->vbasedev.name, nr); + ret = vfio_pci_config_space_write(vdev, addr, len, &val); + if (ret != len) { + error_report("%s(%s) reset bar %d failed: %s", __func__, + vbasedev->name, nr, strwriteerror(ret)); } } @@ -2657,7 +2747,7 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_intx_eoi, + .vfio_eoi = vfio_pci_intx_eoi, .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, @@ -2666,10 +2756,10 @@ static VFIODeviceOps vfio_pci_ops = { bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; int ret; - ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); if (ret) { error_setg_errno(errp, -ret, "failed getting region info for VGA region index %d", @@ -2728,11 +2818,11 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + struct vfio_region_info *reg_info = NULL; + struct vfio_irq_info irq_info; int i, ret = -1; /* Sanity check device */ @@ -2767,14 +2857,14 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) QLIST_INIT(&vdev->bars[i].quirks); } - ret = vfio_get_region_info(vbasedev, - VFIO_PCI_CONFIG_REGION_INDEX, ®_info); + ret = vfio_device_get_region_info(vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, ®_info); if (ret) { error_setg_errno(errp, -ret, "failed to get config info"); return false; } - trace_vfio_populate_device_config(vdev->vbasedev.name, + trace_vfio_pci_populate_device_config(vdev->vbasedev.name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -2793,12 +2883,10 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } - irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; - - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); + trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2810,9 +2898,21 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { - vfio_detach_device(&vdev->vbasedev); + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + + vfio_device_detach(&vdev->vbasedev); g_free(vdev->vbasedev.name); g_free(vdev->msix); @@ -2846,7 +2946,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2855,8 +2955,9 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Unable to init event notifier for error detection"); + if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0, + &err)) { + error_report_err(err); vdev->pci_aer = false; return; } @@ -2864,11 +2965,11 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2881,13 +2982,13 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2905,35 +3006,37 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) { - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_REQ_IRQ_INDEX }; + struct vfio_irq_info irq_info; Error *err = NULL; int32_t fd; + int ret; if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { return; } - if (ioctl(vdev->vbasedev.fd, - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, + &irq_info); + if (ret < 0 || irq_info.count < 1) { return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { - error_report("vfio: Unable to init event notifier for device request"); + if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0, + &err)) { + error_report_err(err); return; } fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2947,93 +3050,33 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { - ERRP_GUARD(); - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; - char *subsys; - int i, ret; - bool is_mdev; - char uuid[UUID_STR_LEN]; - g_autofree char *name = NULL; - g_autofree char *tmp = NULL; - - if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { - if (!(~vdev->host.domain || ~vdev->host.bus || - ~vdev->host.slot || ~vdev->host.function)) { - error_setg(errp, "No provided host device"); - error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " -#ifdef CONFIG_IOMMUFD - "or -device vfio-pci,fd=DEVICE_FD " -#endif - "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); - return; - } - vbasedev->sysfsdev = - g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", - vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); - } - - if (!vfio_device_get_name(vbasedev, errp)) { - return; - } - - /* - * Mediated devices *might* operate compatibly with discarding of RAM, but - * we cannot know for certain, it depends on whether the mdev vendor driver - * stays in sync with the active working set of the guest driver. Prevent - * the x-balloon-allowed option unless this is minimally an mdev device. - */ - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); - free(subsys); - - trace_vfio_mdev(vbasedev->name, is_mdev); - - if (vbasedev->ram_block_discard_allowed && !is_mdev) { - error_setg(errp, "x-balloon-allowed only potentially compatible " - "with mdev devices"); - goto error; - } - - if (!qemu_uuid_is_null(&vdev->vf_token)) { - qemu_uuid_unparse(&vdev->vf_token, uuid); - name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); - } else { - name = g_strdup(vbasedev->name); - } - - if (!vfio_attach_device(name, vbasedev, - pci_device_iommu_address_space(pdev), errp)) { - goto error; - } + uint32_t config_space_size; + int ret; - if (!vfio_populate_device(vdev, errp)) { - goto error; - } + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); /* Get a copy of config space */ - ret = pread(vbasedev->fd, vdev->pdev.config, - MIN(pci_config_size(&vdev->pdev), vdev->config_size), - vdev->config_offset); - if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { - ret = ret < 0 ? -errno : -EFAULT; - error_setg_errno(errp, -ret, "failed to read device config space"); - goto error; + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; } /* vfio emulates a lot for us, but some bits need extra love */ @@ -3052,7 +3095,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->vendor_id != PCI_ANY_ID) { if (vdev->vendor_id >= 0xffff) { error_setg(errp, "invalid PCI vendor ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id); @@ -3063,7 +3106,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->device_id != PCI_ANY_ID) { if (vdev->device_id > 0xffff) { error_setg(errp, "invalid PCI device ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id); @@ -3074,7 +3117,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->sub_vendor_id != PCI_ANY_ID) { if (vdev->sub_vendor_id > 0xffff) { error_setg(errp, "invalid PCI subsystem vendor ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, vdev->sub_vendor_id, ~0); @@ -3085,7 +3128,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->sub_device_id != PCI_ANY_ID) { if (vdev->sub_device_id > 0xffff) { error_setg(errp, "invalid PCI subsystem device ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); trace_vfio_pci_emulated_sub_device_id(vbasedev->name, @@ -3116,52 +3159,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_prepare(vdev); if (!vfio_msix_early_setup(vdev, errp)) { - goto error; + return false; } vfio_bars_register(vdev); - if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { - error_prepend(errp, "Failed to set iommu_device: "); - goto out_teardown; - } - - if (!vfio_add_capabilities(vdev, errp)) { - goto out_unset_idev; - } - - if (vdev->vga) { - vfio_vga_quirk_setup(vdev); - } - - for (i = 0; i < PCI_ROM_SLOT; i++) { - vfio_bar_quirk_setup(vdev, i); - } - - if (!vdev->igd_opregion && - vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { - g_autofree struct vfio_region_info *opregion = NULL; - - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, - "cannot support IGD OpRegion feature on hotplugged " - "device"); - goto out_unset_idev; - } - - ret = vfio_get_dev_region_info(vbasedev, - VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); - if (ret) { - error_setg_errno(errp, -ret, - "does not support requested IGD OpRegion feature"); - goto out_unset_idev; - } + return true; +} - if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) { - goto out_unset_idev; - } - } +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +{ + PCIDevice *pdev = &vdev->pdev; /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { @@ -3176,14 +3184,111 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, - vfio_intx_mmap_enable, vdev); + vfio_intx_mmap_enable, vdev); pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_routing_notifier); vdev->irqchip_change_notifier.notify = vfio_irqchip_change; kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); if (!vfio_intx_enable(vdev, errp)) { - goto out_deregister; + timer_free(vdev->intx.mmap_timer); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + return false; + } + } + return true; +} + +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) +{ + ERRP_GUARD(); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; + int i; + char uuid[UUID_STR_LEN]; + g_autofree char *name = NULL; + + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { + if (!(~vdev->host.domain || ~vdev->host.bus || + ~vdev->host.slot || ~vdev->host.function)) { + error_setg(errp, "No provided host device"); + error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif + "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); + return; } + vbasedev->sysfsdev = + g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", + vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); + } + + if (!vfio_device_get_name(vbasedev, errp)) { + return; + } + + /* + * Mediated devices *might* operate compatibly with discarding of RAM, but + * we cannot know for certain, it depends on whether the mdev vendor driver + * stays in sync with the active working set of the guest driver. Prevent + * the x-balloon-allowed option unless this is minimally an mdev device. + */ + vbasedev->mdev = vfio_device_is_mdev(vbasedev); + + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); + + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { + error_setg(errp, "x-balloon-allowed only potentially compatible " + "with mdev devices"); + goto error; + } + + if (!qemu_uuid_is_null(&vdev->vf_token)) { + qemu_uuid_unparse(&vdev->vf_token, uuid); + name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); + } else { + name = g_strdup(vbasedev->name); + } + + if (!vfio_device_attach(name, vbasedev, + pci_device_iommu_address_space(pdev), errp)) { + goto error; + } + + if (!vfio_pci_populate_device(vdev, errp)) { + goto error; + } + + if (!vfio_pci_config_setup(vdev, errp)) { + goto error; + } + + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + error_prepend(errp, "Failed to set vIOMMU: "); + goto out_teardown; + } + + if (!vfio_pci_add_capabilities(vdev, errp)) { + goto out_unset_idev; + } + + if (!vfio_config_quirk_setup(vdev, errp)) { + goto out_unset_idev; + } + + if (vdev->vga) { + vfio_vga_quirk_setup(vdev); + } + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_bar_quirk_setup(vdev, i); + } + + if (!vfio_pci_interrupt_setup(vdev, errp)) { + goto out_unset_idev; } if (vdev->display != ON_OFF_AUTO_OFF) { @@ -3226,8 +3331,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - vfio_register_err_notifier(vdev); - vfio_register_req_notifier(vdev); + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); return; @@ -3244,35 +3349,26 @@ out_deregister: timer_free(vdev->intx.mmap_timer); } out_unset_idev: - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } out_teardown: - vfio_teardown_msi(vdev); - vfio_bars_exit(vdev); + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } static void vfio_instance_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } static void vfio_exitfn(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); @@ -3285,16 +3381,23 @@ static void vfio_exitfn(PCIDevice *pdev) if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } - vfio_teardown_msi(vdev); + vfio_pci_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); - vfio_bars_exit(vdev); + vfio_pci_bars_exit(vdev); vfio_migration_exit(vbasedev); - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } } static void vfio_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI(dev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (cpr_is_incoming()) { + return; + } trace_vfio_pci_reset(vdev->vbasedev.name); @@ -3334,7 +3437,7 @@ post_reset: static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -3353,15 +3456,52 @@ static void vfio_instance_init(Object *obj) /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command * line, therefore, no need to wait to realize like other devices */ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + + /* + * A device that is resuming for cpr is already configured, so do not + * reset it during qemu_system_reset prior to cpr load, else interrupts + * may be lost. + */ + pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; } -static Property vfio_pci_dev_properties[] = { +static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->desc = "VFIO PCI base device"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pdc->exit = vfio_exitfn; + pdc->config_read = vfio_pci_read_config; + pdc->config_write = vfio_pci_write_config; +} + +static const TypeInfo vfio_pci_base_dev_info = { + .name = TYPE_VFIO_PCI_BASE, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(VFIOPCIDevice), + .abstract = true, + .class_init = vfio_pci_base_dev_class_init, + .interfaces = (const InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, +}; + +static PropertyInfo vfio_pci_migration_multifd_transfer_prop; + +static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, + vbasedev.device_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), @@ -3373,9 +3513,17 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true), + DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false), + DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice, + igd_legacy_mode, ON_OFF_AUTO_AUTO), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), + DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice, + vbasedev.migration_multifd_transfer, + vfio_pci_migration_multifd_transfer_prop, OnOffAuto, + .set_default = true, .defval.i = ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3401,67 +3549,187 @@ static Property vfio_pci_dev_properties[] = { nv_gpudirect_clique, qdev_prop_nv_gpudirect_clique, uint8_t), DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, - OFF_AUTOPCIBAR_OFF), + OFF_AUTO_PCIBAR_OFF), #ifdef CONFIG_IOMMUFD DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), #endif DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true), - DEFINE_PROP_END_OF_LIST(), }; #ifdef CONFIG_IOMMUFD static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) { - vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + vfio_device_set_fd(&vdev->vbasedev, str, errp); } #endif -static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) +static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); - dc->reset = vfio_pci_reset; + device_class_set_legacy_reset(dc, vfio_pci_reset); device_class_set_props(dc, vfio_pci_dev_properties); #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif + dc->vmsd = &vfio_cpr_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); - pdc->realize = vfio_realize; - pdc->exit = vfio_exitfn; - pdc->config_read = vfio_pci_read_config; - pdc->config_write = vfio_pci_write_config; + pdc->realize = vfio_pci_realize; + + object_class_property_set_description(klass, /* 1.3 */ + "host", + "Host PCI address [domain:]<bus:slot.function> of assigned device"); + object_class_property_set_description(klass, /* 1.3 */ + "x-intx-mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after INTx (DEBUG)"); + object_class_property_set_description(klass, /* 1.5 */ + "x-vga", + "Expose VGA address spaces for device"); + object_class_property_set_description(klass, /* 2.3 */ + "x-req", + "Disable device request notification support (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-intx", + "Disable direct VFIO->KVM INTx injection. Allows to " + "trace INTx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msi", + "Disable direct VFIO->KVM MSI injection. Allows to " + "trace MSI interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msix", + "Disable direct VFIO->KVM MSIx injection. Allows to " + "trace MSIx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-vendor-id", + "Override PCI Vendor ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-device-id", + "Override PCI device ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-vendor-id", + "Override PCI Subsystem Vendor ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-device-id", + "Override PCI Subsystem Device ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 2.7 */ + "x-igd-opregion", + "Expose host IGD OpRegion to guest"); + object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */ + "x-igd-gms", + "Override IGD data stolen memory size (32MiB units)"); + object_class_property_set_description(klass, /* 2.11 */ + "x-nv-gpudirect-clique", + "Add NVIDIA GPUDirect capability indicating P2P DMA " + "clique for device [0-15]"); + object_class_property_set_description(klass, /* 2.12 */ + "x-no-geforce-quirks", + "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). " + "Improves performance"); + object_class_property_set_description(klass, /* 2.12 */ + "display", + "Enable display support for device, ex. vGPU"); + object_class_property_set_description(klass, /* 2.12 */ + "x-msix-relocation", + "Specify MSI-X MMIO relocation to the end of specified " + "existing BAR or new BAR to avoid virtualization overhead " + "due to adjacent device registers"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-kvm-ioeventfd", + "Disable registration of ioeventfds with KVM (DEBUG)"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-vfio-ioeventfd", + "Disable linking of KVM ioeventfds to VFIO ioeventfds " + "(DEBUG)"); + object_class_property_set_description(klass, /* 3.1 */ + "x-balloon-allowed", + "Override allowing ballooning with device (DEBUG, DANGER)"); + object_class_property_set_description(klass, /* 3.2 */ + "xres", + "Set X display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 3.2 */ + "yres", + "Set Y display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 5.2 */ + "x-pre-copy-dirty-page-tracking", + "Disable dirty pages tracking during iterative phase " + "(DEBUG)"); + object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */ + "enable-migration", + "Enale device migration. Also requires a host VFIO PCI " + "variant or mdev driver with migration support enabled"); + object_class_property_set_description(klass, /* 8.1 */ + "vf-token", + "Specify UUID VF token. Required for VF when PF is owned " + "by another VFIO driver"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.1 */ + "x-device-dirty-page-tracking", + "Disable device dirty page tracking and use " + "container-based dirty page tracking"); + object_class_property_set_description(klass, /* 9.1 */ + "migration-events", + "Emit VFIO migration QAPI event when a VFIO device " + "changes its migration state. For management applications"); + object_class_property_set_description(klass, /* 9.1 */ + "skip-vsc-check", + "Skip config space check for Vendor Specific Capability. " + "Setting to false will enforce strict checking of VSC content " + "(DEBUG)"); + object_class_property_set_description(klass, /* 10.0 */ + "x-migration-multifd-transfer", + "Transfer this device state via " + "multifd channels when live migrating it"); } static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, - .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(VFIOPCIDevice), + .parent = TYPE_VFIO_PCI_BASE, .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, - .interfaces = (InterfaceInfo[]) { - { INTERFACE_PCIE_DEVICE }, - { INTERFACE_CONVENTIONAL_PCI_DEVICE }, - { } - }, }; -static Property vfio_pci_dev_nohotplug_properties[] = { +static const Property vfio_pci_dev_nohotplug_properties[] = { DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate, ON_OFF_AUTO_AUTO), - DEFINE_PROP_END_OF_LIST(), }; -static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) +static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); device_class_set_props(dc, vfio_pci_dev_nohotplug_properties); dc->hotpluggable = false; + + object_class_property_set_description(klass, /* 3.1 */ + "ramfb", + "Enable ramfb to provide pre-boot graphics for devices " + "enabling display option"); + object_class_property_set_description(klass, /* 8.2 */ + "x-ramfb-migrate", + "Override default migration support for ramfb support " + "(DEBUG)"); } static const TypeInfo vfio_pci_nohotplug_dev_info = { @@ -3473,6 +3741,18 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { static void register_vfio_pci_dev_type(void) { + /* + * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can + * run for a long time before being migrated so it is desirable to have a + * fallback mechanism to the old way of transferring VFIO device state if + * it turns to be necessary. + * The following makes this type of property have the same mutability level + * as ordinary migration parameters. + */ + vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; + vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; + + type_register_static(&vfio_pci_base_dev_info); type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } |