diff options
Diffstat (limited to 'hw/vfio')
31 files changed, 3275 insertions, 1910 deletions
diff --git a/hw/vfio/Kconfig b/hw/vfio/Kconfig index 7cdba05..91d9023 100644 --- a/hw/vfio/Kconfig +++ b/hw/vfio/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + config VFIO bool depends on LINUX diff --git a/hw/vfio/amd-xgbe.c b/hw/vfio/amd-xgbe.c index 5927503..58f590e 100644 --- a/hw/vfio/amd-xgbe.c +++ b/hw/vfio/amd-xgbe.c @@ -34,7 +34,7 @@ static const VMStateDescription vfio_platform_amd_xgbe_vmstate = { .unmigratable = 1, }; -static void vfio_amd_xgbe_class_init(ObjectClass *klass, void *data) +static void vfio_amd_xgbe_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VFIOAmdXgbeDeviceClass *vcxc = diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index c7ab4ff..1df4438 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -10,16 +10,19 @@ * directory. */ +#include <stdbool.h> #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include <linux/vfio.h> #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/ap-device.h" +#include "hw/s390x/css.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" +#include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/option.h" @@ -28,7 +31,7 @@ #include "migration/vmstate.h" #include "hw/qdev-properties.h" #include "hw/s390x/ap-bridge.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "qom/object.h" #define TYPE_VFIO_AP_DEVICE "vfio-ap" @@ -37,8 +40,23 @@ struct VFIOAPDevice { APDevice apdev; VFIODevice vdev; EventNotifier req_notifier; + EventNotifier cfg_notifier; }; +typedef struct APConfigChgEvent { + QTAILQ_ENTRY(APConfigChgEvent) next; +} APConfigChgEvent; + +static QTAILQ_HEAD(, APConfigChgEvent) cfg_chg_events = + QTAILQ_HEAD_INITIALIZER(cfg_chg_events); + +static QemuMutex cfg_chg_events_lock; + +static void __attribute__((constructor)) vfio_ap_global_init(void) +{ + qemu_mutex_init(&cfg_chg_events_lock); +} + OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE) static void vfio_ap_compute_needs_reset(VFIODevice *vdev) @@ -70,14 +88,65 @@ static void vfio_ap_req_notifier_handler(void *opaque) } } +static void vfio_ap_cfg_chg_notifier_handler(void *opaque) +{ + APConfigChgEvent *cfg_chg_event; + VFIOAPDevice *vapdev = opaque; + + if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) { + return; + } + + cfg_chg_event = g_new0(APConfigChgEvent, 1); + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + QTAILQ_INSERT_TAIL(&cfg_chg_events, cfg_chg_event, next); + } + + css_generate_css_crws(0); + +} + +int ap_chsc_sei_nt0_get_event(void *res) +{ + ChscSeiNt0Res *nt0_res = (ChscSeiNt0Res *)res; + APConfigChgEvent *cfg_chg_event; + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + if (QTAILQ_EMPTY(&cfg_chg_events)) { + return EVENT_INFORMATION_NOT_STORED; + } + + cfg_chg_event = QTAILQ_FIRST(&cfg_chg_events); + QTAILQ_REMOVE(&cfg_chg_events, cfg_chg_event, next); + } + + memset(nt0_res, 0, sizeof(*nt0_res)); + g_free(cfg_chg_event); + nt0_res->flags |= PENDING_EVENT_INFO_BITMASK; + nt0_res->length = sizeof(ChscSeiNt0Res); + nt0_res->code = NT0_RES_RESPONSE_CODE; + nt0_res->nt = NT0_RES_NT_DEFAULT; + nt0_res->rs = NT0_RES_RS_AP_CHANGE; + nt0_res->cc = NT0_RES_CC_AP_CHANGE; + + return EVENT_INFORMATION_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + QEMU_LOCK_GUARD(&cfg_chg_events_lock); + return !QTAILQ_EMPTY(&cfg_chg_events); +} + static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { int fd; - size_t argsz; + int ret; IOHandler *fd_read; EventNotifier *notifier; - g_autofree struct vfio_irq_info *irq_info = NULL; + struct vfio_irq_info irq_info; VFIODevice *vdev = &vapdev->vdev; switch (irq) { @@ -85,6 +154,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, notifier = &vapdev->req_notifier; fd_read = vfio_ap_req_notifier_handler; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + fd_read = vfio_ap_cfg_chg_notifier_handler; + break; default: error_setg(errp, "vfio: Unsupported device irq(%d)", irq); return false; @@ -96,14 +169,15 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, return false; } - argsz = sizeof(*irq_info); - irq_info = g_malloc0(argsz); - irq_info->index = irq; - irq_info->argsz = argsz; + ret = vfio_device_get_irq_info(vdev, irq, &irq_info); - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, - irq_info) < 0 || irq_info->count < 1) { - error_setg_errno(errp, errno, "vfio: Error getting irq info"); + if (ret < 0) { + error_setg_errno(errp, -ret, "vfio: Error getting irq info"); + return false; + } + + if (irq_info.count < 1) { + error_setg(errp, "vfio: Error getting irq info, count=0"); return false; } @@ -117,8 +191,8 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, fd = event_notifier_get_fd(notifier); qemu_set_fd_handler(fd, fd_read, NULL, vapdev); - if (!vfio_set_irq_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, - errp)) { + if (!vfio_device_irq_set_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, + errp)) { qemu_set_fd_handler(fd, NULL, NULL, vapdev); event_notifier_cleanup(notifier); } @@ -136,13 +210,16 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, case VFIO_AP_REQ_IRQ_INDEX: notifier = &vapdev->req_notifier; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + break; default: error_report("vfio: Unsupported device irq(%d)", irq); return; } - if (!vfio_set_irq_signaling(&vapdev->vdev, irq, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vapdev->vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { warn_reportf_err(err, VFIO_MSG_PREFIX, vapdev->vdev.name); } @@ -162,7 +239,7 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) return; } - if (!vfio_attach_device(vbasedev->name, vbasedev, + if (!vfio_device_attach(vbasedev->name, vbasedev, &address_space_memory, errp)) { goto error; } @@ -175,6 +252,15 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) warn_report_err(err); } + if (!vfio_ap_register_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX, &err)) + { + /* + * Report this error, but do not make it a failing condition. + * Lack of this IRQ in the host does not prevent normal operation. + */ + warn_report_err(err); + } + return; error: @@ -187,7 +273,8 @@ static void vfio_ap_unrealize(DeviceState *dev) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX); - vfio_detach_device(&vapdev->vdev); + vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX); + vfio_device_detach(&vapdev->vdev); g_free(vapdev->vdev.name); } @@ -241,7 +328,7 @@ static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp) } #endif -static void vfio_ap_class_init(ObjectClass *klass, void *data) +static void vfio_ap_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); diff --git a/hw/vfio/calxeda-xgmac.c b/hw/vfio/calxeda-xgmac.c index a5ef262..03f2ff5 100644 --- a/hw/vfio/calxeda-xgmac.c +++ b/hw/vfio/calxeda-xgmac.c @@ -34,7 +34,7 @@ static const VMStateDescription vfio_platform_calxeda_xgmac_vmstate = { .unmigratable = 1, }; -static void vfio_calxeda_xgmac_class_init(ObjectClass *klass, void *data) +static void vfio_calxeda_xgmac_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VFIOCalxedaXgmacDeviceClass *vcxc = diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index e5e0d9e..cea9d6e 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -21,13 +21,13 @@ #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" #include "hw/qdev-properties.h" #include "hw/s390x/ccw-device.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" @@ -376,8 +376,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, Error **errp) { VFIODevice *vdev = &vcdev->vdev; - g_autofree struct vfio_irq_info *irq_info = NULL; - size_t argsz; + struct vfio_irq_info irq_info; + int ret; int fd; EventNotifier *notifier; IOHandler *fd_read; @@ -406,13 +406,15 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, return false; } - argsz = sizeof(*irq_info); - irq_info = g_malloc0(argsz); - irq_info->index = irq; - irq_info->argsz = argsz; - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, - irq_info) < 0 || irq_info->count < 1) { - error_setg_errno(errp, errno, "vfio: Error getting irq info"); + ret = vfio_device_get_irq_info(vdev, irq, &irq_info); + + if (ret < 0) { + error_setg_errno(errp, -ret, "vfio: Error getting irq info"); + return false; + } + + if (irq_info.count < 1) { + error_setg(errp, "vfio: Error getting irq info, count=0"); return false; } @@ -426,8 +428,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, fd = event_notifier_get_fd(notifier); qemu_set_fd_handler(fd, fd_read, NULL, vcdev); - if (!vfio_set_irq_signaling(vdev, irq, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { + if (!vfio_device_irq_set_signaling(vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vcdev); event_notifier_cleanup(notifier); } @@ -456,8 +458,8 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, return; } - if (!vfio_set_irq_signaling(&vcdev->vdev, irq, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vcdev->vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { warn_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); } @@ -488,7 +490,7 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) return false; } - ret = vfio_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, &info); + ret = vfio_device_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, &info); if (ret) { error_setg_errno(errp, -ret, "vfio: Error getting config info"); return false; @@ -502,11 +504,10 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->io_region_offset = info->offset; vcdev->io_region = g_malloc0(info->size); - g_free(info); /* check for the optional async command region */ - ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, - VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, &info); + ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, &info); if (!ret) { vcdev->async_cmd_region_size = info->size; if (sizeof(*vcdev->async_cmd_region) != vcdev->async_cmd_region_size) { @@ -515,11 +516,10 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->async_cmd_region_offset = info->offset; vcdev->async_cmd_region = g_malloc0(info->size); - g_free(info); } - ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, - VFIO_REGION_SUBTYPE_CCW_SCHIB, &info); + ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_SCHIB, &info); if (!ret) { vcdev->schib_region_size = info->size; if (sizeof(*vcdev->schib_region) != vcdev->schib_region_size) { @@ -528,11 +528,10 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->schib_region_offset = info->offset; vcdev->schib_region = g_malloc(info->size); - g_free(info); } - ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, - VFIO_REGION_SUBTYPE_CCW_CRW, &info); + ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_CRW, &info); if (!ret) { vcdev->crw_region_size = info->size; @@ -542,7 +541,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->crw_region_offset = info->offset; vcdev->crw_region = g_malloc(info->size); - g_free(info); } return true; @@ -552,7 +550,6 @@ out_err: g_free(vcdev->schib_region); g_free(vcdev->async_cmd_region); g_free(vcdev->io_region); - g_free(info); return false; } @@ -583,7 +580,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) goto out_unrealize; } - if (!vfio_attach_device(cdev->mdevid, vbasedev, + if (!vfio_device_attach(cdev->mdevid, vbasedev, &address_space_memory, errp)) { goto out_attach_dev_err; } @@ -620,7 +617,7 @@ out_irq_notifier_err: out_io_notifier_err: vfio_ccw_put_region(vcdev); out_region_err: - vfio_detach_device(vbasedev); + vfio_device_detach(vbasedev); out_attach_dev_err: g_free(vbasedev->name); out_unrealize: @@ -639,7 +636,7 @@ static void vfio_ccw_unrealize(DeviceState *dev) vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); vfio_ccw_put_region(vcdev); - vfio_detach_device(&vcdev->vdev); + vfio_device_detach(&vcdev->vdev); g_free(vcdev->vdev.name); if (cdc->unrealize) { @@ -689,7 +686,7 @@ static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp) } #endif -static void vfio_ccw_class_init(ObjectClass *klass, void *data) +static void vfio_ccw_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 749a3fd..d834bd4 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -10,29 +10,87 @@ * SPDX-License-Identifier: GPL-2.0-or-later */ +#include <sys/ioctl.h> +#include <linux/vfio.h> + #include "qemu/osdep.h" +#include "system/tcg.h" +#include "system/ram_addr.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "hw/vfio/vfio-container-base.h" +#include "hw/vfio/vfio-device.h" /* vfio_device_reset_handler */ +#include "system/reset.h" +#include "vfio-helpers.h" + +#include "trace.h" + +static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = + QLIST_HEAD_INITIALIZER(vfio_address_spaces); + +VFIOAddressSpace *vfio_address_space_get(AddressSpace *as) +{ + VFIOAddressSpace *space; + + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as == as) { + return space; + } + } + + /* No suitable VFIOAddressSpace, create a new one */ + space = g_malloc0(sizeof(*space)); + space->as = as; + QLIST_INIT(&space->containers); + + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_register_reset(vfio_device_reset_handler, NULL); + } + + QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); + + return space; +} + +void vfio_address_space_put(VFIOAddressSpace *space) +{ + if (!QLIST_EMPTY(&space->containers)) { + return; + } + + QLIST_REMOVE(space, list); + g_free(space); + + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_unregister_reset(vfio_device_reset_handler, NULL); + } +} + +void vfio_address_space_insert(VFIOAddressSpace *space, + VFIOContainerBase *bcontainer) +{ + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + bcontainer->space = space; +} int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) + void *vaddr, bool readonly, MemoryRegion *mr) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, bool unmap_all) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); g_assert(vioc->dma_unmap); - return vioc->dma_unmap(bcontainer, iova, size, iotlb); + return vioc->dma_unmap(bcontainer, iova, size, iotlb, unmap_all); } bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, @@ -83,7 +141,67 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, return ret; } -int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, +static bool vfio_container_devices_dirty_tracking_is_started( + const VFIOContainerBase *bcontainer) +{ + VFIODevice *vbasedev; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_tracking) { + return false; + } + } + + return true; +} + +bool vfio_container_dirty_tracking_is_started( + const VFIOContainerBase *bcontainer) +{ + return vfio_container_devices_dirty_tracking_is_started(bcontainer) || + bcontainer->dirty_pages_started; +} + +bool vfio_container_devices_dirty_tracking_is_supported( + const VFIOContainerBase *bcontainer) +{ + VFIODevice *vbasedev; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; + } + if (!vbasedev->dirty_pages_supported) { + return false; + } + } + + return true; +} + +static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + hwaddr size, void *bitmap) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_report), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_dma_logging_report *report = + (struct vfio_device_feature_dma_logging_report *)feature->data; + + report->iova = iova; + report->length = size; + report->page_size = qemu_real_host_page_size(); + report->bitmap = (uintptr_t)bitmap; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; + + return vbasedev->io_ops->device_feature(vbasedev, feature); +} + +static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); @@ -93,6 +211,74 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, errp); } +static int vfio_container_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) +{ + VFIODevice *vbasedev; + int ret; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + ret = vfio_device_dma_logging_report(vbasedev, iova, size, + vbmap->bitmap); + if (ret) { + error_setg_errno(errp, -ret, + "%s: Failed to get DMA logging report, iova: " + "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, + vbasedev->name, iova, size); + + return ret; + } + } + + return 0; +} + +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr, Error **errp) +{ + bool all_device_dirty_tracking = + vfio_container_devices_dirty_tracking_is_supported(bcontainer); + uint64_t dirty_pages; + VFIOBitmap vbmap; + int ret; + + if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + return 0; + } + + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + error_setg_errno(errp, -ret, + "Failed to allocate dirty tracking bitmap"); + return ret; + } + + if (all_device_dirty_tracking) { + ret = vfio_container_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, + errp); + } else { + ret = vfio_container_iommu_query_dirty_bitmap(bcontainer, &vbmap, iova, size, + errp); + } + + if (ret) { + goto out; + } + + dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, + vbmap.pages); + + trace_vfio_container_query_dirty_bitmap(iova, size, vbmap.size, ram_addr, + dirty_pages); +out: + g_free(vbmap.bitmap); + + return ret; +} + static gpointer copy_iova_range(gconstpointer src, gpointer data) { Range *source = (Range *)src; diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 7c57bdd2..3e13fea 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -22,18 +22,26 @@ #include <sys/ioctl.h> #include <linux/vfio.h> -#include "hw/vfio/vfio-common.h" -#include "exec/address-spaces.h" -#include "exec/memory.h" -#include "exec/ram_addr.h" +#include "hw/vfio/vfio-device.h" +#include "system/address-spaces.h" +#include "system/memory.h" +#include "system/ram_addr.h" #include "qemu/error-report.h" #include "qemu/range.h" #include "system/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/blocker.h" #include "pci.h" +#include "hw/vfio/vfio-container.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" -VFIOGroupList vfio_group_list = +#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" + +typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; +static VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) @@ -112,12 +120,9 @@ unmap_exit: return ret; } -/* - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 - */ -static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) +static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -131,8 +136,10 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, int ret; Error *local_err = NULL; - if (iotlb && vfio_devices_all_dirty_tracking_started(bcontainer)) { - if (!vfio_devices_all_device_dirty_tracking(bcontainer) && + g_assert(!cpr_is_incoming()); + + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { + if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && bcontainer->dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -163,7 +170,7 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, } if (need_dirty_sync) { - ret = vfio_get_dirty_bitmap(bcontainer, iova, size, + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, size, iotlb->translated_addr, &local_err); if (ret) { error_report_err(local_err); @@ -174,8 +181,37 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, return 0; } +/* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) +{ + int ret; + + if (unmap_all) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + Int128 llsize = int128_rshift(int128_2_64(), 1); + + ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize), + iotlb); + + if (ret == 0) { + ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize), + int128_get64(llsize), iotlb); + } + + } else { + ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb); + } + + return ret; +} + static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -198,7 +234,7 @@ static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || (errno == EBUSY && - vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, false) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -273,37 +309,6 @@ static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, return ret; } -static struct vfio_info_cap_header * -vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) -{ - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { - return NULL; - } - - return vfio_get_cap((void *)info, info->cap_offset, id); -} - -bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, - unsigned int *avail) -{ - struct vfio_info_cap_header *hdr; - struct vfio_iommu_type1_info_dma_avail *cap; - - /* If the capability cannot be found, assume no DMA limiting */ - hdr = vfio_get_iommu_type1_info_cap(info, - VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); - if (!hdr) { - return false; - } - - if (avail != NULL) { - cap = (void *) hdr; - *avail = cap->avail; - } - - return true; -} - static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, VFIOContainerBase *bcontainer) { @@ -330,7 +335,7 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, return true; } -static void vfio_kvm_device_add_group(VFIOGroup *group) +static void vfio_group_add_kvm_device(VFIOGroup *group) { Error *err = NULL; @@ -339,7 +344,7 @@ static void vfio_kvm_device_add_group(VFIOGroup *group) } } -static void vfio_kvm_device_del_group(VFIOGroup *group) +static void vfio_group_del_kvm_device(VFIOGroup *group) { Error *err = NULL; @@ -424,7 +429,12 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, return NULL; } - if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { + /* + * During CPR, just set the container type and skip the ioctls, as the + * container and group are already configured in the kernel. + */ + if (!cpr_is_incoming() && + !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { return NULL; } @@ -535,16 +545,10 @@ static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) return true; } -static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as, - Error **errp) +static bool vfio_container_attach_discard_disable(VFIOContainer *container, + VFIOGroup *group, Error **errp) { - VFIOContainer *container; - VFIOContainerBase *bcontainer; - int ret, fd; - VFIOAddressSpace *space; - VFIOIOMMUClass *vioc; - - space = vfio_get_address_space(as); + int ret; /* * VFIO is currently incompatible with discarding of RAM insofar as the @@ -577,108 +581,158 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOContainer, bcontainer); - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, - "Cannot set discarding of RAM broken"); - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, - &container->fd)) { - error_report("vfio: error disconnecting group %d from" - " container", group->groupid); - } - return false; - } - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); - vfio_kvm_device_add_group(group); - return true; + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); } } + return !ret; +} - fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); - if (fd < 0) { - goto put_space_exit; +static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group, + Error **errp) +{ + if (!vfio_container_attach_discard_disable(container, group, errp)) { + return false; + } + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + vfio_group_add_kvm_device(group); + /* + * Remember the container fd for each group, so we can attach to the same + * container after CPR. + */ + cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} + +static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group) +{ + QLIST_REMOVE(group, container_next); + group->container = NULL; + vfio_group_del_kvm_device(group); + vfio_ram_block_discard_disable(container, false); + cpr_delete_fd("vfio_container_for_group", group->groupid); +} + +static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, + Error **errp) +{ + VFIOContainer *container; + VFIOContainerBase *bcontainer; + int ret, fd = -1; + VFIOAddressSpace *space; + VFIOIOMMUClass *vioc = NULL; + bool new_container = false; + bool group_was_added = false; + + space = vfio_address_space_get(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); + + if (!cpr_is_incoming()) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + return vfio_container_group_add(container, group, errp); + } + } + + fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); + if (fd < 0) { + goto fail; + } + } else { + /* + * For incoming CPR, the group is already attached in the kernel. + * If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, + * create the container struct and group list. + */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + + if (vfio_cpr_container_match(container, group, fd)) { + return vfio_container_group_add(container, group, errp); + } + } } ret = ioctl(fd, VFIO_GET_API_VERSION); if (ret != VFIO_API_VERSION) { error_setg(errp, "supported vfio version: %d, " "reported version: %d", VFIO_API_VERSION, ret); - goto close_fd_exit; + goto fail; } container = vfio_create_container(fd, group, errp); if (!container) { - goto close_fd_exit; + goto fail; } + new_container = true; bcontainer = &container->bcontainer; - if (!vfio_cpr_register_container(bcontainer, errp)) { - goto free_container_exit; - } - - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - goto unregister_container_exit; + if (!vfio_legacy_cpr_register_container(container, errp)) { + goto fail; } vioc = VFIO_IOMMU_GET_CLASS(bcontainer); assert(vioc->setup); if (!vioc->setup(bcontainer, errp)) { - goto enable_discards_exit; + goto fail; } - vfio_kvm_device_add_group(group); - vfio_address_space_insert(space, bcontainer); - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); - - bcontainer->listener = vfio_memory_listener; - memory_listener_register(&bcontainer->listener, bcontainer->space->as); + if (!vfio_container_group_add(container, group, errp)) { + goto fail; + } + group_was_added = true; - if (bcontainer->error) { - error_propagate_prepend(errp, bcontainer->error, - "memory listener initialization failed: "); - goto listener_release_exit; + /* + * If CPR, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * dma_map to supply the new vaddr, and the calls will match the mappings + * remembered by the kernel. + */ + if (!cpr_is_incoming()) { + if (!vfio_listener_register(bcontainer, errp)) { + goto fail; + } } bcontainer->initialized = true; return true; -listener_release_exit: - QLIST_REMOVE(group, container_next); - vfio_kvm_device_del_group(group); - memory_listener_unregister(&bcontainer->listener); - if (vioc->release) { - vioc->release(bcontainer); - } - -enable_discards_exit: - vfio_ram_block_discard_disable(container, false); - -unregister_container_exit: - vfio_cpr_unregister_container(bcontainer); - -free_container_exit: - object_unref(container); -close_fd_exit: - close(fd); +fail: + if (new_container) { + vfio_listener_unregister(bcontainer); + } -put_space_exit: - vfio_put_address_space(space); + if (group_was_added) { + vfio_container_group_del(container, group); + } + if (vioc && vioc->release) { + vioc->release(bcontainer); + } + if (new_container) { + vfio_legacy_cpr_unregister_container(container); + object_unref(container); + } + if (fd >= 0) { + close(fd); + } + vfio_address_space_put(space); return false; } -static void vfio_disconnect_container(VFIOGroup *group) +static void vfio_container_disconnect(VFIOGroup *group) { VFIOContainer *container = group->container; VFIOContainerBase *bcontainer = &container->bcontainer; @@ -686,6 +740,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(group, container_next); group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); /* * Explicitly release the listener first before unset container, @@ -693,7 +748,7 @@ static void vfio_disconnect_container(VFIOGroup *group) * group. */ if (QLIST_EMPTY(&container->group_list)) { - memory_listener_unregister(&bcontainer->listener); + vfio_listener_unregister(bcontainer); if (vioc->release) { vioc->release(bcontainer); } @@ -707,16 +762,16 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = bcontainer->space; - trace_vfio_disconnect_container(container->fd); - vfio_cpr_unregister_container(bcontainer); + trace_vfio_container_disconnect(container->fd); + vfio_legacy_cpr_unregister_container(container); close(container->fd); object_unref(container); - vfio_put_address_space(space); + vfio_address_space_put(space); } } -static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) +static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) { ERRP_GUARD(); VFIOGroup *group; @@ -739,7 +794,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) group = g_malloc0(sizeof(*group)); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open(path, O_RDWR, errp); + group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); if (group->fd < 0) { goto free_group_exit; } @@ -760,7 +815,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) group->groupid = groupid; QLIST_INIT(&group->device_list); - if (!vfio_connect_container(group, as, errp)) { + if (!vfio_container_connect(group, as, errp)) { error_prepend(errp, "failed to setup container for group %d: ", groupid); goto close_fd_exit; @@ -771,6 +826,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) return group; close_fd_exit: + cpr_delete_fd("vfio_group", groupid); close(group->fd); free_group_exit: @@ -779,7 +835,7 @@ free_group_exit: return NULL; } -static void vfio_put_group(VFIOGroup *group) +static void vfio_group_put(VFIOGroup *group) { if (!group || !QLIST_EMPTY(&group->device_list)) { return; @@ -788,21 +844,22 @@ static void vfio_put_group(VFIOGroup *group) if (!group->ram_block_discard_allowed) { vfio_ram_block_discard_disable(group->container, false); } - vfio_kvm_device_del_group(group); - vfio_disconnect_container(group); + vfio_group_del_kvm_device(group); + vfio_container_disconnect(group); QLIST_REMOVE(group, next); - trace_vfio_put_group(group->fd); + trace_vfio_group_put(group->fd); + cpr_delete_fd("vfio_group", group->groupid); close(group->fd); g_free(group); } -static bool vfio_get_device(VFIOGroup *group, const char *name, +static bool vfio_device_get(VFIOGroup *group, const char *name, VFIODevice *vbasedev, Error **errp) { g_autofree struct vfio_device_info *info = NULL; int fd; - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + fd = vfio_cpr_group_get_device_fd(group->fd, name); if (fd < 0) { error_setg_errno(errp, errno, "error getting device from group %d", group->groupid); @@ -815,8 +872,7 @@ static bool vfio_get_device(VFIOGroup *group, const char *name, info = vfio_get_device_info(fd); if (!info) { error_setg_errno(errp, errno, "error getting device info"); - close(fd); - return false; + goto fail; } /* @@ -830,8 +886,7 @@ static bool vfio_get_device(VFIOGroup *group, const char *name, if (!QLIST_EMPTY(&group->device_list)) { error_setg(errp, "Inconsistent setting of support for discarding " "RAM (e.g., balloon) within group"); - close(fd); - return false; + goto fail; } if (!group->ram_block_discard_allowed) { @@ -840,33 +895,35 @@ static bool vfio_get_device(VFIOGroup *group, const char *name, } } + vfio_device_prepare(vbasedev, &group->container->bcontainer, info); + vbasedev->fd = fd; vbasedev->group = group; QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - vbasedev->num_irqs = info->num_irqs; - vbasedev->num_regions = info->num_regions; - vbasedev->flags = info->flags; - - trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs); - - vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); + trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); return true; + +fail: + close(fd); + cpr_delete_fd(name, 0); + return false; } -static void vfio_put_base_device(VFIODevice *vbasedev) +static void vfio_device_put(VFIODevice *vbasedev) { if (!vbasedev->group) { return; } QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; - trace_vfio_put_base_device(vbasedev->fd); + trace_vfio_device_put(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); close(vbasedev->fd); } -static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) +static int vfio_device_get_groupid(VFIODevice *vbasedev, Error **errp) { char *tmp, group_path[PATH_MAX]; g_autofree char *group_name = NULL; @@ -894,29 +951,24 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) } /* - * vfio_attach_device: attach a device to a security context + * vfio_device_attach: attach a device to a security context * @name and @vbasedev->name are likely to be different depending * on the type of the device, hence the need for passing @name */ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - int groupid = vfio_device_groupid(vbasedev, errp); + int groupid = vfio_device_get_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; VFIOGroup *group; - VFIOContainerBase *bcontainer; if (groupid < 0) { return false; } - trace_vfio_attach_device(vbasedev->name, groupid); + trace_vfio_device_attach(vbasedev->name, groupid); - if (!vfio_device_hiod_realize(vbasedev, errp)) { - return false; - } - - group = vfio_get_group(groupid, as, errp); + group = vfio_group_get(groupid, as, errp); if (!group) { return false; } @@ -924,33 +976,51 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { error_setg(errp, "device is already attached"); - vfio_put_group(group); - return false; + goto group_put_exit; } } - if (!vfio_get_device(group, name, vbasedev, errp)) { - vfio_put_group(group); - return false; + if (!vfio_device_get(group, name, vbasedev, errp)) { + goto group_put_exit; } - bcontainer = &group->container->bcontainer; - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + if (!vfio_device_hiod_create_and_realize(vbasedev, + TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + errp)) { + goto device_put_exit; + } + + if (vbasedev->mdev) { + error_setg(&vbasedev->cpr.mdev_blocker, + "CPR does not support vfio mdev %s", vbasedev->name); + if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) < 0) { + goto hiod_unref_exit; + } + } return true; + +hiod_unref_exit: + object_unref(vbasedev->hiod); +device_put_exit: + vfio_device_put(vbasedev); +group_put_exit: + vfio_group_put(group); + return false; } static void vfio_legacy_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - QLIST_REMOVE(vbasedev, global_next); - QLIST_REMOVE(vbasedev, container_next); - vbasedev->bcontainer = NULL; - trace_vfio_detach_device(vbasedev->name, group->groupid); - vfio_put_base_device(vbasedev); - vfio_put_group(group); + trace_vfio_device_detach(vbasedev->name, group->groupid); + + vfio_device_unprepare(vbasedev); + + migrate_del_blocker(&vbasedev->cpr.mdev_blocker); + object_unref(vbasedev->hiod); + vfio_device_put(vbasedev); + vfio_group_put(group); } static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) @@ -1121,12 +1191,10 @@ out_single: return ret; } -static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) +static void vfio_iommu_legacy_class_init(ObjectClass *klass, const void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); - vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO; - vioc->setup = vfio_legacy_setup; vioc->dma_map = vfio_legacy_dma_map; vioc->dma_unmap = vfio_legacy_dma_unmap; @@ -1185,7 +1253,7 @@ static void vfio_iommu_legacy_instance_init(Object *obj) QLIST_INIT(&container->group_list); } -static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) +static void hiod_legacy_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c new file mode 100644 index 0000000..a84c324 --- /dev/null +++ b/hw/vfio/cpr-legacy.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2021-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qemu/osdep.h" +#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-listener.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return false; + } + container->cpr.vaddr_unmapped = true; + return true; +} + +/* + * Set the new @vaddr for any mappings registered during cpr load. + * The incoming state is cleared thereafter. + */ +static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, void *vaddr, + bool readonly, MemoryRegion *mr) +{ + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_VADDR, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + g_assert(cpr_is_incoming()); + + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + return -errno; + } + + return 0; +} + +static void vfio_region_remap(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + cpr.remap_listener); + vfio_container_region_add(&container->bcontainer, section, true); +} + +static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); + return false; + + } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL"); + return false; + + } else { + return true; + } +} + +static int vfio_container_pre_save(void *opaque) +{ + VFIOContainer *container = opaque; + Error *local_err = NULL; + + if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOContainer *container = opaque; + VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOGroup *group; + Error *local_err = NULL; + + if (!vfio_listener_register(bcontainer, &local_err)) { + error_report_err(local_err); + return -1; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + + /* Restore original dma_map function */ + vioc->dma_map = container->cpr.saved_dma_map; + } + return 0; +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .priority = MIG_PRI_LOW, /* Must happen after devices and groups */ + .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) +{ + VFIOContainer *container = + container_of(notifier, VFIOContainer, cpr.transfer_notifier); + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (e->type != MIG_EVENT_PRECOPY_FAILED) { + return 0; + } + + if (container->cpr.vaddr_unmapped) { + /* + * Force a call to vfio_region_remap for each mapped section by + * temporarily registering a listener, and temporarily diverting + * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr. + */ + + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + vioc->dma_map = vfio_legacy_cpr_dma_map; + + container->cpr.remap_listener = (MemoryListener) { + .name = "vfio cpr recover", + .region_add = vfio_region_remap + }; + memory_listener_register(&container->cpr.remap_listener, + bcontainer->space->as); + memory_listener_unregister(&container->cpr.remap_listener); + container->cpr.vaddr_unmapped = false; + vioc->dma_map = container->cpr.saved_dma_map; + } + return 0; +} + +bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + Error **cpr_blocker = &container->cpr.blocker; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + if (!vfio_cpr_supported(container, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) == 0; + } + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + /* During incoming CPR, divert calls to dma_map. */ + if (cpr_is_incoming()) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + container->cpr.saved_dma_map = vioc->dma_map; + vioc->dma_map = vfio_legacy_cpr_dma_map; + } + + migration_add_notifier_mode(&container->cpr.transfer_notifier, + vfio_cpr_fail_notifier, + MIG_MODE_CPR_TRANSFER); + return true; +} + +void vfio_legacy_cpr_unregister_container(VFIOContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + migrate_del_blocker(&container->cpr.blocker); + vmstate_unregister(NULL, &vfio_container_vmstate, container); + migration_remove_notifier(&container->cpr.transfer_notifier); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a giommu. + * + * The giommu already exists. Find it and replay it, which calls + * vfio_legacy_cpr_dma_map further down the stack. + */ +void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu = NULL; + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) && + giommu->iommu_offset == iommu_offset) { + break; + } + } + g_assert(giommu); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a RamDiscardManager. + * + * The ram discard listener already exists. Call its populate function + * directly, which calls vfio_legacy_cpr_dma_map. + */ +bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); + + g_assert(vrdl); + return vrdl->listener.notify_populate(&vrdl->listener, section) == 0; +} + +int vfio_cpr_group_get_device_fd(int d, const char *name) +{ + const int id = 0; + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +static bool same_device(int fd1, int fd2) +{ + struct stat st1, st2; + + return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; +} + +bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group, + int fd) +{ + if (container->fd == fd) { + return true; + } + if (!same_device(container->fd, fd)) { + return false; + } + /* + * Same device, different fd. This occurs when the container fd is + * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS + * produces duplicates. De-dup it. + */ + cpr_delete_fd("vfio_container_for_group", group->groupid); + close(fd); + cpr_save_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 3d1c8d2..fdbb58e 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -6,13 +6,15 @@ */ #include "qemu/osdep.h" -#include "hw/vfio/vfio-common.h" -#include "migration/misc.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/pci.h" +#include "migration/cpr.h" #include "qapi/error.h" #include "system/runstate.h" -static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, - MigrationEvent *e, Error **errp) +int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) { if (e->type == MIG_EVENT_PRECOPY_SETUP && !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) { @@ -37,3 +39,32 @@ void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) { migration_remove_notifier(&bcontainer->cpr_reboot_notifier); } + +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_cpr_pci_pre_load(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +const VMStateDescription vfio_cpr_pci_vmstate = { + .name = "vfio-cpr-pci", + .version_id = 0, + .minimum_version_id = 0, + .pre_load = vfio_cpr_pci_pre_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/vfio/device.c b/hw/vfio/device.c new file mode 100644 index 0000000..d91c695 --- /dev/null +++ b/hw/vfio/device.c @@ -0,0 +1,576 @@ +/* + * VFIO device + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> + +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/pci.h" +#include "hw/hw.h" +#include "trace.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/units.h" +#include "monitor/monitor.h" +#include "vfio-helpers.h" + +VFIODeviceList vfio_device_list = + QLIST_HEAD_INITIALIZER(vfio_device_list); + +/* + * We want to differentiate hot reset of multiple in-use devices vs + * hot reset of a single in-use device. VFIO_DEVICE_RESET will already + * handle the case of doing hot resets when there is only a single + * device per bus. The in-use here refers to how many VFIODevices are + * affected. A hot reset that affects multiple devices, but only a + * single in-use device, means that we can call it from our bus + * ->reset() callback since the extent is effectively a single + * device. This allows us to make use of it in the hotplug path. When + * there are multiple in-use devices, we can only trigger the hot + * reset during a system reset and thus from our reset handler. We + * separate _one vs _multi here so that we don't overlap and do a + * double reset on the system reset path where both our reset handler + * and ->reset() callback are used. Calling _one() will only do a hot + * reset for the one in-use devices case, calling _multi() will do + * nothing if a _one() would have been sufficient. + */ +void vfio_device_reset_handler(void *opaque) +{ + VFIODevice *vbasedev; + + trace_vfio_device_reset_handler(); + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->dev->realized) { + vbasedev->ops->vfio_compute_needs_reset(vbasedev); + } + } + + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->dev->realized && vbasedev->needs_reset) { + vbasedev->ops->vfio_hot_reset_multi(vbasedev); + } + } +} + +/* + * Common VFIO interrupt disable + */ +void vfio_device_irq_disable(VFIODevice *vbasedev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = index, + .start = 0, + .count = 0, + }; + + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); +} + +void vfio_device_irq_unmask(VFIODevice *vbasedev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, + .index = index, + .start = 0, + .count = 1, + }; + + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); +} + +void vfio_device_irq_mask(VFIODevice *vbasedev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, + .index = index, + .start = 0, + .count = 1, + }; + + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); +} + +static inline const char *action_to_str(int action) +{ + switch (action) { + case VFIO_IRQ_SET_ACTION_MASK: + return "MASK"; + case VFIO_IRQ_SET_ACTION_UNMASK: + return "UNMASK"; + case VFIO_IRQ_SET_ACTION_TRIGGER: + return "TRIGGER"; + default: + return "UNKNOWN ACTION"; + } +} + +static const char *index_to_str(VFIODevice *vbasedev, int index) +{ + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + return NULL; + } + + switch (index) { + case VFIO_PCI_INTX_IRQ_INDEX: + return "INTX"; + case VFIO_PCI_MSI_IRQ_INDEX: + return "MSI"; + case VFIO_PCI_MSIX_IRQ_INDEX: + return "MSIX"; + case VFIO_PCI_ERR_IRQ_INDEX: + return "ERR"; + case VFIO_PCI_REQ_IRQ_INDEX: + return "REQ"; + default: + return NULL; + } +} + +bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex, + int action, int fd, Error **errp) +{ + ERRP_GUARD(); + g_autofree struct vfio_irq_set *irq_set = NULL; + int argsz; + const char *name; + int32_t *pfd; + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; + irq_set->index = index; + irq_set->start = subindex; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + *pfd = fd; + + if (!vbasedev->io_ops->set_irqs(vbasedev, irq_set)) { + return true; + } + + error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure"); + + name = index_to_str(vbasedev, index); + if (name) { + error_prepend(errp, "%s-%d: ", name, subindex); + } else { + error_prepend(errp, "index %d-%d: ", index, subindex); + } + error_prepend(errp, + "Failed to %s %s eventfd signaling for interrupt ", + fd < 0 ? "tear down" : "set up", action_to_str(action)); + return false; +} + +int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, + struct vfio_irq_info *info) +{ + memset(info, 0, sizeof(*info)); + + info->argsz = sizeof(*info); + info->index = index; + + return vbasedev->io_ops->get_irq_info(vbasedev, info); +} + +int vfio_device_get_region_info(VFIODevice *vbasedev, int index, + struct vfio_region_info **info) +{ + size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; + int ret; + + /* check cache */ + if (vbasedev->reginfo[index] != NULL) { + *info = vbasedev->reginfo[index]; + return 0; + } + + *info = g_malloc0(argsz); + + (*info)->index = index; +retry: + (*info)->argsz = argsz; + + ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd); + if (ret != 0) { + g_free(*info); + *info = NULL; + return ret; + } + + if ((*info)->argsz > argsz) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + + if (fd != -1) { + close(fd); + fd = -1; + } + + goto retry; + } + + /* fill cache */ + vbasedev->reginfo[index] = *info; + if (vbasedev->region_fds != NULL) { + vbasedev->region_fds[index] = fd; + } + + return 0; +} + +int vfio_device_get_region_fd(VFIODevice *vbasedev, int index) +{ + return vbasedev->region_fds ? + vbasedev->region_fds[index] : + vbasedev->fd; +} + +int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info) +{ + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_type *cap_type; + + if (vfio_device_get_region_info(vbasedev, i, info)) { + continue; + } + + hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); + if (!hdr) { + continue; + } + + cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); + + trace_vfio_device_get_region_info_type(vbasedev->name, i, + cap_type->type, cap_type->subtype); + + if (cap_type->type == type && cap_type->subtype == subtype) { + return 0; + } + } + + *info = NULL; + return -ENODEV; +} + +bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) +{ + struct vfio_region_info *info = NULL; + bool ret = false; + + if (!vfio_device_get_region_info(vbasedev, region, &info)) { + if (vfio_get_region_info_cap(info, cap_type)) { + ret = true; + } + } + + return ret; +} + +bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) +{ + ERRP_GUARD(); + struct stat st; + + if (vbasedev->fd < 0) { + if (stat(vbasedev->sysfsdev, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + return false; + } + /* User may specify a name, e.g: VFIO platform device */ + if (!vbasedev->name) { + vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + } + } else { + if (!vbasedev->iommufd) { + error_setg(errp, "Use FD passing only with iommufd backend"); + return false; + } + /* + * Give a name with fd so any function printing out vbasedev->name + * will not break. + */ + if (!vbasedev->name) { + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + } + } + + return true; +} + +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +{ + ERRP_GUARD(); + int fd = monitor_fd_param(monitor_cur(), str, errp); + + if (fd < 0) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + vbasedev->fd = fd; +} + +static VFIODeviceIOOps vfio_device_io_ops_ioctl; + +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard) +{ + vbasedev->type = type; + vbasedev->ops = ops; + vbasedev->io_ops = &vfio_device_io_ops_ioctl; + vbasedev->dev = dev; + vbasedev->fd = -1; + vbasedev->use_region_fds = false; + + vbasedev->ram_block_discard_allowed = ram_discard; +} + +int vfio_device_get_aw_bits(VFIODevice *vdev) +{ + /* + * iova_ranges is a sorted list. For old kernels that support + * VFIO but not support query of iova ranges, iova_ranges is NULL, + * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. + */ + GList *l = g_list_last(vdev->bcontainer->iova_ranges); + + if (l) { + Range *range = l->data; + return range_get_last_bit(range) + 1; + } + + return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; +} + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} + +bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev, + const char *typename, Error **errp) +{ + HostIOMMUDevice *hiod; + + if (vbasedev->mdev) { + return true; + } + + hiod = HOST_IOMMU_DEVICE(object_new(typename)); + + if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + object_unref(hiod); + return false; + } + + vbasedev->hiod = hiod; + return true; +} + +VFIODevice *vfio_get_vfio_device(Object *obj) +{ + if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) { + return &VFIO_PCI_BASE(obj)->vbasedev; + } else { + return NULL; + } +} + +bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name, + VFIODevice *vbasedev, AddressSpace *as, + Error **errp) +{ + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(iommu_type)); + + assert(ops); + + return ops->attach_device(name, vbasedev, as, errp); +} + +bool vfio_device_attach(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const char *iommu_type = vbasedev->iommufd ? + TYPE_VFIO_IOMMU_IOMMUFD : + TYPE_VFIO_IOMMU_LEGACY; + + return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev, + as, errp); +} + +void vfio_device_detach(VFIODevice *vbasedev) +{ + if (!vbasedev->bcontainer) { + return; + } + VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); +} + +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, + struct vfio_device_info *info) +{ + vbasedev->num_irqs = info->num_irqs; + vbasedev->num_regions = info->num_regions; + vbasedev->flags = info->flags; + vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); + + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + vbasedev->reginfo = g_new0(struct vfio_region_info *, + vbasedev->num_regions); + if (vbasedev->use_region_fds) { + vbasedev->region_fds = g_new0(int, vbasedev->num_regions); + } +} + +void vfio_device_unprepare(VFIODevice *vbasedev) +{ + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + g_free(vbasedev->reginfo[i]); + if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) { + close(vbasedev->region_fds[i]); + } + + } + + g_clear_pointer(&vbasedev->reginfo, g_free); + g_clear_pointer(&vbasedev->region_fds, g_free); + + QLIST_REMOVE(vbasedev, container_next); + QLIST_REMOVE(vbasedev, global_next); + vbasedev->bcontainer = NULL; +} + +/* + * Traditional ioctl() based io + */ + +static int vfio_device_io_device_feature(VFIODevice *vbasedev, + struct vfio_device_feature *feature) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info, + int *fd) +{ + int ret; + + *fd = -1; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_set_irqs(VFIODevice *vbasedev, + struct vfio_irq_set *irqs) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + struct vfio_region_info *info; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret != 0) { + return ret; + } + + ret = pread(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data, + bool post) +{ + struct vfio_region_info *info; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret != 0) { + return ret; + } + + ret = pwrite(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static VFIODeviceIOOps vfio_device_io_ops_ioctl = { + .device_feature = vfio_device_io_device_feature, + .get_region_info = vfio_device_io_get_region_info, + .get_irq_info = vfio_device_io_get_irq_info, + .set_irqs = vfio_device_io_set_irqs, + .region_read = vfio_device_io_region_read, + .region_write = vfio_device_io_region_write, +}; diff --git a/hw/vfio/display.c b/hw/vfio/display.c index 4fdcef5..9c6f5aa 100644 --- a/hw/vfio/display.c +++ b/hw/vfio/display.c @@ -16,9 +16,9 @@ #include "qemu/error-report.h" #include "hw/display/edid.h" -#include "ui/console.h" #include "qapi/error.h" #include "pci.h" +#include "vfio-display.h" #include "trace.h" #ifndef DRM_PLANE_TYPE_PRIMARY @@ -129,10 +129,10 @@ static bool vfio_display_edid_init(VFIOPCIDevice *vdev, Error **errp) int fd = vdev->vbasedev.fd; int ret; - ret = vfio_get_dev_region_info(&vdev->vbasedev, - VFIO_REGION_TYPE_GFX, - VFIO_REGION_SUBTYPE_GFX_EDID, - &dpy->edid_info); + ret = vfio_device_get_region_info_type(&vdev->vbasedev, + VFIO_REGION_TYPE_GFX, + VFIO_REGION_SUBTYPE_GFX_EDID, + &dpy->edid_info); if (ret) { /* Failed to get GFX edid info, allow to go through without edid. */ return true; @@ -213,6 +213,7 @@ static VFIODMABuf *vfio_display_get_dmabuf(VFIOPCIDevice *vdev, struct vfio_device_gfx_plane_info plane; VFIODMABuf *dmabuf; int fd, ret; + uint32_t offset = 0; memset(&plane, 0, sizeof(plane)); plane.argsz = sizeof(plane); @@ -245,10 +246,10 @@ static VFIODMABuf *vfio_display_get_dmabuf(VFIOPCIDevice *vdev, dmabuf = g_new0(VFIODMABuf, 1); dmabuf->dmabuf_id = plane.dmabuf_id; - dmabuf->buf = qemu_dmabuf_new(plane.width, plane.height, - plane.stride, 0, 0, plane.width, + dmabuf->buf = qemu_dmabuf_new(plane.width, plane.height, &offset, + &plane.stride, 0, 0, plane.width, plane.height, plane.drm_format, - plane.drm_format_mod, fd, false, false); + plane.drm_format_mod, &fd, 1, false, false); if (plane_type == DRM_PLANE_TYPE_CURSOR) { vfio_display_update_cursor(dmabuf, &plane); diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 4b255d4..d0dbab1 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -22,242 +22,11 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> -#include "hw/vfio/vfio-common.h" -#include "hw/vfio/pci.h" +#include "system/kvm.h" +#include "hw/vfio/vfio-device.h" #include "hw/hw.h" -#include "trace.h" #include "qapi/error.h" -#include "qemu/error-report.h" -#include "qemu/units.h" -#include "monitor/monitor.h" - -/* - * Common VFIO interrupt disable - */ -void vfio_disable_irqindex(VFIODevice *vbasedev, int index) -{ - struct vfio_irq_set irq_set = { - .argsz = sizeof(irq_set), - .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, - .index = index, - .start = 0, - .count = 0, - }; - - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); -} - -void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) -{ - struct vfio_irq_set irq_set = { - .argsz = sizeof(irq_set), - .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, - .index = index, - .start = 0, - .count = 1, - }; - - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); -} - -void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) -{ - struct vfio_irq_set irq_set = { - .argsz = sizeof(irq_set), - .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, - .index = index, - .start = 0, - .count = 1, - }; - - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); -} - -static inline const char *action_to_str(int action) -{ - switch (action) { - case VFIO_IRQ_SET_ACTION_MASK: - return "MASK"; - case VFIO_IRQ_SET_ACTION_UNMASK: - return "UNMASK"; - case VFIO_IRQ_SET_ACTION_TRIGGER: - return "TRIGGER"; - default: - return "UNKNOWN ACTION"; - } -} - -static const char *index_to_str(VFIODevice *vbasedev, int index) -{ - if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { - return NULL; - } - - switch (index) { - case VFIO_PCI_INTX_IRQ_INDEX: - return "INTX"; - case VFIO_PCI_MSI_IRQ_INDEX: - return "MSI"; - case VFIO_PCI_MSIX_IRQ_INDEX: - return "MSIX"; - case VFIO_PCI_ERR_IRQ_INDEX: - return "ERR"; - case VFIO_PCI_REQ_IRQ_INDEX: - return "REQ"; - default: - return NULL; - } -} - -bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, - int action, int fd, Error **errp) -{ - ERRP_GUARD(); - g_autofree struct vfio_irq_set *irq_set = NULL; - int argsz; - const char *name; - int32_t *pfd; - - argsz = sizeof(*irq_set) + sizeof(*pfd); - - irq_set = g_malloc0(argsz); - irq_set->argsz = argsz; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; - irq_set->index = index; - irq_set->start = subindex; - irq_set->count = 1; - pfd = (int32_t *)&irq_set->data; - *pfd = fd; - - if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { - return true; - } - - error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure"); - - name = index_to_str(vbasedev, index); - if (name) { - error_prepend(errp, "%s-%d: ", name, subindex); - } else { - error_prepend(errp, "index %d-%d: ", index, subindex); - } - error_prepend(errp, - "Failed to %s %s eventfd signaling for interrupt ", - fd < 0 ? "tear down" : "set up", action_to_str(action)); - return false; -} - -/* - * IO Port/MMIO - Beware of the endians, VFIO is always little endian - */ -void vfio_region_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIORegion *region = opaque; - VFIODevice *vbasedev = region->vbasedev; - union { - uint8_t byte; - uint16_t word; - uint32_t dword; - uint64_t qword; - } buf; - - switch (size) { - case 1: - buf.byte = data; - break; - case 2: - buf.word = cpu_to_le16(data); - break; - case 4: - buf.dword = cpu_to_le32(data); - break; - case 8: - buf.qword = cpu_to_le64(data); - break; - default: - hw_error("vfio: unsupported write size, %u bytes", size); - break; - } - - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 - ",%d) failed: %m", - __func__, vbasedev->name, region->nr, - addr, data, size); - } - - trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); - - /* - * A read or write to a BAR always signals an INTx EOI. This will - * do nothing if not pending (including not in INTx mode). We assume - * that a BAR access is in response to an interrupt and that BAR - * accesses will service the interrupt. Unfortunately, we don't know - * which access will service the interrupt, so we're potentially - * getting quite a few host interrupts per guest interrupt. - */ - vbasedev->ops->vfio_eoi(vbasedev); -} - -uint64_t vfio_region_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIORegion *region = opaque; - VFIODevice *vbasedev = region->vbasedev; - union { - uint8_t byte; - uint16_t word; - uint32_t dword; - uint64_t qword; - } buf; - uint64_t data = 0; - - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", - __func__, vbasedev->name, region->nr, - addr, size); - return (uint64_t)-1; - } - switch (size) { - case 1: - data = buf.byte; - break; - case 2: - data = le16_to_cpu(buf.word); - break; - case 4: - data = le32_to_cpu(buf.dword); - break; - case 8: - data = le64_to_cpu(buf.qword); - break; - default: - hw_error("vfio: unsupported read size, %u bytes", size); - break; - } - - trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); - - /* Same as write above */ - vbasedev->ops->vfio_eoi(vbasedev); - - return data; -} - -const MemoryRegionOps vfio_region_ops = { - .read = vfio_region_read, - .write = vfio_region_write, - .endianness = DEVICE_LITTLE_ENDIAN, - .valid = { - .min_access_size = 1, - .max_access_size = 8, - }, - .impl = { - .min_access_size = 1, - .max_access_size = 8, - }, -}; +#include "vfio-helpers.h" int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size) { @@ -306,435 +75,126 @@ vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) return vfio_get_cap((void *)info, info->cap_offset, id); } -static int vfio_setup_region_sparse_mmaps(VFIORegion *region, - struct vfio_region_info *info) +struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) { - struct vfio_info_cap_header *hdr; - struct vfio_region_info_cap_sparse_mmap *sparse; - int i, j; - - hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); - if (!hdr) { - return -ENODEV; - } - - sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); - - trace_vfio_region_sparse_mmap_header(region->vbasedev->name, - region->nr, sparse->nr_areas); - - region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); - - for (i = 0, j = 0; i < sparse->nr_areas; i++) { - if (sparse->areas[i].size) { - trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, - sparse->areas[i].offset + - sparse->areas[i].size - 1); - region->mmaps[j].offset = sparse->areas[i].offset; - region->mmaps[j].size = sparse->areas[i].size; - j++; - } + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; } - region->nr_mmaps = j; - region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); - - return 0; + return vfio_get_cap((void *)info, info->cap_offset, id); } -int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, - int index, const char *name) +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail) { - g_autofree struct vfio_region_info *info = NULL; - int ret; + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_dma_avail *cap; - ret = vfio_get_region_info(vbasedev, index, &info); - if (ret) { - return ret; + /* If the capability cannot be found, assume no DMA limiting */ + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); + if (!hdr) { + return false; } - region->vbasedev = vbasedev; - region->flags = info->flags; - region->size = info->size; - region->fd_offset = info->offset; - region->nr = index; - - if (region->size) { - region->mem = g_new0(MemoryRegion, 1); - memory_region_init_io(region->mem, obj, &vfio_region_ops, - region, name, region->size); - - if (!vbasedev->no_mmap && - region->flags & VFIO_REGION_INFO_FLAG_MMAP) { - - ret = vfio_setup_region_sparse_mmaps(region, info); - - if (ret) { - region->nr_mmaps = 1; - region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); - region->mmaps[0].offset = 0; - region->mmaps[0].size = region->size; - } - } + if (avail != NULL) { + cap = (void *) hdr; + *avail = cap->avail; } - trace_vfio_region_setup(vbasedev->name, index, name, - region->flags, region->fd_offset, region->size); - return 0; + return true; } -static void vfio_subregion_unmap(VFIORegion *region, int index) -{ - trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), - region->mmaps[index].offset, - region->mmaps[index].offset + - region->mmaps[index].size - 1); - memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); - munmap(region->mmaps[index].mmap, region->mmaps[index].size); - object_unparent(OBJECT(®ion->mmaps[index].mem)); - region->mmaps[index].mmap = NULL; -} +#ifdef CONFIG_KVM +/* + * We have a single VFIO pseudo device per KVM VM. Once created it lives + * for the life of the VM. Closing the file descriptor only drops our + * reference to it and the device's reference to kvm. Therefore once + * initialized, this file descriptor is only released on QEMU exit and + * we'll re-use it should another vfio device be attached before then. + */ +int vfio_kvm_device_fd = -1; +#endif -int vfio_region_mmap(VFIORegion *region) +int vfio_kvm_device_add_fd(int fd, Error **errp) { - int i, ret, prot = 0; - char *name; +#ifdef CONFIG_KVM + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_FILE, + .attr = KVM_DEV_VFIO_FILE_ADD, + .addr = (uint64_t)(unsigned long)&fd, + }; - if (!region->mem) { + if (!kvm_enabled()) { return 0; } - prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; - prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; - - for (i = 0; i < region->nr_mmaps; i++) { - size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); - void *map_base, *map_align; - - /* - * Align the mmap for more efficient mapping in the kernel. Ideally - * we'd know the PMD and PUD mapping sizes to use as discrete alignment - * intervals, but we don't. As of Linux v6.12, the largest PUD size - * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set - * on x86_64). Align by power-of-two size, capped at 1GiB. - * - * NB. qemu_memalign() and friends actually allocate memory, whereas - * the region size here can exceed host memory, therefore we manually - * create an oversized anonymous mapping and clean it up for alignment. - */ - map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (map_base == MAP_FAILED) { - ret = -errno; - goto no_mmap; - } + if (vfio_kvm_device_fd < 0) { + struct kvm_create_device cd = { + .type = KVM_DEV_TYPE_VFIO, + }; - map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); - munmap(map_base, map_align - map_base); - munmap(map_align + region->mmaps[i].size, - align - (map_align - map_base)); - - region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, - MAP_SHARED | MAP_FIXED, - region->vbasedev->fd, - region->fd_offset + - region->mmaps[i].offset); - if (region->mmaps[i].mmap == MAP_FAILED) { - ret = -errno; - goto no_mmap; + if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { + error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); + return -errno; } - name = g_strdup_printf("%s mmaps[%d]", - memory_region_name(region->mem), i); - memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, - memory_region_owner(region->mem), - name, region->mmaps[i].size, - region->mmaps[i].mmap); - g_free(name); - memory_region_add_subregion(region->mem, region->mmaps[i].offset, - ®ion->mmaps[i].mem); - - trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), - region->mmaps[i].offset, - region->mmaps[i].offset + - region->mmaps[i].size - 1); - } - - return 0; - -no_mmap: - trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, - region->fd_offset + region->mmaps[i].offset, - region->fd_offset + region->mmaps[i].offset + - region->mmaps[i].size - 1, ret); - - region->mmaps[i].mmap = NULL; - - for (i--; i >= 0; i--) { - vfio_subregion_unmap(region, i); - } - - return ret; -} - -void vfio_region_unmap(VFIORegion *region) -{ - int i; - - if (!region->mem) { - return; + vfio_kvm_device_fd = cd.fd; } - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - vfio_subregion_unmap(region, i); - } - } -} - -void vfio_region_exit(VFIORegion *region) -{ - int i; - - if (!region->mem) { - return; - } - - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); - } - } - - trace_vfio_region_exit(region->vbasedev->name, region->nr); -} - -void vfio_region_finalize(VFIORegion *region) -{ - int i; - - if (!region->mem) { - return; - } - - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - munmap(region->mmaps[i].mmap, region->mmaps[i].size); - object_unparent(OBJECT(®ion->mmaps[i].mem)); - } - } - - object_unparent(OBJECT(region->mem)); - - g_free(region->mem); - g_free(region->mmaps); - - trace_vfio_region_finalize(region->vbasedev->name, region->nr); - - region->mem = NULL; - region->mmaps = NULL; - region->nr_mmaps = 0; - region->size = 0; - region->flags = 0; - region->nr = 0; -} - -void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) -{ - int i; - - if (!region->mem) { - return; - } - - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - memory_region_set_enabled(®ion->mmaps[i].mem, enabled); - } - } - - trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), - enabled); -} - -int vfio_get_region_info(VFIODevice *vbasedev, int index, - struct vfio_region_info **info) -{ - size_t argsz = sizeof(struct vfio_region_info); - - *info = g_malloc0(argsz); - - (*info)->index = index; -retry: - (*info)->argsz = argsz; - - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { - g_free(*info); - *info = NULL; + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", + fd); return -errno; } - - if ((*info)->argsz > argsz) { - argsz = (*info)->argsz; - *info = g_realloc(*info, argsz); - - goto retry; - } - +#endif return 0; } -int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, - uint32_t subtype, struct vfio_region_info **info) -{ - int i; - - for (i = 0; i < vbasedev->num_regions; i++) { - struct vfio_info_cap_header *hdr; - struct vfio_region_info_cap_type *cap_type; - - if (vfio_get_region_info(vbasedev, i, info)) { - continue; - } - - hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); - if (!hdr) { - g_free(*info); - continue; - } - - cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); - - trace_vfio_get_dev_region(vbasedev->name, i, - cap_type->type, cap_type->subtype); - - if (cap_type->type == type && cap_type->subtype == subtype) { - return 0; - } - - g_free(*info); - } - - *info = NULL; - return -ENODEV; -} - -bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) +int vfio_kvm_device_del_fd(int fd, Error **errp) { - g_autofree struct vfio_region_info *info = NULL; - bool ret = false; - - if (!vfio_get_region_info(vbasedev, region, &info)) { - if (vfio_get_region_info_cap(info, cap_type)) { - ret = true; - } - } - - return ret; -} +#ifdef CONFIG_KVM + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_FILE, + .attr = KVM_DEV_VFIO_FILE_DEL, + .addr = (uint64_t)(unsigned long)&fd, + }; -bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) -{ - ERRP_GUARD(); - struct stat st; - - if (vbasedev->fd < 0) { - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, "no such host device"); - error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); - return false; - } - /* User may specify a name, e.g: VFIO platform device */ - if (!vbasedev->name) { - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - } - } else { - if (!vbasedev->iommufd) { - error_setg(errp, "Use FD passing only with iommufd backend"); - return false; - } - /* - * Give a name with fd so any function printing out vbasedev->name - * will not break. - */ - if (!vbasedev->name) { - vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); - } + if (vfio_kvm_device_fd < 0) { + error_setg(errp, "KVM VFIO device isn't created yet"); + return -EINVAL; } - return true; -} - -void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) -{ - ERRP_GUARD(); - int fd = monitor_fd_param(monitor_cur(), str, errp); - - if (fd < 0) { - error_prepend(errp, "Could not parse remote object fd %s:", str); - return; + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, + "Failed to remove fd %d from KVM VFIO device", fd); + return -errno; } - vbasedev->fd = fd; -} - -void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, - DeviceState *dev, bool ram_discard) -{ - vbasedev->type = type; - vbasedev->ops = ops; - vbasedev->dev = dev; - vbasedev->fd = -1; - - vbasedev->ram_block_discard_allowed = ram_discard; +#endif + return 0; } -int vfio_device_get_aw_bits(VFIODevice *vdev) +struct vfio_device_info *vfio_get_device_info(int fd) { - /* - * iova_ranges is a sorted list. For old kernels that support - * VFIO but not support query of iova ranges, iova_ranges is NULL, - * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. - */ - GList *l = g_list_last(vdev->bcontainer->iova_ranges); - - if (l) { - Range *range = l->data; - return range_get_last_bit(range) + 1; - } + struct vfio_device_info *info; + uint32_t argsz = sizeof(*info); - return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; -} + info = g_malloc0(argsz); -bool vfio_device_is_mdev(VFIODevice *vbasedev) -{ - g_autofree char *subsys = NULL; - g_autofree char *tmp = NULL; +retry: + info->argsz = argsz; - if (!vbasedev->sysfsdev) { - return false; + if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { + g_free(info); + return NULL; } - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); -} - -bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) -{ - HostIOMMUDevice *hiod = vbasedev->hiod; - - if (!hiod) { - return true; + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; } - return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); -} - -VFIODevice *vfio_get_vfio_device(Object *obj) -{ - if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) { - return &VFIO_PCI(obj)->vbasedev; - } else { - return NULL; - } + return info; } diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 265fffc..e7a9d1f 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -103,6 +103,7 @@ static int igd_gen(VFIOPCIDevice *vdev) /* * Unfortunately, Intel changes it's specification quite often. This makes * it impossible to use a suitable default value for unknown devices. + * Return -1 for not applying any generation-specific quirks. */ return -1; } @@ -182,34 +183,25 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); - pci_set_long(vdev->pdev.config + IGD_ASLS, 0); - pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); - return true; } -static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) +static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, + struct vfio_region_info **opregion) { - g_autofree struct vfio_region_info *opregion = NULL; int ret; - /* Hotplugging is not supported for opregion access */ - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); - return false; - } - - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { - error_setg_errno(errp, -ret, - "Device does not supports IGD OpRegion feature"); return false; } - if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) { + /* Hotplugging is not supported for opregion access */ + if (vdev->pdev.qdev.hotplugged) { + warn_report("IGD device detected, but OpRegion is not supported " + "on hotplugged device."); return false; } @@ -301,7 +293,8 @@ static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp) } } -static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data) +static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); @@ -317,7 +310,7 @@ static const TypeInfo vfio_pci_igd_lpc_bridge_info = { .name = "vfio-pci-igd-lpc-bridge", .parent = TYPE_PCI_DEVICE, .class_init = vfio_pci_igd_lpc_bridge_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, }, @@ -354,8 +347,8 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) { - g_autofree struct vfio_region_info *host = NULL; - g_autofree struct vfio_region_info *lpc = NULL; + struct vfio_region_info *host = NULL; + struct vfio_region_info *lpc = NULL; PCIDevice *lpc_bridge; int ret; @@ -385,7 +378,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) * Check whether we have all the vfio device specific regions to * support LPC quirk (added in Linux v4.6). */ - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); if (ret) { @@ -393,7 +386,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) return false; } - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); if (ret) { @@ -418,6 +411,44 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) return true; } +static bool vfio_pci_igd_override_gms(int gen, uint32_t gms, uint32_t *gmch) +{ + bool ret = false; + + if (gen == -1) { + error_report("x-igd-gms is not supported on this device"); + } else if (gen < 8) { + if (gms <= 0x10) { + *gmch &= ~(IGD_GMCH_GEN6_GMS_MASK << IGD_GMCH_GEN6_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN6_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, "x-igd-gms", "0~0x10"); + } + } else if (gen == 8) { + if (gms <= 0x40) { + *gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN8_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, "x-igd-gms", "0~0x40"); + } + } else { + /* 0x0 to 0x40: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if ((gms <= 0x40) || (gms >= 0xf0 && gms <= 0xfe)) { + *gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN8_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, + "x-igd-gms", "0~0x40 or 0xf0~0xfe"); + } + } + + return ret; +} + #define IGD_GGC_MMIO_OFFSET 0x108040 #define IGD_BDSM_MMIO_OFFSET 0x1080C0 @@ -427,41 +458,35 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) VFIOConfigMirrorQuirk *ggc_mirror, *bdsm_mirror; int gen; - /* - * This must be an Intel VGA device at address 00:02.0 for us to even - * consider enabling legacy mode. Some driver have dependencies on the PCI - * bus address. - */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || !vfio_is_vga(vdev) || nr != 0) { return; } - /* - * Only on IGD devices of gen 11 and above, the BDSM register is mirrored - * into MMIO space and read from MMIO space by the Windows driver. - */ + /* Only on IGD Gen6-12 device needs quirks in BAR 0 */ gen = igd_gen(vdev); if (gen < 6) { return; } - ggc_quirk = vfio_quirk_alloc(1); - ggc_mirror = ggc_quirk->data = g_malloc0(sizeof(*ggc_mirror)); - ggc_mirror->mem = ggc_quirk->mem; - ggc_mirror->vdev = vdev; - ggc_mirror->bar = nr; - ggc_mirror->offset = IGD_GGC_MMIO_OFFSET; - ggc_mirror->config_offset = IGD_GMCH; - - memory_region_init_io(ggc_mirror->mem, OBJECT(vdev), - &vfio_generic_mirror_quirk, ggc_mirror, - "vfio-igd-ggc-quirk", 2); - memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, - ggc_mirror->offset, ggc_mirror->mem, - 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, ggc_quirk, next); + if (vdev->igd_gms) { + ggc_quirk = vfio_quirk_alloc(1); + ggc_mirror = ggc_quirk->data = g_malloc0(sizeof(*ggc_mirror)); + ggc_mirror->mem = ggc_quirk->mem; + ggc_mirror->vdev = vdev; + ggc_mirror->bar = nr; + ggc_mirror->offset = IGD_GGC_MMIO_OFFSET; + ggc_mirror->config_offset = IGD_GMCH; + + memory_region_init_io(ggc_mirror->mem, OBJECT(vdev), + &vfio_generic_mirror_quirk, ggc_mirror, + "vfio-igd-ggc-quirk", 2); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + ggc_mirror->offset, ggc_mirror->mem, + 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, ggc_quirk, next); + } bdsm_quirk = vfio_quirk_alloc(1); bdsm_mirror = bdsm_quirk->data = g_malloc0(sizeof(*bdsm_mirror)); @@ -483,44 +508,37 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { + struct vfio_region_info *opregion = NULL; int ret, gen; - uint64_t gms_size; + uint64_t gms_size = 0; uint64_t *bdsm_size; uint32_t gmch; bool legacy_mode_enabled = false; Error *err = NULL; - /* - * This must be an Intel VGA device at address 00:02.0 for us to even - * consider enabling legacy mode. The vBIOS has dependencies on the - * PCI bus address. - */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || !vfio_is_vga(vdev)) { return true; } - /* - * IGD is not a standard, they like to change their specs often. We - * only attempt to support back to SandBridge and we hope that newer - * devices maintain compatibility with generation 8. - */ - gen = igd_gen(vdev); - if (gen == -1) { - error_report("IGD device %s is unsupported in legacy mode, " - "try SandyBridge or newer", vdev->vbasedev.name); + /* IGD device always comes with OpRegion */ + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { return true; } + info_report("OpRegion detected on Intel display %x.", vdev->device_id); + gen = igd_gen(vdev); gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); /* * For backward compatibility, enable legacy mode when + * - Device geneation is 6 to 9 (including both) * - Machine type is i440fx (pc_piix) * - IGD device is at guest BDF 00:02.0 * - Not manually disabled by x-igd-legacy-mode=off */ if ((vdev->igd_legacy_mode != ON_OFF_AUTO_OFF) && + (gen >= 6 && gen <= 9) && !strcmp(MACHINE_GET_CLASS(qdev_get_machine())->family, "pc_piix") && (&vdev->pdev == pci_find_device(pci_device_root_bus(&vdev->pdev), 0, PCI_DEVFN(0x2, 0)))) { @@ -531,7 +549,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * - OpRegion * - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host */ - g_autofree struct vfio_region_info *rom = NULL; + struct vfio_region_info *rom = NULL; legacy_mode_enabled = true; info_report("IGD legacy mode enabled, " @@ -542,8 +560,8 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * there's no ROM, there's no point in setting up this quirk. * NB. We only seem to get BIOS ROMs, so UEFI VM would need CSM support. */ - ret = vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, &rom); + ret = vfio_device_get_region_info(&vdev->vbasedev, + VFIO_PCI_ROM_REGION_INDEX, &rom); if ((ret || !rom->size) && !vdev->pdev.romfile) { error_setg(&err, "Device has no ROM"); goto error; @@ -565,13 +583,15 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) vdev->features |= VFIO_FEATURE_ENABLE_IGD_LPC; } else if (vdev->igd_legacy_mode == ON_OFF_AUTO_ON) { error_setg(&err, - "Machine is not i440fx or assigned BDF is not 00:02.0"); + "Machine is not i440fx, assigned BDF is not 00:02.0, " + "or device %04x (gen %d) doesn't support legacy mode", + vdev->device_id, gen); goto error; } /* Setup OpRegion access */ if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - !vfio_pci_igd_setup_opregion(vdev, errp)) { + !vfio_pci_igd_opregion_init(vdev, opregion, errp)) { goto error; } @@ -579,7 +599,15 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_LPC) && !vfio_pci_igd_setup_lpc_bridge(vdev, errp)) { goto error; - } + } + + /* + * ASLS (OpRegion address) is read-only, emulated + * It contains HPA, guest firmware need to reprogram it with GPA. + */ + pci_set_long(vdev->pdev.config + IGD_ASLS, 0); + pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); /* * Allow user to override dsm size using x-igd-gms option, in multiples of @@ -587,56 +615,44 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * set from DVMT Pre-Allocated option in host BIOS. */ if (vdev->igd_gms) { - if (gen < 8) { - if (vdev->igd_gms <= 0x10) { - gmch &= ~(IGD_GMCH_GEN6_GMS_MASK << IGD_GMCH_GEN6_GMS_SHIFT); - gmch |= vdev->igd_gms << IGD_GMCH_GEN6_GMS_SHIFT; - } else { - error_report(QERR_INVALID_PARAMETER_VALUE, - "x-igd-gms", "0~0x10"); - } - } else { - if (vdev->igd_gms <= 0x40) { - gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); - gmch |= vdev->igd_gms << IGD_GMCH_GEN8_GMS_SHIFT; - } else { - error_report(QERR_INVALID_PARAMETER_VALUE, - "x-igd-gms", "0~0x40"); - } + if (!vfio_pci_igd_override_gms(gen, vdev->igd_gms, &gmch)) { + return false; } + + /* GMCH is read-only, emulated */ + pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); + pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); + pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); } - gms_size = igd_stolen_memory_size(gen, gmch); + if (gen > 0) { + gms_size = igd_stolen_memory_size(gen, gmch); + + /* BDSM is read-write, emulated. BIOS needs to be able to write it */ + if (gen < 11) { + pci_set_long(vdev->pdev.config + IGD_BDSM, 0); + pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); + } else { + pci_set_quad(vdev->pdev.config + IGD_BDSM_GEN11, 0); + pci_set_quad(vdev->pdev.wmask + IGD_BDSM_GEN11, ~0); + pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); + } + } /* * Request reserved memory for stolen memory via fw_cfg. VM firmware * must allocate a 1MB aligned reserved memory region below 4GB with - * the requested size (in bytes) for use by the Intel PCI class VGA - * device at VM address 00:02.0. The base address of this reserved - * memory region must be written to the device BDSM register at PCI - * config offset 0x5C. + * the requested size (in bytes) for use by the IGD device. The base + * address of this reserved memory region must be written to the + * device BDSM register. + * For newer device without BDSM register, this fw_cfg item is 0. */ bdsm_size = g_malloc(sizeof(*bdsm_size)); *bdsm_size = cpu_to_le64(gms_size); fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", bdsm_size, sizeof(*bdsm_size)); - /* GMCH is read-only, emulated */ - pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); - pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); - pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); - - /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ - if (gen < 11) { - pci_set_long(vdev->pdev.config + IGD_BDSM, 0); - pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); - } else { - pci_set_quad(vdev->pdev.config + IGD_BDSM_GEN11, 0); - pci_set_quad(vdev->pdev.wmask + IGD_BDSM_GEN11, ~0); - pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); - } - trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (gms_size / MiB)); return true; @@ -663,8 +679,27 @@ error: */ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) { + struct vfio_region_info *opregion = NULL; + int gen; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || + !vfio_is_vga(vdev)) { + return true; + } + + /* FIXME: Cherryview is Gen8, but don't support GVT-g */ + gen = igd_gen(vdev); + if (gen != 8 && gen != 9) { + return true; + } + + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { + /* Should never reach here, KVMGT always emulates OpRegion */ + return false; + } + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - !vfio_pci_igd_setup_opregion(vdev, errp)) { + !vfio_pci_igd_opregion_init(vdev, opregion, errp)) { return false; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 42c8412..d3efef7 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -15,19 +15,27 @@ #include <linux/vfio.h> #include <linux/iommufd.h> -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "qemu/error-report.h" #include "trace.h" #include "qapi/error.h" #include "system/iommufd.h" #include "hw/qdev-core.h" +#include "hw/vfio/vfio-cpr.h" #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" +#include "vfio-iommufd.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" + +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ + TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); @@ -39,11 +47,28 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, bool unmap_all) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + /* unmap in halves */ + if (unmap_all) { + Int128 llsize = int128_rshift(int128_2_64(), 1); + int ret; + + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + 0, int128_get64(llsize)); + + if (ret == 0) { + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + int128_get64(llsize), + int128_get64(llsize)); + } + + return ret; + } + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ return iommufd_backend_unmap_dma(container->be, container->ioas_id, iova, size); @@ -280,7 +305,8 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, { ERRP_GUARD(); IOMMUFDBackend *iommufd = vbasedev->iommufd; - uint32_t flags = 0; + uint32_t type, flags = 0; + uint64_t hw_caps; VFIOIOASHwpt *hwpt; uint32_t hwpt_id; int ret; @@ -317,7 +343,12 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, * vfio_migration_realize() may decide to use VF dirty tracking * instead. */ - if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid, + &type, NULL, 0, &hw_caps, errp)) { + return false; + } + + if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } @@ -403,7 +434,8 @@ static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) if (!QLIST_EMPTY(&bcontainer->device_list)) { return; } - memory_listener_unregister(&bcontainer->listener); + vfio_cpr_unregister_container(bcontainer); + vfio_listener_unregister(bcontainer); iommufd_backend_free_id(container->be, container->ioas_id); object_unref(container); } @@ -485,18 +517,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, goto err_connect_bind; } - space = vfio_get_address_space(as); - - /* - * The HostIOMMUDevice data from legacy backend is static and doesn't need - * any information from the (type1-iommu) backend to be initialized. In - * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd - * FD to be connected and having a devid to be able to successfully call - * iommufd_backend_get_device_info(). - */ - if (!vfio_device_hiod_realize(vbasedev, errp)) { - goto err_alloc_ioas; - } + space = vfio_address_space_get(as); /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { @@ -555,12 +576,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, bcontainer->pgsizes = qemu_real_host_page_size(); } - bcontainer->listener = vfio_memory_listener; - memory_listener_register(&bcontainer->listener, bcontainer->space->as); + if (!vfio_listener_register(bcontainer, errp)) { + goto err_listener_register; + } - if (bcontainer->error) { - error_propagate_prepend(errp, bcontainer->error, - "memory listener initialization failed: "); + if (!vfio_cpr_register_container(bcontainer, errp)) { goto err_listener_register; } @@ -573,7 +593,12 @@ found_container: goto err_listener_register; } - if (!vfio_cpr_register_container(bcontainer, errp)) { + /* + * Do not move this code before attachment! The nested IOMMU support + * needs device and hwpt id which are generated only after attachment. + */ + if (!vfio_device_hiod_create_and_realize(vbasedev, + TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) { goto err_listener_register; } @@ -585,14 +610,7 @@ found_container: iommufd_cdev_ram_block_discard_disable(false); } - vbasedev->group = 0; - vbasedev->num_irqs = dev_info.num_irqs; - vbasedev->num_regions = dev_info.num_regions; - vbasedev->flags = dev_info.flags; - vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + vfio_device_prepare(vbasedev, bcontainer, &dev_info); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); @@ -605,7 +623,7 @@ err_discard_disable: err_attach_container: iommufd_cdev_container_destroy(container); err_alloc_ioas: - vfio_put_address_space(space); + vfio_address_space_put(space); iommufd_cdev_unbind_and_disconnect(vbasedev); err_connect_bind: close(vbasedev->fd); @@ -619,18 +637,16 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); - QLIST_REMOVE(vbasedev, global_next); - QLIST_REMOVE(vbasedev, container_next); - vbasedev->bcontainer = NULL; + vfio_device_unprepare(vbasedev); if (!vbasedev->ram_block_discard_allowed) { iommufd_cdev_ram_block_discard_disable(false); } - vfio_cpr_unregister_container(bcontainer); + object_unref(vbasedev->hiod); iommufd_cdev_detach_container(vbasedev, container); iommufd_cdev_container_destroy(container); - vfio_put_address_space(space); + vfio_address_space_put(space); iommufd_cdev_unbind_and_disconnect(vbasedev); close(vbasedev->fd); @@ -786,12 +802,10 @@ out_single: return ret; } -static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) +static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); - vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO; - vioc->dma_map = iommufd_cdev_map; vioc->dma_unmap = iommufd_cdev_unmap; vioc->attach_device = iommufd_cdev_attach; @@ -801,21 +815,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { VFIODevice *vdev = opaque; + HostIOMMUDeviceIOMMUFD *idev; HostIOMMUDeviceCaps *caps = &hiod->caps; + VendorCaps *vendor_caps = &caps->vendor_caps; enum iommu_hw_info_type type; - union { - struct iommu_hw_info_vtd vtd; - } data; uint64_t hw_caps; hiod->agent = opaque; - if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, + vendor_caps, sizeof(*vendor_caps), &hw_caps, errp)) { return false; } @@ -824,6 +855,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, caps->type = type; caps->hw_caps = hw_caps; + idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->hwpt_id = vdev->hwpt->hwpt_id; + return true; } @@ -846,13 +882,17 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod) } -static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) +static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges; hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { diff --git a/hw/vfio/common.c b/hw/vfio/listener.c index d8aad4e..f498e23 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/listener.c @@ -25,12 +25,11 @@ #endif #include <linux/vfio.h> -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "hw/vfio/pci.h" -#include "exec/address-spaces.h" -#include "exec/memory.h" -#include "exec/ram_addr.h" -#include "exec/target_page.h" +#include "system/address-spaces.h" +#include "system/memory.h" +#include "system/ram_addr.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -41,160 +40,23 @@ #include "trace.h" #include "qapi/error.h" #include "migration/misc.h" -#include "migration/blocker.h" #include "migration/qemu-file.h" #include "system/tcg.h" #include "system/tpm.h" - -VFIODeviceList vfio_device_list = - QLIST_HEAD_INITIALIZER(vfio_device_list); -static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = - QLIST_HEAD_INITIALIZER(vfio_address_spaces); - -#ifdef CONFIG_KVM -/* - * We have a single VFIO pseudo device per KVM VM. Once created it lives - * for the life of the VM. Closing the file descriptor only drops our - * reference to it and the device's reference to kvm. Therefore once - * initialized, this file descriptor is only released on QEMU exit and - * we'll re-use it should another vfio device be attached before then. - */ -int vfio_kvm_device_fd = -1; -#endif +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" /* * Device state interfaces */ -bool vfio_mig_active(void) -{ - VFIODevice *vbasedev; - - if (QLIST_EMPTY(&vfio_device_list)) { - return false; - } - - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->migration_blocker) { - return false; - } - } - return true; -} - -static Error *multiple_devices_migration_blocker; - -/* - * Multiple devices migration is allowed only if all devices support P2P - * migration. Single device migration is allowed regardless of P2P migration - * support. - */ -static bool vfio_multiple_devices_migration_is_supported(void) -{ - VFIODevice *vbasedev; - unsigned int device_num = 0; - bool all_support_p2p = true; - - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->migration) { - device_num++; - - if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { - all_support_p2p = false; - } - } - } - - return all_support_p2p || device_num <= 1; -} - -int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) -{ - if (vfio_multiple_devices_migration_is_supported()) { - return 0; - } - - if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { - error_setg(errp, "Multiple VFIO devices migration is supported only if " - "all of them support P2P migration"); - return -EINVAL; - } - - if (multiple_devices_migration_blocker) { - return 0; - } - - error_setg(&multiple_devices_migration_blocker, - "Multiple VFIO devices migration is supported only if all of " - "them support P2P migration"); - return migrate_add_blocker_normal(&multiple_devices_migration_blocker, - errp); -} - -void vfio_unblock_multiple_devices_migration(void) -{ - if (!multiple_devices_migration_blocker || - !vfio_multiple_devices_migration_is_supported()) { - return; - } - - migrate_del_blocker(&multiple_devices_migration_blocker); -} - -bool vfio_viommu_preset(VFIODevice *vbasedev) -{ - return vbasedev->bcontainer->space->as != &address_space_memory; -} - -static void vfio_set_migration_error(int ret) -{ - if (migration_is_running()) { - migration_file_set_error(ret, NULL); - } -} - -bool vfio_device_state_is_running(VFIODevice *vbasedev) -{ - VFIOMigration *migration = vbasedev->migration; - - return migration->device_state == VFIO_DEVICE_STATE_RUNNING || - migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; -} - -bool vfio_device_state_is_precopy(VFIODevice *vbasedev) -{ - VFIOMigration *migration = vbasedev->migration; - - return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || - migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; -} - -static bool vfio_devices_all_device_dirty_tracking_started( - const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (!vbasedev->dirty_tracking) { - return false; - } - } - - return true; -} - -bool vfio_devices_all_dirty_tracking_started( - const VFIOContainerBase *bcontainer) -{ - return vfio_devices_all_device_dirty_tracking_started(bcontainer) || - bcontainer->dirty_pages_started; -} static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; - if (!vfio_devices_all_dirty_tracking_started(bcontainer)) { + if (!vfio_container_dirty_tracking_is_started(bcontainer)) { return false; } @@ -214,22 +76,6 @@ static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) return true; } -bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { - return false; - } - if (!vbasedev->dirty_pages_supported) { - return false; - } - } - - return true; -} - static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -244,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) +/* + * Called with rcu_read_lock held. + * The returned MemoryRegion must not be accessed after calling rcu_read_unlock. + */ +static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { - bool ret, mr_has_discard_manager; + MemoryRegion *mr; - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { + mr = memory_translate_iotlb(iotlb, xlat_p, errp); + if (mr && memory_region_has_ram_discard_manager(mr)) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The * pages will remain pinned inside vfio until unmapped, resulting in a @@ -272,7 +119,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, " intended via an IOMMU. It's possible to mitigate " " by setting/adjusting RLIMIT_MEMLOCK."); } - return ret; + return mr; } static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) @@ -280,6 +127,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mr; + hwaddr xlat; void *vaddr; int ret; Error *local_err = NULL; @@ -288,9 +137,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { - error_report("Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); - vfio_set_migration_error(-EINVAL); + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + if (migration_is_running()) { + migration_file_set_error(-EINVAL, local_err); + } else { + error_report_err(local_err); + } return; } @@ -299,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); goto out; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be @@ -312,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mr); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -321,13 +179,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } } else { ret = vfio_container_dma_unmap(bcontainer, iova, - iotlb->addr_mask + 1, iotlb); + iotlb->addr_mask + 1, iotlb, false); if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, - iotlb->addr_mask + 1, ret, strerror(-ret)); - vfio_set_migration_error(ret); + error_setg(&local_err, + "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, + iotlb->addr_mask + 1, ret, strerror(-ret)); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } } } out: @@ -345,7 +208,7 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, int ret; /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, false); if (ret) { error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); @@ -377,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -387,7 +250,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, return 0; } -static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, +static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); @@ -462,7 +325,7 @@ static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, } } -static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, +static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); @@ -555,6 +418,32 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, return true; } +static void vfio_listener_begin(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + void (*listener_begin)(VFIOContainerBase *bcontainer); + + listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + + if (listener_begin) { + listener_begin(bcontainer); + } +} + +static void vfio_listener_commit(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + void (*listener_commit)(VFIOContainerBase *bcontainer); + + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit; + + if (listener_commit) { + listener_commit(bcontainer); + } +} + static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) { /* @@ -567,11 +456,38 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) } } +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); + vfio_container_region_add(bcontainer, section, false); +} + +void vfio_container_region_add(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + bool cpr_remap) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -607,6 +523,11 @@ static void vfio_listener_region_add(MemoryListener *listener, int iommu_idx; trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + + if (cpr_remap) { + vfio_cpr_giommu_remap(bcontainer, section); + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -649,7 +570,12 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(bcontainer, section); + if (!cpr_remap) { + vfio_ram_discard_register_listener(bcontainer, section); + } else if (!vfio_cpr_ram_discard_register_listener(bcontainer, + section)) { + goto fail; + } return; } @@ -675,7 +601,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -772,27 +698,20 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_unregister_ram_discard_listener(bcontainer, section); + vfio_ram_discard_unregister_listener(bcontainer, section); /* Unregistering will trigger an unmap. */ try_unmap = false; } if (try_unmap) { + bool unmap_all = false; + if (int128_eq(llsize, int128_2_64())) { - /* The unmap ioctl doesn't accept a full 64-bit span. */ - llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); - if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, int128_get64(llsize), ret, - strerror(-ret)); - } - iova += int128_get64(llsize); + unmap_all = true; + llsize = int128_zero(); } - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), + NULL, unmap_all); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", @@ -945,13 +864,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + int ret; + if (!vbasedev->dirty_tracking) { continue; } - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + ret = vbasedev->io_ops->device_feature(vbasedev, feature); + + if (ret != 0) { warn_report("%s: Failed to stop DMA logging, err %d (%s)", - vbasedev->name, -errno, strerror(errno)); + vbasedev->name, -ret, strerror(-ret)); } vbasedev->dirty_tracking = false; } @@ -1052,10 +975,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, continue; } - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + ret = vbasedev->io_ops->device_feature(vbasedev, feature); if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "%s: Failed to start DMA logging", + error_setg_errno(errp, -ret, "%s: Failed to start DMA logging", vbasedev->name); goto out; } @@ -1080,7 +1002,7 @@ static bool vfio_listener_log_global_start(MemoryListener *listener, listener); bool ret; - if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { ret = vfio_devices_dma_logging_start(bcontainer, errp); } else { ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0; @@ -1099,7 +1021,7 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) Error *local_err = NULL; int ret = 0; - if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { vfio_devices_dma_logging_stop(bcontainer); } else { ret = vfio_container_set_dirty_page_tracking(bcontainer, false, @@ -1109,102 +1031,12 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) if (ret) { error_prepend(&local_err, "vfio: Could not stop dirty page tracking - "); - error_report_err(local_err); - vfio_set_migration_error(ret); - } -} - -static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, - hwaddr size, void *bitmap) -{ - uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + - sizeof(struct vfio_device_feature_dma_logging_report), - sizeof(uint64_t))] = {}; - struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; - struct vfio_device_feature_dma_logging_report *report = - (struct vfio_device_feature_dma_logging_report *)feature->data; - - report->iova = iova; - report->length = size; - report->page_size = qemu_real_host_page_size(); - report->bitmap = (uintptr_t)bitmap; - - feature->argsz = sizeof(buf); - feature->flags = VFIO_DEVICE_FEATURE_GET | - VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { - return -errno; - } - - return 0; -} - -int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - VFIODevice *vbasedev; - int ret; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - ret = vfio_device_dma_logging_report(vbasedev, iova, size, - vbmap->bitmap); - if (ret) { - error_setg_errno(errp, -ret, - "%s: Failed to get DMA logging report, iova: " - "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, - vbasedev->name, iova, size); - - return ret; + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); } } - - return 0; -} - -int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, - uint64_t size, ram_addr_t ram_addr, Error **errp) -{ - bool all_device_dirty_tracking = - vfio_devices_all_device_dirty_tracking(bcontainer); - uint64_t dirty_pages; - VFIOBitmap vbmap; - int ret; - - if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { - cpu_physical_memory_set_dirty_range(ram_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); - return 0; - } - - ret = vfio_bitmap_alloc(&vbmap, size); - if (ret) { - error_setg_errno(errp, -ret, - "Failed to allocate dirty tracking bitmap"); - return ret; - } - - if (all_device_dirty_tracking) { - ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, - errp); - } else { - ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size, - errp); - } - - if (ret) { - goto out; - } - - dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, - vbmap.pages); - - trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); -out: - g_free(vbmap.bitmap); - - return ret; } typedef struct { @@ -1222,29 +1054,32 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ram_addr_t translated_addr; Error *local_err = NULL; int ret = -EINVAL; + MemoryRegion *mr; + hwaddr xlat; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { - error_report("Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); goto out; } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { - error_report_err(local_err); + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { goto out_unlock; } + translated_addr = memory_region_get_ram_addr(mr) + xlat; - ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr, &local_err); if (ret) { error_prepend(&local_err, "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") failed - ", bcontainer, iova, iotlb->addr_mask + 1); - error_report_err(local_err); } out_unlock: @@ -1252,11 +1087,15 @@ out_unlock: out: if (ret) { - vfio_set_migration_error(ret); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } } } -static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, +static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section, void *opaque) { const hwaddr size = int128_get64(section->size); @@ -1271,7 +1110,7 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, + ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, &local_err); if (ret) { error_report_err(local_err); @@ -1284,26 +1123,15 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); /* * We only want/can synchronize the bitmap for actually mapped parts - * which correspond to populated parts. Replay all populated parts. */ return ram_discard_manager_replay_populated(rdm, section, - vfio_ram_discard_get_dirty_bitmap, + vfio_ram_discard_query_dirty_bitmap, &vrdl); } @@ -1365,7 +1193,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; - return vfio_get_dirty_bitmap(bcontainer, + return vfio_container_query_dirty_bitmap(bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), int128_get64(section->size), ram_addr, errp); } @@ -1385,14 +1213,19 @@ static void vfio_listener_log_sync(MemoryListener *listener, if (vfio_log_sync_needed(bcontainer)) { ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err); if (ret) { - error_report_err(local_err); - vfio_set_migration_error(ret); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } } } } -const MemoryListener vfio_memory_listener = { +static const MemoryListener vfio_memory_listener = { .name = "vfio", + .begin = vfio_listener_begin, + .commit = vfio_listener_commit, .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, .log_global_start = vfio_listener_log_global_start, @@ -1400,184 +1233,21 @@ const MemoryListener vfio_memory_listener = { .log_sync = vfio_listener_log_sync, }; -void vfio_reset_handler(void *opaque) +bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp) { - VFIODevice *vbasedev; + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); - trace_vfio_reset_handler(); - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->dev->realized) { - vbasedev->ops->vfio_compute_needs_reset(vbasedev); - } - } - - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->dev->realized && vbasedev->needs_reset) { - vbasedev->ops->vfio_hot_reset_multi(vbasedev); - } - } -} - -int vfio_kvm_device_add_fd(int fd, Error **errp) -{ -#ifdef CONFIG_KVM - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_FILE, - .attr = KVM_DEV_VFIO_FILE_ADD, - .addr = (uint64_t)(unsigned long)&fd, - }; - - if (!kvm_enabled()) { - return 0; - } - - if (vfio_kvm_device_fd < 0) { - struct kvm_create_device cd = { - .type = KVM_DEV_TYPE_VFIO, - }; - - if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { - error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); - return -errno; - } - - vfio_kvm_device_fd = cd.fd; - } - - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", - fd); - return -errno; - } -#endif - return 0; -} - -int vfio_kvm_device_del_fd(int fd, Error **errp) -{ -#ifdef CONFIG_KVM - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_FILE, - .attr = KVM_DEV_VFIO_FILE_DEL, - .addr = (uint64_t)(unsigned long)&fd, - }; - - if (vfio_kvm_device_fd < 0) { - error_setg(errp, "KVM VFIO device isn't created yet"); - return -EINVAL; - } - - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, - "Failed to remove fd %d from KVM VFIO device", fd); - return -errno; - } -#endif - return 0; -} - -VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) -{ - VFIOAddressSpace *space; - - QLIST_FOREACH(space, &vfio_address_spaces, list) { - if (space->as == as) { - return space; - } - } - - /* No suitable VFIOAddressSpace, create a new one */ - space = g_malloc0(sizeof(*space)); - space->as = as; - QLIST_INIT(&space->containers); - - if (QLIST_EMPTY(&vfio_address_spaces)) { - qemu_register_reset(vfio_reset_handler, NULL); - } - - QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); - - return space; -} - -void vfio_put_address_space(VFIOAddressSpace *space) -{ - if (!QLIST_EMPTY(&space->containers)) { - return; - } - - QLIST_REMOVE(space, list); - g_free(space); - - if (QLIST_EMPTY(&vfio_address_spaces)) { - qemu_unregister_reset(vfio_reset_handler, NULL); - } -} - -void vfio_address_space_insert(VFIOAddressSpace *space, - VFIOContainerBase *bcontainer) -{ - QLIST_INSERT_HEAD(&space->containers, bcontainer, next); - bcontainer->space = space; -} - -struct vfio_device_info *vfio_get_device_info(int fd) -{ - struct vfio_device_info *info; - uint32_t argsz = sizeof(*info); - - info = g_malloc0(argsz); - -retry: - info->argsz = argsz; - - if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { - g_free(info); - return NULL; - } - - if (info->argsz > argsz) { - argsz = info->argsz; - info = g_realloc(info, argsz); - goto retry; - } - - return info; -} - -bool vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) -{ - const VFIOIOMMUClass *ops = - VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); - HostIOMMUDevice *hiod = NULL; - - if (vbasedev->iommufd) { - ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); - } - - assert(ops); - - - if (!vbasedev->mdev) { - hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); - vbasedev->hiod = hiod; - } - - if (!ops->attach_device(name, vbasedev, as, errp)) { - object_unref(hiod); - vbasedev->hiod = NULL; + if (bcontainer->error) { + error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); return false; } return true; } -void vfio_detach_device(VFIODevice *vbasedev) +void vfio_listener_unregister(VFIOContainerBase *bcontainer) { - if (!vbasedev->bcontainer) { - return; - } - object_unref(vbasedev->hiod); - VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); + memory_listener_unregister(&bcontainer->listener); } diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index a8939c8..63ea393 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -1,7 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + vfio_ss = ss.source_set() vfio_ss.add(files( - 'common.c', + 'listener.c', + 'container-base.c', 'container.c', + 'helpers.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( @@ -18,11 +22,12 @@ specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss) system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) system_ss.add(when: 'CONFIG_VFIO', if_true: files( - 'helpers.c', - 'container-base.c', + 'cpr.c', + 'cpr-legacy.c', + 'device.c', 'migration.c', 'migration-multifd.c', - 'cpr.c', + 'region.c', )) system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( 'iommufd.c', diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 378f6f3..850a319 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -10,7 +10,7 @@ */ #include "qemu/osdep.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "migration/misc.h" #include "qapi/error.h" #include "qemu/bswap.h" @@ -21,6 +21,7 @@ #include "io/channel-buffer.h" #include "migration/qemu-file.h" #include "migration-multifd.h" +#include "vfio-migration-internal.h" #include "trace.h" #define VFIO_DEVICE_STATE_CONFIG_STATE (1) @@ -575,7 +576,7 @@ vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, return false; } - vfio_mig_add_bytes_transferred(packet_len); + vfio_migration_add_bytes_transferred(packet_len); return true; } @@ -645,7 +646,7 @@ vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, goto thread_exit; } - vfio_mig_add_bytes_transferred(packet_size); + vfio_migration_add_bytes_transferred(packet_size); } if (!vfio_save_complete_precopy_thread_config_state(vbasedev, diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index a664051..0bab632 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -12,7 +12,7 @@ #ifndef HW_VFIO_MIGRATION_MULTIFD_H #define HW_VFIO_MIGRATION_MULTIFD_H -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp); void vfio_multifd_cleanup(VFIODevice *vbasedev); diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index fbff46c..b76697bd 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -16,7 +16,8 @@ #include <sys/ioctl.h> #include "system/runstate.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-migration.h" #include "migration/misc.h" #include "migration/savevm.h" #include "migration/vmstate.h" @@ -30,6 +31,7 @@ #include "pci.h" #include "trace.h" #include "hw/hw.h" +#include "vfio-migration-internal.h" /* * This is an arbitrary size based on migration of mlx5 devices, where typically @@ -373,7 +375,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); qemu_put_be64(f, data_size); qemu_put_buffer(f, migration->data_buffer, data_size); - vfio_mig_add_bytes_transferred(data_size); + vfio_migration_add_bytes_transferred(data_size); trace_vfio_save_block(migration->vbasedev->name, data_size); @@ -1014,13 +1016,72 @@ static int vfio_migration_init(VFIODevice *vbasedev) vfio_vmstate_change_prepare : NULL; migration->vm_state = qdev_add_vm_change_state_handler_full( - vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); + vbasedev->dev, vfio_vmstate_change, prepare_cb, NULL, vbasedev); migration_add_notifier(&migration->migration_state, vfio_migration_state_notifier); return 0; } +static Error *multiple_devices_migration_blocker; + +/* + * Multiple devices migration is allowed only if all devices support P2P + * migration. Single device migration is allowed regardless of P2P migration + * support. + */ +static bool vfio_multiple_devices_migration_is_supported(void) +{ + VFIODevice *vbasedev; + unsigned int device_num = 0; + bool all_support_p2p = true; + + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->migration) { + device_num++; + + if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { + all_support_p2p = false; + } + } + } + + return all_support_p2p || device_num <= 1; +} + +static int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) +{ + if (vfio_multiple_devices_migration_is_supported()) { + return 0; + } + + if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { + error_setg(errp, "Multiple VFIO devices migration is supported only if " + "all of them support P2P migration"); + return -EINVAL; + } + + if (multiple_devices_migration_blocker) { + return 0; + } + + error_setg(&multiple_devices_migration_blocker, + "Multiple VFIO devices migration is supported only if all of " + "them support P2P migration"); + return migrate_add_blocker_normal(&multiple_devices_migration_blocker, + errp); +} + +static void vfio_unblock_multiple_devices_migration(void) +{ + if (!multiple_devices_migration_blocker || + !vfio_multiple_devices_migration_is_supported()) { + return; + } + + migrate_del_blocker(&multiple_devices_migration_blocker); +} + static void vfio_migration_deinit(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -1047,21 +1108,42 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) /* ---------------------------------------------------------------------- */ -int64_t vfio_mig_bytes_transferred(void) +int64_t vfio_migration_bytes_transferred(void) { return MIN(qatomic_read(&bytes_transferred), INT64_MAX); } -void vfio_reset_bytes_transferred(void) +void vfio_migration_reset_bytes_transferred(void) { qatomic_set(&bytes_transferred, 0); } -void vfio_mig_add_bytes_transferred(unsigned long val) +void vfio_migration_add_bytes_transferred(unsigned long val) { qatomic_add(&bytes_transferred, val); } +bool vfio_migration_active(void) +{ + VFIODevice *vbasedev; + + if (QLIST_EMPTY(&vfio_device_list)) { + return false; + } + + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->migration_blocker) { + return false; + } + } + return true; +} + +static bool vfio_viommu_preset(VFIODevice *vbasedev) +{ + return vbasedev->bcontainer->space->as != &address_space_memory; +} + /* * Return true when either migration initialized or blocker registered. * Currently only return false when adding blocker fails which will @@ -1138,3 +1220,19 @@ void vfio_migration_exit(VFIODevice *vbasedev) migrate_del_blocker(&vbasedev->migration_blocker); } + +bool vfio_device_state_is_running(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->device_state == VFIO_DEVICE_STATE_RUNNING || + migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; +} + +bool vfio_device_state_is_precopy(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || + migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f87f3cc..fa25bde 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -30,6 +30,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "migration/vmstate.h" +#include "migration/cpr.h" #include "qobject/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -44,6 +45,8 @@ #include "migration/blocker.h" #include "migration/qemu-file.h" #include "system/iommufd.h" +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -54,6 +57,23 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr, Error **errp) +{ + int ret = event_notifier_init(e, 0); + + if (ret) { + error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + } + return !ret; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -101,7 +121,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_pci_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -109,11 +129,11 @@ static void vfio_intx_eoi(VFIODevice *vbasedev) return; } - trace_vfio_intx_eoi(vbasedev->name); + trace_vfio_pci_intx_eoi(vbasedev->name); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); - vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); } static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) @@ -129,13 +149,12 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) /* Get to a known interrupt state */ qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { goto fail; } @@ -147,15 +166,15 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) goto fail_irqfd; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_UNMASK, - event_notifier_get_fd(&vdev->intx.unmask), - errp)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_UNMASK, + event_notifier_get_fd(&vdev->intx.unmask), + errp)) { goto fail_vfio; } /* Let'em rip */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.kvm_accel = true; @@ -167,10 +186,10 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); return false; #else return true; @@ -188,7 +207,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) * Get to a known state, hardware masked, QEMU ready to accept new * interrupts, QEMU IRQ de-asserted. */ - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); @@ -199,7 +218,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -208,7 +227,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) vdev->intx.kvm_accel = false; /* If we've missed an event, let it re-fire through QEMU */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); trace_vfio_intx_disable_kvm(vdev->vbasedev.name); #endif @@ -234,12 +253,12 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) } /* Re-enable the interrupt in cased we missed an EOI */ - vfio_intx_eoi(&vdev->vbasedev); + vfio_pci_intx_eoi(&vdev->vbasedev); } static void vfio_intx_routing_notifier(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); PCIINTxRoute route; if (vdev->interrupt != VFIO_INT_INTx) { @@ -266,7 +285,6 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); Error *err = NULL; int32_t fd; - int ret; if (!pin) { @@ -289,18 +307,17 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); - if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0, + errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return false; } @@ -320,20 +337,25 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) timer_del(vdev->intx.mmap_timer); vfio_intx_disable_kvm(vdev); - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); vfio_mmap_set_enabled(vdev, true); fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; trace_vfio_intx_disable(vdev->vbasedev.name); } +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp) +{ + return vfio_intx_enable(vdev, errp); +} + /* * MSI/X */ @@ -379,7 +401,7 @@ static void vfio_msi_interrupt(void *opaque) static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) { g_autofree struct vfio_irq_set *irq_set = NULL; - int ret = 0, argsz; + int argsz; int32_t *fd; argsz = sizeof(*irq_set) + sizeof(*fd); @@ -394,9 +416,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) fd = (int32_t *)&irq_set->data; *fd = -1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); - - return ret; + return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) @@ -453,15 +473,15 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); g_free(irq_set); return ret; } -static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix) { if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; @@ -471,13 +491,16 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, vector_n, &vdev->pdev); } -static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) { + const char *name = "kvm_interrupt"; + if (vector->virq < 0) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr, + NULL)) { goto fail_notifier; } @@ -489,19 +512,20 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) return; fail_kvm: - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr); fail_notifier: kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -511,10 +535,47 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + Error *local_err = NULL; + + vector->vdev = vdev; + vector->virq = -1; + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr, + &local_err)) { + error_report_err(local_err); + } + vector->use = true; + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_vector_use(pdev, nr); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -524,13 +585,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_pci_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -542,19 +597,19 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } } else { if (msg) { if (vdev->defer_kvm_irq_routing) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); } else { vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); - vfio_connect_kvm_msi_virq(vector); + vfio_connect_kvm_msi_virq(vector, nr); } } } @@ -576,27 +631,14 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, if (!vdev->defer_kvm_irq_routing) { if (vdev->msix->noresize && resizing) { - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_set_irq_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -619,7 +661,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev, static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); @@ -636,7 +678,7 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) int32_t fd = event_notifier_get_fd(&vector->interrupt); Error *err = NULL; - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); @@ -644,14 +686,14 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); vdev->defer_kvm_irq_routing = true; vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); } -static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { int i; @@ -661,7 +703,7 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) kvm_irqchip_commit_route_changes(&vfio_route_change); for (i = 0; i < vdev->nr_vectors; i++) { - vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i); } } @@ -681,19 +723,20 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * routes once rather than per vector provides a substantial * performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { /* @@ -710,7 +753,8 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) */ ret = vfio_enable_msix_no_vec(vdev); if (ret) { - error_report("vfio: failed to enable MSI-X, %d", ret); + error_report("vfio: failed to enable MSI-X, %s", + strerror(-ret)); } } @@ -730,19 +774,21 @@ retry: * Deferring to commit the KVM routes once rather than per vector * provides a substantial performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i, + &local_err)) { + error_report_err(local_err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -752,10 +798,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vdev, vector, i, false); + vfio_pci_add_kvm_msi_virq(vdev, vector, i, false); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; @@ -763,7 +809,8 @@ retry: ret = vfio_enable_vectors(vdev, false); if (ret) { if (ret < 0) { - error_report("vfio: Error: Failed to setup MSI fds: %m"); + error_report("vfio: Error: Failed to setup MSI fds: %s", + strerror(-ret)); } else { error_report("vfio: Error: Failed to enable %d " "MSI vectors, retry with %d", vdev->nr_vectors, ret); @@ -797,11 +844,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -833,7 +880,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) * Always clear MSI-X IRQ index. A PF device could have enabled * MSI-X with no vectors. See vfio_msix_enable(). */ - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); vfio_msi_disable_common(vdev); if (!vfio_intx_enable(vdev, &err)) { @@ -850,7 +897,7 @@ static void vfio_msi_disable(VFIOPCIDevice *vdev) { Error *err = NULL; - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); vfio_msi_disable_common(vdev); vfio_intx_enable(vdev, &err); if (err) { @@ -879,18 +926,22 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { - g_autofree struct vfio_region_info *reg_info = NULL; + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info = NULL; uint64_t size; off_t off = 0; ssize_t bytes; + int ret; - if (vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { - error_report("vfio: Error getting ROM info: %m"); + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, + ®_info); + + if (ret != 0) { + error_report("vfio: Error getting ROM info: %s", strerror(-ret)); return; } - trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, + trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -899,8 +950,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) if (!vdev->rom_size) { vdev->rom_read_failed = true; - error_report("vfio-pci: Cannot read device rom at " - "%s", vdev->vbasedev.name); + error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name); error_printf("Device option ROM contents are probably invalid " "(check dmesg).\nSkip option ROM probe with rombar=0, " "or load from file with romfile=\n"); @@ -911,18 +961,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = vbasedev->io_ops->region_read(vbasedev, + VFIO_PCI_ROM_REGION_INDEX, + off, size, vdev->rom + off); + if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strreaderror(bytes)); + break; } } @@ -958,6 +1012,24 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) } } +/* "Raw" read of underlying config space. */ +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); +} + +/* "Raw" write of underlying config space. */ +static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data, false); +} + static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) { VFIOPCIDevice *vdev = opaque; @@ -1010,10 +1082,9 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; char *name; - int fd = vdev->vbasedev.fd; if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ @@ -1030,11 +1101,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) * Use the same size ROM BAR as the physical device. The contents * will get filled in later when the guest tries to read it. */ - if (pread(fd, &orig, 4, offset) != 4 || - pwrite(fd, &size, 4, offset) != 4 || - pread(fd, &size, 4, offset) != 4 || - pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); + if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) { + + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); return; } @@ -1167,7 +1239,7 @@ static const MemoryRegionOps vfio_vga_ops = { */ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIORegion *region = &vdev->bars[bar].region; MemoryRegion *mmap_mr, *region_mr, *base_mr; PCIIORegion *r; @@ -1213,7 +1285,8 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) */ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1226,12 +1299,12 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { ssize_t ret; - ret = pread(vdev->vbasedev.fd, &phys_val, len, - vdev->config_offset + addr); + ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val); if (ret != len) { - error_report("%s(%s, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, len); - return -errno; + error_report("%s(%s, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, len, + strreaderror(ret)); + return -1; } phys_val = le32_to_cpu(phys_val); } @@ -1246,16 +1319,19 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); + int ret; trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); /* Write everything to VFIO, let it filter out what we can't write */ - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) - != len) { - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, val, len); + ret = vfio_pci_config_space_write(vdev, addr, len, &val_le); + if (ret != len) { + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, val, len, + strwriteerror(ret)); } /* MSI/MSI-X Enabling/Disabling */ @@ -1343,9 +1419,11 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) int ret, entries; Error *err = NULL; - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", + strreaderror(ret)); return false; } ctrl = le16_to_cpu(ctrl); @@ -1378,8 +1456,8 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) * If the host driver allows mapping of a MSIX data, we are going to * do map the entire BAR and emulate MSIX table on top of that. */ - if (vfio_has_region_cap(&vdev->vbasedev, region->nr, - VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { + if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr, + VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { return; } @@ -1552,31 +1630,35 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int ret, fd = vdev->vbasedev.fd; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_MSIX_IRQ_INDEX }; + struct vfio_irq_info irq_info; VFIOMSIXInfo *msix; + int ret; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); if (!pos) { return true; } - if (pread(fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed to read PCI MSIX FLAGS: %s", + strreaderror(ret)); return false; } - if (pread(fd, &table, sizeof(table), - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE, + sizeof(table), &table); + if (ret != sizeof(table)) { + error_setg(errp, "failed to read PCI MSIX TABLE: %s", + strreaderror(ret)); return false; } - if (pread(fd, &pba, sizeof(pba), - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA, + sizeof(pba), &pba); + if (ret != sizeof(pba)) { + error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret)); return false; } @@ -1591,7 +1673,8 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + &irq_info); if (ret < 0) { error_setg_errno(errp, -ret, "failed to get MSI-X irq info"); g_free(msix); @@ -1698,7 +1781,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev) { msi_uninit(&vdev->pdev); @@ -1735,10 +1818,10 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) } /* Determine what type of BAR this is for registration */ - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr), + sizeof(pci_bar), &pci_bar); if (ret != sizeof(pci_bar)) { - error_report("vfio: Failed to read BAR %d (%m)", nr); + error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret)); return; } @@ -1748,6 +1831,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) @@ -1794,7 +1880,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_pci_bars_exit(VFIOPCIDevice *vdev) { int i; @@ -2385,7 +2471,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) g_free(config); } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -2441,21 +2527,23 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) void vfio_pci_post_reset(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; Error *err = NULL; - int nr; + int ret, nr; if (!vfio_intx_enable(vdev, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); uint32_t val = 0; uint32_t len = sizeof(val); - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { - error_report("%s(%s) reset bar %d failed: %m", __func__, - vdev->vbasedev.name, nr); + ret = vfio_pci_config_space_write(vdev, addr, len, &val); + if (ret != len) { + error_report("%s(%s) reset bar %d failed: %s", __func__, + vbasedev->name, nr, strwriteerror(ret)); } } @@ -2659,7 +2747,7 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_intx_eoi, + .vfio_eoi = vfio_pci_intx_eoi, .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, @@ -2668,10 +2756,10 @@ static VFIODeviceOps vfio_pci_ops = { bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; int ret; - ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); if (ret) { error_setg_errno(errp, -ret, "failed getting region info for VGA region index %d", @@ -2730,11 +2818,11 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + struct vfio_region_info *reg_info = NULL; + struct vfio_irq_info irq_info; int i, ret = -1; /* Sanity check device */ @@ -2769,14 +2857,14 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) QLIST_INIT(&vdev->bars[i].quirks); } - ret = vfio_get_region_info(vbasedev, - VFIO_PCI_CONFIG_REGION_INDEX, ®_info); + ret = vfio_device_get_region_info(vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, ®_info); if (ret) { error_setg_errno(errp, -ret, "failed to get config info"); return false; } - trace_vfio_populate_device_config(vdev->vbasedev.name, + trace_vfio_pci_populate_device_config(vdev->vbasedev.name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -2795,12 +2883,10 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } - irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; - - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); + trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2812,9 +2898,21 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { - vfio_detach_device(&vdev->vbasedev); + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + + vfio_device_detach(&vdev->vbasedev); g_free(vdev->vbasedev.name); g_free(vdev->msix); @@ -2848,7 +2946,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2857,8 +2955,9 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Unable to init event notifier for error detection"); + if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0, + &err)) { + error_report_err(err); vdev->pci_aer = false; return; } @@ -2866,11 +2965,11 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2883,13 +2982,13 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2907,35 +3006,37 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) { - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_REQ_IRQ_INDEX }; + struct vfio_irq_info irq_info; Error *err = NULL; int32_t fd; + int ret; if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { return; } - if (ioctl(vdev->vbasedev.fd, - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, + &irq_info); + if (ret < 0 || irq_info.count < 1) { return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { - error_report("vfio: Unable to init event notifier for device request"); + if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0, + &err)) { + error_report_err(err); return; } fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2949,87 +3050,33 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { - ERRP_GUARD(); - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; - int i, ret; - char uuid[UUID_STR_LEN]; - g_autofree char *name = NULL; - - if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { - if (!(~vdev->host.domain || ~vdev->host.bus || - ~vdev->host.slot || ~vdev->host.function)) { - error_setg(errp, "No provided host device"); - error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " -#ifdef CONFIG_IOMMUFD - "or -device vfio-pci,fd=DEVICE_FD " -#endif - "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); - return; - } - vbasedev->sysfsdev = - g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", - vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); - } - - if (!vfio_device_get_name(vbasedev, errp)) { - return; - } - - /* - * Mediated devices *might* operate compatibly with discarding of RAM, but - * we cannot know for certain, it depends on whether the mdev vendor driver - * stays in sync with the active working set of the guest driver. Prevent - * the x-balloon-allowed option unless this is minimally an mdev device. - */ - vbasedev->mdev = vfio_device_is_mdev(vbasedev); - - trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - - if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { - error_setg(errp, "x-balloon-allowed only potentially compatible " - "with mdev devices"); - goto error; - } - - if (!qemu_uuid_is_null(&vdev->vf_token)) { - qemu_uuid_unparse(&vdev->vf_token, uuid); - name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); - } else { - name = g_strdup(vbasedev->name); - } - - if (!vfio_attach_device(name, vbasedev, - pci_device_iommu_address_space(pdev), errp)) { - goto error; - } + uint32_t config_space_size; + int ret; - if (!vfio_populate_device(vdev, errp)) { - goto error; - } + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); /* Get a copy of config space */ - ret = pread(vbasedev->fd, vdev->pdev.config, - MIN(pci_config_size(&vdev->pdev), vdev->config_size), - vdev->config_offset); - if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { - ret = ret < 0 ? -errno : -EFAULT; - error_setg_errno(errp, -ret, "failed to read device config space"); - goto error; + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; } /* vfio emulates a lot for us, but some bits need extra love */ @@ -3048,7 +3095,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->vendor_id != PCI_ANY_ID) { if (vdev->vendor_id >= 0xffff) { error_setg(errp, "invalid PCI vendor ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id); @@ -3059,7 +3106,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->device_id != PCI_ANY_ID) { if (vdev->device_id > 0xffff) { error_setg(errp, "invalid PCI device ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id); @@ -3070,7 +3117,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->sub_vendor_id != PCI_ANY_ID) { if (vdev->sub_vendor_id > 0xffff) { error_setg(errp, "invalid PCI subsystem vendor ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, vdev->sub_vendor_id, ~0); @@ -3081,7 +3128,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->sub_device_id != PCI_ANY_ID) { if (vdev->sub_device_id > 0xffff) { error_setg(errp, "invalid PCI subsystem device ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); trace_vfio_pci_emulated_sub_device_id(vbasedev->name, @@ -3112,32 +3159,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_prepare(vdev); if (!vfio_msix_early_setup(vdev, errp)) { - goto error; + return false; } vfio_bars_register(vdev); - if (!vbasedev->mdev && - !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { - error_prepend(errp, "Failed to set vIOMMU: "); - goto out_teardown; - } - - if (!vfio_add_capabilities(vdev, errp)) { - goto out_unset_idev; - } - - if (!vfio_config_quirk_setup(vdev, errp)) { - goto out_unset_idev; - } - - if (vdev->vga) { - vfio_vga_quirk_setup(vdev); - } + return true; +} - for (i = 0; i < PCI_ROM_SLOT; i++) { - vfio_bar_quirk_setup(vdev, i); - } +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +{ + PCIDevice *pdev = &vdev->pdev; /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { @@ -3152,14 +3184,111 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, - vfio_intx_mmap_enable, vdev); + vfio_intx_mmap_enable, vdev); pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_routing_notifier); vdev->irqchip_change_notifier.notify = vfio_irqchip_change; kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); if (!vfio_intx_enable(vdev, errp)) { - goto out_deregister; + timer_free(vdev->intx.mmap_timer); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + return false; + } + } + return true; +} + +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) +{ + ERRP_GUARD(); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; + int i; + char uuid[UUID_STR_LEN]; + g_autofree char *name = NULL; + + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { + if (!(~vdev->host.domain || ~vdev->host.bus || + ~vdev->host.slot || ~vdev->host.function)) { + error_setg(errp, "No provided host device"); + error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif + "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); + return; } + vbasedev->sysfsdev = + g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", + vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); + } + + if (!vfio_device_get_name(vbasedev, errp)) { + return; + } + + /* + * Mediated devices *might* operate compatibly with discarding of RAM, but + * we cannot know for certain, it depends on whether the mdev vendor driver + * stays in sync with the active working set of the guest driver. Prevent + * the x-balloon-allowed option unless this is minimally an mdev device. + */ + vbasedev->mdev = vfio_device_is_mdev(vbasedev); + + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); + + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { + error_setg(errp, "x-balloon-allowed only potentially compatible " + "with mdev devices"); + goto error; + } + + if (!qemu_uuid_is_null(&vdev->vf_token)) { + qemu_uuid_unparse(&vdev->vf_token, uuid); + name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); + } else { + name = g_strdup(vbasedev->name); + } + + if (!vfio_device_attach(name, vbasedev, + pci_device_iommu_address_space(pdev), errp)) { + goto error; + } + + if (!vfio_pci_populate_device(vdev, errp)) { + goto error; + } + + if (!vfio_pci_config_setup(vdev, errp)) { + goto error; + } + + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + error_prepend(errp, "Failed to set vIOMMU: "); + goto out_teardown; + } + + if (!vfio_pci_add_capabilities(vdev, errp)) { + goto out_unset_idev; + } + + if (!vfio_config_quirk_setup(vdev, errp)) { + goto out_unset_idev; + } + + if (vdev->vga) { + vfio_vga_quirk_setup(vdev); + } + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_bar_quirk_setup(vdev, i); + } + + if (!vfio_pci_interrupt_setup(vdev, errp)) { + goto out_unset_idev; } if (vdev->display != ON_OFF_AUTO_OFF) { @@ -3202,8 +3331,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - vfio_register_err_notifier(vdev); - vfio_register_req_notifier(vdev); + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); return; @@ -3224,33 +3353,22 @@ out_unset_idev: pci_device_unset_iommu_device(pdev); } out_teardown: - vfio_teardown_msi(vdev); - vfio_bars_exit(vdev); + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } static void vfio_instance_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } static void vfio_exitfn(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); @@ -3263,9 +3381,9 @@ static void vfio_exitfn(PCIDevice *pdev) if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } - vfio_teardown_msi(vdev); + vfio_pci_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); - vfio_bars_exit(vdev); + vfio_pci_bars_exit(vdev); vfio_migration_exit(vbasedev); if (!vbasedev->mdev) { pci_device_unset_iommu_device(pdev); @@ -3274,7 +3392,12 @@ static void vfio_exitfn(PCIDevice *pdev) static void vfio_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI(dev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (cpr_is_incoming()) { + return; + } trace_vfio_pci_reset(vdev->vbasedev.name); @@ -3314,7 +3437,7 @@ post_reset: static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -3333,8 +3456,40 @@ static void vfio_instance_init(Object *obj) /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command * line, therefore, no need to wait to realize like other devices */ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + + /* + * A device that is resuming for cpr is already configured, so do not + * reset it during qemu_system_reset prior to cpr load, else interrupts + * may be lost. + */ + pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; +} + +static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->desc = "VFIO PCI base device"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pdc->exit = vfio_exitfn; + pdc->config_read = vfio_pci_read_config; + pdc->config_write = vfio_pci_write_config; } +static const TypeInfo vfio_pci_base_dev_info = { + .name = TYPE_VFIO_PCI_BASE, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(VFIOPCIDevice), + .abstract = true, + .class_init = vfio_pci_base_dev_class_init, + .interfaces = (const InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, +}; + static PropertyInfo vfio_pci_migration_multifd_transfer_prop; static const Property vfio_pci_dev_properties[] = { @@ -3358,7 +3513,7 @@ static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true), DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false), DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice, @@ -3405,11 +3560,12 @@ static const Property vfio_pci_dev_properties[] = { #ifdef CONFIG_IOMMUFD static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) { - vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + vfio_device_set_fd(&vdev->vbasedev, str, errp); } #endif -static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) +static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); @@ -3419,12 +3575,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif + dc->vmsd = &vfio_cpr_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); - pdc->realize = vfio_realize; - pdc->exit = vfio_exitfn; - pdc->config_read = vfio_pci_read_config; - pdc->config_write = vfio_pci_write_config; + pdc->realize = vfio_pci_realize; object_class_property_set_description(klass, /* 1.3 */ "host", @@ -3549,16 +3702,10 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, - .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(VFIOPCIDevice), + .parent = TYPE_VFIO_PCI_BASE, .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, - .interfaces = (InterfaceInfo[]) { - { INTERFACE_PCIE_DEVICE }, - { INTERFACE_CONVENTIONAL_PCI_DEVICE }, - { } - }, }; static const Property vfio_pci_dev_nohotplug_properties[] = { @@ -3567,7 +3714,8 @@ static const Property vfio_pci_dev_nohotplug_properties[] = { ON_OFF_AUTO_AUTO), }; -static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) +static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3604,6 +3752,7 @@ static void register_vfio_pci_dev_type(void) vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; + type_register_static(&vfio_pci_base_dev_info); type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index d94ecab..5ba7330 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -12,14 +12,16 @@ #ifndef HW_VFIO_VFIO_PCI_H #define HW_VFIO_VFIO_PCI_H -#include "exec/memory.h" +#include "system/memory.h" #include "hw/pci/pci_device.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-region.h" #include "qemu/event_notifier.h" #include "qemu/queue.h" #include "qemu/timer.h" #include "qom/object.h" #include "system/kvm.h" +#include "vfio-display.h" #define PCI_ANY_ID (~0) @@ -114,10 +116,19 @@ typedef struct VFIOMSIXInfo { uint32_t pba_offset; unsigned long *pending; bool noresize; + MemoryRegion *pba_region; } VFIOMSIXInfo; +/* + * TYPE_VFIO_PCI_BASE is an abstract type used to share code + * between VFIO implementations that use a kernel driver + * with those that use user sockets. + */ +#define TYPE_VFIO_PCI_BASE "vfio-pci-base" +OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_BASE) + #define TYPE_VFIO_PCI "vfio-pci" -OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI) +/* TYPE_VFIO_PCI shares struct VFIOPCIDevice. */ struct VFIOPCIDevice { PCIDevice pdev; @@ -200,6 +211,14 @@ static inline bool vfio_is_vga(VFIOPCIDevice *vdev) return class == PCI_CLASS_DISPLAY_VGA; } +/* MSI/MSI-X/INTx */ +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr); +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix); +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp); + uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); @@ -238,4 +257,15 @@ void vfio_display_finalize(VFIOPCIDevice *vdev); extern const VMStateDescription vfio_display_vmstate; +void vfio_pci_bars_exit(VFIOPCIDevice *vdev); +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_intx_eoi(VFIODevice *vbasedev); +void vfio_pci_put_device(VFIOPCIDevice *vdev); +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev); +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev); +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 7b4e100..9a21f2e 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -28,8 +28,8 @@ #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/range.h" -#include "exec/memory.h" -#include "exec/address-spaces.h" +#include "system/memory.h" +#include "system/address-spaces.h" #include "qemu/queue.h" #include "hw/sysbus.h" #include "trace.h" @@ -37,6 +37,7 @@ #include "hw/platform-bus.h" #include "hw/qdev-properties.h" #include "system/kvm.h" +#include "hw/vfio/vfio-region.h" /* * Functions used whatever the injection method @@ -118,8 +119,8 @@ static int vfio_set_trigger_eventfd(VFIOINTp *intp, qemu_set_fd_handler(fd, (IOHandler *)handler, NULL, intp); - if (!vfio_set_irq_signaling(vbasedev, intp->pin, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + if (!vfio_device_irq_set_signaling(vbasedev, intp->pin, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); qemu_set_fd_handler(fd, NULL, NULL, NULL); return -EINVAL; @@ -300,7 +301,7 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) if (vfio_irq_is_automasked(intp)) { /* unmasks the physical level-sensitive IRQ */ - vfio_unmask_single_irqindex(vbasedev, intp->pin); + vfio_device_irq_unmask(vbasedev, intp->pin); } /* a single IRQ can be active at a time */ @@ -356,8 +357,8 @@ static int vfio_set_resample_eventfd(VFIOINTp *intp) Error *err = NULL; qemu_set_fd_handler(fd, NULL, NULL, NULL); - if (!vfio_set_irq_signaling(vbasedev, intp->pin, 0, - VFIO_IRQ_SET_ACTION_UNMASK, fd, &err)) { + if (!vfio_device_irq_set_signaling(vbasedev, intp->pin, 0, + VFIO_IRQ_SET_ACTION_UNMASK, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); return -EINVAL; } @@ -473,10 +474,10 @@ static bool vfio_populate_device(VFIODevice *vbasedev, Error **errp) QSIMPLEQ_INIT(&vdev->pending_intp_queue); for (i = 0; i < vbasedev->num_irqs; i++) { - struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + struct vfio_irq_info irq; + + ret = vfio_device_get_irq_info(vbasedev, i, &irq); - irq.index = i; - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); if (ret) { error_setg_errno(errp, -ret, "failed to get device irq info"); goto irq_err; @@ -545,7 +546,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp) return false; } - if (!vfio_attach_device(vbasedev->name, vbasedev, + if (!vfio_device_attach(vbasedev->name, vbasedev, &address_space_memory, errp)) { return false; } @@ -554,7 +555,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp) return true; } - vfio_detach_device(vbasedev); + vfio_device_detach(vbasedev); return false; } @@ -658,7 +659,7 @@ static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp) } #endif -static void vfio_platform_class_init(ObjectClass *klass, void *data) +static void vfio_platform_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass); diff --git a/hw/vfio/region.c b/hw/vfio/region.c new file mode 100644 index 0000000..d04c57d --- /dev/null +++ b/hw/vfio/region.c @@ -0,0 +1,403 @@ +/* + * VFIO regions + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> + +#include "hw/vfio/vfio-region.h" +#include "hw/vfio/vfio-device.h" +#include "hw/hw.h" +#include "trace.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/units.h" +#include "monitor/monitor.h" +#include "vfio-helpers.h" + +/* + * IO Port/MMIO - Beware of the endians, VFIO is always little endian + */ +void vfio_region_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIORegion *region = opaque; + VFIODevice *vbasedev = region->vbasedev; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + uint64_t qword; + } buf; + int ret; + + switch (size) { + case 1: + buf.byte = data; + break; + case 2: + buf.word = cpu_to_le16(data); + break; + case 4: + buf.dword = cpu_to_le32(data); + break; + case 8: + buf.qword = cpu_to_le64(data); + break; + default: + hw_error("vfio: unsupported write size, %u bytes", size); + break; + } + + ret = vbasedev->io_ops->region_write(vbasedev, region->nr, + addr, size, &buf, region->post_wr); + if (ret != size) { + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 + ",%d) failed: %s", + __func__, vbasedev->name, region->nr, + addr, data, size, strwriteerror(ret)); + } + + trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); + + /* + * A read or write to a BAR always signals an INTx EOI. This will + * do nothing if not pending (including not in INTx mode). We assume + * that a BAR access is in response to an interrupt and that BAR + * accesses will service the interrupt. Unfortunately, we don't know + * which access will service the interrupt, so we're potentially + * getting quite a few host interrupts per guest interrupt. + */ + vbasedev->ops->vfio_eoi(vbasedev); +} + +uint64_t vfio_region_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIORegion *region = opaque; + VFIODevice *vbasedev = region->vbasedev; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + uint64_t qword; + } buf; + uint64_t data = 0; + int ret; + + ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", + __func__, vbasedev->name, region->nr, + addr, size, strreaderror(ret)); + return (uint64_t)-1; + } + switch (size) { + case 1: + data = buf.byte; + break; + case 2: + data = le16_to_cpu(buf.word); + break; + case 4: + data = le32_to_cpu(buf.dword); + break; + case 8: + data = le64_to_cpu(buf.qword); + break; + default: + hw_error("vfio: unsupported read size, %u bytes", size); + break; + } + + trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); + + /* Same as write above */ + vbasedev->ops->vfio_eoi(vbasedev); + + return data; +} + +static const MemoryRegionOps vfio_region_ops = { + .read = vfio_region_read, + .write = vfio_region_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; + int i, j; + + hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + if (!hdr) { + return -ENODEV; + } + + sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); + + trace_vfio_region_sparse_mmap_header(region->vbasedev->name, + region->nr, sparse->nr_areas); + + region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); + + for (i = 0, j = 0; i < sparse->nr_areas; i++) { + if (sparse->areas[i].size) { + trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, + sparse->areas[i].offset + + sparse->areas[i].size - 1); + region->mmaps[j].offset = sparse->areas[i].offset; + region->mmaps[j].size = sparse->areas[i].size; + j++; + } + } + + region->nr_mmaps = j; + region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); + + return 0; +} + +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, + int index, const char *name) +{ + struct vfio_region_info *info = NULL; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret) { + return ret; + } + + region->vbasedev = vbasedev; + region->flags = info->flags; + region->size = info->size; + region->fd_offset = info->offset; + region->nr = index; + region->post_wr = false; + + if (region->size) { + region->mem = g_new0(MemoryRegion, 1); + memory_region_init_io(region->mem, obj, &vfio_region_ops, + region, name, region->size); + + if (!vbasedev->no_mmap && + region->flags & VFIO_REGION_INFO_FLAG_MMAP) { + + ret = vfio_setup_region_sparse_mmaps(region, info); + + if (ret) { + region->nr_mmaps = 1; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; + } + } + } + + trace_vfio_region_setup(vbasedev->name, index, name, + region->flags, region->fd_offset, region->size); + return 0; +} + +static void vfio_subregion_unmap(VFIORegion *region, int index) +{ + trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), + region->mmaps[index].offset, + region->mmaps[index].offset + + region->mmaps[index].size - 1); + memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); + munmap(region->mmaps[index].mmap, region->mmaps[index].size); + object_unparent(OBJECT(®ion->mmaps[index].mem)); + region->mmaps[index].mmap = NULL; +} + +int vfio_region_mmap(VFIORegion *region) +{ + int i, ret, prot = 0; + char *name; + int fd; + + if (!region->mem) { + return 0; + } + + prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + + for (i = 0; i < region->nr_mmaps; i++) { + size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); + void *map_base, *map_align; + + /* + * Align the mmap for more efficient mapping in the kernel. Ideally + * we'd know the PMD and PUD mapping sizes to use as discrete alignment + * intervals, but we don't. As of Linux v6.12, the largest PUD size + * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set + * on x86_64). Align by power-of-two size, capped at 1GiB. + * + * NB. qemu_memalign() and friends actually allocate memory, whereas + * the region size here can exceed host memory, therefore we manually + * create an oversized anonymous mapping and clean it up for alignment. + */ + map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map_base == MAP_FAILED) { + ret = -errno; + goto no_mmap; + } + + fd = vfio_device_get_region_fd(region->vbasedev, region->nr); + + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); + munmap(map_base, map_align - map_base); + munmap(map_align + region->mmaps[i].size, + align - (map_align - map_base)); + + region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, + MAP_SHARED | MAP_FIXED, fd, + region->fd_offset + + region->mmaps[i].offset); + if (region->mmaps[i].mmap == MAP_FAILED) { + ret = -errno; + goto no_mmap; + } + + name = g_strdup_printf("%s mmaps[%d]", + memory_region_name(region->mem), i); + memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, + memory_region_owner(region->mem), + name, region->mmaps[i].size, + region->mmaps[i].mmap); + g_free(name); + memory_region_add_subregion(region->mem, region->mmaps[i].offset, + ®ion->mmaps[i].mem); + + trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), + region->mmaps[i].offset, + region->mmaps[i].offset + + region->mmaps[i].size - 1); + } + + return 0; + +no_mmap: + trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, + region->fd_offset + region->mmaps[i].offset, + region->fd_offset + region->mmaps[i].offset + + region->mmaps[i].size - 1, ret); + + region->mmaps[i].mmap = NULL; + + for (i--; i >= 0; i--) { + vfio_subregion_unmap(region, i); + } + + return ret; +} + +void vfio_region_unmap(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + vfio_subregion_unmap(region, i); + } + } +} + +void vfio_region_exit(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); + } + } + + trace_vfio_region_exit(region->vbasedev->name, region->nr); +} + +void vfio_region_finalize(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + munmap(region->mmaps[i].mmap, region->mmaps[i].size); + object_unparent(OBJECT(®ion->mmaps[i].mem)); + } + } + + object_unparent(OBJECT(region->mem)); + + g_free(region->mem); + g_free(region->mmaps); + + trace_vfio_region_finalize(region->vbasedev->name, region->nr); + + region->mem = NULL; + region->mmaps = NULL; + region->nr_mmaps = 0; + region->size = 0; + region->flags = 0; + region->nr = 0; +} + +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_set_enabled(®ion->mmaps[i].mem, enabled); + } + } + + trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), + enabled); +} diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 1a5d161..564b70e 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -13,19 +13,28 @@ #include <linux/vfio.h> #include "system/kvm.h" #include "system/hostmem.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-container.h" #include "hw/hw.h" -#include "exec/ram_addr.h" +#include "system/ram_addr.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "trace.h" +#include "vfio-helpers.h" + +typedef struct VFIOHostDMAWindow { + hwaddr min_iova; + hwaddr max_iova; + uint64_t iova_pgsizes; + QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; +} VFIOHostDMAWindow; typedef struct VFIOSpaprContainer { VFIOContainer container; MemoryListener prereg_listener; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + unsigned int levels; } VFIOSpaprContainer; OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR); @@ -230,15 +239,17 @@ static int vfio_spapr_remove_window(VFIOContainer *container, return 0; } -static int vfio_spapr_create_window(VFIOContainer *container, +static bool vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, - hwaddr *pgsize) + hwaddr *pgsize, Error **errp) { int ret = 0; VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; - unsigned entries, bits_total, bits_per_level, max_levels; + unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels; struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; long rampagesize = qemu_minrampagesize(); @@ -252,11 +263,11 @@ static int vfio_spapr_create_window(VFIOContainer *container, pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; if (!pagesize) { - error_report("Host doesn't support page size 0x%"PRIx64 - ", the supported mask is 0x%lx", - memory_region_iommu_get_min_page_size(iommu_mr), - bcontainer->pgsizes); - return -EINVAL; + error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64 + ", the supported mask is 0x%lx", + memory_region_iommu_get_min_page_size(iommu_mr), + bcontainer->pgsizes); + return false; } /* @@ -291,28 +302,41 @@ static int vfio_spapr_create_window(VFIOContainer *container, */ bits_per_level = ctz64(qemu_real_host_page_size()) + 8; create.levels = bits_total / bits_per_level; - if (bits_total % bits_per_level) { - ++create.levels; - } - max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); - for ( ; create.levels <= max_levels; ++create.levels) { - ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); - if (!ret) { - break; + + ddw_levels = scontainer->levels; + if (ddw_levels > 1) { + if (bits_total % bits_per_level) { + ++create.levels; } + max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); + for ( ; create.levels <= max_levels; ++create.levels) { + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + if (!ret) { + break; + } + } + } else { /* ddw_levels == 1 */ + if (create.levels > ddw_levels) { + error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables" + ". Use larger IO page size. Supported mask is 0x%lx", + bcontainer->pgsizes); + return false; + } + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); } + if (ret) { - error_report("Failed to create a window, ret = %d (%m)", ret); - return -errno; + error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret); + return false; } if (create.start_addr != section->offset_within_address_space) { vfio_spapr_remove_window(container, create.start_addr); - error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, - section->offset_within_address_space, - (uint64_t)create.start_addr); - return -EINVAL; + error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx + ", must be %"PRIx64, section->offset_within_address_space, + (uint64_t)create.start_addr); + return false; } trace_vfio_spapr_create_window(create.page_shift, create.levels, @@ -320,7 +344,7 @@ static int vfio_spapr_create_window(VFIOContainer *container, create.start_addr); *pgsize = pagesize; - return 0; + return true; } static bool @@ -377,9 +401,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, } } - ret = vfio_spapr_create_window(container, section, &pgsize); - if (ret) { - error_setg_errno(errp, -ret, "Failed to create SPAPR window"); + ret = vfio_spapr_create_window(container, section, &pgsize, errp); + if (!ret) { return false; } @@ -502,6 +525,8 @@ static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, goto listener_unregister_exit; } + scontainer->levels = info.ddw.levels; + if (v2) { bcontainer->pgsizes = info.ddw.pgsizes; /* @@ -534,7 +559,7 @@ listener_unregister_exit: return false; } -static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) +static void vfio_iommu_spapr_class_init(ObjectClass *klass, const void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9347e3a..e1728c4 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -1,8 +1,10 @@ # See docs/devel/tracing.rst for syntax documentation. +# +# SPDX-License-Identifier: GPL-2.0-or-later # pci.c vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c" -vfio_intx_eoi(const char *name) " (%s) EOI" +vfio_pci_intx_eoi(const char *name) " (%s) EOI" vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled" vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled" vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" @@ -35,10 +37,8 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s" vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:" vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d" vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s" -vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" -vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" -vfio_attach_device(const char *name, int group_id) " (%s) group %d" -vfio_detach_device(const char *name, int group_id) " (%s) group %d" +vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" +vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d" vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x" vfio_pci_reset(const char *name) " (%s)" @@ -89,9 +89,7 @@ vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB" vfio_pci_igd_host_bridge_enabled(const char *name) "%s" vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" -# common.c -vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" -vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 +# listener.c vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" @@ -103,10 +101,21 @@ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t si vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]" -vfio_disconnect_container(int fd) "close container->fd=%d" -vfio_put_group(int fd) "close group->fd=%d" -vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" -vfio_put_base_device(int fd) "close vdev->fd=%d" +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 + +# container-base.c +vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 + +# container.c +vfio_container_disconnect(int fd) "close container->fd=%d" +vfio_group_put(int fd) "close group->fd=%d" +vfio_device_get(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" +vfio_device_put(int fd) "close vdev->fd=%d" +vfio_legacy_dma_unmap_overflow_workaround(void) "" + +# region.c +vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" +vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx" vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d" vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]" @@ -116,11 +125,6 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" -vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" -vfio_legacy_dma_unmap_overflow_workaround(void) "" -vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 -vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 -vfio_reset_handler(void) "" # platform.c vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" @@ -192,3 +196,9 @@ iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" + +# device.c +vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" +vfio_device_reset_handler(void) "" +vfio_device_attach(const char *name, int group_id) " (%s) group %d" +vfio_device_detach(const char *name, int group_id) " (%s) group %d" diff --git a/hw/vfio/trace.h b/hw/vfio/trace.h index 5a343aa..b34b61d 100644 --- a/hw/vfio/trace.h +++ b/hw/vfio/trace.h @@ -1 +1,4 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + */ #include "trace/trace-hw_vfio.h" diff --git a/hw/vfio/vfio-display.h b/hw/vfio/vfio-display.h new file mode 100644 index 0000000..2606c34 --- /dev/null +++ b/hw/vfio/vfio-display.h @@ -0,0 +1,42 @@ +/* + * VFIO display + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_DISPLAY_H +#define HW_VFIO_VFIO_DISPLAY_H + +#include "ui/console.h" +#include "hw/display/ramfb.h" +#include "hw/vfio/vfio-region.h" + +typedef struct VFIODMABuf { + QemuDmaBuf *buf; + uint32_t pos_x, pos_y, pos_updates; + uint32_t hot_x, hot_y, hot_updates; + int dmabuf_id; + QTAILQ_ENTRY(VFIODMABuf) next; +} VFIODMABuf; + +typedef struct VFIODisplay { + QemuConsole *con; + RAMFBState *ramfb; + struct vfio_region_info *edid_info; + struct vfio_region_gfx_edid *edid_regs; + uint8_t *edid_blob; + QEMUTimer *edid_link_timer; + struct { + VFIORegion buffer; + DisplaySurface *surface; + } region; + struct { + QTAILQ_HEAD(, VFIODMABuf) bufs; + VFIODMABuf *primary; + VFIODMABuf *cursor; + } dmabuf; +} VFIODisplay; + +#endif /* HW_VFIO_VFIO_DISPLAY_H */ diff --git a/hw/vfio/vfio-helpers.h b/hw/vfio/vfio-helpers.h new file mode 100644 index 0000000..54a327f --- /dev/null +++ b/hw/vfio/vfio-helpers.h @@ -0,0 +1,35 @@ +/* + * VFIO helpers + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_HELPERS_H +#define HW_VFIO_VFIO_HELPERS_H + +#ifdef CONFIG_LINUX +#include <linux/vfio.h> + +extern int vfio_kvm_device_fd; + +struct vfio_info_cap_header * +vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); +struct vfio_info_cap_header * +vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); +struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); +struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id); +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail); +#endif + +int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); +struct vfio_device_info *vfio_get_device_info(int fd); + +int vfio_kvm_device_add_fd(int fd, Error **errp); +int vfio_kvm_device_del_fd(int fd, Error **errp); + +#endif /* HW_VFIO_VFIO_HELPERS_H */ diff --git a/hw/vfio/vfio-iommufd.h b/hw/vfio/vfio-iommufd.h new file mode 100644 index 0000000..07ea0f4 --- /dev/null +++ b/hw/vfio/vfio-iommufd.h @@ -0,0 +1,34 @@ +/* + * VFIO iommufd + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_IOMMUFD_H +#define HW_VFIO_VFIO_IOMMUFD_H + +#include "hw/vfio/vfio-container-base.h" + +typedef struct VFIODevice VFIODevice; + +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + uint32_t hwpt_flags; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + +typedef struct IOMMUFDBackend IOMMUFDBackend; + +typedef struct VFIOIOMMUFDContainer { + VFIOContainerBase bcontainer; + IOMMUFDBackend *be; + uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; +} VFIOIOMMUFDContainer; + +OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD); + +#endif /* HW_VFIO_VFIO_IOMMUFD_H */ diff --git a/hw/vfio/vfio-listener.h b/hw/vfio/vfio-listener.h new file mode 100644 index 0000000..eb69ddd --- /dev/null +++ b/hw/vfio/vfio-listener.h @@ -0,0 +1,15 @@ +/* + * VFIO MemoryListener services + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_LISTENER_H +#define HW_VFIO_VFIO_LISTENER_H + +bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp); +void vfio_listener_unregister(VFIOContainerBase *bcontainer); + +#endif /* HW_VFIO_VFIO_LISTENER_H */ diff --git a/hw/vfio/vfio-migration-internal.h b/hw/vfio/vfio-migration-internal.h new file mode 100644 index 0000000..a8b456b --- /dev/null +++ b/hw/vfio/vfio-migration-internal.h @@ -0,0 +1,74 @@ +/* + * VFIO migration + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_MIGRATION_INTERNAL_H +#define HW_VFIO_VFIO_MIGRATION_INTERNAL_H + +#ifdef CONFIG_LINUX +#include <linux/vfio.h> +#endif + +#include "qemu/typedefs.h" +#include "qemu/notify.h" + +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) + +typedef struct VFIODevice VFIODevice; +typedef struct VFIOMultifd VFIOMultifd; + +typedef struct VFIOMigration { + struct VFIODevice *vbasedev; + VMChangeStateEntry *vm_state; + NotifierWithReturn migration_state; + uint32_t device_state; + int data_fd; + void *data_buffer; + size_t data_buffer_size; + uint64_t mig_flags; + uint64_t precopy_init_size; + uint64_t precopy_dirty_size; + bool multifd_transfer; + VFIOMultifd *multifd; + bool initial_data_sent; + + bool event_save_iterate_started; + bool event_precopy_empty_hit; +} VFIOMigration; + +bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); +void vfio_migration_exit(VFIODevice *vbasedev); +bool vfio_device_state_is_running(VFIODevice *vbasedev); +bool vfio_device_state_is_precopy(VFIODevice *vbasedev); +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp); +int vfio_load_device_config_state(QEMUFile *f, void *opaque); + +#ifdef CONFIG_LINUX +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp); +#endif + +void vfio_migration_add_bytes_transferred(unsigned long val); + +#endif /* HW_VFIO_VFIO_MIGRATION_INTERNAL_H */ |