diff options
Diffstat (limited to 'hw/vfio')
37 files changed, 5552 insertions, 4072 deletions
diff --git a/hw/vfio/Kconfig b/hw/vfio/Kconfig index 7cdba05..27de24e 100644 --- a/hw/vfio/Kconfig +++ b/hw/vfio/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + config VFIO bool depends on LINUX @@ -15,22 +17,6 @@ config VFIO_CCW select VFIO depends on LINUX && S390_CCW_VIRTIO -config VFIO_PLATFORM - bool - default y - select VFIO - depends on LINUX && PLATFORM_BUS - -config VFIO_XGMAC - bool - default y - depends on VFIO_PLATFORM - -config VFIO_AMD_XGBE - bool - default y - depends on VFIO_PLATFORM - config VFIO_AP bool default y diff --git a/hw/vfio/amd-xgbe.c b/hw/vfio/amd-xgbe.c deleted file mode 100644 index 5927503..0000000 --- a/hw/vfio/amd-xgbe.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * AMD XGBE VFIO device - * - * Copyright Linaro Limited, 2015 - * - * Authors: - * Eric Auger <eric.auger@linaro.org> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "hw/vfio/vfio-amd-xgbe.h" -#include "migration/vmstate.h" -#include "qemu/module.h" -#include "qemu/error-report.h" - -static void amd_xgbe_realize(DeviceState *dev, Error **errp) -{ - VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); - VFIOAmdXgbeDeviceClass *k = VFIO_AMD_XGBE_DEVICE_GET_CLASS(dev); - - warn_report("-device vfio-amd-xgbe is deprecated"); - vdev->compat = g_strdup("amd,xgbe-seattle-v1a"); - vdev->num_compat = 1; - - k->parent_realize(dev, errp); -} - -static const VMStateDescription vfio_platform_amd_xgbe_vmstate = { - .name = "vfio-amd-xgbe", - .unmigratable = 1, -}; - -static void vfio_amd_xgbe_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - VFIOAmdXgbeDeviceClass *vcxc = - VFIO_AMD_XGBE_DEVICE_CLASS(klass); - device_class_set_parent_realize(dc, amd_xgbe_realize, - &vcxc->parent_realize); - dc->desc = "VFIO AMD XGBE"; - dc->vmsd = &vfio_platform_amd_xgbe_vmstate; -} - -static const TypeInfo vfio_amd_xgbe_dev_info = { - .name = TYPE_VFIO_AMD_XGBE, - .parent = TYPE_VFIO_PLATFORM, - .instance_size = sizeof(VFIOAmdXgbeDevice), - .class_init = vfio_amd_xgbe_class_init, - .class_size = sizeof(VFIOAmdXgbeDeviceClass), -}; - -static void register_amd_xgbe_dev_type(void) -{ - type_register_static(&vfio_amd_xgbe_dev_info); -} - -type_init(register_amd_xgbe_dev_type) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index c7ab4ff..7719f24 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -10,16 +10,19 @@ * directory. */ +#include <stdbool.h> #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include <linux/vfio.h> #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/ap-device.h" +#include "hw/s390x/css.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" +#include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/option.h" @@ -28,7 +31,7 @@ #include "migration/vmstate.h" #include "hw/qdev-properties.h" #include "hw/s390x/ap-bridge.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "qom/object.h" #define TYPE_VFIO_AP_DEVICE "vfio-ap" @@ -37,8 +40,23 @@ struct VFIOAPDevice { APDevice apdev; VFIODevice vdev; EventNotifier req_notifier; + EventNotifier cfg_notifier; }; +typedef struct APConfigChgEvent { + QTAILQ_ENTRY(APConfigChgEvent) next; +} APConfigChgEvent; + +static QTAILQ_HEAD(, APConfigChgEvent) cfg_chg_events = + QTAILQ_HEAD_INITIALIZER(cfg_chg_events); + +static QemuMutex cfg_chg_events_lock; + +static void __attribute__((constructor)) vfio_ap_global_init(void) +{ + qemu_mutex_init(&cfg_chg_events_lock); +} + OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE) static void vfio_ap_compute_needs_reset(VFIODevice *vdev) @@ -70,14 +88,65 @@ static void vfio_ap_req_notifier_handler(void *opaque) } } +static void vfio_ap_cfg_chg_notifier_handler(void *opaque) +{ + APConfigChgEvent *cfg_chg_event; + VFIOAPDevice *vapdev = opaque; + + if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) { + return; + } + + cfg_chg_event = g_new0(APConfigChgEvent, 1); + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + QTAILQ_INSERT_TAIL(&cfg_chg_events, cfg_chg_event, next); + } + + css_generate_css_crws(0); + +} + +int ap_chsc_sei_nt0_get_event(void *res) +{ + ChscSeiNt0Res *nt0_res = (ChscSeiNt0Res *)res; + APConfigChgEvent *cfg_chg_event; + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + if (QTAILQ_EMPTY(&cfg_chg_events)) { + return EVENT_INFORMATION_NOT_STORED; + } + + cfg_chg_event = QTAILQ_FIRST(&cfg_chg_events); + QTAILQ_REMOVE(&cfg_chg_events, cfg_chg_event, next); + } + + memset(nt0_res, 0, sizeof(*nt0_res)); + g_free(cfg_chg_event); + nt0_res->flags |= PENDING_EVENT_INFO_BITMASK; + nt0_res->length = sizeof(ChscSeiNt0Res); + nt0_res->code = NT0_RES_RESPONSE_CODE; + nt0_res->nt = NT0_RES_NT_DEFAULT; + nt0_res->rs = NT0_RES_RS_AP_CHANGE; + nt0_res->cc = NT0_RES_CC_AP_CHANGE; + + return EVENT_INFORMATION_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + QEMU_LOCK_GUARD(&cfg_chg_events_lock); + return !QTAILQ_EMPTY(&cfg_chg_events); +} + static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { int fd; - size_t argsz; + int ret; IOHandler *fd_read; EventNotifier *notifier; - g_autofree struct vfio_irq_info *irq_info = NULL; + struct vfio_irq_info irq_info; VFIODevice *vdev = &vapdev->vdev; switch (irq) { @@ -85,6 +154,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, notifier = &vapdev->req_notifier; fd_read = vfio_ap_req_notifier_handler; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + fd_read = vfio_ap_cfg_chg_notifier_handler; + break; default: error_setg(errp, "vfio: Unsupported device irq(%d)", irq); return false; @@ -96,14 +169,15 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, return false; } - argsz = sizeof(*irq_info); - irq_info = g_malloc0(argsz); - irq_info->index = irq; - irq_info->argsz = argsz; + ret = vfio_device_get_irq_info(vdev, irq, &irq_info); - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, - irq_info) < 0 || irq_info->count < 1) { - error_setg_errno(errp, errno, "vfio: Error getting irq info"); + if (ret < 0) { + error_setg_errno(errp, -ret, "vfio: Error getting irq info"); + return false; + } + + if (irq_info.count < 1) { + error_setg(errp, "vfio: Error getting irq info, count=0"); return false; } @@ -117,8 +191,8 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, fd = event_notifier_get_fd(notifier); qemu_set_fd_handler(fd, fd_read, NULL, vapdev); - if (!vfio_set_irq_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, - errp)) { + if (!vfio_device_irq_set_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, + errp)) { qemu_set_fd_handler(fd, NULL, NULL, vapdev); event_notifier_cleanup(notifier); } @@ -136,13 +210,16 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, case VFIO_AP_REQ_IRQ_INDEX: notifier = &vapdev->req_notifier; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + break; default: error_report("vfio: Unsupported device irq(%d)", irq); return; } - if (!vfio_set_irq_signaling(&vapdev->vdev, irq, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vapdev->vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { warn_reportf_err(err, VFIO_MSG_PREFIX, vapdev->vdev.name); } @@ -162,7 +239,7 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) return; } - if (!vfio_attach_device(vbasedev->name, vbasedev, + if (!vfio_device_attach(vbasedev->name, vbasedev, &address_space_memory, errp)) { goto error; } @@ -175,11 +252,20 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) warn_report_err(err); } + if (!vfio_ap_register_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX, &err)) + { + /* + * Report this error, but do not make it a failing condition. + * Lack of this IRQ in the host does not prevent normal operation. + */ + warn_report_err(err); + } + return; error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); } static void vfio_ap_unrealize(DeviceState *dev) @@ -187,8 +273,9 @@ static void vfio_ap_unrealize(DeviceState *dev) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX); - vfio_detach_device(&vapdev->vdev); - g_free(vapdev->vdev.name); + vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX); + vfio_device_detach(&vapdev->vdev); + vfio_device_free_name(&vapdev->vdev); } static const Property vfio_ap_properties[] = { @@ -241,7 +328,7 @@ static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp) } #endif -static void vfio_ap_class_init(ObjectClass *klass, void *data) +static void vfio_ap_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); diff --git a/hw/vfio/calxeda-xgmac.c b/hw/vfio/calxeda-xgmac.c deleted file mode 100644 index a5ef262..0000000 --- a/hw/vfio/calxeda-xgmac.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * calxeda xgmac VFIO device - * - * Copyright Linaro Limited, 2014 - * - * Authors: - * Eric Auger <eric.auger@linaro.org> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "hw/vfio/vfio-calxeda-xgmac.h" -#include "migration/vmstate.h" -#include "qemu/module.h" -#include "qemu/error-report.h" - -static void calxeda_xgmac_realize(DeviceState *dev, Error **errp) -{ - VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); - VFIOCalxedaXgmacDeviceClass *k = VFIO_CALXEDA_XGMAC_DEVICE_GET_CLASS(dev); - - warn_report("-device vfio-calxeda-xgmac is deprecated"); - vdev->compat = g_strdup("calxeda,hb-xgmac"); - vdev->num_compat = 1; - - k->parent_realize(dev, errp); -} - -static const VMStateDescription vfio_platform_calxeda_xgmac_vmstate = { - .name = "vfio-calxeda-xgmac", - .unmigratable = 1, -}; - -static void vfio_calxeda_xgmac_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - VFIOCalxedaXgmacDeviceClass *vcxc = - VFIO_CALXEDA_XGMAC_DEVICE_CLASS(klass); - device_class_set_parent_realize(dc, calxeda_xgmac_realize, - &vcxc->parent_realize); - dc->desc = "VFIO Calxeda XGMAC"; - dc->vmsd = &vfio_platform_calxeda_xgmac_vmstate; -} - -static const TypeInfo vfio_calxeda_xgmac_dev_info = { - .name = TYPE_VFIO_CALXEDA_XGMAC, - .parent = TYPE_VFIO_PLATFORM, - .instance_size = sizeof(VFIOCalxedaXgmacDevice), - .class_init = vfio_calxeda_xgmac_class_init, - .class_size = sizeof(VFIOCalxedaXgmacDeviceClass), -}; - -static void register_calxeda_xgmac_dev_type(void) -{ - type_register_static(&vfio_calxeda_xgmac_dev_info); -} - -type_init(register_calxeda_xgmac_dev_type) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index e5e0d9e..9560b8d 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -21,13 +21,13 @@ #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" #include "hw/qdev-properties.h" #include "hw/s390x/ccw-device.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" @@ -376,8 +376,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, Error **errp) { VFIODevice *vdev = &vcdev->vdev; - g_autofree struct vfio_irq_info *irq_info = NULL; - size_t argsz; + struct vfio_irq_info irq_info; + int ret; int fd; EventNotifier *notifier; IOHandler *fd_read; @@ -406,13 +406,15 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, return false; } - argsz = sizeof(*irq_info); - irq_info = g_malloc0(argsz); - irq_info->index = irq; - irq_info->argsz = argsz; - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, - irq_info) < 0 || irq_info->count < 1) { - error_setg_errno(errp, errno, "vfio: Error getting irq info"); + ret = vfio_device_get_irq_info(vdev, irq, &irq_info); + + if (ret < 0) { + error_setg_errno(errp, -ret, "vfio: Error getting irq info"); + return false; + } + + if (irq_info.count < 1) { + error_setg(errp, "vfio: Error getting irq info, count=0"); return false; } @@ -426,8 +428,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, fd = event_notifier_get_fd(notifier); qemu_set_fd_handler(fd, fd_read, NULL, vcdev); - if (!vfio_set_irq_signaling(vdev, irq, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { + if (!vfio_device_irq_set_signaling(vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vcdev); event_notifier_cleanup(notifier); } @@ -456,8 +458,8 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, return; } - if (!vfio_set_irq_signaling(&vcdev->vdev, irq, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vcdev->vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { warn_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); } @@ -488,7 +490,7 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) return false; } - ret = vfio_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, &info); + ret = vfio_device_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, &info); if (ret) { error_setg_errno(errp, -ret, "vfio: Error getting config info"); return false; @@ -502,11 +504,10 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->io_region_offset = info->offset; vcdev->io_region = g_malloc0(info->size); - g_free(info); /* check for the optional async command region */ - ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, - VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, &info); + ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, &info); if (!ret) { vcdev->async_cmd_region_size = info->size; if (sizeof(*vcdev->async_cmd_region) != vcdev->async_cmd_region_size) { @@ -515,11 +516,10 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->async_cmd_region_offset = info->offset; vcdev->async_cmd_region = g_malloc0(info->size); - g_free(info); } - ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, - VFIO_REGION_SUBTYPE_CCW_SCHIB, &info); + ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_SCHIB, &info); if (!ret) { vcdev->schib_region_size = info->size; if (sizeof(*vcdev->schib_region) != vcdev->schib_region_size) { @@ -528,11 +528,10 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->schib_region_offset = info->offset; vcdev->schib_region = g_malloc(info->size); - g_free(info); } - ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, - VFIO_REGION_SUBTYPE_CCW_CRW, &info); + ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_CRW, &info); if (!ret) { vcdev->crw_region_size = info->size; @@ -542,7 +541,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->crw_region_offset = info->offset; vcdev->crw_region = g_malloc(info->size); - g_free(info); } return true; @@ -552,7 +550,6 @@ out_err: g_free(vcdev->schib_region); g_free(vcdev->async_cmd_region); g_free(vcdev->io_region); - g_free(info); return false; } @@ -583,7 +580,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) goto out_unrealize; } - if (!vfio_attach_device(cdev->mdevid, vbasedev, + if (!vfio_device_attach(cdev->mdevid, vbasedev, &address_space_memory, errp)) { goto out_attach_dev_err; } @@ -620,9 +617,9 @@ out_irq_notifier_err: out_io_notifier_err: vfio_ccw_put_region(vcdev); out_region_err: - vfio_detach_device(vbasedev); + vfio_device_detach(vbasedev); out_attach_dev_err: - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); out_unrealize: if (cdc->unrealize) { cdc->unrealize(cdev); @@ -639,8 +636,8 @@ static void vfio_ccw_unrealize(DeviceState *dev) vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); vfio_ccw_put_region(vcdev); - vfio_detach_device(&vcdev->vdev); - g_free(vcdev->vdev.name); + vfio_device_detach(&vcdev->vdev); + vfio_device_free_name(&vcdev->vdev); if (cdc->unrealize) { cdc->unrealize(cdev); @@ -689,7 +686,7 @@ static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp) } #endif -static void vfio_ccw_class_init(ObjectClass *klass, void *data) +static void vfio_ccw_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c deleted file mode 100644 index 749a3fd..0000000 --- a/hw/vfio/container-base.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * VFIO BASE CONTAINER - * - * Copyright (C) 2023 Intel Corporation. - * Copyright Red Hat, Inc. 2023 - * - * Authors: Yi Liu <yi.l.liu@intel.com> - * Eric Auger <eric.auger@redhat.com> - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/error-report.h" -#include "hw/vfio/vfio-container-base.h" - -int vfio_container_dma_map(VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); -} - -int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - g_assert(vioc->dma_unmap); - return vioc->dma_unmap(bcontainer, iova, size, iotlb); -} - -bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, - Error **errp) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - if (!vioc->add_window) { - return true; - } - - return vioc->add_window(bcontainer, section, errp); -} - -void vfio_container_del_section_window(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - if (!vioc->del_window) { - return; - } - - return vioc->del_window(bcontainer, section); -} - -int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, - bool start, Error **errp) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - int ret; - - if (!bcontainer->dirty_pages_supported) { - return 0; - } - - g_assert(vioc->set_dirty_page_tracking); - if (bcontainer->dirty_pages_started == start) { - return 0; - } - - ret = vioc->set_dirty_page_tracking(bcontainer, start, errp); - if (!ret) { - bcontainer->dirty_pages_started = start; - } - - return ret; -} - -int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - g_assert(vioc->query_dirty_bitmap); - return vioc->query_dirty_bitmap(bcontainer, vbmap, iova, size, - errp); -} - -static gpointer copy_iova_range(gconstpointer src, gpointer data) -{ - Range *source = (Range *)src; - Range *dest = g_new(Range, 1); - - range_set_bounds(dest, range_lob(source), range_upb(source)); - return dest; -} - -GList *vfio_container_get_iova_ranges(const VFIOContainerBase *bcontainer) -{ - assert(bcontainer); - return g_list_copy_deep(bcontainer->iova_ranges, copy_iova_range, NULL); -} - -static void vfio_container_instance_finalize(Object *obj) -{ - VFIOContainerBase *bcontainer = VFIO_IOMMU(obj); - VFIOGuestIOMMU *giommu, *tmp; - - QLIST_SAFE_REMOVE(bcontainer, next); - - QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier( - MEMORY_REGION(giommu->iommu_mr), &giommu->n); - QLIST_REMOVE(giommu, giommu_next); - g_free(giommu); - } - - g_list_free_full(bcontainer->iova_ranges, g_free); -} - -static void vfio_container_instance_init(Object *obj) -{ - VFIOContainerBase *bcontainer = VFIO_IOMMU(obj); - - bcontainer->error = NULL; - bcontainer->dirty_pages_supported = false; - bcontainer->dma_max_mappings = 0; - bcontainer->iova_ranges = NULL; - QLIST_INIT(&bcontainer->giommu_list); - QLIST_INIT(&bcontainer->vrdl_list); -} - -static const TypeInfo types[] = { - { - .name = TYPE_VFIO_IOMMU, - .parent = TYPE_OBJECT, - .instance_init = vfio_container_instance_init, - .instance_finalize = vfio_container_instance_finalize, - .instance_size = sizeof(VFIOContainerBase), - .class_size = sizeof(VFIOIOMMUClass), - .abstract = true, - }, -}; - -DEFINE_TYPES(types) diff --git a/hw/vfio/container-legacy.c b/hw/vfio/container-legacy.c new file mode 100644 index 0000000..a3615d7 --- /dev/null +++ b/hw/vfio/container-legacy.c @@ -0,0 +1,1260 @@ +/* + * generic functions used by VFIO devices + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#include <linux/vfio.h> + +#include "hw/vfio/vfio-device.h" +#include "system/address-spaces.h" +#include "system/memory.h" +#include "system/physmem.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "system/reset.h" +#include "trace.h" +#include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/blocker.h" +#include "pci.h" +#include "hw/vfio/vfio-container-legacy.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" + +#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" + +typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; +static VFIOGroupList vfio_group_list = + QLIST_HEAD_INITIALIZER(vfio_group_list); + +static int vfio_ram_block_discard_disable(VFIOLegacyContainer *container, + bool state) +{ + switch (container->iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); + default: + /* + * VFIO_SPAPR_TCE_IOMMU most probably works just fine with + * RamDiscardManager, however, it is completely untested. + * + * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does + * completely the opposite of managing mapping/pinning dynamically as + * required by RamDiscardManager. We would have to special-case sections + * with a RamDiscardManager. + */ + return ram_block_discard_disable(state); + } +} + +static int vfio_dma_unmap_bitmap(const VFIOLegacyContainer *container, + hwaddr iova, uint64_t size, + IOMMUTLBEntry *iotlb) +{ + const VFIOContainer *bcontainer = VFIO_IOMMU(container); + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + VFIOBitmap vbmap; + int ret; + + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + return ret; + } + + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); + + unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); + unmap->iova = iova; + unmap->size = size; + unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; + bitmap = (struct vfio_bitmap *)&unmap->data; + + /* + * physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize + * to qemu_real_host_page_size. + */ + bitmap->pgsize = qemu_real_host_page_size(); + bitmap->size = vbmap.size; + bitmap->data = (__u64 *)vbmap.bitmap; + + if (vbmap.size > bcontainer->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); + ret = -E2BIG; + goto unmap_exit; + } + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + if (!ret) { + physical_memory_set_dirty_lebitmap(vbmap.bitmap, + iotlb->translated_addr, vbmap.pages); + } else { + error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); + } + +unmap_exit: + g_free(unmap); + g_free(vbmap.bitmap); + + return ret; +} + +static int vfio_legacy_dma_unmap_one(const VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + IOMMUTLBEntry *iotlb) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = iova, + .size = size, + }; + bool need_dirty_sync = false; + int ret; + Error *local_err = NULL; + + g_assert(!cpr_is_incoming()); + + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { + if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && + bcontainer->dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + + need_dirty_sync = true; + } + + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + return -errno; + } + + if (need_dirty_sync) { + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, size, + iotlb->translated_addr, &local_err); + if (ret) { + error_report_err(local_err); + return ret; + } + } + + return 0; +} + +/* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +static int vfio_legacy_dma_unmap(const VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) +{ + int ret; + + if (unmap_all) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + Int128 llsize = int128_rshift(int128_2_64(), 1); + + ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize), + iotlb); + + if (ret == 0) { + ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize), + int128_get64(llsize), iotlb); + } + + } else { + ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb); + } + + return ret; +} + +static int vfio_legacy_dma_map(const VFIOContainer *bcontainer, hwaddr iova, + uint64_t size, void *vaddr, bool readonly, + MemoryRegion *mr) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_READ, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + if (!readonly) { + map.flags |= VFIO_DMA_MAP_FLAG_WRITE; + } + + /* + * Try the mapping, if it fails with EBUSY, unmap the region and try + * again. This shouldn't be necessary, but we sometimes see it in + * the VGA ROM space. + */ + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || + (errno == EBUSY && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, false) == 0 && + ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { + return 0; + } + + return -errno; +} + +static int +vfio_legacy_set_dirty_page_tracking(const VFIOContainer *bcontainer, + bool start, Error **errp) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + ret = -errno; + error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x", + dirty.flags); + } + + return ret; +} + +static int vfio_legacy_query_dirty_bitmap(const VFIOContainer *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + int ret; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; + + /* + * physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize + * to qemu_real_host_page_size. + */ + range->bitmap.pgsize = qemu_real_host_page_size(); + range->bitmap.size = vbmap->size; + range->bitmap.data = (__u64 *)vbmap->bitmap; + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + ret = -errno; + error_setg_errno(errp, errno, + "Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64, (uint64_t)range->iova, + (uint64_t)range->size); + } + + g_free(dbitmap); + + return ret; +} + +static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, + VFIOContainer *bcontainer) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_iova_range *cap; + + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + if (!hdr) { + return false; + } + + cap = (void *)hdr; + + for (int i = 0; i < cap->nr_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, cap->iova_ranges[i].start, + cap->iova_ranges[i].end); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); + } + + return true; +} + +static void vfio_group_add_kvm_device(VFIOGroup *group) +{ + Error *err = NULL; + + if (vfio_kvm_device_add_fd(group->fd, &err)) { + error_reportf_err(err, "group ID %d: ", group->groupid); + } +} + +static void vfio_group_del_kvm_device(VFIOGroup *group) +{ + Error *err = NULL; + + if (vfio_kvm_device_del_fd(group->fd, &err)) { + error_reportf_err(err, "group ID %d: ", group->groupid); + } +} + +/* + * vfio_get_iommu_type - selects the richest iommu_type (v2 first) + */ +static int vfio_get_iommu_type(int container_fd, + Error **errp) +{ + int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, + VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; + int i; + + for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { + if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { + return iommu_types[i]; + } + } + error_setg(errp, "No available IOMMU models"); + return -EINVAL; +} + +/* + * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type + */ +static const char *vfio_get_iommu_class_name(int iommu_type) +{ + switch (iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + return TYPE_VFIO_IOMMU_LEGACY; + break; + case VFIO_SPAPR_TCE_v2_IOMMU: + case VFIO_SPAPR_TCE_IOMMU: + return TYPE_VFIO_IOMMU_SPAPR; + break; + default: + g_assert_not_reached(); + }; +} + +static bool vfio_set_iommu(int container_fd, int group_fd, + int *iommu_type, Error **errp) +{ + if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) { + error_setg_errno(errp, errno, "Failed to set group container"); + return false; + } + + while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) { + if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + /* + * On sPAPR, despite the IOMMU subdriver always advertises v1 and + * v2, the running platform may not support v2 and there is no + * way to guess it until an IOMMU group gets added to the container. + * So in case it fails with v2, try v1 as a fallback. + */ + *iommu_type = VFIO_SPAPR_TCE_IOMMU; + continue; + } + error_setg_errno(errp, errno, "Failed to set iommu for container"); + return false; + } + + return true; +} + +static VFIOLegacyContainer *vfio_create_container(int fd, VFIOGroup *group, + Error **errp) +{ + int iommu_type; + const char *vioc_name; + VFIOLegacyContainer *container; + + iommu_type = vfio_get_iommu_type(fd, errp); + if (iommu_type < 0) { + return NULL; + } + + /* + * During CPR, just set the container type and skip the ioctls, as the + * container and group are already configured in the kernel. + */ + if (!cpr_is_incoming() && + !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { + return NULL; + } + + vioc_name = vfio_get_iommu_class_name(iommu_type); + + container = VFIO_IOMMU_LEGACY(object_new(vioc_name)); + container->fd = fd; + container->iommu_type = iommu_type; + return container; +} + +static int vfio_get_iommu_info(VFIOLegacyContainer *container, + struct vfio_iommu_type1_info **info) +{ + + size_t argsz = sizeof(struct vfio_iommu_type1_info); + + *info = g_new0(struct vfio_iommu_type1_info, 1); +again: + (*info)->argsz = argsz; + + if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { + g_free(*info); + *info = NULL; + return -errno; + } + + if (((*info)->argsz > argsz)) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + goto again; + } + + return 0; +} + +static struct vfio_info_cap_header * +vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_get_iommu_info_migration(VFIOLegacyContainer *container, + struct vfio_iommu_type1_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_migration *cap_mig; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + + hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); + if (!hdr) { + return; + } + + cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, + header); + + /* + * physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { + bcontainer->dirty_pages_supported = true; + bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +} + +static bool vfio_legacy_setup(VFIOContainer *bcontainer, Error **errp) +{ + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + g_autofree struct vfio_iommu_type1_info *info = NULL; + int ret; + + ret = vfio_get_iommu_info(container, &info); + if (ret) { + error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); + return false; + } + + if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { + bcontainer->pgsizes = info->iova_pgsizes; + } else { + bcontainer->pgsizes = qemu_real_host_page_size(); + } + + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; + } + + vfio_get_info_iova_range(info, bcontainer); + + vfio_get_iommu_info_migration(container, info); + return true; +} + +static bool vfio_container_attach_discard_disable( + VFIOLegacyContainer *container, VFIOGroup *group, Error **errp) +{ + int ret; + + /* + * VFIO is currently incompatible with discarding of RAM insofar as the + * madvise to purge (zap) the page from QEMU's address space does not + * interact with the memory API and therefore leaves stale virtual to + * physical mappings in the IOMMU if the page was previously pinned. We + * therefore set discarding broken for each group added to a container, + * whether the container is used individually or shared. This provides + * us with options to allow devices within a group to opt-in and allow + * discarding, so long as it is done consistently for a group (for instance + * if the device is an mdev device where it is known that the host vendor + * driver will never pin pages outside of the working set of the guest + * driver, which would thus not be discarding candidates). + * + * The first opportunity to induce pinning occurs here where we attempt to + * attach the group to existing containers within the AddressSpace. If any + * pages are already zapped from the virtual address space, such as from + * previous discards, new pinning will cause valid mappings to be + * re-established. Likewise, when the overall MemoryListener for a new + * container is registered, a replay of mappings within the AddressSpace + * will occur, re-establishing any previously zapped pages as well. + * + * Especially virtio-balloon is currently only prevented from discarding + * new memory, it will not yet set ram_block_discard_set_required() and + * therefore, neither stops us here or deals with the sudden memory + * consumption of inflated memory. + * + * We do support discarding of memory coordinated via the RamDiscardManager + * with some IOMMU types. vfio_ram_block_discard_disable() handles the + * details once we know which type of IOMMU we are using. + */ + + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + } + return !ret; +} + +static bool vfio_container_group_add(VFIOLegacyContainer *container, + VFIOGroup *group, Error **errp) +{ + if (!vfio_container_attach_discard_disable(container, group, errp)) { + return false; + } + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + vfio_group_add_kvm_device(group); + /* + * Remember the container fd for each group, so we can attach to the same + * container after CPR. + */ + cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} + +static void vfio_container_group_del(VFIOLegacyContainer *container, + VFIOGroup *group) +{ + QLIST_REMOVE(group, container_next); + group->container = NULL; + vfio_group_del_kvm_device(group); + vfio_ram_block_discard_disable(container, false); + cpr_delete_fd("vfio_container_for_group", group->groupid); +} + +static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, + Error **errp) +{ + VFIOLegacyContainer *container; + VFIOContainer *bcontainer; + int ret, fd = -1; + VFIOAddressSpace *space; + VFIOIOMMUClass *vioc = NULL; + bool new_container = false; + bool group_was_added = false; + + space = vfio_address_space_get(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); + + if (!cpr_is_incoming()) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = VFIO_IOMMU_LEGACY(bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + return vfio_container_group_add(container, group, errp); + } + } + + fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); + if (fd < 0) { + goto fail; + } + } else { + /* + * For incoming CPR, the group is already attached in the kernel. + * If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, + * create the container struct and group list. + */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = VFIO_IOMMU_LEGACY(bcontainer); + + if (vfio_cpr_container_match(container, group, fd)) { + return vfio_container_group_add(container, group, errp); + } + } + } + + ret = ioctl(fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + error_setg(errp, "supported vfio version: %d, " + "reported version: %d", VFIO_API_VERSION, ret); + goto fail; + } + + container = vfio_create_container(fd, group, errp); + if (!container) { + goto fail; + } + new_container = true; + bcontainer = VFIO_IOMMU(container); + + if (!vfio_legacy_cpr_register_container(container, errp)) { + goto fail; + } + + vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + assert(vioc->setup); + + if (!vioc->setup(bcontainer, errp)) { + goto fail; + } + + vfio_address_space_insert(space, bcontainer); + + if (!vfio_container_group_add(container, group, errp)) { + goto fail; + } + group_was_added = true; + + /* + * If CPR, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * dma_map to supply the new vaddr, and the calls will match the mappings + * remembered by the kernel. + */ + if (!cpr_is_incoming()) { + if (!vfio_listener_register(bcontainer, errp)) { + goto fail; + } + } + + bcontainer->initialized = true; + + return true; + +fail: + if (new_container) { + vfio_listener_unregister(bcontainer); + } + + if (group_was_added) { + vfio_container_group_del(container, group); + } + if (vioc && vioc->release) { + vioc->release(bcontainer); + } + if (new_container) { + vfio_legacy_cpr_unregister_container(container); + object_unref(container); + } + if (fd >= 0) { + close(fd); + } + vfio_address_space_put(space); + + return false; +} + +static void vfio_container_disconnect(VFIOGroup *group) +{ + VFIOLegacyContainer *container = group->container; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + + QLIST_REMOVE(group, container_next); + group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); + + /* + * Explicitly release the listener first before unset container, + * since unset may destroy the backend container if it's the last + * group. + */ + if (QLIST_EMPTY(&container->group_list)) { + vfio_listener_unregister(bcontainer); + if (vioc->release) { + vioc->release(bcontainer); + } + } + + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from container", + group->groupid); + } + + if (QLIST_EMPTY(&container->group_list)) { + VFIOAddressSpace *space = bcontainer->space; + + trace_vfio_container_disconnect(container->fd); + vfio_legacy_cpr_unregister_container(container); + close(container->fd); + object_unref(container); + + vfio_address_space_put(space); + } +} + +static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) +{ + ERRP_GUARD(); + VFIOGroup *group; + char path[32]; + struct vfio_group_status status = { .argsz = sizeof(status) }; + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == groupid) { + /* Found it. Now is it already in the right context? */ + if (VFIO_IOMMU(group->container)->space->as == as) { + return group; + } else { + error_setg(errp, "group %d used in multiple address spaces", + group->groupid); + return NULL; + } + } + } + + group = g_malloc0(sizeof(*group)); + + snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); + group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); + if (group->fd < 0) { + goto free_group_exit; + } + + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { + error_setg_errno(errp, errno, "failed to get group %d status", groupid); + goto close_fd_exit; + } + + if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_setg(errp, "group %d is not viable", groupid); + error_append_hint(errp, + "Please ensure all devices within the iommu_group " + "are bound to their vfio bus driver.\n"); + goto close_fd_exit; + } + + group->groupid = groupid; + QLIST_INIT(&group->device_list); + + if (!vfio_container_connect(group, as, errp)) { + error_prepend(errp, "failed to setup container for group %d: ", + groupid); + goto close_fd_exit; + } + + QLIST_INSERT_HEAD(&vfio_group_list, group, next); + + return group; + +close_fd_exit: + cpr_delete_fd("vfio_group", groupid); + close(group->fd); + +free_group_exit: + g_free(group); + + return NULL; +} + +static void vfio_group_put(VFIOGroup *group) +{ + if (!group || !QLIST_EMPTY(&group->device_list)) { + return; + } + + if (!group->ram_block_discard_allowed) { + vfio_ram_block_discard_disable(group->container, false); + } + vfio_group_del_kvm_device(group); + vfio_container_disconnect(group); + QLIST_REMOVE(group, next); + trace_vfio_group_put(group->fd); + cpr_delete_fd("vfio_group", group->groupid); + close(group->fd); + g_free(group); +} + +static bool vfio_device_get(VFIOGroup *group, const char *name, + VFIODevice *vbasedev, Error **errp) +{ + g_autofree struct vfio_device_info *info = NULL; + int fd; + + fd = vfio_cpr_group_get_device_fd(group->fd, name); + if (fd < 0) { + error_setg_errno(errp, errno, "error getting device from group %d", + group->groupid); + error_append_hint(errp, + "Verify all devices in group %d are bound to vfio-<bus> " + "or pci-stub and not already in use\n", group->groupid); + return false; + } + + info = vfio_get_device_info(fd); + if (!info) { + error_setg_errno(errp, errno, "error getting device info"); + goto fail; + } + + /* + * Set discarding of RAM as not broken for this group if the driver knows + * the device operates compatibly with discarding. Setting must be + * consistent per group, but since compatibility is really only possible + * with mdev currently, we expect singleton groups. + */ + if (vbasedev->ram_block_discard_allowed != + group->ram_block_discard_allowed) { + if (!QLIST_EMPTY(&group->device_list)) { + error_setg(errp, "Inconsistent setting of support for discarding " + "RAM (e.g., balloon) within group"); + goto fail; + } + + if (!group->ram_block_discard_allowed) { + group->ram_block_discard_allowed = true; + vfio_ram_block_discard_disable(group->container, false); + } + } + + vfio_device_prepare(vbasedev, VFIO_IOMMU(group->container), info); + + vbasedev->fd = fd; + vbasedev->group = group; + QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); + + trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); + + return true; + +fail: + close(fd); + cpr_delete_fd(name, 0); + return false; +} + +static void vfio_device_put(VFIODevice *vbasedev) +{ + if (!vbasedev->group) { + return; + } + QLIST_REMOVE(vbasedev, next); + vbasedev->group = NULL; + trace_vfio_device_put(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); + close(vbasedev->fd); +} + +static int vfio_device_get_groupid(VFIODevice *vbasedev, Error **errp) +{ + char *tmp, group_path[PATH_MAX]; + g_autofree char *group_name = NULL; + int ret, groupid; + ssize_t len; + + tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev); + len = readlink(tmp, group_path, sizeof(group_path)); + g_free(tmp); + + if (len <= 0 || len >= sizeof(group_path)) { + ret = len < 0 ? -errno : -ENAMETOOLONG; + error_setg_errno(errp, -ret, "no iommu_group found"); + return ret; + } + + group_path[len] = 0; + + group_name = g_path_get_basename(group_path); + if (sscanf(group_name, "%d", &groupid) != 1) { + error_setg_errno(errp, errno, "failed to read %s", group_path); + return -errno; + } + return groupid; +} + +/* + * vfio_device_attach: attach a device to a security context + * @name and @vbasedev->name are likely to be different depending + * on the type of the device, hence the need for passing @name + */ +static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + int groupid = vfio_device_get_groupid(vbasedev, errp); + VFIODevice *vbasedev_iter; + VFIOGroup *group; + + if (groupid < 0) { + return false; + } + + trace_vfio_device_attach(vbasedev->name, groupid); + + group = vfio_group_get(groupid, as, errp); + if (!group) { + return false; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { + error_setg(errp, "device is already attached"); + goto group_put_exit; + } + } + if (!vfio_device_get(group, name, vbasedev, errp)) { + goto group_put_exit; + } + + if (!vfio_device_hiod_create_and_realize(vbasedev, + TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + errp)) { + goto device_put_exit; + } + + if (vbasedev->mdev) { + error_setg(&vbasedev->cpr.mdev_blocker, + "CPR does not support vfio mdev %s", vbasedev->name); + if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, errp, + MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, + -1) < 0) { + goto hiod_unref_exit; + } + } + + return true; + +hiod_unref_exit: + object_unref(vbasedev->hiod); +device_put_exit: + vfio_device_put(vbasedev); +group_put_exit: + vfio_group_put(group); + return false; +} + +static void vfio_legacy_detach_device(VFIODevice *vbasedev) +{ + VFIOGroup *group = vbasedev->group; + + trace_vfio_device_detach(vbasedev->name, group->groupid); + + vfio_device_unprepare(vbasedev); + + migrate_del_blocker(&vbasedev->cpr.mdev_blocker); + object_unref(vbasedev->hiod); + vfio_device_put(vbasedev); + vfio_group_put(group); +} + +static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + devices = &info->devices[0]; + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + trace_vfio_pci_hot_reset_dep_devices(host.domain, + host.bus, host.slot, host.function, devices[i].group_id); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + } + ret = -EPERM; + goto out; + } + + /* Prep dependent devices for reset and clear our marker. */ + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + !vfio_pci_from_vfio_device(vbasedev_iter)) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + break; + } + } + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Determine how many group fds need to be passed */ + count = 0; + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + count++; + break; + } + } + } + + reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); + reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); + fds = &reset->group_fds[0]; + + /* Fill in group fds */ + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + fds[reset->count++] = group->fd; + break; + } + } + } + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + +out: + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + break; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + !vfio_pci_from_vfio_device(vbasedev_iter)) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + vfio_pci_post_reset(tmp); + break; + } + } + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + +static void vfio_iommu_legacy_class_init(ObjectClass *klass, const void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->setup = vfio_legacy_setup; + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; + vioc->attach_device = vfio_legacy_attach_device; + vioc->detach_device = vfio_legacy_detach_device; + vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; +}; + +static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + + hiod->name = g_strdup(vdev->name); + hiod->agent = opaque; + + return true; +} + +static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, + Error **errp) +{ + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return vfio_device_get_aw_bits(hiod->agent); + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + +static GList * +hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod) +{ + VFIODevice *vdev = hiod->agent; + + g_assert(vdev); + return vfio_container_get_iova_ranges(vdev->bcontainer); +} + +static uint64_t +hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod) +{ + VFIODevice *vdev = hiod->agent; + + g_assert(vdev); + return vfio_container_get_page_size_mask(vdev->bcontainer); +} + +static void vfio_iommu_legacy_instance_init(Object *obj) +{ + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(obj); + + QLIST_INIT(&container->group_list); +} + +static void hiod_legacy_vfio_class_init(ObjectClass *oc, const void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->realize = hiod_legacy_vfio_realize; + hioc->get_cap = hiod_legacy_vfio_get_cap; + hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges; + hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_LEGACY, + .parent = TYPE_VFIO_IOMMU, + .instance_init = vfio_iommu_legacy_instance_init, + .instance_size = sizeof(VFIOLegacyContainer), + .class_init = vfio_iommu_legacy_class_init, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_legacy_vfio_class_init, + } +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 7c57bdd2..9ddec30 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1,1212 +1,352 @@ /* - * generic functions used by VFIO devices + * VFIO BASE CONTAINER * - * Copyright Red Hat, Inc. 2012 + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 * - * Authors: - * Alex Williamson <alex.williamson@redhat.com> + * Authors: Yi Liu <yi.l.liu@intel.com> + * Eric Auger <eric.auger@redhat.com> * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Based on qemu-kvm device-assignment: - * Adapted for KVM by Qumranet. - * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) - * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) - * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) - * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) - * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + * SPDX-License-Identifier: GPL-2.0-or-later */ -#include "qemu/osdep.h" #include <sys/ioctl.h> #include <linux/vfio.h> -#include "hw/vfio/vfio-common.h" -#include "exec/address-spaces.h" -#include "exec/memory.h" -#include "exec/ram_addr.h" +#include "qemu/osdep.h" +#include "system/tcg.h" +#include "system/ram_addr.h" +#include "qapi/error.h" #include "qemu/error-report.h" -#include "qemu/range.h" +#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-device.h" /* vfio_device_reset_handler */ +#include "system/physmem.h" #include "system/reset.h" -#include "trace.h" -#include "qapi/error.h" -#include "pci.h" - -VFIOGroupList vfio_group_list = - QLIST_HEAD_INITIALIZER(vfio_group_list); - -static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) -{ - switch (container->iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - /* - * We support coordinated discarding of RAM via the RamDiscardManager. - */ - return ram_block_uncoordinated_discard_disable(state); - default: - /* - * VFIO_SPAPR_TCE_IOMMU most probably works just fine with - * RamDiscardManager, however, it is completely untested. - * - * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does - * completely the opposite of managing mapping/pinning dynamically as - * required by RamDiscardManager. We would have to special-case sections - * with a RamDiscardManager. - */ - return ram_block_discard_disable(state); - } -} - -static int vfio_dma_unmap_bitmap(const VFIOContainer *container, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) -{ - const VFIOContainerBase *bcontainer = &container->bcontainer; - struct vfio_iommu_type1_dma_unmap *unmap; - struct vfio_bitmap *bitmap; - VFIOBitmap vbmap; - int ret; - - ret = vfio_bitmap_alloc(&vbmap, size); - if (ret) { - return ret; - } - - unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); - - unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); - unmap->iova = iova; - unmap->size = size; - unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; - bitmap = (struct vfio_bitmap *)&unmap->data; - - /* - * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of - * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize - * to qemu_real_host_page_size. - */ - bitmap->pgsize = qemu_real_host_page_size(); - bitmap->size = vbmap.size; - bitmap->data = (__u64 *)vbmap.bitmap; - - if (vbmap.size > bcontainer->max_dirty_bitmap_size) { - error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); - ret = -E2BIG; - goto unmap_exit; - } +#include "vfio-helpers.h" - ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); - if (!ret) { - cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, - iotlb->translated_addr, vbmap.pages); - } else { - error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); - } +#include "trace.h" -unmap_exit: - g_free(unmap); - g_free(vbmap.bitmap); +static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = + QLIST_HEAD_INITIALIZER(vfio_address_spaces); - return ret; -} - -/* - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 - */ -static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) +VFIOAddressSpace *vfio_address_space_get(AddressSpace *as) { - const VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - struct vfio_iommu_type1_dma_unmap unmap = { - .argsz = sizeof(unmap), - .flags = 0, - .iova = iova, - .size = size, - }; - bool need_dirty_sync = false; - int ret; - Error *local_err = NULL; - - if (iotlb && vfio_devices_all_dirty_tracking_started(bcontainer)) { - if (!vfio_devices_all_device_dirty_tracking(bcontainer) && - bcontainer->dirty_pages_supported) { - return vfio_dma_unmap_bitmap(container, iova, size, iotlb); - } - - need_dirty_sync = true; - } - - while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { - /* - * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c - * v4.15) where an overflow in its wrap-around check prevents us from - * unmapping the last page of the address space. Test for the error - * condition and re-try the unmap excluding the last page. The - * expectation is that we've never mapped the last page anyway and this - * unmap request comes via vIOMMU support which also makes it unlikely - * that this page is used. This bug was introduced well after type1 v2 - * support was introduced, so we shouldn't need to test for v1. A fix - * is queued for kernel v5.0 so this workaround can be removed once - * affected kernels are sufficiently deprecated. - */ - if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && - container->iommu_type == VFIO_TYPE1v2_IOMMU) { - trace_vfio_legacy_dma_unmap_overflow_workaround(); - unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); - continue; - } - return -errno; - } + VFIOAddressSpace *space; - if (need_dirty_sync) { - ret = vfio_get_dirty_bitmap(bcontainer, iova, size, - iotlb->translated_addr, &local_err); - if (ret) { - error_report_err(local_err); - return ret; + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as == as) { + return space; } } - return 0; -} - -static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) -{ - const VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - struct vfio_iommu_type1_dma_map map = { - .argsz = sizeof(map), - .flags = VFIO_DMA_MAP_FLAG_READ, - .vaddr = (__u64)(uintptr_t)vaddr, - .iova = iova, - .size = size, - }; - - if (!readonly) { - map.flags |= VFIO_DMA_MAP_FLAG_WRITE; - } - - /* - * Try the mapping, if it fails with EBUSY, unmap the region and try - * again. This shouldn't be necessary, but we sometimes see it in - * the VGA ROM space. - */ - if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && - vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && - ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { - return 0; - } - - return -errno; -} - -static int -vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, - bool start, Error **errp) -{ - const VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - int ret; - struct vfio_iommu_type1_dirty_bitmap dirty = { - .argsz = sizeof(dirty), - }; - - if (start) { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; - } else { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; - } - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); - if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x", - dirty.flags); - } - - return ret; -} - -static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - const VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - struct vfio_iommu_type1_dirty_bitmap *dbitmap; - struct vfio_iommu_type1_dirty_bitmap_get *range; - int ret; - - dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); - - dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); - dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; - range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; - range->iova = iova; - range->size = size; - - /* - * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of - * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize - * to qemu_real_host_page_size. - */ - range->bitmap.pgsize = qemu_real_host_page_size(); - range->bitmap.size = vbmap->size; - range->bitmap.data = (__u64 *)vbmap->bitmap; - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); - if (ret) { - ret = -errno; - error_setg_errno(errp, errno, - "Failed to get dirty bitmap for iova: 0x%"PRIx64 - " size: 0x%"PRIx64, (uint64_t)range->iova, - (uint64_t)range->size); - } - - g_free(dbitmap); + /* No suitable VFIOAddressSpace, create a new one */ + space = g_malloc0(sizeof(*space)); + space->as = as; + QLIST_INIT(&space->containers); - return ret; -} - -static struct vfio_info_cap_header * -vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) -{ - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { - return NULL; - } - - return vfio_get_cap((void *)info, info->cap_offset, id); -} - -bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, - unsigned int *avail) -{ - struct vfio_info_cap_header *hdr; - struct vfio_iommu_type1_info_dma_avail *cap; - - /* If the capability cannot be found, assume no DMA limiting */ - hdr = vfio_get_iommu_type1_info_cap(info, - VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); - if (!hdr) { - return false; - } - - if (avail != NULL) { - cap = (void *) hdr; - *avail = cap->avail; - } - - return true; -} - -static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, - VFIOContainerBase *bcontainer) -{ - struct vfio_info_cap_header *hdr; - struct vfio_iommu_type1_info_cap_iova_range *cap; - - hdr = vfio_get_iommu_type1_info_cap(info, - VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); - if (!hdr) { - return false; - } - - cap = (void *)hdr; - - for (int i = 0; i < cap->nr_iovas; i++) { - Range *range = g_new(Range, 1); - - range_set_bounds(range, cap->iova_ranges[i].start, - cap->iova_ranges[i].end); - bcontainer->iova_ranges = - range_list_insert(bcontainer->iova_ranges, range); + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_register_reset(vfio_device_reset_handler, NULL); } - return true; -} - -static void vfio_kvm_device_add_group(VFIOGroup *group) -{ - Error *err = NULL; + QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); - if (vfio_kvm_device_add_fd(group->fd, &err)) { - error_reportf_err(err, "group ID %d: ", group->groupid); - } + return space; } -static void vfio_kvm_device_del_group(VFIOGroup *group) +void vfio_address_space_put(VFIOAddressSpace *space) { - Error *err = NULL; - - if (vfio_kvm_device_del_fd(group->fd, &err)) { - error_reportf_err(err, "group ID %d: ", group->groupid); + if (!QLIST_EMPTY(&space->containers)) { + return; } -} -/* - * vfio_get_iommu_type - selects the richest iommu_type (v2 first) - */ -static int vfio_get_iommu_type(int container_fd, - Error **errp) -{ - int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, - VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; - int i; + QLIST_REMOVE(space, list); + g_free(space); - for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { - if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { - return iommu_types[i]; - } + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_unregister_reset(vfio_device_reset_handler, NULL); } - error_setg(errp, "No available IOMMU models"); - return -EINVAL; } -/* - * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type - */ -static const char *vfio_get_iommu_class_name(int iommu_type) +void vfio_address_space_insert(VFIOAddressSpace *space, + VFIOContainer *bcontainer) { - switch (iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - return TYPE_VFIO_IOMMU_LEGACY; - break; - case VFIO_SPAPR_TCE_v2_IOMMU: - case VFIO_SPAPR_TCE_IOMMU: - return TYPE_VFIO_IOMMU_SPAPR; - break; - default: - g_assert_not_reached(); - }; + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + bcontainer->space = space; } -static bool vfio_set_iommu(int container_fd, int group_fd, - int *iommu_type, Error **errp) +int vfio_container_dma_map(VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + void *vaddr, bool readonly, MemoryRegion *mr) { - if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) { - error_setg_errno(errp, errno, "Failed to set group container"); - return false; - } - - while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) { - if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - /* - * On sPAPR, despite the IOMMU subdriver always advertises v1 and - * v2, the running platform may not support v2 and there is no - * way to guess it until an IOMMU group gets added to the container. - * So in case it fails with v2, try v1 as a fallback. - */ - *iommu_type = VFIO_SPAPR_TCE_IOMMU; - continue; - } - error_setg_errno(errp, errno, "Failed to set iommu for container"); - return false; - } - - return true; -} + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + RAMBlock *rb = mr->ram_block; + int mfd = rb ? qemu_ram_get_fd(rb) : -1; -static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, - Error **errp) -{ - int iommu_type; - const char *vioc_name; - VFIOContainer *container; + if (mfd >= 0 && vioc->dma_map_file) { + unsigned long start = vaddr - qemu_ram_get_host_addr(rb); + unsigned long offset = qemu_ram_get_fd_offset(rb); - iommu_type = vfio_get_iommu_type(fd, errp); - if (iommu_type < 0) { - return NULL; + return vioc->dma_map_file(bcontainer, iova, size, mfd, start + offset, + readonly); } - - if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { - return NULL; - } - - vioc_name = vfio_get_iommu_class_name(iommu_type); - - container = VFIO_IOMMU_LEGACY(object_new(vioc_name)); - container->fd = fd; - container->iommu_type = iommu_type; - return container; + g_assert(vioc->dma_map); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } -static int vfio_get_iommu_info(VFIOContainer *container, - struct vfio_iommu_type1_info **info) +int vfio_container_dma_unmap(VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - size_t argsz = sizeof(struct vfio_iommu_type1_info); - - *info = g_new0(struct vfio_iommu_type1_info, 1); -again: - (*info)->argsz = argsz; - - if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { - g_free(*info); - *info = NULL; - return -errno; - } - - if (((*info)->argsz > argsz)) { - argsz = (*info)->argsz; - *info = g_realloc(*info, argsz); - goto again; - } - - return 0; + g_assert(vioc->dma_unmap); + return vioc->dma_unmap(bcontainer, iova, size, iotlb, unmap_all); } -static struct vfio_info_cap_header * -vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +bool vfio_container_add_section_window(VFIOContainer *bcontainer, + MemoryRegionSection *section, + Error **errp) { - struct vfio_info_cap_header *hdr; - void *ptr = info; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { - return NULL; + if (!vioc->add_window) { + return true; } - for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { - if (hdr->id == id) { - return hdr; - } - } - - return NULL; + return vioc->add_window(bcontainer, section, errp); } -static void vfio_get_iommu_info_migration(VFIOContainer *container, - struct vfio_iommu_type1_info *info) +void vfio_container_del_section_window(VFIOContainer *bcontainer, + MemoryRegionSection *section) { - struct vfio_info_cap_header *hdr; - struct vfio_iommu_type1_info_cap_migration *cap_mig; - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); - if (!hdr) { + if (!vioc->del_window) { return; } - cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, - header); - - /* - * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of - * qemu_real_host_page_size to mark those dirty. - */ - if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - bcontainer->dirty_pages_supported = true; - bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; - bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; - } + return vioc->del_window(bcontainer, section); } -static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) +int vfio_container_set_dirty_page_tracking(VFIOContainer *bcontainer, + bool start, Error **errp) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - g_autofree struct vfio_iommu_type1_info *info = NULL; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); int ret; - ret = vfio_get_iommu_info(container, &info); - if (ret) { - error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); - return false; - } - - if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - bcontainer->pgsizes = info->iova_pgsizes; - } else { - bcontainer->pgsizes = qemu_real_host_page_size(); - } - - if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { - bcontainer->dma_max_mappings = 65535; - } - - vfio_get_info_iova_range(info, bcontainer); - - vfio_get_iommu_info_migration(container, info); - return true; -} - -static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as, - Error **errp) -{ - VFIOContainer *container; - VFIOContainerBase *bcontainer; - int ret, fd; - VFIOAddressSpace *space; - VFIOIOMMUClass *vioc; - - space = vfio_get_address_space(as); - - /* - * VFIO is currently incompatible with discarding of RAM insofar as the - * madvise to purge (zap) the page from QEMU's address space does not - * interact with the memory API and therefore leaves stale virtual to - * physical mappings in the IOMMU if the page was previously pinned. We - * therefore set discarding broken for each group added to a container, - * whether the container is used individually or shared. This provides - * us with options to allow devices within a group to opt-in and allow - * discarding, so long as it is done consistently for a group (for instance - * if the device is an mdev device where it is known that the host vendor - * driver will never pin pages outside of the working set of the guest - * driver, which would thus not be discarding candidates). - * - * The first opportunity to induce pinning occurs here where we attempt to - * attach the group to existing containers within the AddressSpace. If any - * pages are already zapped from the virtual address space, such as from - * previous discards, new pinning will cause valid mappings to be - * re-established. Likewise, when the overall MemoryListener for a new - * container is registered, a replay of mappings within the AddressSpace - * will occur, re-establishing any previously zapped pages as well. - * - * Especially virtio-balloon is currently only prevented from discarding - * new memory, it will not yet set ram_block_discard_set_required() and - * therefore, neither stops us here or deals with the sudden memory - * consumption of inflated memory. - * - * We do support discarding of memory coordinated via the RamDiscardManager - * with some IOMMU types. vfio_ram_block_discard_disable() handles the - * details once we know which type of IOMMU we are using. - */ - - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOContainer, bcontainer); - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, - "Cannot set discarding of RAM broken"); - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, - &container->fd)) { - error_report("vfio: error disconnecting group %d from" - " container", group->groupid); - } - return false; - } - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); - vfio_kvm_device_add_group(group); - return true; - } - } - - fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); - if (fd < 0) { - goto put_space_exit; - } - - ret = ioctl(fd, VFIO_GET_API_VERSION); - if (ret != VFIO_API_VERSION) { - error_setg(errp, "supported vfio version: %d, " - "reported version: %d", VFIO_API_VERSION, ret); - goto close_fd_exit; - } - - container = vfio_create_container(fd, group, errp); - if (!container) { - goto close_fd_exit; - } - bcontainer = &container->bcontainer; - - if (!vfio_cpr_register_container(bcontainer, errp)) { - goto free_container_exit; - } - - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - goto unregister_container_exit; - } - - vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - assert(vioc->setup); - - if (!vioc->setup(bcontainer, errp)) { - goto enable_discards_exit; + if (!bcontainer->dirty_pages_supported) { + return 0; } - vfio_kvm_device_add_group(group); - - vfio_address_space_insert(space, bcontainer); - - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); - - bcontainer->listener = vfio_memory_listener; - memory_listener_register(&bcontainer->listener, bcontainer->space->as); - - if (bcontainer->error) { - error_propagate_prepend(errp, bcontainer->error, - "memory listener initialization failed: "); - goto listener_release_exit; + g_assert(vioc->set_dirty_page_tracking); + if (bcontainer->dirty_pages_started == start) { + return 0; } - bcontainer->initialized = true; - - return true; -listener_release_exit: - QLIST_REMOVE(group, container_next); - vfio_kvm_device_del_group(group); - memory_listener_unregister(&bcontainer->listener); - if (vioc->release) { - vioc->release(bcontainer); + ret = vioc->set_dirty_page_tracking(bcontainer, start, errp); + if (!ret) { + bcontainer->dirty_pages_started = start; } -enable_discards_exit: - vfio_ram_block_discard_disable(container, false); - -unregister_container_exit: - vfio_cpr_unregister_container(bcontainer); - -free_container_exit: - object_unref(container); - -close_fd_exit: - close(fd); - -put_space_exit: - vfio_put_address_space(space); - - return false; + return ret; } -static void vfio_disconnect_container(VFIOGroup *group) +static bool vfio_container_devices_dirty_tracking_is_started( + const VFIOContainer *bcontainer) { - VFIOContainer *container = group->container; - VFIOContainerBase *bcontainer = &container->bcontainer; - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + VFIODevice *vbasedev; - QLIST_REMOVE(group, container_next); - group->container = NULL; - - /* - * Explicitly release the listener first before unset container, - * since unset may destroy the backend container if it's the last - * group. - */ - if (QLIST_EMPTY(&container->group_list)) { - memory_listener_unregister(&bcontainer->listener); - if (vioc->release) { - vioc->release(bcontainer); - } - } - - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { - error_report("vfio: error disconnecting group %d from container", - group->groupid); - } - - if (QLIST_EMPTY(&container->group_list)) { - VFIOAddressSpace *space = bcontainer->space; - - trace_vfio_disconnect_container(container->fd); - vfio_cpr_unregister_container(bcontainer); - close(container->fd); - object_unref(container); - - vfio_put_address_space(space); - } -} - -static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) -{ - ERRP_GUARD(); - VFIOGroup *group; - char path[32]; - struct vfio_group_status status = { .argsz = sizeof(status) }; - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == groupid) { - /* Found it. Now is it already in the right context? */ - if (group->container->bcontainer.space->as == as) { - return group; - } else { - error_setg(errp, "group %d used in multiple address spaces", - group->groupid); - return NULL; - } + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_tracking) { + return false; } } - group = g_malloc0(sizeof(*group)); - - snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open(path, O_RDWR, errp); - if (group->fd < 0) { - goto free_group_exit; - } - - if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { - error_setg_errno(errp, errno, "failed to get group %d status", groupid); - goto close_fd_exit; - } - - if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { - error_setg(errp, "group %d is not viable", groupid); - error_append_hint(errp, - "Please ensure all devices within the iommu_group " - "are bound to their vfio bus driver.\n"); - goto close_fd_exit; - } - - group->groupid = groupid; - QLIST_INIT(&group->device_list); - - if (!vfio_connect_container(group, as, errp)) { - error_prepend(errp, "failed to setup container for group %d: ", - groupid); - goto close_fd_exit; - } - - QLIST_INSERT_HEAD(&vfio_group_list, group, next); - - return group; - -close_fd_exit: - close(group->fd); - -free_group_exit: - g_free(group); - - return NULL; + return true; } -static void vfio_put_group(VFIOGroup *group) +bool vfio_container_dirty_tracking_is_started( + const VFIOContainer *bcontainer) { - if (!group || !QLIST_EMPTY(&group->device_list)) { - return; - } - - if (!group->ram_block_discard_allowed) { - vfio_ram_block_discard_disable(group->container, false); - } - vfio_kvm_device_del_group(group); - vfio_disconnect_container(group); - QLIST_REMOVE(group, next); - trace_vfio_put_group(group->fd); - close(group->fd); - g_free(group); + return vfio_container_devices_dirty_tracking_is_started(bcontainer) || + bcontainer->dirty_pages_started; } -static bool vfio_get_device(VFIOGroup *group, const char *name, - VFIODevice *vbasedev, Error **errp) +bool vfio_container_devices_dirty_tracking_is_supported( + const VFIOContainer *bcontainer) { - g_autofree struct vfio_device_info *info = NULL; - int fd; - - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); - if (fd < 0) { - error_setg_errno(errp, errno, "error getting device from group %d", - group->groupid); - error_append_hint(errp, - "Verify all devices in group %d are bound to vfio-<bus> " - "or pci-stub and not already in use\n", group->groupid); - return false; - } + VFIODevice *vbasedev; - info = vfio_get_device_info(fd); - if (!info) { - error_setg_errno(errp, errno, "error getting device info"); - close(fd); - return false; - } - - /* - * Set discarding of RAM as not broken for this group if the driver knows - * the device operates compatibly with discarding. Setting must be - * consistent per group, but since compatibility is really only possible - * with mdev currently, we expect singleton groups. - */ - if (vbasedev->ram_block_discard_allowed != - group->ram_block_discard_allowed) { - if (!QLIST_EMPTY(&group->device_list)) { - error_setg(errp, "Inconsistent setting of support for discarding " - "RAM (e.g., balloon) within group"); - close(fd); + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { return false; } - - if (!group->ram_block_discard_allowed) { - group->ram_block_discard_allowed = true; - vfio_ram_block_discard_disable(group->container, false); + if (!vbasedev->dirty_pages_supported) { + return false; } } - vbasedev->fd = fd; - vbasedev->group = group; - QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - - vbasedev->num_irqs = info->num_irqs; - vbasedev->num_regions = info->num_regions; - vbasedev->flags = info->flags; - - trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs); - - vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); - return true; } -static void vfio_put_base_device(VFIODevice *vbasedev) +static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + hwaddr size, void *bitmap) { - if (!vbasedev->group) { - return; - } - QLIST_REMOVE(vbasedev, next); - vbasedev->group = NULL; - trace_vfio_put_base_device(vbasedev->fd); - close(vbasedev->fd); -} + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_report), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_dma_logging_report *report = + (struct vfio_device_feature_dma_logging_report *)feature->data; -static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) -{ - char *tmp, group_path[PATH_MAX]; - g_autofree char *group_name = NULL; - int ret, groupid; - ssize_t len; - - tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev); - len = readlink(tmp, group_path, sizeof(group_path)); - g_free(tmp); - - if (len <= 0 || len >= sizeof(group_path)) { - ret = len < 0 ? -errno : -ENAMETOOLONG; - error_setg_errno(errp, -ret, "no iommu_group found"); - return ret; - } + report->iova = iova; + report->length = size; + report->page_size = qemu_real_host_page_size(); + report->bitmap = (uintptr_t)bitmap; - group_path[len] = 0; + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - group_name = g_path_get_basename(group_path); - if (sscanf(group_name, "%d", &groupid) != 1) { - error_setg_errno(errp, errno, "failed to read %s", group_path); - return -errno; - } - return groupid; + return vbasedev->io_ops->device_feature(vbasedev, feature); } -/* - * vfio_attach_device: attach a device to a security context - * @name and @vbasedev->name are likely to be different depending - * on the type of the device, hence the need for passing @name - */ -static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +static int vfio_container_iommu_query_dirty_bitmap( + const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) { - int groupid = vfio_device_groupid(vbasedev, errp); - VFIODevice *vbasedev_iter; - VFIOGroup *group; - VFIOContainerBase *bcontainer; - - if (groupid < 0) { - return false; - } + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - trace_vfio_attach_device(vbasedev->name, groupid); + g_assert(vioc->query_dirty_bitmap); + return vioc->query_dirty_bitmap(bcontainer, vbmap, iova, size, + errp); +} - if (!vfio_device_hiod_realize(vbasedev, errp)) { - return false; - } +static int vfio_container_devices_query_dirty_bitmap( + const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) +{ + VFIODevice *vbasedev; + int ret; - group = vfio_get_group(groupid, as, errp); - if (!group) { - return false; - } + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + ret = vfio_device_dma_logging_report(vbasedev, iova, size, + vbmap->bitmap); + if (ret) { + error_setg_errno(errp, -ret, + "%s: Failed to get DMA logging report, iova: " + "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, + vbasedev->name, iova, size); - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { - error_setg(errp, "device is already attached"); - vfio_put_group(group); - return false; + return ret; } } - if (!vfio_get_device(group, name, vbasedev, errp)) { - vfio_put_group(group); - return false; - } - - bcontainer = &group->container->bcontainer; - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); - return true; + return 0; } -static void vfio_legacy_detach_device(VFIODevice *vbasedev) +int vfio_container_query_dirty_bitmap(const VFIOContainer *bcontainer, + uint64_t iova, uint64_t size, + hwaddr translated_addr, Error **errp) { - VFIOGroup *group = vbasedev->group; - - QLIST_REMOVE(vbasedev, global_next); - QLIST_REMOVE(vbasedev, container_next); - vbasedev->bcontainer = NULL; - trace_vfio_detach_device(vbasedev->name, group->groupid); - vfio_put_base_device(vbasedev); - vfio_put_group(group); -} + bool all_device_dirty_tracking = + vfio_container_devices_dirty_tracking_is_supported(bcontainer); + uint64_t dirty_pages; + VFIOBitmap vbmap; + int ret; -static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) -{ - VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); - VFIOGroup *group; - struct vfio_pci_hot_reset_info *info = NULL; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; - - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); + if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { + physical_memory_set_dirty_range(translated_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + return 0; } - vdev->vbasedev.needs_reset = false; - - ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + ret = vfio_bitmap_alloc(&vbmap, size); if (ret) { - goto out_single; - } - devices = &info->devices[0]; - - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); - - /* Verify that we have all the groups required */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - trace_vfio_pci_hot_reset_dep_devices(host.domain, - host.bus, host.slot, host.function, devices[i].group_id); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %s, " - "depends on group %d which is not owned.", - vdev->vbasedev.name, devices[i].group_id); - } - ret = -EPERM; - goto out; - } - - /* Prep dependent devices for reset and clear our marker. */ - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - if (single) { - ret = -EINVAL; - goto out_single; - } - vfio_pci_pre_reset(tmp); - tmp->vbasedev.needs_reset = false; - multi = true; - break; - } - } - } - - if (!single && !multi) { - ret = -EINVAL; - goto out_single; - } - - /* Determine how many group fds need to be passed */ - count = 0; - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - count++; - break; - } - } + error_setg_errno(errp, -ret, + "Failed to allocate dirty tracking bitmap"); + return ret; } - reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); - reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); - fds = &reset->group_fds[0]; - - /* Fill in group fds */ - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - fds[reset->count++] = group->fd; - break; - } - } + if (all_device_dirty_tracking) { + ret = vfio_container_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, + errp); + } else { + ret = vfio_container_iommu_query_dirty_bitmap(bcontainer, &vbmap, iova, size, + errp); } - /* Bus reset! */ - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); - g_free(reset); if (ret) { - ret = -errno; + goto out; } - trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, - ret ? strerror(errno) : "Success"); + dirty_pages = physical_memory_set_dirty_lebitmap(vbmap.bitmap, + translated_addr, + vbmap.pages); + trace_vfio_container_query_dirty_bitmap(iova, size, vbmap.size, + translated_addr, dirty_pages); out: - /* Re-enable INTx on affected devices */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - break; - } - - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - vfio_pci_post_reset(tmp); - break; - } - } - } -out_single: - if (!single) { - vfio_pci_post_reset(vdev); - } - g_free(info); + g_free(vbmap.bitmap); return ret; } -static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) +static gpointer copy_iova_range(gconstpointer src, gpointer data) { - VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); - - vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO; - - vioc->setup = vfio_legacy_setup; - vioc->dma_map = vfio_legacy_dma_map; - vioc->dma_unmap = vfio_legacy_dma_unmap; - vioc->attach_device = vfio_legacy_attach_device; - vioc->detach_device = vfio_legacy_detach_device; - vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; - vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; - vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; -}; + Range *source = (Range *)src; + Range *dest = g_new(Range, 1); -static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, - Error **errp) -{ - VFIODevice *vdev = opaque; - - hiod->name = g_strdup(vdev->name); - hiod->agent = opaque; - - return true; + range_set_bounds(dest, range_lob(source), range_upb(source)); + return dest; } -static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, - Error **errp) +GList *vfio_container_get_iova_ranges(const VFIOContainer *bcontainer) { - switch (cap) { - case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return vfio_device_get_aw_bits(hiod->agent); - default: - error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); - return -EINVAL; - } + assert(bcontainer); + return g_list_copy_deep(bcontainer->iova_ranges, copy_iova_range, NULL); } -static GList * -hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod) +static void vfio_container_instance_finalize(Object *obj) { - VFIODevice *vdev = hiod->agent; + VFIOContainer *bcontainer = VFIO_IOMMU(obj); + VFIOGuestIOMMU *giommu, *tmp; - g_assert(vdev); - return vfio_container_get_iova_ranges(vdev->bcontainer); -} + QLIST_SAFE_REMOVE(bcontainer, next); -static uint64_t -hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod) -{ - VFIODevice *vdev = hiod->agent; + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier( + MEMORY_REGION(giommu->iommu_mr), &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } - g_assert(vdev); - return vfio_container_get_page_size_mask(vdev->bcontainer); + g_list_free_full(bcontainer->iova_ranges, g_free); } -static void vfio_iommu_legacy_instance_init(Object *obj) +static void vfio_container_instance_init(Object *obj) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(obj); + VFIOContainer *bcontainer = VFIO_IOMMU(obj); - QLIST_INIT(&container->group_list); + bcontainer->error = NULL; + bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; + bcontainer->iova_ranges = NULL; + QLIST_INIT(&bcontainer->giommu_list); + QLIST_INIT(&bcontainer->vrdl_list); } -static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) -{ - HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); - - hioc->realize = hiod_legacy_vfio_realize; - hioc->get_cap = hiod_legacy_vfio_get_cap; - hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges; - hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask; -}; - static const TypeInfo types[] = { { - .name = TYPE_VFIO_IOMMU_LEGACY, - .parent = TYPE_VFIO_IOMMU, - .instance_init = vfio_iommu_legacy_instance_init, + .name = TYPE_VFIO_IOMMU, + .parent = TYPE_OBJECT, + .instance_init = vfio_container_instance_init, + .instance_finalize = vfio_container_instance_finalize, .instance_size = sizeof(VFIOContainer), - .class_init = vfio_iommu_legacy_class_init, - }, { - .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, - .parent = TYPE_HOST_IOMMU_DEVICE, - .class_init = hiod_legacy_vfio_class_init, - } + .class_size = sizeof(VFIOIOMMUClass), + .abstract = true, + }, }; DEFINE_TYPES(types) diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c new file mode 100644 index 0000000..8a4d65d --- /dev/null +++ b/hw/vfio/cpr-iommufd.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2024-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/vfio-device.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "system/iommufd.h" +#include "vfio-iommufd.h" +#include "trace.h" + +typedef struct CprVFIODevice { + char *name; + unsigned int namelen; + uint32_t ioas_id; + int devid; + uint32_t hwpt_id; + QLIST_ENTRY(CprVFIODevice) next; +} CprVFIODevice; + +static const VMStateDescription vmstate_cpr_vfio_device = { + .name = "cpr vfio device", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(namelen, CprVFIODevice), + VMSTATE_VBUFFER_ALLOC_UINT32(name, CprVFIODevice, 0, NULL, namelen), + VMSTATE_INT32(devid, CprVFIODevice), + VMSTATE_UINT32(ioas_id, CprVFIODevice), + VMSTATE_UINT32(hwpt_id, CprVFIODevice), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_cpr_vfio_devices = { + .name = CPR_STATE "/vfio devices", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]){ + VMSTATE_QLIST_V(vfio_devices, CprState, 1, vmstate_cpr_vfio_device, + CprVFIODevice, next), + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_cpr_save_device(VFIODevice *vbasedev) +{ + CprVFIODevice *elem = g_new0(CprVFIODevice, 1); + + elem->name = g_strdup(vbasedev->name); + elem->namelen = strlen(vbasedev->name) + 1; + elem->ioas_id = vbasedev->cpr.ioas_id; + elem->devid = vbasedev->devid; + elem->hwpt_id = vbasedev->cpr.hwpt_id; + QLIST_INSERT_HEAD(&cpr_state.vfio_devices, elem, next); +} + +static CprVFIODevice *find_device(const char *name) +{ + CprVFIODeviceList *head = &cpr_state.vfio_devices; + CprVFIODevice *elem; + + QLIST_FOREACH(elem, head, next) { + if (!strcmp(elem->name, name)) { + return elem; + } + } + return NULL; +} + +static void vfio_cpr_delete_device(const char *name) +{ + CprVFIODevice *elem = find_device(name); + + if (elem) { + QLIST_REMOVE(elem, next); + g_free(elem->name); + g_free(elem); + } +} + +static bool vfio_cpr_find_device(VFIODevice *vbasedev) +{ + CprVFIODevice *elem = find_device(vbasedev->name); + + if (elem) { + vbasedev->cpr.ioas_id = elem->ioas_id; + vbasedev->devid = elem->devid; + vbasedev->cpr.hwpt_id = elem->hwpt_id; + trace_vfio_cpr_find_device(elem->ioas_id, elem->devid, elem->hwpt_id); + return true; + } + return false; +} + +static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp) +{ + if (!iommufd_change_process_capable(be)) { + if (errp) { + error_setg(errp, "vfio iommufd backend does not support " + "IOMMU_IOAS_CHANGE_PROCESS"); + } + return false; + } + return true; +} + +static int iommufd_cpr_pre_save(void *opaque) +{ + IOMMUFDBackend *be = opaque; + + /* + * The process has not changed yet, but proactively try the ioctl, + * and it will fail if any DMA mappings are not supported. + */ + if (!iommufd_change_process_capable(be)) { + error_report("some memory regions do not support " + "IOMMU_IOAS_CHANGE_PROCESS"); + return -1; + } + return 0; +} + +static int iommufd_cpr_post_load(void *opaque, int version_id) +{ + IOMMUFDBackend *be = opaque; + Error *local_err = NULL; + + if (!iommufd_change_process(be, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + +static const VMStateDescription iommufd_cpr_vmstate = { + .name = "iommufd", + .version_id = 0, + .minimum_version_id = 0, + .pre_save = iommufd_cpr_pre_save, + .post_load = iommufd_cpr_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +bool vfio_iommufd_cpr_register_iommufd(IOMMUFDBackend *be, Error **errp) +{ + Error **cpr_blocker = &be->cpr_blocker; + + if (!vfio_cpr_supported(be, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, + MIG_MODE_CPR_EXEC, -1) == 0; + } + + vmstate_register(NULL, -1, &iommufd_cpr_vmstate, be); + + return true; +} + +void vfio_iommufd_cpr_unregister_iommufd(IOMMUFDBackend *be) +{ + vmstate_unregister(NULL, &iommufd_cpr_vmstate, be); + migrate_del_blocker(&be->cpr_blocker); +} + +bool vfio_iommufd_cpr_register_container(VFIOIOMMUFDContainer *container, + Error **errp) +{ + VFIOContainer *bcontainer = VFIO_IOMMU(container); + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + vfio_cpr_add_kvm_notifier(); + + return true; +} + +void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container) +{ + VFIOContainer *bcontainer = VFIO_IOMMU(container); + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); +} + +void vfio_iommufd_cpr_register_device(VFIODevice *vbasedev) +{ + if (!cpr_is_incoming()) { + /* + * Beware fd may have already been saved by vfio_device_set_fd, + * so call resave to avoid a duplicate entry. + */ + cpr_resave_fd(vbasedev->name, 0, vbasedev->fd); + vfio_cpr_save_device(vbasedev); + } +} + +void vfio_iommufd_cpr_unregister_device(VFIODevice *vbasedev) +{ + cpr_delete_fd(vbasedev->name, 0); + vfio_cpr_delete_device(vbasedev->name); +} + +void vfio_cpr_load_device(VFIODevice *vbasedev) +{ + if (cpr_is_incoming()) { + bool ret = vfio_cpr_find_device(vbasedev); + g_assert(ret); + + if (vbasedev->fd < 0) { + vbasedev->fd = cpr_find_fd(vbasedev->name, 0); + } + } +} diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c new file mode 100644 index 0000000..80af746 --- /dev/null +++ b/hw/vfio/cpr-legacy.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2021-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qemu/osdep.h" +#include "hw/vfio/vfio-container-legacy.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-listener.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +static bool vfio_dma_unmap_vaddr_all(VFIOLegacyContainer *container, + Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return false; + } + container->cpr.vaddr_unmapped = true; + return true; +} + +/* + * Set the new @vaddr for any mappings registered during cpr load. + * The incoming state is cleared thereafter. + */ +static int vfio_legacy_cpr_dma_map(const VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, void *vaddr, + bool readonly, MemoryRegion *mr) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_VADDR, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + g_assert(cpr_is_incoming()); + + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + return -errno; + } + + return 0; +} + +static void vfio_region_remap(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOLegacyContainer *container = container_of(listener, + VFIOLegacyContainer, + cpr.remap_listener); + vfio_container_region_add(VFIO_IOMMU(container), section, true); +} + +static bool vfio_cpr_supported(VFIOLegacyContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); + return false; + + } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL"); + return false; + + } else { + return true; + } +} + +static int vfio_container_pre_save(void *opaque) +{ + VFIOLegacyContainer *container = opaque; + Error *local_err = NULL; + + if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOLegacyContainer *container = opaque; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + dma_map_fn saved_dma_map = vioc->dma_map; + Error *local_err = NULL; + + /* During incoming CPR, divert calls to dma_map. */ + vioc->dma_map = vfio_legacy_cpr_dma_map; + + if (!vfio_listener_register(bcontainer, &local_err)) { + error_report_err(local_err); + return -1; + } + + /* Restore original dma_map function */ + vioc->dma_map = saved_dma_map; + + return 0; +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .priority = MIG_PRI_LOW, /* Must happen after devices and groups */ + .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) +{ + VFIOLegacyContainer *container = + container_of(notifier, VFIOLegacyContainer, cpr.transfer_notifier); + VFIOContainer *bcontainer = VFIO_IOMMU(container); + + if (e->type != MIG_EVENT_PRECOPY_FAILED) { + return 0; + } + + if (container->cpr.vaddr_unmapped) { + /* + * Force a call to vfio_region_remap for each mapped section by + * temporarily registering a listener, and temporarily diverting + * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr. + */ + + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + dma_map_fn saved_dma_map = vioc->dma_map; + vioc->dma_map = vfio_legacy_cpr_dma_map; + + container->cpr.remap_listener = (MemoryListener) { + .name = "vfio cpr recover", + .region_add = vfio_region_remap + }; + memory_listener_register(&container->cpr.remap_listener, + bcontainer->space->as); + memory_listener_unregister(&container->cpr.remap_listener); + container->cpr.vaddr_unmapped = false; + vioc->dma_map = saved_dma_map; + } + return 0; +} + +bool vfio_legacy_cpr_register_container(VFIOLegacyContainer *container, + Error **errp) +{ + VFIOContainer *bcontainer = VFIO_IOMMU(container); + Error **cpr_blocker = &container->cpr.blocker; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + if (!vfio_cpr_supported(container, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, + MIG_MODE_CPR_EXEC, -1) == 0; + } + + vfio_cpr_add_kvm_notifier(); + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + migration_add_notifier_modes(&container->cpr.transfer_notifier, + vfio_cpr_fail_notifier, + MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, -1); + return true; +} + +void vfio_legacy_cpr_unregister_container(VFIOLegacyContainer *container) +{ + VFIOContainer *bcontainer = VFIO_IOMMU(container); + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + migrate_del_blocker(&container->cpr.blocker); + vmstate_unregister(NULL, &vfio_container_vmstate, container); + migration_remove_notifier(&container->cpr.transfer_notifier); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a giommu. + * + * The giommu already exists. Find it and replay it, which calls + * vfio_legacy_cpr_dma_map further down the stack. + */ +void vfio_cpr_giommu_remap(VFIOContainer *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu = NULL; + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) && + giommu->iommu_offset == iommu_offset) { + break; + } + } + g_assert(giommu); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a RamDiscardManager. + * + * The ram discard listener already exists. Call its populate function + * directly, which calls vfio_legacy_cpr_dma_map. + */ +bool vfio_cpr_ram_discard_register_listener(VFIOContainer *bcontainer, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); + + g_assert(vrdl); + return vrdl->listener.notify_populate(&vrdl->listener, section) == 0; +} + +int vfio_cpr_group_get_device_fd(int d, const char *name) +{ + const int id = 0; + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +static bool same_device(int fd1, int fd2) +{ + struct stat st1, st2; + + return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; +} + +bool vfio_cpr_container_match(VFIOLegacyContainer *container, VFIOGroup *group, + int fd) +{ + if (container->fd == fd) { + return true; + } + if (!same_device(container->fd, fd)) { + return false; + } + /* + * Same device, different fd. This occurs when the container fd is + * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS + * produces duplicates. De-dup it. + */ + cpr_delete_fd("vfio_container_for_group", group->groupid); + close(fd); + cpr_save_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 3d1c8d2..db462aa 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -6,13 +6,17 @@ */ #include "qemu/osdep.h" -#include "hw/vfio/vfio-common.h" -#include "migration/misc.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/pci.h" +#include "hw/pci/msix.h" +#include "hw/pci/msi.h" +#include "migration/cpr.h" #include "qapi/error.h" #include "system/runstate.h" -static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, - MigrationEvent *e, Error **errp) +int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) { if (e->type == MIG_EVENT_PRECOPY_SETUP && !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) { @@ -25,15 +29,266 @@ static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, return 0; } -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp) +#define STRDUP_VECTOR_FD_NAME(vdev, name) \ + g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) + +void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr, + int fd) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + cpr_save_fd(fdname, nr, fd); +} + +int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + return cpr_find_fd(fdname, nr); +} + +void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + cpr_delete_fd(fdname, nr); +} + +static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, + bool msix) +{ + int i, fd; + bool pending = false; + PCIDevice *pdev = PCI_DEVICE(vdev); + + vdev->nr_vectors = nr_vectors; + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + + vfio_pci_prepare_kvm_msi_virq_batch(vdev); + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i); + if (fd >= 0) { + vfio_pci_vector_init(vdev, i); + vfio_pci_msi_set_handler(vdev, i, true); + } + + if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) { + vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix); + } else { + vdev->msi_vectors[i].virq = -1; + } + + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { + set_bit(i, vdev->msix->pending); + pending = true; + } + } + + vfio_pci_commit_kvm_msi_virq_batch(vdev); + + if (msix) { + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); + } +} + +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_cpr_pci_pre_load(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = PCI_DEVICE(vdev); + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +static int vfio_cpr_pci_post_load(void *opaque, int version_id) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = PCI_DEVICE(vdev); + int nr_vectors; + + vfio_sub_page_bar_update_mappings(vdev); + + if (msix_enabled(pdev)) { + vfio_pci_msix_set_notifiers(vdev); + nr_vectors = vdev->msix->entries; + vfio_cpr_claim_vectors(vdev, nr_vectors, true); + + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + vfio_cpr_claim_vectors(vdev, nr_vectors, false); + + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + Error *local_err = NULL; + if (!vfio_pci_intx_enable(vdev, &local_err)) { + error_report_err(local_err); + return -1; + } + } + + return 0; +} + +static bool pci_msix_present(void *opaque, int version_id) +{ + PCIDevice *pdev = opaque; + + return msix_present(pdev); +} + +static const VMStateDescription vfio_intx_vmstate = { + .name = "vfio-cpr-intx", + .version_id = 0, + .minimum_version_id = 0, + .fields = (VMStateField[]) { + VMSTATE_BOOL(pending, VFIOINTx), + VMSTATE_UINT32(route.mode, VFIOINTx), + VMSTATE_INT32(route.irq, VFIOINTx), + VMSTATE_END_OF_LIST() + } +}; + +#define VMSTATE_VFIO_INTX(_field, _state) { \ + .name = (stringify(_field)), \ + .size = sizeof(VFIOINTx), \ + .vmsd = &vfio_intx_vmstate, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ +} + +const VMStateDescription vfio_cpr_pci_vmstate = { + .name = "vfio-cpr-pci", + .version_id = 0, + .minimum_version_id = 0, + .pre_load = vfio_cpr_pci_pre_load, + .post_load = vfio_cpr_pci_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, VFIOPCIDevice), + VMSTATE_MSIX_TEST(parent_obj, VFIOPCIDevice, pci_msix_present), + VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), + VMSTATE_END_OF_LIST() + } +}; + +static NotifierWithReturn kvm_close_notifier; + +static int vfio_cpr_kvm_close_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, + Error **errp) +{ + if (e->type == MIG_EVENT_PRECOPY_DONE) { + vfio_kvm_device_close(); + } + return 0; +} + +void vfio_cpr_add_kvm_notifier(void) +{ + if (!kvm_close_notifier.notify) { + migration_add_notifier_modes(&kvm_close_notifier, + vfio_cpr_kvm_close_notifier, + MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, + -1); + } +} + +static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq, bool enable) +{ + if (enable) { + return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq); + } else { + return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq); + } +} + +static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable) +{ + const char *op = (enable ? "enable" : "disable"); + PCIDevice *pdev = PCI_DEVICE(vdev); + int i, nr_vectors, ret = 0; + + if (msix_enabled(pdev)) { + nr_vectors = vdev->msix->entries; + + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, + &vdev->intx.unmask, vdev->intx.route.irq, + enable); + if (ret) { + error_setg_errno(errp, -ret, "failed to %s INTx irq %d", + op, vdev->intx.route.irq); + return ret; + } + vfio_pci_intx_set_handler(vdev, enable); + return ret; + + } else { + return 0; + } + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + if (vector->use) { + ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, + NULL, vector->virq, enable); + if (ret) { + error_setg_errno(errp, -ret, + "failed to %s msi vector %d virq %d", + op, i, vector->virq); + return ret; + } + vfio_pci_msi_set_handler(vdev, i, enable); + } + } + + return ret; +} + +/* + * When CPR starts, detach IRQs from the VFIO device so future interrupts + * are posted to kvm_interrupt, which is preserved in new QEMU. Interrupts + * that were already posted to the old KVM instance, but not delivered to the + * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance + * in new QEMU. + * + * If CPR fails, reattach the IRQs. + */ +static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) +{ + VFIOPCIDevice *vdev = + container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier); + + if (e->type == MIG_EVENT_PRECOPY_SETUP) { + return vfio_cpr_set_msi_virq(vdev, errp, false); + } else if (e->type == MIG_EVENT_PRECOPY_FAILED) { + return vfio_cpr_set_msi_virq(vdev, errp, true); + } + return 0; +} + +void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev) { - migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, - vfio_cpr_reboot_notifier, - MIG_MODE_CPR_REBOOT); - return true; + migration_add_notifier_modes(&vdev->cpr.transfer_notifier, + vfio_cpr_pci_notifier, + MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, -1); } -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) +void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev) { - migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + migration_remove_notifier(&vdev->cpr.transfer_notifier); } diff --git a/hw/vfio/device.c b/hw/vfio/device.c new file mode 100644 index 0000000..64f8750 --- /dev/null +++ b/hw/vfio/device.c @@ -0,0 +1,594 @@ +/* + * VFIO device + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> + +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/pci.h" +#include "hw/hw.h" +#include "trace.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/units.h" +#include "migration/cpr.h" +#include "migration/blocker.h" +#include "monitor/monitor.h" +#include "vfio-helpers.h" + +VFIODeviceList vfio_device_list = + QLIST_HEAD_INITIALIZER(vfio_device_list); + +/* + * We want to differentiate hot reset of multiple in-use devices vs + * hot reset of a single in-use device. VFIO_DEVICE_RESET will already + * handle the case of doing hot resets when there is only a single + * device per bus. The in-use here refers to how many VFIODevices are + * affected. A hot reset that affects multiple devices, but only a + * single in-use device, means that we can call it from our bus + * ->reset() callback since the extent is effectively a single + * device. This allows us to make use of it in the hotplug path. When + * there are multiple in-use devices, we can only trigger the hot + * reset during a system reset and thus from our reset handler. We + * separate _one vs _multi here so that we don't overlap and do a + * double reset on the system reset path where both our reset handler + * and ->reset() callback are used. Calling _one() will only do a hot + * reset for the one in-use devices case, calling _multi() will do + * nothing if a _one() would have been sufficient. + */ +void vfio_device_reset_handler(void *opaque) +{ + VFIODevice *vbasedev; + + trace_vfio_device_reset_handler(); + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->dev->realized) { + vbasedev->ops->vfio_compute_needs_reset(vbasedev); + } + } + + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->dev->realized && vbasedev->needs_reset) { + vbasedev->ops->vfio_hot_reset_multi(vbasedev); + } + } +} + +/* + * Common VFIO interrupt disable + */ +void vfio_device_irq_disable(VFIODevice *vbasedev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = index, + .start = 0, + .count = 0, + }; + + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); +} + +void vfio_device_irq_unmask(VFIODevice *vbasedev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, + .index = index, + .start = 0, + .count = 1, + }; + + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); +} + +void vfio_device_irq_mask(VFIODevice *vbasedev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, + .index = index, + .start = 0, + .count = 1, + }; + + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); +} + +static inline const char *action_to_str(int action) +{ + switch (action) { + case VFIO_IRQ_SET_ACTION_MASK: + return "MASK"; + case VFIO_IRQ_SET_ACTION_UNMASK: + return "UNMASK"; + case VFIO_IRQ_SET_ACTION_TRIGGER: + return "TRIGGER"; + default: + return "UNKNOWN ACTION"; + } +} + +static const char *index_to_str(VFIODevice *vbasedev, int index) +{ + if (!vfio_pci_from_vfio_device(vbasedev)) { + return NULL; + } + + switch (index) { + case VFIO_PCI_INTX_IRQ_INDEX: + return "INTX"; + case VFIO_PCI_MSI_IRQ_INDEX: + return "MSI"; + case VFIO_PCI_MSIX_IRQ_INDEX: + return "MSIX"; + case VFIO_PCI_ERR_IRQ_INDEX: + return "ERR"; + case VFIO_PCI_REQ_IRQ_INDEX: + return "REQ"; + default: + return NULL; + } +} + +bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex, + int action, int fd, Error **errp) +{ + ERRP_GUARD(); + g_autofree struct vfio_irq_set *irq_set = NULL; + int argsz; + const char *name; + int32_t *pfd; + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; + irq_set->index = index; + irq_set->start = subindex; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + *pfd = fd; + + if (!vbasedev->io_ops->set_irqs(vbasedev, irq_set)) { + return true; + } + + error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure"); + + name = index_to_str(vbasedev, index); + if (name) { + error_prepend(errp, "%s-%d: ", name, subindex); + } else { + error_prepend(errp, "index %d-%d: ", index, subindex); + } + error_prepend(errp, + "Failed to %s %s eventfd signaling for interrupt ", + fd < 0 ? "tear down" : "set up", action_to_str(action)); + return false; +} + +int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, + struct vfio_irq_info *info) +{ + memset(info, 0, sizeof(*info)); + + info->argsz = sizeof(*info); + info->index = index; + + return vbasedev->io_ops->get_irq_info(vbasedev, info); +} + +int vfio_device_get_region_info(VFIODevice *vbasedev, int index, + struct vfio_region_info **info) +{ + size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; + int ret; + + /* check cache */ + if (vbasedev->reginfo[index] != NULL) { + *info = vbasedev->reginfo[index]; + return 0; + } + + *info = g_malloc0(argsz); + + (*info)->index = index; +retry: + (*info)->argsz = argsz; + + ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd); + if (ret != 0) { + g_free(*info); + *info = NULL; + return ret; + } + + if ((*info)->argsz > argsz) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + + if (fd != -1) { + close(fd); + fd = -1; + } + + goto retry; + } + + /* fill cache */ + vbasedev->reginfo[index] = *info; + if (vbasedev->region_fds != NULL) { + vbasedev->region_fds[index] = fd; + } + + return 0; +} + +int vfio_device_get_region_fd(VFIODevice *vbasedev, int index) +{ + return vbasedev->region_fds ? + vbasedev->region_fds[index] : + vbasedev->fd; +} + +int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info) +{ + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_type *cap_type; + + if (vfio_device_get_region_info(vbasedev, i, info)) { + continue; + } + + hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); + if (!hdr) { + continue; + } + + cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); + + trace_vfio_device_get_region_info_type(vbasedev->name, i, + cap_type->type, cap_type->subtype); + + if (cap_type->type == type && cap_type->subtype == subtype) { + return 0; + } + } + + *info = NULL; + return -ENODEV; +} + +bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) +{ + struct vfio_region_info *info = NULL; + bool ret = false; + + if (!vfio_device_get_region_info(vbasedev, region, &info)) { + if (vfio_get_region_info_cap(info, cap_type)) { + ret = true; + } + } + + return ret; +} + +bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) +{ + ERRP_GUARD(); + struct stat st; + + if (vbasedev->fd < 0) { + if (stat(vbasedev->sysfsdev, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + return false; + } + /* User may specify a name, e.g: VFIO platform device */ + if (!vbasedev->name) { + vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + } + } else { + if (!vbasedev->iommufd) { + error_setg(errp, "Use FD passing only with iommufd backend"); + return false; + } + if (!vbasedev->name) { + + if (vbasedev->dev->id) { + vbasedev->name = g_strdup(vbasedev->dev->id); + return true; + } else { + /* + * Assign a name so any function printing it will not break. + * The fd number changes across processes, so this cannot be + * used as an invariant name for CPR. + */ + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + error_setg(&vbasedev->cpr.id_blocker, + "vfio device with fd=%d needs an id property", + vbasedev->fd); + return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker, + errp, MIG_MODE_CPR_TRANSFER, + -1) == 0; + } + } + } + + return true; +} + +void vfio_device_free_name(VFIODevice *vbasedev) +{ + g_clear_pointer(&vbasedev->name, g_free); + migrate_del_blocker(&vbasedev->cpr.id_blocker); +} + +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +{ + vbasedev->fd = cpr_get_fd_param(vbasedev->dev->id, str, 0, errp); +} + +static VFIODeviceIOOps vfio_device_io_ops_ioctl; + +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard) +{ + vbasedev->type = type; + vbasedev->ops = ops; + vbasedev->io_ops = &vfio_device_io_ops_ioctl; + vbasedev->dev = dev; + vbasedev->fd = -1; + vbasedev->use_region_fds = false; + + vbasedev->ram_block_discard_allowed = ram_discard; +} + +int vfio_device_get_aw_bits(VFIODevice *vdev) +{ + /* + * iova_ranges is a sorted list. For old kernels that support + * VFIO but not support query of iova ranges, iova_ranges is NULL, + * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. + */ + GList *l = g_list_last(vdev->bcontainer->iova_ranges); + + if (l) { + Range *range = l->data; + return range_get_last_bit(range) + 1; + } + + return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; +} + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} + +bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev, + const char *typename, Error **errp) +{ + HostIOMMUDevice *hiod; + + if (vbasedev->mdev) { + return true; + } + + hiod = HOST_IOMMU_DEVICE(object_new(typename)); + + if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + object_unref(hiod); + return false; + } + + vbasedev->hiod = hiod; + return true; +} + +VFIODevice *vfio_get_vfio_device(Object *obj) +{ + if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) { + return &VFIO_PCI_DEVICE(obj)->vbasedev; + } else { + return NULL; + } +} + +bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name, + VFIODevice *vbasedev, AddressSpace *as, + Error **errp) +{ + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(iommu_type)); + + assert(ops); + + return ops->attach_device(name, vbasedev, as, errp); +} + +bool vfio_device_attach(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const char *iommu_type = vbasedev->iommufd ? + TYPE_VFIO_IOMMU_IOMMUFD : + TYPE_VFIO_IOMMU_LEGACY; + + return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev, + as, errp); +} + +void vfio_device_detach(VFIODevice *vbasedev) +{ + if (!vbasedev->bcontainer) { + return; + } + VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); +} + +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainer *bcontainer, + struct vfio_device_info *info) +{ + int i; + + vbasedev->num_irqs = info->num_irqs; + vbasedev->num_regions = info->num_regions; + vbasedev->flags = info->flags; + vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); + + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + vbasedev->reginfo = g_new0(struct vfio_region_info *, + vbasedev->num_regions); + if (vbasedev->use_region_fds) { + vbasedev->region_fds = g_new0(int, vbasedev->num_regions); + for (i = 0; i < vbasedev->num_regions; i++) { + vbasedev->region_fds[i] = -1; + } + } +} + +void vfio_device_unprepare(VFIODevice *vbasedev) +{ + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + g_free(vbasedev->reginfo[i]); + if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) { + close(vbasedev->region_fds[i]); + } + } + + g_clear_pointer(&vbasedev->reginfo, g_free); + g_clear_pointer(&vbasedev->region_fds, g_free); + + QLIST_REMOVE(vbasedev, container_next); + QLIST_REMOVE(vbasedev, global_next); + vbasedev->bcontainer = NULL; +} + +/* + * Traditional ioctl() based io + */ + +static int vfio_device_io_device_feature(VFIODevice *vbasedev, + struct vfio_device_feature *feature) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info, + int *fd) +{ + int ret; + + *fd = -1; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_set_irqs(VFIODevice *vbasedev, + struct vfio_irq_set *irqs) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + struct vfio_region_info *info; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret != 0) { + return ret; + } + + ret = pread(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data, + bool post) +{ + struct vfio_region_info *info; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret != 0) { + return ret; + } + + ret = pwrite(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static VFIODeviceIOOps vfio_device_io_ops_ioctl = { + .device_feature = vfio_device_io_device_feature, + .get_region_info = vfio_device_io_get_region_info, + .get_irq_info = vfio_device_io_get_irq_info, + .set_irqs = vfio_device_io_set_irqs, + .region_read = vfio_device_io_region_read, + .region_write = vfio_device_io_region_write, +}; diff --git a/hw/vfio/display.c b/hw/vfio/display.c index ea87830..faacd90 100644 --- a/hw/vfio/display.c +++ b/hw/vfio/display.c @@ -16,9 +16,9 @@ #include "qemu/error-report.h" #include "hw/display/edid.h" -#include "ui/console.h" #include "qapi/error.h" #include "pci.h" +#include "vfio-display.h" #include "trace.h" #ifndef DRM_PLANE_TYPE_PRIMARY @@ -104,7 +104,6 @@ static void vfio_display_edid_update(VFIOPCIDevice *vdev, bool enabled, err: trace_vfio_display_edid_write_error(); - return; } static void vfio_display_edid_ui_info(void *opaque, uint32_t idx, @@ -130,10 +129,10 @@ static bool vfio_display_edid_init(VFIOPCIDevice *vdev, Error **errp) int fd = vdev->vbasedev.fd; int ret; - ret = vfio_get_dev_region_info(&vdev->vbasedev, - VFIO_REGION_TYPE_GFX, - VFIO_REGION_SUBTYPE_GFX_EDID, - &dpy->edid_info); + ret = vfio_device_get_region_info_type(&vdev->vbasedev, + VFIO_REGION_TYPE_GFX, + VFIO_REGION_SUBTYPE_GFX_EDID, + &dpy->edid_info); if (ret) { /* Failed to get GFX edid info, allow to go through without edid. */ return true; @@ -214,6 +213,7 @@ static VFIODMABuf *vfio_display_get_dmabuf(VFIOPCIDevice *vdev, struct vfio_device_gfx_plane_info plane; VFIODMABuf *dmabuf; int fd, ret; + uint32_t offset = 0; memset(&plane, 0, sizeof(plane)); plane.argsz = sizeof(plane); @@ -246,10 +246,10 @@ static VFIODMABuf *vfio_display_get_dmabuf(VFIOPCIDevice *vdev, dmabuf = g_new0(VFIODMABuf, 1); dmabuf->dmabuf_id = plane.dmabuf_id; - dmabuf->buf = qemu_dmabuf_new(plane.width, plane.height, - plane.stride, 0, 0, plane.width, + dmabuf->buf = qemu_dmabuf_new(plane.width, plane.height, &offset, + &plane.stride, 0, 0, plane.width, plane.height, plane.drm_format, - plane.drm_format_mod, fd, false, false); + plane.drm_format_mod, &fd, 1, false, false); if (plane_type == DRM_PLANE_TYPE_CURSOR) { vfio_display_update_cursor(dmabuf, &plane); @@ -365,7 +365,7 @@ static bool vfio_display_dmabuf_init(VFIOPCIDevice *vdev, Error **errp) &vfio_display_dmabuf_ops, vdev); if (vdev->enable_ramfb) { - vdev->dpy->ramfb = ramfb_setup(errp); + vdev->dpy->ramfb = ramfb_setup(vdev->use_legacy_x86_rom, errp); if (!vdev->dpy->ramfb) { return false; } @@ -494,7 +494,7 @@ static bool vfio_display_region_init(VFIOPCIDevice *vdev, Error **errp) &vfio_display_region_ops, vdev); if (vdev->enable_ramfb) { - vdev->dpy->ramfb = ramfb_setup(errp); + vdev->dpy->ramfb = ramfb_setup(vdev->use_legacy_x86_rom, errp); if (!vdev->dpy->ramfb) { return false; } diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 4b255d4..23d13e5 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -22,242 +22,11 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> -#include "hw/vfio/vfio-common.h" -#include "hw/vfio/pci.h" +#include "system/kvm.h" +#include "hw/vfio/vfio-device.h" #include "hw/hw.h" -#include "trace.h" #include "qapi/error.h" -#include "qemu/error-report.h" -#include "qemu/units.h" -#include "monitor/monitor.h" - -/* - * Common VFIO interrupt disable - */ -void vfio_disable_irqindex(VFIODevice *vbasedev, int index) -{ - struct vfio_irq_set irq_set = { - .argsz = sizeof(irq_set), - .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, - .index = index, - .start = 0, - .count = 0, - }; - - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); -} - -void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) -{ - struct vfio_irq_set irq_set = { - .argsz = sizeof(irq_set), - .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, - .index = index, - .start = 0, - .count = 1, - }; - - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); -} - -void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) -{ - struct vfio_irq_set irq_set = { - .argsz = sizeof(irq_set), - .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, - .index = index, - .start = 0, - .count = 1, - }; - - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); -} - -static inline const char *action_to_str(int action) -{ - switch (action) { - case VFIO_IRQ_SET_ACTION_MASK: - return "MASK"; - case VFIO_IRQ_SET_ACTION_UNMASK: - return "UNMASK"; - case VFIO_IRQ_SET_ACTION_TRIGGER: - return "TRIGGER"; - default: - return "UNKNOWN ACTION"; - } -} - -static const char *index_to_str(VFIODevice *vbasedev, int index) -{ - if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { - return NULL; - } - - switch (index) { - case VFIO_PCI_INTX_IRQ_INDEX: - return "INTX"; - case VFIO_PCI_MSI_IRQ_INDEX: - return "MSI"; - case VFIO_PCI_MSIX_IRQ_INDEX: - return "MSIX"; - case VFIO_PCI_ERR_IRQ_INDEX: - return "ERR"; - case VFIO_PCI_REQ_IRQ_INDEX: - return "REQ"; - default: - return NULL; - } -} - -bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, - int action, int fd, Error **errp) -{ - ERRP_GUARD(); - g_autofree struct vfio_irq_set *irq_set = NULL; - int argsz; - const char *name; - int32_t *pfd; - - argsz = sizeof(*irq_set) + sizeof(*pfd); - - irq_set = g_malloc0(argsz); - irq_set->argsz = argsz; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; - irq_set->index = index; - irq_set->start = subindex; - irq_set->count = 1; - pfd = (int32_t *)&irq_set->data; - *pfd = fd; - - if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { - return true; - } - - error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure"); - - name = index_to_str(vbasedev, index); - if (name) { - error_prepend(errp, "%s-%d: ", name, subindex); - } else { - error_prepend(errp, "index %d-%d: ", index, subindex); - } - error_prepend(errp, - "Failed to %s %s eventfd signaling for interrupt ", - fd < 0 ? "tear down" : "set up", action_to_str(action)); - return false; -} - -/* - * IO Port/MMIO - Beware of the endians, VFIO is always little endian - */ -void vfio_region_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIORegion *region = opaque; - VFIODevice *vbasedev = region->vbasedev; - union { - uint8_t byte; - uint16_t word; - uint32_t dword; - uint64_t qword; - } buf; - - switch (size) { - case 1: - buf.byte = data; - break; - case 2: - buf.word = cpu_to_le16(data); - break; - case 4: - buf.dword = cpu_to_le32(data); - break; - case 8: - buf.qword = cpu_to_le64(data); - break; - default: - hw_error("vfio: unsupported write size, %u bytes", size); - break; - } - - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 - ",%d) failed: %m", - __func__, vbasedev->name, region->nr, - addr, data, size); - } - - trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); - - /* - * A read or write to a BAR always signals an INTx EOI. This will - * do nothing if not pending (including not in INTx mode). We assume - * that a BAR access is in response to an interrupt and that BAR - * accesses will service the interrupt. Unfortunately, we don't know - * which access will service the interrupt, so we're potentially - * getting quite a few host interrupts per guest interrupt. - */ - vbasedev->ops->vfio_eoi(vbasedev); -} - -uint64_t vfio_region_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIORegion *region = opaque; - VFIODevice *vbasedev = region->vbasedev; - union { - uint8_t byte; - uint16_t word; - uint32_t dword; - uint64_t qword; - } buf; - uint64_t data = 0; - - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", - __func__, vbasedev->name, region->nr, - addr, size); - return (uint64_t)-1; - } - switch (size) { - case 1: - data = buf.byte; - break; - case 2: - data = le16_to_cpu(buf.word); - break; - case 4: - data = le32_to_cpu(buf.dword); - break; - case 8: - data = le64_to_cpu(buf.qword); - break; - default: - hw_error("vfio: unsupported read size, %u bytes", size); - break; - } - - trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); - - /* Same as write above */ - vbasedev->ops->vfio_eoi(vbasedev); - - return data; -} - -const MemoryRegionOps vfio_region_ops = { - .read = vfio_region_read, - .write = vfio_region_write, - .endianness = DEVICE_LITTLE_ENDIAN, - .valid = { - .min_access_size = 1, - .max_access_size = 8, - }, - .impl = { - .min_access_size = 1, - .max_access_size = 8, - }, -}; +#include "vfio-helpers.h" int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size) { @@ -306,435 +75,154 @@ vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) return vfio_get_cap((void *)info, info->cap_offset, id); } -static int vfio_setup_region_sparse_mmaps(VFIORegion *region, - struct vfio_region_info *info) +struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) { - struct vfio_info_cap_header *hdr; - struct vfio_region_info_cap_sparse_mmap *sparse; - int i, j; - - hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); - if (!hdr) { - return -ENODEV; - } - - sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); - - trace_vfio_region_sparse_mmap_header(region->vbasedev->name, - region->nr, sparse->nr_areas); - - region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); - - for (i = 0, j = 0; i < sparse->nr_areas; i++) { - if (sparse->areas[i].size) { - trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, - sparse->areas[i].offset + - sparse->areas[i].size - 1); - region->mmaps[j].offset = sparse->areas[i].offset; - region->mmaps[j].size = sparse->areas[i].size; - j++; - } + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; } - region->nr_mmaps = j; - region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); - - return 0; + return vfio_get_cap((void *)info, info->cap_offset, id); } -int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, - int index, const char *name) +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail) { - g_autofree struct vfio_region_info *info = NULL; - int ret; + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_dma_avail *cap; - ret = vfio_get_region_info(vbasedev, index, &info); - if (ret) { - return ret; + /* If the capability cannot be found, assume no DMA limiting */ + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); + if (!hdr) { + return false; } - region->vbasedev = vbasedev; - region->flags = info->flags; - region->size = info->size; - region->fd_offset = info->offset; - region->nr = index; - - if (region->size) { - region->mem = g_new0(MemoryRegion, 1); - memory_region_init_io(region->mem, obj, &vfio_region_ops, - region, name, region->size); - - if (!vbasedev->no_mmap && - region->flags & VFIO_REGION_INFO_FLAG_MMAP) { - - ret = vfio_setup_region_sparse_mmaps(region, info); - - if (ret) { - region->nr_mmaps = 1; - region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); - region->mmaps[0].offset = 0; - region->mmaps[0].size = region->size; - } - } + if (avail != NULL) { + cap = (void *) hdr; + *avail = cap->avail; } - trace_vfio_region_setup(vbasedev->name, index, name, - region->flags, region->fd_offset, region->size); - return 0; + return true; } -static void vfio_subregion_unmap(VFIORegion *region, int index) -{ - trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), - region->mmaps[index].offset, - region->mmaps[index].offset + - region->mmaps[index].size - 1); - memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); - munmap(region->mmaps[index].mmap, region->mmaps[index].size); - object_unparent(OBJECT(®ion->mmaps[index].mem)); - region->mmaps[index].mmap = NULL; -} +#ifdef CONFIG_KVM +/* + * We have a single VFIO pseudo device per KVM VM. Once created it lives + * for the life of the VM. Closing the file descriptor only drops our + * reference to it and the device's reference to kvm. Therefore once + * initialized, this file descriptor is only released on QEMU exit and + * we'll re-use it should another vfio device be attached before then. + */ +int vfio_kvm_device_fd = -1; +#endif -int vfio_region_mmap(VFIORegion *region) +void vfio_kvm_device_close(void) { - int i, ret, prot = 0; - char *name; - - if (!region->mem) { - return 0; - } - - prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; - prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; - - for (i = 0; i < region->nr_mmaps; i++) { - size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); - void *map_base, *map_align; - - /* - * Align the mmap for more efficient mapping in the kernel. Ideally - * we'd know the PMD and PUD mapping sizes to use as discrete alignment - * intervals, but we don't. As of Linux v6.12, the largest PUD size - * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set - * on x86_64). Align by power-of-two size, capped at 1GiB. - * - * NB. qemu_memalign() and friends actually allocate memory, whereas - * the region size here can exceed host memory, therefore we manually - * create an oversized anonymous mapping and clean it up for alignment. - */ - map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (map_base == MAP_FAILED) { - ret = -errno; - goto no_mmap; - } - - map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); - munmap(map_base, map_align - map_base); - munmap(map_align + region->mmaps[i].size, - align - (map_align - map_base)); - - region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, - MAP_SHARED | MAP_FIXED, - region->vbasedev->fd, - region->fd_offset + - region->mmaps[i].offset); - if (region->mmaps[i].mmap == MAP_FAILED) { - ret = -errno; - goto no_mmap; - } - - name = g_strdup_printf("%s mmaps[%d]", - memory_region_name(region->mem), i); - memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, - memory_region_owner(region->mem), - name, region->mmaps[i].size, - region->mmaps[i].mmap); - g_free(name); - memory_region_add_subregion(region->mem, region->mmaps[i].offset, - ®ion->mmaps[i].mem); - - trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), - region->mmaps[i].offset, - region->mmaps[i].offset + - region->mmaps[i].size - 1); +#ifdef CONFIG_KVM + kvm_close(); + if (vfio_kvm_device_fd != -1) { + close(vfio_kvm_device_fd); + vfio_kvm_device_fd = -1; } - - return 0; - -no_mmap: - trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, - region->fd_offset + region->mmaps[i].offset, - region->fd_offset + region->mmaps[i].offset + - region->mmaps[i].size - 1, ret); - - region->mmaps[i].mmap = NULL; - - for (i--; i >= 0; i--) { - vfio_subregion_unmap(region, i); - } - - return ret; +#endif } -void vfio_region_unmap(VFIORegion *region) +int vfio_kvm_device_add_fd(int fd, Error **errp) { - int i; - - if (!region->mem) { - return; - } +#ifdef CONFIG_KVM + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_FILE, + .attr = KVM_DEV_VFIO_FILE_ADD, + .addr = (uint64_t)(unsigned long)&fd, + }; - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - vfio_subregion_unmap(region, i); - } + if (!kvm_enabled()) { + return 0; } -} - -void vfio_region_exit(VFIORegion *region) -{ - int i; - if (!region->mem) { - return; - } + if (vfio_kvm_device_fd < 0) { + struct kvm_create_device cd = { + .type = KVM_DEV_TYPE_VFIO, + }; - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); + if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { + error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); + return -errno; } - } - trace_vfio_region_exit(region->vbasedev->name, region->nr); -} - -void vfio_region_finalize(VFIORegion *region) -{ - int i; - - if (!region->mem) { - return; + vfio_kvm_device_fd = cd.fd; } - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - munmap(region->mmaps[i].mmap, region->mmaps[i].size); - object_unparent(OBJECT(®ion->mmaps[i].mem)); - } + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", + fd); + return -errno; } - - object_unparent(OBJECT(region->mem)); - - g_free(region->mem); - g_free(region->mmaps); - - trace_vfio_region_finalize(region->vbasedev->name, region->nr); - - region->mem = NULL; - region->mmaps = NULL; - region->nr_mmaps = 0; - region->size = 0; - region->flags = 0; - region->nr = 0; +#endif + return 0; } -void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) +int vfio_kvm_device_del_fd(int fd, Error **errp) { - int i; - - if (!region->mem) { - return; - } +#ifdef CONFIG_KVM + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_FILE, + .attr = KVM_DEV_VFIO_FILE_DEL, + .addr = (uint64_t)(unsigned long)&fd, + }; - for (i = 0; i < region->nr_mmaps; i++) { - if (region->mmaps[i].mmap) { - memory_region_set_enabled(®ion->mmaps[i].mem, enabled); - } + if (vfio_kvm_device_fd < 0) { + error_setg(errp, "KVM VFIO device isn't created yet"); + return -EINVAL; } - trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), - enabled); -} - -int vfio_get_region_info(VFIODevice *vbasedev, int index, - struct vfio_region_info **info) -{ - size_t argsz = sizeof(struct vfio_region_info); - - *info = g_malloc0(argsz); - - (*info)->index = index; -retry: - (*info)->argsz = argsz; - - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { - g_free(*info); - *info = NULL; + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, + "Failed to remove fd %d from KVM VFIO device", fd); return -errno; } - - if ((*info)->argsz > argsz) { - argsz = (*info)->argsz; - *info = g_realloc(*info, argsz); - - goto retry; - } - +#endif return 0; } -int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, - uint32_t subtype, struct vfio_region_info **info) +struct vfio_device_info *vfio_get_device_info(int fd) { - int i; - - for (i = 0; i < vbasedev->num_regions; i++) { - struct vfio_info_cap_header *hdr; - struct vfio_region_info_cap_type *cap_type; - - if (vfio_get_region_info(vbasedev, i, info)) { - continue; - } + struct vfio_device_info *info; + uint32_t argsz = sizeof(*info); - hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); - if (!hdr) { - g_free(*info); - continue; - } - - cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); - - trace_vfio_get_dev_region(vbasedev->name, i, - cap_type->type, cap_type->subtype); + info = g_malloc0(argsz); - if (cap_type->type == type && cap_type->subtype == subtype) { - return 0; - } - - g_free(*info); - } - - *info = NULL; - return -ENODEV; -} - -bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) -{ - g_autofree struct vfio_region_info *info = NULL; - bool ret = false; - - if (!vfio_get_region_info(vbasedev, region, &info)) { - if (vfio_get_region_info_cap(info, cap_type)) { - ret = true; - } - } - - return ret; -} +retry: + info->argsz = argsz; -bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) -{ - ERRP_GUARD(); - struct stat st; - - if (vbasedev->fd < 0) { - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, "no such host device"); - error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); - return false; - } - /* User may specify a name, e.g: VFIO platform device */ - if (!vbasedev->name) { - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - } - } else { - if (!vbasedev->iommufd) { - error_setg(errp, "Use FD passing only with iommufd backend"); - return false; - } - /* - * Give a name with fd so any function printing out vbasedev->name - * will not break. - */ - if (!vbasedev->name) { - vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); - } + if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { + g_free(info); + return NULL; } - return true; -} - -void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) -{ - ERRP_GUARD(); - int fd = monitor_fd_param(monitor_cur(), str, errp); - - if (fd < 0) { - error_prepend(errp, "Could not parse remote object fd %s:", str); - return; + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; } - vbasedev->fd = fd; -} - -void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, - DeviceState *dev, bool ram_discard) -{ - vbasedev->type = type; - vbasedev->ops = ops; - vbasedev->dev = dev; - vbasedev->fd = -1; - vbasedev->ram_block_discard_allowed = ram_discard; + return info; } -int vfio_device_get_aw_bits(VFIODevice *vdev) +bool vfio_arch_wants_loading_config_after_iter(void) { /* - * iova_ranges is a sorted list. For old kernels that support - * VFIO but not support query of iova ranges, iova_ranges is NULL, - * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. + * Starting the config load only after all iterables were loaded (during + * non-iterables loading phase) is required for ARM64 due to this platform + * VFIO dependency on interrupt controller being loaded first. + * + * See commit d329f5032e17 ("vfio: Move the saving of the config space to + * the right place in VFIO migration"). */ - GList *l = g_list_last(vdev->bcontainer->iova_ranges); - - if (l) { - Range *range = l->data; - return range_get_last_bit(range) + 1; - } - - return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; -} - -bool vfio_device_is_mdev(VFIODevice *vbasedev) -{ - g_autofree char *subsys = NULL; - g_autofree char *tmp = NULL; - - if (!vbasedev->sysfsdev) { - return false; - } - - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); -} - -bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) -{ - HostIOMMUDevice *hiod = vbasedev->hiod; - - if (!hiod) { - return true; - } - - return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); -} - -VFIODevice *vfio_get_vfio_device(Object *obj) -{ - if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) { - return &VFIO_PCI(obj)->vbasedev; - } else { - return NULL; - } +#if defined(TARGET_ARM) + return true; +#else + return false; +#endif } diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 265fffc..4bfa2e0 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -103,6 +103,7 @@ static int igd_gen(VFIOPCIDevice *vdev) /* * Unfortunately, Intel changes it's specification quite often. This makes * it impossible to use a suitable default value for unknown devices. + * Return -1 for not applying any generation-specific quirks. */ return -1; } @@ -112,6 +113,7 @@ static int igd_gen(VFIOPCIDevice *vdev) #define IGD_BDSM 0x5c /* Base Data of Stolen Memory */ #define IGD_BDSM_GEN11 0xc0 /* Base Data of Stolen Memory of gen 11 and later */ +#define IGD_GMCH_VGA_DISABLE BIT(1) #define IGD_GMCH_GEN6_GMS_SHIFT 3 /* SNB_GMCH in i915 */ #define IGD_GMCH_GEN6_GMS_MASK 0x1f #define IGD_GMCH_GEN8_GMS_SHIFT 8 /* BDW_GMCH in i915 */ @@ -182,34 +184,25 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); - pci_set_long(vdev->pdev.config + IGD_ASLS, 0); - pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); - return true; } -static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) +static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, + struct vfio_region_info **opregion) { - g_autofree struct vfio_region_info *opregion = NULL; int ret; - /* Hotplugging is not supported for opregion access */ - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); - return false; - } - - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { - error_setg_errno(errp, -ret, - "Device does not supports IGD OpRegion feature"); return false; } - if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) { + /* Hotplugging is not supported for opregion access */ + if (DEVICE(vdev)->hotplugged) { + warn_report("IGD device detected, but OpRegion is not supported " + "on hotplugged device."); return false; } @@ -267,11 +260,12 @@ static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev, static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev, struct vfio_region_info *info) { + PCIDevice *pdev = PCI_DEVICE(vdev); PCIBus *bus; PCIDevice *host_bridge; int ret; - bus = pci_device_root_bus(&vdev->pdev); + bus = pci_device_root_bus(pdev); host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0)); if (!host_bridge) { @@ -301,7 +295,8 @@ static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp) } } -static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data) +static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); @@ -317,7 +312,7 @@ static const TypeInfo vfio_pci_igd_lpc_bridge_info = { .name = "vfio-pci-igd-lpc-bridge", .parent = TYPE_PCI_DEVICE, .class_init = vfio_pci_igd_lpc_bridge_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, }, @@ -333,13 +328,14 @@ type_init(vfio_pci_igd_register_types) static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, struct vfio_region_info *info) { + PCIDevice *pdev = PCI_DEVICE(vdev); PCIDevice *lpc_bridge; int ret; - lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + lpc_bridge = pci_find_device(pci_device_root_bus(pdev), 0, PCI_DEVFN(0x1f, 0)); if (!lpc_bridge) { - lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev), + lpc_bridge = pci_create_simple(pci_device_root_bus(pdev), PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge"); } @@ -354,15 +350,16 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) { - g_autofree struct vfio_region_info *host = NULL; - g_autofree struct vfio_region_info *lpc = NULL; + struct vfio_region_info *host = NULL; + struct vfio_region_info *lpc = NULL; + PCIDevice *pdev = PCI_DEVICE(vdev); PCIDevice *lpc_bridge; int ret; /* * Copying IDs or creating new devices are not supported on hotplug */ - if (vdev->pdev.qdev.hotplugged) { + if (DEVICE(vdev)->hotplugged) { error_setg(errp, "IGD LPC is not supported on hotplugged device"); return false; } @@ -372,7 +369,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) * can stuff host values into, so if there's already one there and it's not * one we can hack on, this quirk is no-go. Sorry Q35. */ - lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + lpc_bridge = pci_find_device(pci_device_root_bus(pdev), 0, PCI_DEVFN(0x1f, 0)); if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge), "vfio-pci-igd-lpc-bridge")) { @@ -385,7 +382,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) * Check whether we have all the vfio device specific regions to * support LPC quirk (added in Linux v4.6). */ - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); if (ret) { @@ -393,7 +390,7 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) return false; } - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); if (ret) { @@ -418,6 +415,44 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) return true; } +static bool vfio_pci_igd_override_gms(int gen, uint32_t gms, uint32_t *gmch) +{ + bool ret = false; + + if (gen == -1) { + error_report("x-igd-gms is not supported on this device"); + } else if (gen < 8) { + if (gms <= 0x10) { + *gmch &= ~(IGD_GMCH_GEN6_GMS_MASK << IGD_GMCH_GEN6_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN6_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, "x-igd-gms", "0~0x10"); + } + } else if (gen == 8) { + if (gms <= 0x40) { + *gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN8_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, "x-igd-gms", "0~0x40"); + } + } else { + /* 0x0 to 0x40: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if ((gms <= 0x40) || (gms >= 0xf0 && gms <= 0xfe)) { + *gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN8_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, + "x-igd-gms", "0~0x40 or 0xf0~0xfe"); + } + } + + return ret; +} + #define IGD_GGC_MMIO_OFFSET 0x108040 #define IGD_BDSM_MMIO_OFFSET 0x1080C0 @@ -427,41 +462,35 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) VFIOConfigMirrorQuirk *ggc_mirror, *bdsm_mirror; int gen; - /* - * This must be an Intel VGA device at address 00:02.0 for us to even - * consider enabling legacy mode. Some driver have dependencies on the PCI - * bus address. - */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || - !vfio_is_vga(vdev) || nr != 0) { + !vfio_is_base_display(vdev) || nr != 0) { return; } - /* - * Only on IGD devices of gen 11 and above, the BDSM register is mirrored - * into MMIO space and read from MMIO space by the Windows driver. - */ + /* Only on IGD Gen6-12 device needs quirks in BAR 0 */ gen = igd_gen(vdev); if (gen < 6) { return; } - ggc_quirk = vfio_quirk_alloc(1); - ggc_mirror = ggc_quirk->data = g_malloc0(sizeof(*ggc_mirror)); - ggc_mirror->mem = ggc_quirk->mem; - ggc_mirror->vdev = vdev; - ggc_mirror->bar = nr; - ggc_mirror->offset = IGD_GGC_MMIO_OFFSET; - ggc_mirror->config_offset = IGD_GMCH; - - memory_region_init_io(ggc_mirror->mem, OBJECT(vdev), - &vfio_generic_mirror_quirk, ggc_mirror, - "vfio-igd-ggc-quirk", 2); - memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, - ggc_mirror->offset, ggc_mirror->mem, - 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, ggc_quirk, next); + if (vdev->igd_gms) { + ggc_quirk = vfio_quirk_alloc(1); + ggc_mirror = ggc_quirk->data = g_malloc0(sizeof(*ggc_mirror)); + ggc_mirror->mem = ggc_quirk->mem; + ggc_mirror->vdev = vdev; + ggc_mirror->bar = nr; + ggc_mirror->offset = IGD_GGC_MMIO_OFFSET; + ggc_mirror->config_offset = IGD_GMCH; + + memory_region_init_io(ggc_mirror->mem, OBJECT(vdev), + &vfio_generic_mirror_quirk, ggc_mirror, + "vfio-igd-ggc-quirk", 2); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + ggc_mirror->offset, ggc_mirror->mem, + 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, ggc_quirk, next); + } bdsm_quirk = vfio_quirk_alloc(1); bdsm_mirror = bdsm_quirk->data = g_malloc0(sizeof(*bdsm_mirror)); @@ -483,46 +512,43 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { + struct vfio_region_info *opregion = NULL; + PCIDevice *pdev = PCI_DEVICE(vdev); int ret, gen; - uint64_t gms_size; + uint64_t gms_size = 0; uint64_t *bdsm_size; uint32_t gmch; bool legacy_mode_enabled = false; Error *err = NULL; - /* - * This must be an Intel VGA device at address 00:02.0 for us to even - * consider enabling legacy mode. The vBIOS has dependencies on the - * PCI bus address. - */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || - !vfio_is_vga(vdev)) { + !vfio_is_base_display(vdev)) { return true; } - /* - * IGD is not a standard, they like to change their specs often. We - * only attempt to support back to SandBridge and we hope that newer - * devices maintain compatibility with generation 8. - */ - gen = igd_gen(vdev); - if (gen == -1) { - error_report("IGD device %s is unsupported in legacy mode, " - "try SandyBridge or newer", vdev->vbasedev.name); + /* IGD device always comes with OpRegion */ + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { return true; } + info_report("OpRegion detected on Intel display %x.", vdev->device_id); - gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); + gen = igd_gen(vdev); + gmch = vfio_pci_read_config(pdev, IGD_GMCH, 4); /* * For backward compatibility, enable legacy mode when + * - Device geneation is 6 to 9 (including both) + * - IGD exposes itself as VGA controller and claims VGA cycles on host * - Machine type is i440fx (pc_piix) * - IGD device is at guest BDF 00:02.0 * - Not manually disabled by x-igd-legacy-mode=off */ if ((vdev->igd_legacy_mode != ON_OFF_AUTO_OFF) && + vfio_is_vga(vdev) && + (gen >= 6 && gen <= 9) && + !(gmch & IGD_GMCH_VGA_DISABLE) && !strcmp(MACHINE_GET_CLASS(qdev_get_machine())->family, "pc_piix") && - (&vdev->pdev == pci_find_device(pci_device_root_bus(&vdev->pdev), + (pdev == pci_find_device(pci_device_root_bus(pdev), 0, PCI_DEVFN(0x2, 0)))) { /* * IGD legacy mode requires: @@ -531,7 +557,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * - OpRegion * - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host */ - g_autofree struct vfio_region_info *rom = NULL; + struct vfio_region_info *rom = NULL; legacy_mode_enabled = true; info_report("IGD legacy mode enabled, " @@ -542,22 +568,24 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * there's no ROM, there's no point in setting up this quirk. * NB. We only seem to get BIOS ROMs, so UEFI VM would need CSM support. */ - ret = vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, &rom); - if ((ret || !rom->size) && !vdev->pdev.romfile) { + ret = vfio_device_get_region_info(&vdev->vbasedev, + VFIO_PCI_ROM_REGION_INDEX, &rom); + if ((ret || !rom->size) && !pdev->romfile) { error_setg(&err, "Device has no ROM"); goto error; } /* - * If IGD VGA Disable is clear (expected) and VGA is not already - * enabled, try to enable it. Probably shouldn't be using legacy mode - * without VGA, but also no point in us enabling VGA if disabled in - * hardware. + * If VGA is not already enabled, try to enable it. We shouldn't be + * using legacy mode without VGA. */ - if (!(gmch & 0x2) && !vdev->vga && !vfio_populate_vga(vdev, &err)) { - error_setg(&err, "Unable to enable VGA access"); - goto error; + if (!vdev->vga) { + if (vfio_populate_vga(vdev, &err)) { + vfio_pci_config_register_vga(vdev); + } else { + error_setg(&err, "Unable to enable VGA access"); + goto error; + } } /* Enable OpRegion and LPC bridge quirk */ @@ -565,13 +593,15 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) vdev->features |= VFIO_FEATURE_ENABLE_IGD_LPC; } else if (vdev->igd_legacy_mode == ON_OFF_AUTO_ON) { error_setg(&err, - "Machine is not i440fx or assigned BDF is not 00:02.0"); + "Machine is not i440fx, assigned BDF is not 00:02.0, " + "or device %04x (gen %d) doesn't support legacy mode", + vdev->device_id, gen); goto error; } /* Setup OpRegion access */ if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - !vfio_pci_igd_setup_opregion(vdev, errp)) { + !vfio_pci_igd_opregion_init(vdev, opregion, errp)) { goto error; } @@ -579,7 +609,15 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_LPC) && !vfio_pci_igd_setup_lpc_bridge(vdev, errp)) { goto error; - } + } + + /* + * ASLS (OpRegion address) is read-only, emulated + * It contains HPA, guest firmware need to reprogram it with GPA. + */ + pci_set_long(pdev->config + IGD_ASLS, 0); + pci_set_long(pdev->wmask + IGD_ASLS, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); /* * Allow user to override dsm size using x-igd-gms option, in multiples of @@ -587,56 +625,44 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * set from DVMT Pre-Allocated option in host BIOS. */ if (vdev->igd_gms) { - if (gen < 8) { - if (vdev->igd_gms <= 0x10) { - gmch &= ~(IGD_GMCH_GEN6_GMS_MASK << IGD_GMCH_GEN6_GMS_SHIFT); - gmch |= vdev->igd_gms << IGD_GMCH_GEN6_GMS_SHIFT; - } else { - error_report(QERR_INVALID_PARAMETER_VALUE, - "x-igd-gms", "0~0x10"); - } - } else { - if (vdev->igd_gms <= 0x40) { - gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); - gmch |= vdev->igd_gms << IGD_GMCH_GEN8_GMS_SHIFT; - } else { - error_report(QERR_INVALID_PARAMETER_VALUE, - "x-igd-gms", "0~0x40"); - } + if (!vfio_pci_igd_override_gms(gen, vdev->igd_gms, &gmch)) { + return false; } + + /* GMCH is read-only, emulated */ + pci_set_long(pdev->config + IGD_GMCH, gmch); + pci_set_long(pdev->wmask + IGD_GMCH, 0); + pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); } - gms_size = igd_stolen_memory_size(gen, gmch); + if (gen > 0) { + gms_size = igd_stolen_memory_size(gen, gmch); + + /* BDSM is read-write, emulated. BIOS needs to be able to write it */ + if (gen < 11) { + pci_set_long(pdev->config + IGD_BDSM, 0); + pci_set_long(pdev->wmask + IGD_BDSM, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); + } else { + pci_set_quad(pdev->config + IGD_BDSM_GEN11, 0); + pci_set_quad(pdev->wmask + IGD_BDSM_GEN11, ~0); + pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); + } + } /* * Request reserved memory for stolen memory via fw_cfg. VM firmware * must allocate a 1MB aligned reserved memory region below 4GB with - * the requested size (in bytes) for use by the Intel PCI class VGA - * device at VM address 00:02.0. The base address of this reserved - * memory region must be written to the device BDSM register at PCI - * config offset 0x5C. + * the requested size (in bytes) for use by the IGD device. The base + * address of this reserved memory region must be written to the + * device BDSM register. + * For newer device without BDSM register, this fw_cfg item is 0. */ bdsm_size = g_malloc(sizeof(*bdsm_size)); *bdsm_size = cpu_to_le64(gms_size); fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", bdsm_size, sizeof(*bdsm_size)); - /* GMCH is read-only, emulated */ - pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); - pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); - pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); - - /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ - if (gen < 11) { - pci_set_long(vdev->pdev.config + IGD_BDSM, 0); - pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); - } else { - pci_set_quad(vdev->pdev.config + IGD_BDSM_GEN11, 0); - pci_set_quad(vdev->pdev.wmask + IGD_BDSM_GEN11, ~0); - pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); - } - trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (gms_size / MiB)); return true; @@ -663,8 +689,27 @@ error: */ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) { + struct vfio_region_info *opregion = NULL; + int gen; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || + !vfio_is_vga(vdev)) { + return true; + } + + /* FIXME: Cherryview is Gen8, but don't support GVT-g */ + gen = igd_gen(vdev); + if (gen != 8 && gen != 9) { + return true; + } + + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { + /* Should never reach here, KVMGT always emulates OpRegion */ + return false; + } + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - !vfio_pci_igd_setup_opregion(vdev, errp)) { + !vfio_pci_igd_opregion_init(vdev, opregion, errp)) { return false; } diff --git a/hw/vfio/iommufd-stubs.c b/hw/vfio/iommufd-stubs.c new file mode 100644 index 0000000..0be5276 --- /dev/null +++ b/hw/vfio/iommufd-stubs.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "migration/cpr.h" +#include "migration/vmstate.h" + +const VMStateDescription vmstate_cpr_vfio_devices = { + .name = CPR_STATE "/vfio devices", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]){ + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 42c8412..68470d5 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -15,34 +15,69 @@ #include <linux/vfio.h> #include <linux/iommufd.h> -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "qemu/error-report.h" #include "trace.h" #include "qapi/error.h" #include "system/iommufd.h" #include "hw/qdev-core.h" +#include "hw/vfio/vfio-cpr.h" #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" +#include "migration/cpr.h" #include "pci.h" +#include "vfio-iommufd.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" -static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ + TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" + +static int iommufd_cdev_map(const VFIOContainer *bcontainer, hwaddr iova, + uint64_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); return iommufd_backend_map_dma(container->be, container->ioas_id, iova, size, vaddr, readonly); } -static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) +static int iommufd_cdev_map_file(const VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + int fd, unsigned long start, bool readonly) +{ + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); + + return iommufd_backend_map_file_dma(container->be, + container->ioas_id, + iova, size, fd, start, readonly); +} + +static int iommufd_cdev_unmap(const VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); + + /* unmap in halves */ + if (unmap_all) { + Int128 llsize = int128_rshift(int128_2_64(), 1); + int ret; + + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + 0, int128_get64(llsize)); + + if (ret == 0) { + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + int128_get64(llsize), + int128_get64(llsize)); + } + + return ret; + } /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ return iommufd_backend_unmap_dma(container->be, @@ -84,6 +119,10 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) goto err_kvm_device_add; } + if (cpr_is_incoming()) { + goto skip_bind; + } + /* Bind device to iommufd */ bind.iommufd = iommufd->fd; if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) { @@ -95,6 +134,8 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) vbasedev->devid = bind.out_devid; trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, vbasedev->fd, vbasedev->devid); + +skip_bind: return true; err_bind: iommufd_cdev_kvm_device_del(vbasedev); @@ -115,11 +156,10 @@ static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } -static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, +static int iommufd_set_dirty_page_tracking(const VFIOContainer *bcontainer, bool start, Error **errp) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); VFIOIOASHwpt *hwpt; QLIST_FOREACH(hwpt, &container->hwpt_list, next) { @@ -146,13 +186,11 @@ err: return -EINVAL; } -static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, +static int iommufd_query_dirty_bitmap(const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) { - VFIOIOMMUFDContainer *container = container_of(bcontainer, - VFIOIOMMUFDContainer, - bcontainer); + VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); unsigned long page_size = qemu_real_host_page_size(); VFIOIOASHwpt *hwpt; @@ -280,14 +318,23 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, { ERRP_GUARD(); IOMMUFDBackend *iommufd = vbasedev->iommufd; - uint32_t flags = 0; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + uint32_t type, flags = 0; + uint64_t hw_caps; VFIOIOASHwpt *hwpt; uint32_t hwpt_id; int ret; /* Try to find a domain */ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { - ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (!cpr_is_incoming()) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + } else if (vbasedev->cpr.hwpt_id == hwpt->hwpt_id) { + ret = 0; + } else { + continue; + } + if (ret) { /* -EINVAL means the domain is incompatible with the device. */ if (ret == -EINVAL) { @@ -304,6 +351,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, return false; } else { vbasedev->hwpt = hwpt; + vbasedev->cpr.hwpt_id = hwpt->hwpt_id; QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); return true; @@ -317,10 +365,20 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, * vfio_migration_realize() may decide to use VF dirty tracking * instead. */ - if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid, + &type, NULL, 0, &hw_caps, errp)) { + return false; + } + + if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } + if (cpr_is_incoming()) { + hwpt_id = vbasedev->cpr.hwpt_id; + goto skip_alloc; + } + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, @@ -328,25 +386,26 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, return false; } + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt_id); + return false; + } + +skip_alloc: hwpt = g_malloc0(sizeof(*hwpt)); hwpt->hwpt_id = hwpt_id; hwpt->hwpt_flags = flags; QLIST_INIT(&hwpt->device_list); - ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); - if (ret) { - iommufd_backend_free_id(container->be, hwpt->hwpt_id); - g_free(hwpt); - return false; - } - vbasedev->hwpt = hwpt; + vbasedev->cpr.hwpt_id = hwpt->hwpt_id; vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); - container->bcontainer.dirty_pages_supported |= + bcontainer->dirty_pages_supported |= vbasedev->iommu_dirty_tracking; - if (container->bcontainer.dirty_pages_supported && + if (bcontainer->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { warn_report("IOMMU instance for device %s doesn't support dirty tracking", vbasedev->name); @@ -378,7 +437,9 @@ static bool iommufd_cdev_attach_container(VFIODevice *vbasedev, return iommufd_cdev_autodomains_get(vbasedev, container, errp); } - return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); + /* If CPR, we are already attached to ioas_id. */ + return cpr_is_incoming() || + !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } static void iommufd_cdev_detach_container(VFIODevice *vbasedev, @@ -398,12 +459,13 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) { - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); if (!QLIST_EMPTY(&bcontainer->device_list)) { return; } - memory_listener_unregister(&bcontainer->listener); + vfio_iommufd_cpr_unregister_container(container); + vfio_listener_unregister(bcontainer); iommufd_backend_free_id(container->be, container->ioas_id); object_unref(container); } @@ -419,7 +481,7 @@ static int iommufd_cdev_ram_block_discard_disable(bool state) static bool iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container, uint32_t ioas_id, Error **errp) { - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); g_autofree struct iommu_ioas_iova_ranges *info = NULL; struct iommu_iova_range *iova_ranges; int sz, fd = container->be->fd; @@ -461,16 +523,19 @@ error: static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - VFIOContainerBase *bcontainer; + VFIOContainer *bcontainer; VFIOIOMMUFDContainer *container; VFIOAddressSpace *space; struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; int ret, devfd; + bool res; uint32_t ioas_id; Error *err = NULL; const VFIOIOMMUClass *iommufd_vioc = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + vfio_cpr_load_device(vbasedev); + if (vbasedev->fd < 0) { devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); if (devfd < 0) { @@ -485,27 +550,25 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, goto err_connect_bind; } - space = vfio_get_address_space(as); - - /* - * The HostIOMMUDevice data from legacy backend is static and doesn't need - * any information from the (type1-iommu) backend to be initialized. In - * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd - * FD to be connected and having a devid to be able to successfully call - * iommufd_backend_get_device_info(). - */ - if (!vfio_device_hiod_realize(vbasedev, errp)) { - goto err_alloc_ioas; - } + space = vfio_address_space_get(as); /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + container = VFIO_IOMMU_IOMMUFD(bcontainer); if (VFIO_IOMMU_GET_CLASS(bcontainer) != iommufd_vioc || vbasedev->iommufd != container->be) { continue; } - if (!iommufd_cdev_attach_container(vbasedev, container, &err)) { + + if (!cpr_is_incoming()) { + res = iommufd_cdev_attach_container(vbasedev, container, &err); + } else if (vbasedev->cpr.ioas_id == container->ioas_id) { + res = true; + } else { + continue; + } + + if (!res) { const char *msg = error_get_pretty(err); trace_iommufd_cdev_fail_attach_existing_container(msg); @@ -522,6 +585,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, } } + if (cpr_is_incoming()) { + ioas_id = vbasedev->cpr.ioas_id; + goto skip_ioas_alloc; + } + /* Need to allocate a new dedicated container */ if (!iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp)) { goto err_alloc_ioas; @@ -529,12 +597,14 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); +skip_ioas_alloc: container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; QLIST_INIT(&container->hwpt_list); + vbasedev->cpr.ioas_id = ioas_id; - bcontainer = &container->bcontainer; + bcontainer = VFIO_IOMMU(container); vfio_address_space_insert(space, bcontainer); if (!iommufd_cdev_attach_container(vbasedev, container, errp)) { @@ -555,12 +625,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, bcontainer->pgsizes = qemu_real_host_page_size(); } - bcontainer->listener = vfio_memory_listener; - memory_listener_register(&bcontainer->listener, bcontainer->space->as); + if (!vfio_listener_register(bcontainer, errp)) { + goto err_listener_register; + } - if (bcontainer->error) { - error_propagate_prepend(errp, bcontainer->error, - "memory listener initialization failed: "); + if (!vfio_iommufd_cpr_register_container(container, errp)) { goto err_listener_register; } @@ -573,7 +642,12 @@ found_container: goto err_listener_register; } - if (!vfio_cpr_register_container(bcontainer, errp)) { + /* + * Do not move this code before attachment! The nested IOMMU support + * needs device and hwpt id which are generated only after attachment. + */ + if (!vfio_device_hiod_create_and_realize(vbasedev, + TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) { goto err_listener_register; } @@ -585,14 +659,8 @@ found_container: iommufd_cdev_ram_block_discard_disable(false); } - vbasedev->group = 0; - vbasedev->num_irqs = dev_info.num_irqs; - vbasedev->num_regions = dev_info.num_regions; - vbasedev->flags = dev_info.flags; - vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + vfio_device_prepare(vbasedev, bcontainer, &dev_info); + vfio_iommufd_cpr_register_device(vbasedev); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); @@ -605,7 +673,7 @@ err_discard_disable: err_attach_container: iommufd_cdev_container_destroy(container); err_alloc_ioas: - vfio_put_address_space(space); + vfio_address_space_put(space); iommufd_cdev_unbind_and_disconnect(vbasedev); err_connect_bind: close(vbasedev->fd); @@ -614,24 +682,22 @@ err_connect_bind: static void iommufd_cdev_detach(VFIODevice *vbasedev) { - VFIOContainerBase *bcontainer = vbasedev->bcontainer; + VFIOContainer *bcontainer = vbasedev->bcontainer; VFIOAddressSpace *space = bcontainer->space; - VFIOIOMMUFDContainer *container = container_of(bcontainer, - VFIOIOMMUFDContainer, - bcontainer); - QLIST_REMOVE(vbasedev, global_next); - QLIST_REMOVE(vbasedev, container_next); - vbasedev->bcontainer = NULL; + VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); + + vfio_device_unprepare(vbasedev); if (!vbasedev->ram_block_discard_allowed) { iommufd_cdev_ram_block_discard_disable(false); } - vfio_cpr_unregister_container(bcontainer); + object_unref(vbasedev->hiod); iommufd_cdev_detach_container(vbasedev, container); iommufd_cdev_container_destroy(container); - vfio_put_address_space(space); + vfio_address_space_put(space); + vfio_iommufd_cpr_unregister_device(vbasedev); iommufd_cdev_unbind_and_disconnect(vbasedev); close(vbasedev->fd); } @@ -665,8 +731,8 @@ iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev, } vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid); - if (!vbasedev_tmp || !vbasedev_tmp->dev->realized || - vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) { + if (!vfio_pci_from_vfio_device(vbasedev_tmp) || + !vbasedev_tmp->dev->realized) { return NULL; } @@ -786,13 +852,12 @@ out_single: return ret; } -static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) +static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); - vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO; - vioc->dma_map = iommufd_cdev_map; + vioc->dma_map_file = iommufd_cdev_map_file; vioc->dma_unmap = iommufd_cdev_unmap; vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; @@ -801,21 +866,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { VFIODevice *vdev = opaque; + HostIOMMUDeviceIOMMUFD *idev; HostIOMMUDeviceCaps *caps = &hiod->caps; + VendorCaps *vendor_caps = &caps->vendor_caps; enum iommu_hw_info_type type; - union { - struct iommu_hw_info_vtd vtd; - } data; uint64_t hw_caps; hiod->agent = opaque; - if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, + vendor_caps, sizeof(*vendor_caps), &hw_caps, errp)) { return false; } @@ -824,6 +906,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, caps->type = type; caps->hw_caps = hw_caps; + idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->hwpt_id = vdev->hwpt->hwpt_id; + return true; } @@ -846,13 +933,17 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod) } -static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) +static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges; hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { diff --git a/hw/vfio/common.c b/hw/vfio/listener.c index 1a0d929..c6bb58f 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/listener.c @@ -25,12 +25,11 @@ #endif #include <linux/vfio.h> -#include "hw/vfio/vfio-common.h" -#include "hw/vfio/pci.h" -#include "exec/address-spaces.h" -#include "exec/memory.h" -#include "exec/ram_addr.h" #include "exec/target_page.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/pci.h" +#include "system/address-spaces.h" +#include "system/memory.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -41,163 +40,23 @@ #include "trace.h" #include "qapi/error.h" #include "migration/misc.h" -#include "migration/blocker.h" #include "migration/qemu-file.h" #include "system/tcg.h" #include "system/tpm.h" - -VFIODeviceList vfio_device_list = - QLIST_HEAD_INITIALIZER(vfio_device_list); -static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = - QLIST_HEAD_INITIALIZER(vfio_address_spaces); - -#ifdef CONFIG_KVM -/* - * We have a single VFIO pseudo device per KVM VM. Once created it lives - * for the life of the VM. Closing the file descriptor only drops our - * reference to it and the device's reference to kvm. Therefore once - * initialized, this file descriptor is only released on QEMU exit and - * we'll re-use it should another vfio device be attached before then. - */ -int vfio_kvm_device_fd = -1; -#endif +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" /* * Device state interfaces */ -bool vfio_mig_active(void) -{ - VFIODevice *vbasedev; - - if (QLIST_EMPTY(&vfio_device_list)) { - return false; - } - - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->migration_blocker) { - return false; - } - } - return true; -} - -static Error *multiple_devices_migration_blocker; - -/* - * Multiple devices migration is allowed only if all devices support P2P - * migration. Single device migration is allowed regardless of P2P migration - * support. - */ -static bool vfio_multiple_devices_migration_is_supported(void) -{ - VFIODevice *vbasedev; - unsigned int device_num = 0; - bool all_support_p2p = true; - - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->migration) { - device_num++; - - if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { - all_support_p2p = false; - } - } - } - - return all_support_p2p || device_num <= 1; -} - -int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) -{ - int ret; - - if (vfio_multiple_devices_migration_is_supported()) { - return 0; - } - - if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { - error_setg(errp, "Multiple VFIO devices migration is supported only if " - "all of them support P2P migration"); - return -EINVAL; - } - - if (multiple_devices_migration_blocker) { - return 0; - } - - error_setg(&multiple_devices_migration_blocker, - "Multiple VFIO devices migration is supported only if all of " - "them support P2P migration"); - ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp); - - return ret; -} - -void vfio_unblock_multiple_devices_migration(void) -{ - if (!multiple_devices_migration_blocker || - !vfio_multiple_devices_migration_is_supported()) { - return; - } - - migrate_del_blocker(&multiple_devices_migration_blocker); -} - -bool vfio_viommu_preset(VFIODevice *vbasedev) -{ - return vbasedev->bcontainer->space->as != &address_space_memory; -} - -static void vfio_set_migration_error(int ret) -{ - if (migration_is_running()) { - migration_file_set_error(ret, NULL); - } -} - -bool vfio_device_state_is_running(VFIODevice *vbasedev) -{ - VFIOMigration *migration = vbasedev->migration; - - return migration->device_state == VFIO_DEVICE_STATE_RUNNING || - migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; -} - -bool vfio_device_state_is_precopy(VFIODevice *vbasedev) -{ - VFIOMigration *migration = vbasedev->migration; - - return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || - migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; -} - -static bool vfio_devices_all_device_dirty_tracking_started( - const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (!vbasedev->dirty_tracking) { - return false; - } - } - - return true; -} - -bool vfio_devices_all_dirty_tracking_started( - const VFIOContainerBase *bcontainer) -{ - return vfio_devices_all_device_dirty_tracking_started(bcontainer) || - bcontainer->dirty_pages_started; -} -static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) +static bool vfio_log_sync_needed(const VFIOContainer *bcontainer) { VFIODevice *vbasedev; - if (!vfio_devices_all_dirty_tracking_started(bcontainer)) { + if (!vfio_container_dirty_tracking_is_started(bcontainer)) { return false; } @@ -217,22 +76,6 @@ static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) return true; } -bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { - return false; - } - if (!vbasedev->dirty_pages_supported) { - return false; - } - } - - return true; -} - static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -247,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) +/* + * Called with rcu_read_lock held. + * The returned MemoryRegion must not be accessed after calling rcu_read_unlock. + */ +static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { - bool ret, mr_has_discard_manager; + MemoryRegion *mr; - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { + mr = memory_translate_iotlb(iotlb, xlat_p, errp); + if (mr && memory_region_has_ram_discard_manager(mr)) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The * pages will remain pinned inside vfio until unmapped, resulting in a @@ -275,14 +119,16 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, " intended via an IOMMU. It's possible to mitigate " " by setting/adjusting RLIMIT_MEMLOCK."); } - return ret; + return mr; } static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainerBase *bcontainer = giommu->bcontainer; + VFIOContainer *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mr; + hwaddr xlat; void *vaddr; int ret; Error *local_err = NULL; @@ -291,9 +137,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { - error_report("Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); - vfio_set_migration_error(-EINVAL); + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + if (migration_is_running()) { + migration_file_set_error(-EINVAL, local_err); + } else { + error_report_err(local_err); + } return; } @@ -302,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); goto out; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be @@ -315,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mr); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -324,13 +179,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } } else { ret = vfio_container_dma_unmap(bcontainer, iova, - iotlb->addr_mask + 1, iotlb); + iotlb->addr_mask + 1, iotlb, false); if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, - iotlb->addr_mask + 1, ret, strerror(-ret)); - vfio_set_migration_error(ret); + error_setg(&local_err, + "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, + iotlb->addr_mask + 1, ret, strerror(-ret)); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } } } out: @@ -342,13 +202,13 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); - VFIOContainerBase *bcontainer = vrdl->bcontainer; + VFIOContainer *bcontainer = vrdl->bcontainer; const hwaddr size = int128_get64(section->size); const hwaddr iova = section->offset_within_address_space; int ret; /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, false); if (ret) { error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); @@ -360,7 +220,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); - VFIOContainerBase *bcontainer = vrdl->bcontainer; + VFIOContainer *bcontainer = vrdl->bcontainer; const hwaddr end = section->offset_within_region + int128_get64(section->size); hwaddr start, next, iova; @@ -380,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -390,8 +250,9 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, return 0; } -static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) +static bool vfio_ram_discard_register_listener(VFIOContainer *bcontainer, + MemoryRegionSection *section, + Error **errp) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); int target_page_size = qemu_target_page_size(); @@ -456,16 +317,18 @@ static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, if (vrdl_mappings + max_memslots - vrdl_count > bcontainer->dma_max_mappings) { - warn_report("%s: possibly running out of DMA mappings. E.g., try" + error_setg(errp, "%s: possibly running out of DMA mappings. E.g., try" " increasing the 'block-size' of virtio-mem devies." " Maximum possible DMA mappings: %d, Maximum possible" " memslots: %d", __func__, bcontainer->dma_max_mappings, max_memslots); + return false; } } + return true; } -static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, +static void vfio_ram_discard_unregister_listener(VFIOContainer *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); @@ -533,7 +396,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section, return true; } -static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, +static bool vfio_get_section_iova_range(VFIOContainer *bcontainer, MemoryRegionSection *section, hwaddr *out_iova, hwaddr *out_end, Int128 *out_llend) @@ -558,23 +421,76 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, return true; } +static void vfio_listener_begin(MemoryListener *listener) +{ + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); + void (*listener_begin)(VFIOContainer *bcontainer); + + listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + + if (listener_begin) { + listener_begin(bcontainer); + } +} + +static void vfio_listener_commit(MemoryListener *listener) +{ + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); + void (*listener_commit)(VFIOContainer *bcontainer); + + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit; + + if (listener_commit) { + listener_commit(bcontainer); + } +} + static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) { /* * MMIO region mapping failures are not fatal but in this case PCI * peer-to-peer transactions are broken. */ - if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) { + if (vfio_pci_from_vfio_device(vbasedev)) { error_append_hint(errp, "%s: PCI peer-to-peer transactions " "on BARs are not supported.\n", vbasedev->name); } } +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainer *bcontainer, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); + vfio_container_region_add(bcontainer, section, false); +} + +void vfio_container_region_add(VFIOContainer *bcontainer, + MemoryRegionSection *section, + bool cpr_remap) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -610,6 +526,11 @@ static void vfio_listener_region_add(MemoryListener *listener, int iommu_idx; trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + + if (cpr_remap) { + vfio_cpr_giommu_remap(bcontainer, section); + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -652,7 +573,17 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(bcontainer, section); + if (!cpr_remap) { + if (!vfio_ram_discard_register_listener(bcontainer, section, &err)) { + goto fail; + } + } else if (!vfio_cpr_ram_discard_register_listener(bcontainer, + section)) { + error_setg(&err, + "vfio_cpr_ram_discard_register_listener for %s failed", + memory_region_name(section->mr)); + goto fail; + } return; } @@ -678,7 +609,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -725,8 +656,8 @@ fail: static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -775,27 +706,20 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_unregister_ram_discard_listener(bcontainer, section); + vfio_ram_discard_unregister_listener(bcontainer, section); /* Unregistering will trigger an unmap. */ try_unmap = false; } if (try_unmap) { + bool unmap_all = false; + if (int128_eq(llsize, int128_2_64())) { - /* The unmap ioctl doesn't accept a full 64-bit span. */ - llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); - if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, int128_get64(llsize), ret, - strerror(-ret)); - } - iova += int128_get64(llsize); + unmap_all = true; + llsize = int128_zero(); } - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), + NULL, unmap_all); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", @@ -820,13 +744,13 @@ typedef struct VFIODirtyRanges { } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { - VFIOContainerBase *bcontainer; + VFIOContainer *bcontainer; VFIODirtyRanges ranges; MemoryListener listener; } VFIODirtyRangesListener; static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainerBase *bcontainer) + VFIOContainer *bcontainer) { VFIOPCIDevice *pcidev; VFIODevice *vbasedev; @@ -835,7 +759,7 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, owner = memory_region_owner(section->mr); QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + if (!vfio_pci_from_vfio_device(vbasedev)) { continue; } pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -911,7 +835,7 @@ static const MemoryListener vfio_dirty_tracking_listener = { .region_add = vfio_dirty_tracking_update, }; -static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, +static void vfio_dirty_tracking_init(VFIOContainer *bcontainer, VFIODirtyRanges *ranges) { VFIODirtyRangesListener dirty; @@ -936,7 +860,7 @@ static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, memory_listener_unregister(&dirty.listener); } -static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) +static void vfio_devices_dma_logging_stop(VFIOContainer *bcontainer) { uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; @@ -948,20 +872,24 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + int ret; + if (!vbasedev->dirty_tracking) { continue; } - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + ret = vbasedev->io_ops->device_feature(vbasedev, feature); + + if (ret != 0) { warn_report("%s: Failed to stop DMA logging, err %d (%s)", - vbasedev->name, -errno, strerror(errno)); + vbasedev->name, -ret, strerror(-ret)); } vbasedev->dirty_tracking = false; } } static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, +vfio_device_feature_dma_logging_start_create(VFIOContainer *bcontainer, VFIODirtyRanges *tracking) { struct vfio_device_feature *feature; @@ -1034,7 +962,7 @@ static void vfio_device_feature_dma_logging_start_destroy( g_free(feature); } -static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, +static bool vfio_devices_dma_logging_start(VFIOContainer *bcontainer, Error **errp) { struct vfio_device_feature *feature; @@ -1055,10 +983,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, continue; } - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + ret = vbasedev->io_ops->device_feature(vbasedev, feature); if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "%s: Failed to start DMA logging", + error_setg_errno(errp, -ret, "%s: Failed to start DMA logging", vbasedev->name); goto out; } @@ -1079,11 +1006,11 @@ static bool vfio_listener_log_global_start(MemoryListener *listener, Error **errp) { ERRP_GUARD(); - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); bool ret; - if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { ret = vfio_devices_dma_logging_start(bcontainer, errp); } else { ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0; @@ -1097,12 +1024,12 @@ static bool vfio_listener_log_global_start(MemoryListener *listener, static void vfio_listener_log_global_stop(MemoryListener *listener) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); Error *local_err = NULL; int ret = 0; - if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { vfio_devices_dma_logging_stop(bcontainer); } else { ret = vfio_container_set_dirty_page_tracking(bcontainer, false, @@ -1112,102 +1039,12 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) if (ret) { error_prepend(&local_err, "vfio: Could not stop dirty page tracking - "); - error_report_err(local_err); - vfio_set_migration_error(ret); - } -} - -static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, - hwaddr size, void *bitmap) -{ - uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + - sizeof(struct vfio_device_feature_dma_logging_report), - sizeof(uint64_t))] = {}; - struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; - struct vfio_device_feature_dma_logging_report *report = - (struct vfio_device_feature_dma_logging_report *)feature->data; - - report->iova = iova; - report->length = size; - report->page_size = qemu_real_host_page_size(); - report->bitmap = (uintptr_t)bitmap; - - feature->argsz = sizeof(buf); - feature->flags = VFIO_DEVICE_FEATURE_GET | - VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { - return -errno; - } - - return 0; -} - -int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - VFIODevice *vbasedev; - int ret; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - ret = vfio_device_dma_logging_report(vbasedev, iova, size, - vbmap->bitmap); - if (ret) { - error_setg_errno(errp, -ret, - "%s: Failed to get DMA logging report, iova: " - "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, - vbasedev->name, iova, size); - - return ret; + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); } } - - return 0; -} - -int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, - uint64_t size, ram_addr_t ram_addr, Error **errp) -{ - bool all_device_dirty_tracking = - vfio_devices_all_device_dirty_tracking(bcontainer); - uint64_t dirty_pages; - VFIOBitmap vbmap; - int ret; - - if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { - cpu_physical_memory_set_dirty_range(ram_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); - return 0; - } - - ret = vfio_bitmap_alloc(&vbmap, size); - if (ret) { - error_setg_errno(errp, -ret, - "Failed to allocate dirty tracking bitmap"); - return ret; - } - - if (all_device_dirty_tracking) { - ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, - errp); - } else { - ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size, - errp); - } - - if (ret) { - goto out; - } - - dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, - vbmap.pages); - - trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); -out: - g_free(vbmap.bitmap); - - return ret; } typedef struct { @@ -1220,34 +1057,37 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier *gdn = container_of(n, vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; - VFIOContainerBase *bcontainer = giommu->bcontainer; + VFIOContainer *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; - ram_addr_t translated_addr; + hwaddr translated_addr; Error *local_err = NULL; int ret = -EINVAL; + MemoryRegion *mr; + hwaddr xlat; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { - error_report("Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); goto out; } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { - error_report_err(local_err); + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { goto out_unlock; } + translated_addr = memory_region_get_ram_addr(mr) + xlat; - ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr, &local_err); if (ret) { error_prepend(&local_err, "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") failed - ", bcontainer, iova, iotlb->addr_mask + 1); - error_report_err(local_err); } out_unlock: @@ -1255,17 +1095,21 @@ out_unlock: out: if (ret) { - vfio_set_migration_error(ret); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } } } -static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, +static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section, void *opaque) { const hwaddr size = int128_get64(section->size); const hwaddr iova = section->offset_within_address_space; - const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + - section->offset_within_region; + const hwaddr translated_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; VFIORamDiscardListener *vrdl = opaque; Error *local_err = NULL; int ret; @@ -1274,8 +1118,8 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, - &local_err); + ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, + translated_addr, &local_err); if (ret) { error_report_err(local_err); } @@ -1283,34 +1127,23 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, } static int -vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); /* * We only want/can synchronize the bitmap for actually mapped parts - * which correspond to populated parts. Replay all populated parts. */ return ram_discard_manager_replay_populated(rdm, section, - vfio_ram_discard_get_dirty_bitmap, + vfio_ram_discard_query_dirty_bitmap, &vrdl); } -static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, +static int vfio_sync_iommu_dirty_bitmap(VFIOContainer *bcontainer, MemoryRegionSection *section) { VFIOGuestIOMMU *giommu; @@ -1347,10 +1180,10 @@ static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, return 0; } -static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, +static int vfio_sync_dirty_bitmap(VFIOContainer *bcontainer, MemoryRegionSection *section, Error **errp) { - ram_addr_t ram_addr; + hwaddr translated_addr; if (memory_region_is_iommu(section->mr)) { return vfio_sync_iommu_dirty_bitmap(bcontainer, section); @@ -1365,19 +1198,19 @@ static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, return ret; } - ram_addr = memory_region_get_ram_addr(section->mr) + - section->offset_within_region; + translated_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; - return vfio_get_dirty_bitmap(bcontainer, + return vfio_container_query_dirty_bitmap(bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), - int128_get64(section->size), ram_addr, errp); + int128_get64(section->size), translated_addr, errp); } static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); int ret; Error *local_err = NULL; @@ -1388,14 +1221,19 @@ static void vfio_listener_log_sync(MemoryListener *listener, if (vfio_log_sync_needed(bcontainer)) { ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err); if (ret) { - error_report_err(local_err); - vfio_set_migration_error(ret); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } } } } -const MemoryListener vfio_memory_listener = { +static const MemoryListener vfio_memory_listener = { .name = "vfio", + .begin = vfio_listener_begin, + .commit = vfio_listener_commit, .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, .log_global_start = vfio_listener_log_global_start, @@ -1403,184 +1241,21 @@ const MemoryListener vfio_memory_listener = { .log_sync = vfio_listener_log_sync, }; -void vfio_reset_handler(void *opaque) +bool vfio_listener_register(VFIOContainer *bcontainer, Error **errp) { - VFIODevice *vbasedev; + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); - trace_vfio_reset_handler(); - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->dev->realized) { - vbasedev->ops->vfio_compute_needs_reset(vbasedev); - } - } - - QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { - if (vbasedev->dev->realized && vbasedev->needs_reset) { - vbasedev->ops->vfio_hot_reset_multi(vbasedev); - } - } -} - -int vfio_kvm_device_add_fd(int fd, Error **errp) -{ -#ifdef CONFIG_KVM - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_FILE, - .attr = KVM_DEV_VFIO_FILE_ADD, - .addr = (uint64_t)(unsigned long)&fd, - }; - - if (!kvm_enabled()) { - return 0; - } - - if (vfio_kvm_device_fd < 0) { - struct kvm_create_device cd = { - .type = KVM_DEV_TYPE_VFIO, - }; - - if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { - error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); - return -errno; - } - - vfio_kvm_device_fd = cd.fd; - } - - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", - fd); - return -errno; - } -#endif - return 0; -} - -int vfio_kvm_device_del_fd(int fd, Error **errp) -{ -#ifdef CONFIG_KVM - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_FILE, - .attr = KVM_DEV_VFIO_FILE_DEL, - .addr = (uint64_t)(unsigned long)&fd, - }; - - if (vfio_kvm_device_fd < 0) { - error_setg(errp, "KVM VFIO device isn't created yet"); - return -EINVAL; - } - - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, - "Failed to remove fd %d from KVM VFIO device", fd); - return -errno; - } -#endif - return 0; -} - -VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) -{ - VFIOAddressSpace *space; - - QLIST_FOREACH(space, &vfio_address_spaces, list) { - if (space->as == as) { - return space; - } - } - - /* No suitable VFIOAddressSpace, create a new one */ - space = g_malloc0(sizeof(*space)); - space->as = as; - QLIST_INIT(&space->containers); - - if (QLIST_EMPTY(&vfio_address_spaces)) { - qemu_register_reset(vfio_reset_handler, NULL); - } - - QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); - - return space; -} - -void vfio_put_address_space(VFIOAddressSpace *space) -{ - if (!QLIST_EMPTY(&space->containers)) { - return; - } - - QLIST_REMOVE(space, list); - g_free(space); - - if (QLIST_EMPTY(&vfio_address_spaces)) { - qemu_unregister_reset(vfio_reset_handler, NULL); - } -} - -void vfio_address_space_insert(VFIOAddressSpace *space, - VFIOContainerBase *bcontainer) -{ - QLIST_INSERT_HEAD(&space->containers, bcontainer, next); - bcontainer->space = space; -} - -struct vfio_device_info *vfio_get_device_info(int fd) -{ - struct vfio_device_info *info; - uint32_t argsz = sizeof(*info); - - info = g_malloc0(argsz); - -retry: - info->argsz = argsz; - - if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { - g_free(info); - return NULL; - } - - if (info->argsz > argsz) { - argsz = info->argsz; - info = g_realloc(info, argsz); - goto retry; - } - - return info; -} - -bool vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) -{ - const VFIOIOMMUClass *ops = - VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); - HostIOMMUDevice *hiod = NULL; - - if (vbasedev->iommufd) { - ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); - } - - assert(ops); - - - if (!vbasedev->mdev) { - hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); - vbasedev->hiod = hiod; - } - - if (!ops->attach_device(name, vbasedev, as, errp)) { - object_unref(hiod); - vbasedev->hiod = NULL; + if (bcontainer->error) { + error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); return false; } return true; } -void vfio_detach_device(VFIODevice *vbasedev) +void vfio_listener_unregister(VFIOContainer *bcontainer) { - if (!vbasedev->bcontainer) { - return; - } - object_unref(vbasedev->hiod); - VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); + memory_listener_unregister(&bcontainer->listener); } diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index a8939c8..82f6869 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -1,7 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + vfio_ss = ss.source_set() vfio_ss.add(files( - 'common.c', + 'listener.c', 'container.c', + 'container-legacy.c', + 'helpers.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( @@ -9,24 +13,24 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'pci.c', )) vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) -vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c')) vfio_ss.add(when: 'CONFIG_VFIO_AP', if_true: files('ap.c')) vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c')) specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss) -system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) -system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) system_ss.add(when: 'CONFIG_VFIO', if_true: files( - 'helpers.c', - 'container-base.c', + 'cpr.c', + 'cpr-legacy.c', + 'device.c', 'migration.c', 'migration-multifd.c', - 'cpr.c', + 'region.c', )) system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( 'iommufd.c', + 'cpr-iommufd.c', )) +system_ss.add(when: 'CONFIG_IOMMUFD', if_false: files('iommufd-stubs.c')) system_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', )) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 378f6f3..e478503 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -10,10 +10,9 @@ */ #include "qemu/osdep.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" #include "migration/misc.h" #include "qapi/error.h" -#include "qemu/bswap.h" #include "qemu/error-report.h" #include "qemu/lockable.h" #include "qemu/main-loop.h" @@ -21,7 +20,9 @@ #include "io/channel-buffer.h" #include "migration/qemu-file.h" #include "migration-multifd.h" +#include "vfio-migration-internal.h" #include "trace.h" +#include "vfio-helpers.h" #define VFIO_DEVICE_STATE_CONFIG_STATE (1) @@ -34,6 +35,18 @@ typedef struct VFIODeviceStatePacket { uint8_t data[0]; } QEMU_PACKED VFIODeviceStatePacket; +bool vfio_load_config_after_iter(VFIODevice *vbasedev) +{ + if (vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_ON) { + return true; + } else if (vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_OFF) { + return false; + } + + assert(vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_AUTO); + return vfio_arch_wants_loading_config_after_iter(); +} + /* type safety */ typedef struct VFIOStateBuffers { GArray *array; @@ -49,12 +62,16 @@ typedef struct VFIOMultifd { bool load_bufs_thread_running; bool load_bufs_thread_want_exit; + bool load_bufs_iter_done; + QemuCond load_bufs_iter_done_cond; + VFIOStateBuffers load_bufs; QemuCond load_bufs_buffer_ready_cond; QemuCond load_bufs_thread_finished_cond; QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ uint32_t load_buf_idx; uint32_t load_buf_idx_last; + size_t load_buf_queued_pending_buffers_size; } VFIOMultifd; static void vfio_state_buffer_clear(gpointer data) @@ -111,6 +128,7 @@ static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; VFIOStateBuffer *lb; + size_t data_size = packet_total_size - sizeof(*packet); vfio_state_buffers_assert_init(&multifd->load_bufs); if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { @@ -126,8 +144,19 @@ static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, assert(packet->idx >= multifd->load_buf_idx); - lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); - lb->len = packet_total_size - sizeof(*packet); + multifd->load_buf_queued_pending_buffers_size += data_size; + if (multifd->load_buf_queued_pending_buffers_size > + vbasedev->migration_max_queued_buffers_size) { + error_setg(errp, + "%s: queuing state buffer %" PRIu32 + " would exceed the size max of %" PRIu64, + vbasedev->name, packet->idx, + vbasedev->migration_max_queued_buffers_size); + return false; + } + + lb->data = g_memdup2(&packet->data, data_size); + lb->len = data_size; lb->is_present = true; return true; @@ -311,6 +340,9 @@ static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, assert(wr_ret <= buf_len); buf_len -= wr_ret; buf_cur += wr_ret; + + assert(multifd->load_buf_queued_pending_buffers_size >= wr_ret); + multifd->load_buf_queued_pending_buffers_size -= wr_ret; } trace_vfio_load_state_device_buffer_load_end(vbasedev->name, @@ -393,6 +425,22 @@ static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) multifd->load_buf_idx++; } + if (vfio_load_config_after_iter(vbasedev)) { + while (!multifd->load_bufs_iter_done) { + qemu_cond_wait(&multifd->load_bufs_iter_done_cond, + &multifd->load_bufs_mutex); + + /* + * Need to re-check cancellation immediately after wait in case + * cond was signalled by vfio_load_cleanup_load_bufs_thread(). + */ + if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + } + } + if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { goto thread_exit; } @@ -412,6 +460,48 @@ thread_exit: return ret; } +int vfio_load_state_config_load_ready(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + int ret = 0; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + error_report("%s: got DEV_CONFIG_LOAD_READY outside multifd transfer", + vbasedev->name); + return -EINVAL; + } + + if (!vfio_load_config_after_iter(vbasedev)) { + error_report("%s: got DEV_CONFIG_LOAD_READY but was disabled", + vbasedev->name); + return -EINVAL; + } + + assert(multifd); + + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + if (multifd->load_bufs_iter_done) { + /* Can't print error here as we're outside BQL */ + ret = -EINVAL; + break; + } + + multifd->load_bufs_iter_done = true; + qemu_cond_signal(&multifd->load_bufs_iter_done_cond); + } + bql_lock(); + + if (ret) { + error_report("%s: duplicate DEV_CONFIG_LOAD_READY", + vbasedev->name); + } + + return ret; +} + static VFIOMultifd *vfio_multifd_new(void) { VFIOMultifd *multifd = g_new(VFIOMultifd, 1); @@ -422,8 +512,12 @@ static VFIOMultifd *vfio_multifd_new(void) multifd->load_buf_idx = 0; multifd->load_buf_idx_last = UINT32_MAX; + multifd->load_buf_queued_pending_buffers_size = 0; qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + multifd->load_bufs_iter_done = false; + qemu_cond_init(&multifd->load_bufs_iter_done_cond); + multifd->load_bufs_thread_running = false; multifd->load_bufs_thread_want_exit = false; qemu_cond_init(&multifd->load_bufs_thread_finished_cond); @@ -447,6 +541,7 @@ static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) multifd->load_bufs_thread_want_exit = true; qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + qemu_cond_signal(&multifd->load_bufs_iter_done_cond); qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, &multifd->load_bufs_mutex); } @@ -459,6 +554,7 @@ static void vfio_multifd_free(VFIOMultifd *multifd) vfio_load_cleanup_load_bufs_thread(multifd); qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); + qemu_cond_destroy(&multifd->load_bufs_iter_done_cond); vfio_state_buffers_destroy(&multifd->load_bufs); qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); qemu_mutex_destroy(&multifd->load_bufs_mutex); @@ -575,14 +671,14 @@ vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, return false; } - vfio_mig_add_bytes_transferred(packet_len); + vfio_migration_add_bytes_transferred(packet_len); return true; } /* * This thread is spawned by the migration core directly via - * .save_live_complete_precopy_thread SaveVMHandler. + * .save_complete_precopy_thread SaveVMHandler. * * It exits after either: * * completing saving the remaining device state and device config, OR: @@ -591,7 +687,7 @@ vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, * multifd_device_state_save_thread_should_exit() returning true. */ bool -vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, +vfio_multifd_save_complete_precopy_thread(SaveCompletePrecopyThreadData *d, Error **errp) { VFIODevice *vbasedev = d->handler_opaque; @@ -645,7 +741,7 @@ vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, goto thread_exit; } - vfio_mig_add_bytes_transferred(packet_size); + vfio_migration_add_bytes_transferred(packet_size); } if (!vfio_save_complete_precopy_thread_config_state(vbasedev, diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index a664051..82d2d3a 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -12,7 +12,7 @@ #ifndef HW_VFIO_MIGRATION_MULTIFD_H #define HW_VFIO_MIGRATION_MULTIFD_H -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp); void vfio_multifd_cleanup(VFIODevice *vbasedev); @@ -20,13 +20,16 @@ void vfio_multifd_cleanup(VFIODevice *vbasedev); bool vfio_multifd_transfer_supported(void); bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); +bool vfio_load_config_after_iter(VFIODevice *vbasedev); bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, Error **errp); +int vfio_load_state_config_load_ready(VFIODevice *vbasedev); + void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f); bool -vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, +vfio_multifd_save_complete_precopy_thread(SaveCompletePrecopyThreadData *d, Error **errp); int vfio_multifd_switchover_start(VFIODevice *vbasedev); diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index fbff46c..4c06e3d 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -16,7 +16,8 @@ #include <sys/ioctl.h> #include "system/runstate.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-migration.h" #include "migration/misc.h" #include "migration/savevm.h" #include "migration/vmstate.h" @@ -30,6 +31,7 @@ #include "pci.h" #include "trace.h" #include "hw/hw.h" +#include "vfio-migration-internal.h" /* * This is an arbitrary size based on migration of mlx5 devices, where typically @@ -373,7 +375,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); qemu_put_be64(f, data_size); qemu_put_buffer(f, migration->data_buffer, data_size); - vfio_mig_add_bytes_transferred(data_size); + vfio_migration_add_bytes_transferred(data_size); trace_vfio_save_block(migration->vbasedev->name, data_size); @@ -673,7 +675,11 @@ static void vfio_save_state(QEMUFile *f, void *opaque) int ret; if (vfio_multifd_transfer_enabled(vbasedev)) { - vfio_multifd_emit_dummy_eos(vbasedev, f); + if (vfio_load_config_after_iter(vbasedev)) { + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_LOAD_READY); + } else { + vfio_multifd_emit_dummy_eos(vbasedev, f); + } return; } @@ -782,6 +788,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) return ret; } + case VFIO_MIG_FLAG_DEV_CONFIG_LOAD_READY: + { + return vfio_load_state_config_load_ready(vbasedev); + } default: error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); return -EINVAL; @@ -822,7 +832,7 @@ static const SaveVMHandlers savevm_vfio_handlers = { .state_pending_exact = vfio_state_pending_exact, .is_active_iterate = vfio_is_active_iterate, .save_live_iterate = vfio_save_iterate, - .save_live_complete_precopy = vfio_save_complete_precopy, + .save_complete = vfio_save_complete_precopy, .save_state = vfio_save_state, .load_setup = vfio_load_setup, .load_cleanup = vfio_load_cleanup, @@ -833,7 +843,7 @@ static const SaveVMHandlers savevm_vfio_handlers = { */ .load_state_buffer = vfio_multifd_load_state_buffer, .switchover_start = vfio_switchover_start, - .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, + .save_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, }; /* ---------------------------------------------------------------------- */ @@ -1014,13 +1024,72 @@ static int vfio_migration_init(VFIODevice *vbasedev) vfio_vmstate_change_prepare : NULL; migration->vm_state = qdev_add_vm_change_state_handler_full( - vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); + vbasedev->dev, vfio_vmstate_change, prepare_cb, NULL, vbasedev); migration_add_notifier(&migration->migration_state, vfio_migration_state_notifier); return 0; } +static Error *multiple_devices_migration_blocker; + +/* + * Multiple devices migration is allowed only if all devices support P2P + * migration. Single device migration is allowed regardless of P2P migration + * support. + */ +static bool vfio_multiple_devices_migration_is_supported(void) +{ + VFIODevice *vbasedev; + unsigned int device_num = 0; + bool all_support_p2p = true; + + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->migration) { + device_num++; + + if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { + all_support_p2p = false; + } + } + } + + return all_support_p2p || device_num <= 1; +} + +static int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) +{ + if (vfio_multiple_devices_migration_is_supported()) { + return 0; + } + + if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { + error_setg(errp, "Multiple VFIO devices migration is supported only if " + "all of them support P2P migration"); + return -EINVAL; + } + + if (multiple_devices_migration_blocker) { + return 0; + } + + error_setg(&multiple_devices_migration_blocker, + "Multiple VFIO devices migration is supported only if all of " + "them support P2P migration"); + return migrate_add_blocker_normal(&multiple_devices_migration_blocker, + errp); +} + +static void vfio_unblock_multiple_devices_migration(void) +{ + if (!multiple_devices_migration_blocker || + !vfio_multiple_devices_migration_is_supported()) { + return; + } + + migrate_del_blocker(&multiple_devices_migration_blocker); +} + static void vfio_migration_deinit(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -1047,21 +1116,42 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) /* ---------------------------------------------------------------------- */ -int64_t vfio_mig_bytes_transferred(void) +int64_t vfio_migration_bytes_transferred(void) { return MIN(qatomic_read(&bytes_transferred), INT64_MAX); } -void vfio_reset_bytes_transferred(void) +void vfio_migration_reset_bytes_transferred(void) { qatomic_set(&bytes_transferred, 0); } -void vfio_mig_add_bytes_transferred(unsigned long val) +void vfio_migration_add_bytes_transferred(unsigned long val) { qatomic_add(&bytes_transferred, val); } +bool vfio_migration_active(void) +{ + VFIODevice *vbasedev; + + if (QLIST_EMPTY(&vfio_device_list)) { + return false; + } + + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->migration_blocker) { + return false; + } + } + return true; +} + +static bool vfio_viommu_preset(VFIODevice *vbasedev) +{ + return vbasedev->bcontainer->space->as != &address_space_memory; +} + /* * Return true when either migration initialized or blocker registered. * Currently only return false when adding blocker fails which will @@ -1138,3 +1228,19 @@ void vfio_migration_exit(VFIODevice *vbasedev) migrate_del_blocker(&vbasedev->migration_blocker); } + +bool vfio_device_state_is_running(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->device_state == VFIO_DEVICE_STATE_RUNNING || + migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; +} + +bool vfio_device_state_is_precopy(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || + migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; +} diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 3f00225..b5da6af 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -113,6 +113,7 @@ static uint64_t vfio_generic_window_quirk_data_read(void *opaque, { VFIOConfigWindowQuirk *window = opaque; VFIOPCIDevice *vdev = window->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); uint64_t data; /* Always read data reg, discard if window enabled */ @@ -120,7 +121,7 @@ static uint64_t vfio_generic_window_quirk_data_read(void *opaque, addr + window->data_offset, size); if (window->window_enabled) { - data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); + data = vfio_pci_read_config(pdev, window->address_val, size); trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, memory_region_name(window->data_mem), data); } @@ -133,9 +134,10 @@ static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, { VFIOConfigWindowQuirk *window = opaque; VFIOPCIDevice *vdev = window->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); if (window->window_enabled) { - vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); + vfio_pci_write_config(pdev, window->address_val, data, size); trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, memory_region_name(window->data_mem), data); return; @@ -156,6 +158,7 @@ static uint64_t vfio_generic_quirk_mirror_read(void *opaque, { VFIOConfigMirrorQuirk *mirror = opaque; VFIOPCIDevice *vdev = mirror->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); uint64_t data; /* Read and discard in case the hardware cares */ @@ -163,7 +166,7 @@ static uint64_t vfio_generic_quirk_mirror_read(void *opaque, addr + mirror->offset, size); addr += mirror->config_offset; - data = vfio_pci_read_config(&vdev->pdev, addr, size); + data = vfio_pci_read_config(pdev, addr, size); trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, memory_region_name(mirror->mem), addr, data); @@ -175,9 +178,10 @@ static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, { VFIOConfigMirrorQuirk *mirror = opaque; VFIOPCIDevice *vdev = mirror->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); addr += mirror->config_offset; - vfio_pci_write_config(&vdev->pdev, addr, data, size); + vfio_pci_write_config(pdev, addr, data, size); trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, memory_region_name(mirror->mem), addr, data); @@ -211,7 +215,8 @@ static uint64_t vfio_ati_3c3_quirk_read(void *opaque, hwaddr addr, unsigned size) { VFIOPCIDevice *vdev = opaque; - uint64_t data = vfio_pci_read_config(&vdev->pdev, + PCIDevice *pdev = PCI_DEVICE(vdev); + uint64_t data = vfio_pci_read_config(pdev, PCI_BASE_ADDRESS_4 + 1, size); trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); @@ -563,6 +568,7 @@ static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, { VFIONvidia3d0Quirk *quirk = opaque; VFIOPCIDevice *vdev = quirk->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); VFIONvidia3d0State old_state = quirk->state; uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], addr + 0x10, size); @@ -573,7 +579,7 @@ static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); - data = vfio_pci_read_config(&vdev->pdev, offset, size); + data = vfio_pci_read_config(pdev, offset, size); trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, offset, size, data); } @@ -586,6 +592,7 @@ static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, { VFIONvidia3d0Quirk *quirk = opaque; VFIOPCIDevice *vdev = quirk->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); VFIONvidia3d0State old_state = quirk->state; quirk->state = NONE; @@ -599,7 +606,7 @@ static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); - vfio_pci_write_config(&vdev->pdev, offset, data, size); + vfio_pci_write_config(pdev, offset, data, size); trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, offset, data, size); return; @@ -815,7 +822,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, { VFIOConfigMirrorQuirk *mirror = opaque; VFIOPCIDevice *vdev = mirror->vdev; - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); LastDataSet *last = (LastDataSet *)&mirror->data; vfio_generic_quirk_mirror_write(opaque, addr, data, size); @@ -1005,6 +1012,7 @@ static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, { VFIOrtl8168Quirk *rtl = opaque; VFIOPCIDevice *vdev = rtl->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); rtl->enabled = false; @@ -1013,7 +1021,7 @@ static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, rtl->addr = (uint32_t)data; if (data & 0x80000000U) { /* Do write */ - if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { + if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { hwaddr offset = data & 0xfff; uint64_t val = rtl->data; @@ -1021,7 +1029,7 @@ static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, (uint16_t)offset, val); /* Write to the proper guest MSI-X table instead */ - memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, + memory_region_dispatch_write(&pdev->msix_table_mmio, offset, val, size_memop(size) | MO_LE, MEMTXATTRS_UNSPECIFIED); @@ -1049,11 +1057,12 @@ static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, { VFIOrtl8168Quirk *rtl = opaque; VFIOPCIDevice *vdev = rtl->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); - if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { + if (rtl->enabled && (pdev->cap_present & QEMU_PCI_CAP_MSIX)) { hwaddr offset = rtl->addr & 0xfff; - memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, + memory_region_dispatch_read(&pdev->msix_table_mmio, offset, &data, size_memop(size) | MO_LE, MEMTXATTRS_UNSPECIFIED); trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); @@ -1150,15 +1159,12 @@ void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) { - int i, j; + int i; for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); QLIST_REMOVE(quirk, next); - for (j = 0; j < quirk->nr_mem; j++) { - object_unparent(OBJECT(&quirk->mem[j])); - } g_free(quirk->mem); g_free(quirk->data); g_free(quirk); @@ -1198,14 +1204,10 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; - int i; while (!QLIST_EMPTY(&bar->quirks)) { VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); QLIST_REMOVE(quirk, next); - for (i = 0; i < quirk->nr_mem; i++) { - object_unparent(OBJECT(&quirk->mem[i])); - } g_free(quirk->mem); g_free(quirk->data); g_free(quirk); @@ -1297,7 +1299,7 @@ static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) static int vfio_radeon_reset(VFIOPCIDevice *vdev) { - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); int i, ret = 0; uint32_t data; @@ -1454,7 +1456,7 @@ static bool is_valid_std_cap_offset(uint8_t pos) static bool vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) { ERRP_GUARD(); - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); int ret, pos; bool c8_conflict = false, d4_conflict = false; uint8_t tmp; @@ -1547,6 +1549,7 @@ static bool vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) { ERRP_GUARD(); + PCIDevice *pdev = PCI_DEVICE(vdev); uint8_t membar_phys[16]; int ret, pos = 0xE8; @@ -1565,7 +1568,7 @@ static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) return false; } - ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos, + ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, VMD_SHADOW_CAP_LEN, errp); if (ret < 0) { error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: "); @@ -1574,10 +1577,10 @@ static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN); pos += PCI_CAP_FLAGS; - pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN); - pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER); - pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */ - memcpy(vdev->pdev.config + pos + 4, membar_phys, 16); + pci_set_byte(pdev->config + pos++, VMD_SHADOW_CAP_LEN); + pci_set_byte(pdev->config + pos++, VMD_SHADOW_CAP_VER); + pci_set_long(pdev->config + pos, 0x53484457); /* SHDW */ + memcpy(pdev->config + pos + 4, membar_phys, 16); return true; } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 7f1532f..06b06af 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -29,7 +29,9 @@ #include "hw/pci/pci_bridge.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "hw/vfio/vfio-cpr.h" #include "migration/vmstate.h" +#include "migration/cpr.h" #include "qobject/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -44,8 +46,8 @@ #include "migration/blocker.h" #include "migration/qemu-file.h" #include "system/iommufd.h" - -#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" /* Protected by BQL */ static KVMRouteChange vfio_route_change; @@ -54,6 +56,36 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +/* Create new or reuse existing eventfd */ +static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr, Error **errp) +{ + int fd, ret; + + fd = vfio_cpr_load_vector_fd(vdev, name, nr); + if (fd >= 0) { + event_notifier_init_fd(e, fd); + return true; + } + + ret = event_notifier_init(e, 0); + if (ret) { + error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + return false; + } + + fd = event_notifier_get_fd(e); + vfio_cpr_save_vector_fd(vdev, name, nr, fd); + return true; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + vfio_cpr_delete_vector_fd(vdev, name, nr); + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -85,6 +117,7 @@ static void vfio_intx_mmap_enable(void *opaque) static void vfio_intx_interrupt(void *opaque) { VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = PCI_DEVICE(vdev); if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { return; @@ -93,7 +126,7 @@ static void vfio_intx_interrupt(void *opaque) trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin); vdev->intx.pending = true; - pci_irq_assert(&vdev->pdev); + pci_irq_assert(pdev); vfio_mmap_set_enabled(vdev, false); if (vdev->intx.mmap_timeout) { timer_mod(vdev->intx.mmap_timer, @@ -101,24 +134,26 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_pci_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + PCIDevice *pdev = PCI_DEVICE(vdev); if (!vdev->intx.pending) { return; } - trace_vfio_intx_eoi(vbasedev->name); + trace_vfio_pci_intx_eoi(vbasedev->name); vdev->intx.pending = false; - pci_irq_deassert(&vdev->pdev); - vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + pci_irq_deassert(pdev); + vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); } static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) { #ifdef CONFIG_KVM + PCIDevice *pdev = PCI_DEVICE(vdev); int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt); if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || @@ -129,13 +164,12 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) /* Get to a known interrupt state */ qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; - pci_irq_deassert(&vdev->pdev); + pci_irq_deassert(pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { goto fail; } @@ -147,15 +181,15 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) goto fail_irqfd; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_UNMASK, - event_notifier_get_fd(&vdev->intx.unmask), - errp)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_UNMASK, + event_notifier_get_fd(&vdev->intx.unmask), + errp)) { goto fail_vfio; } /* Let'em rip */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.kvm_accel = true; @@ -167,19 +201,51 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); return false; #else return true; #endif } +static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) +{ +#ifdef CONFIG_KVM + if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || + vdev->intx.route.mode != PCI_INTX_ENABLED || + !kvm_resamplefds_enabled()) { + return true; + } + + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { + return false; + } + + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, + &vdev->intx.interrupt, + &vdev->intx.unmask, + vdev->intx.route.irq)) { + error_setg_errno(errp, errno, "failed to setup resample irqfd"); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); + return false; + } + + vdev->intx.kvm_accel = true; + trace_vfio_intx_enable_kvm(vdev->vbasedev.name); + return true; +#else + return true; +#endif +} + static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) { #ifdef CONFIG_KVM + PCIDevice *pdev = PCI_DEVICE(vdev); + if (!vdev->intx.kvm_accel) { return; } @@ -188,9 +254,9 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) * Get to a known state, hardware masked, QEMU ready to accept new * interrupts, QEMU IRQ de-asserted. */ - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; - pci_irq_deassert(&vdev->pdev); + pci_irq_deassert(pdev); /* Tell KVM to stop listening for an INTx irqfd */ if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, @@ -199,7 +265,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -208,7 +274,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) vdev->intx.kvm_accel = false; /* If we've missed an event, let it re-fire through QEMU */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); trace_vfio_intx_disable_kvm(vdev->vbasedev.name); #endif @@ -234,19 +300,19 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) } /* Re-enable the interrupt in cased we missed an EOI */ - vfio_intx_eoi(&vdev->vbasedev); + vfio_pci_intx_eoi(&vdev->vbasedev); } static void vfio_intx_routing_notifier(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); PCIINTxRoute route; if (vdev->interrupt != VFIO_INT_INTx) { return; } - route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); + route = pci_device_route_intx_to_irq(pdev, vdev->intx.pin); if (pci_intx_route_changed(&vdev->intx.route, &route)) { vfio_intx_update(vdev, &route); @@ -263,20 +329,26 @@ static void vfio_irqchip_change(Notifier *notify, void *data) static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) { - uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); + PCIDevice *pdev = PCI_DEVICE(vdev); + uint8_t pin = vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1); Error *err = NULL; int32_t fd; - int ret; if (!pin) { return true; } - vfio_disable_interrupts(vdev); + /* + * Do not alter interrupt state during vfio_realize and cpr load. + * The incoming state is cleared thereafter. + */ + if (!cpr_is_incoming()) { + vfio_disable_interrupts(vdev); + } vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ - pci_config_set_interrupt_pin(vdev->pdev.config, pin); + pci_config_set_interrupt_pin(pdev->config, pin); #ifdef CONFIG_KVM /* @@ -284,23 +356,30 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) * where we won't actually use the result anyway. */ if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { - vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, + vdev->intx.route = pci_device_route_intx_to_irq(pdev, vdev->intx.pin); } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); - if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0, + errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + + if (cpr_is_incoming()) { + if (!vfio_cpr_intx_enable_kvm(vdev, &err)) { + warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } + goto skip_signaling; + } + + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return false; } @@ -308,6 +387,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } +skip_signaling: vdev->interrupt = VFIO_INT_INTx; trace_vfio_intx_enable(vdev->vbasedev.name); @@ -316,24 +396,38 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) static void vfio_intx_disable(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); int fd; timer_del(vdev->intx.mmap_timer); vfio_intx_disable_kvm(vdev); - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; - pci_irq_deassert(&vdev->pdev); + pci_irq_deassert(pdev); vfio_mmap_set_enabled(vdev, true); fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; trace_vfio_intx_disable(vdev->vbasedev.name); } +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp) +{ + return vfio_intx_enable(vdev, errp); +} + +void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable) +{ + int fd = event_notifier_get_fd(&vdev->intx.interrupt); + IOHandler *handler = (enable ? vfio_intx_interrupt : NULL); + + qemu_set_fd_handler(fd, handler, NULL, vdev); +} + /* * MSI/X */ @@ -341,6 +435,7 @@ static void vfio_msi_interrupt(void *opaque) { VFIOMSIVector *vector = opaque; VFIOPCIDevice *vdev = vector->vdev; + PCIDevice *pdev = PCI_DEVICE(vdev); MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); void (*notify)(PCIDevice *dev, unsigned vector); MSIMessage msg; @@ -355,9 +450,9 @@ static void vfio_msi_interrupt(void *opaque) notify = msix_notify; /* A masked vector firing needs to use the PBA, enable it */ - if (msix_is_masked(&vdev->pdev, nr)) { + if (msix_is_masked(pdev, nr)) { set_bit(nr, vdev->msix->pending); - memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true); + memory_region_set_enabled(&pdev->msix_pba_mmio, true); trace_vfio_msix_pba_enable(vdev->vbasedev.name); } } else if (vdev->interrupt == VFIO_INT_MSI) { @@ -367,9 +462,18 @@ static void vfio_msi_interrupt(void *opaque) abort(); } - msg = get_msg(&vdev->pdev, nr); + msg = get_msg(pdev, nr); trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); - notify(&vdev->pdev, nr); + notify(pdev, nr); +} + +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + int fd = event_notifier_get_fd(&vector->interrupt); + IOHandler *handler = (enable ? vfio_msi_interrupt : NULL); + + qemu_set_fd_handler(fd, handler, NULL, vector); } /* @@ -379,7 +483,7 @@ static void vfio_msi_interrupt(void *opaque) static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) { g_autofree struct vfio_irq_set *irq_set = NULL; - int ret = 0, argsz; + int argsz; int32_t *fd; argsz = sizeof(*irq_set) + sizeof(*fd); @@ -394,13 +498,12 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) fd = (int32_t *)&irq_set->data; *fd = -1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); - - return ret; + return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) { + PCIDevice *pdev = PCI_DEVICE(vdev); struct vfio_irq_set *irq_set; int ret = 0, i, argsz; int32_t *fds; @@ -443,7 +546,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) */ if (vdev->msi_vectors[i].use) { if (vdev->msi_vectors[i].virq < 0 || - (msix && msix_is_masked(&vdev->pdev, i))) { + (msix && msix_is_masked(pdev, i))) { fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); } else { fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); @@ -453,31 +556,36 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); g_free(irq_set); return ret; } -static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix) { + PCIDevice *pdev = PCI_DEVICE(vdev); + if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; } vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change, - vector_n, &vdev->pdev); + vector_n, pdev); } -static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) { + const char *name = "kvm_interrupt"; + if (vector->virq < 0) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr, + NULL)) { goto fail_notifier; } @@ -489,19 +597,20 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) return; fail_kvm: - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr); fail_notifier: kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -511,10 +620,47 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = PCI_DEVICE(vdev); + Error *local_err = NULL; + + vector->vdev = vdev; + vector->virq = -1; + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr, + &local_err)) { + error_report_err(local_err); + } + vector->use = true; + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_vector_use(pdev, nr); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIOMSIVector *vector; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -524,13 +670,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_pci_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -542,19 +682,19 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } } else { if (msg) { if (vdev->defer_kvm_irq_routing) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); } else { vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); - vfio_connect_kvm_msi_virq(vector); + vfio_connect_kvm_msi_virq(vector, nr); } } } @@ -576,27 +716,14 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, if (!vdev->defer_kvm_irq_routing) { if (vdev->msix->noresize && resizing) { - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_set_irq_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -604,7 +731,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, clear_bit(nr, vdev->msix->pending); if (find_first_bit(vdev->msix->pending, vdev->nr_vectors) == vdev->nr_vectors) { - memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); + memory_region_set_enabled(&pdev->msix_pba_mmio, false); trace_vfio_msix_pba_disable(vdev->vbasedev.name); } @@ -614,12 +741,21 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, static int vfio_msix_vector_use(PCIDevice *pdev, unsigned int nr, MSIMessage msg) { + /* + * Ignore the callback from msix_set_vector_notifiers during resume. + * The necessary subset of these actions is called from + * vfio_cpr_claim_vectors during post load. + */ + if (cpr_is_incoming()) { + return 0; + } + return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); } static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); @@ -636,7 +772,7 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) int32_t fd = event_notifier_get_fd(&vector->interrupt); Error *err = NULL; - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); @@ -644,14 +780,22 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev) +{ + PCIDevice *pdev = PCI_DEVICE(vdev); + + msix_set_vector_notifiers(pdev, vfio_msix_vector_use, + vfio_msix_vector_release, NULL); +} + +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); vdev->defer_kvm_irq_routing = true; vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); } -static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { int i; @@ -661,12 +805,13 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) kvm_irqchip_commit_route_changes(&vfio_route_change); for (i = 0; i < vdev->nr_vectors; i++) { - vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i); } } static void vfio_msix_enable(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); int ret; vfio_disable_interrupts(vdev); @@ -681,19 +826,20 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * routes once rather than per vector provides a substantial * performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); - if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, + if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { /* @@ -710,7 +856,8 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) */ ret = vfio_enable_msix_no_vec(vdev); if (ret) { - error_report("vfio: failed to enable MSI-X, %d", ret); + error_report("vfio: failed to enable MSI-X, %s", + strerror(-ret)); } } @@ -719,30 +866,33 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) static void vfio_msi_enable(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); int ret, i; vfio_disable_interrupts(vdev); - vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); + vdev->nr_vectors = msi_nr_vectors_allocated(pdev); retry: /* * Setting vector notifiers needs to enable route for each vector. * Deferring to commit the KVM routes once rather than per vector * provides a substantial performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i, + &local_err)) { + error_report_err(local_err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -752,10 +902,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vdev, vector, i, false); + vfio_pci_add_kvm_msi_virq(vdev, vector, i, false); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; @@ -763,7 +913,8 @@ retry: ret = vfio_enable_vectors(vdev, false); if (ret) { if (ret < 0) { - error_report("vfio: Error: Failed to setup MSI fds: %m"); + error_report("vfio: Error: Failed to setup MSI fds: %s", + strerror(-ret)); } else { error_report("vfio: Error: Failed to enable %d " "MSI vectors, retry with %d", vdev->nr_vectors, ret); @@ -797,11 +948,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -813,10 +964,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) static void vfio_msix_disable(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); Error *err = NULL; int i; - msix_unset_vector_notifiers(&vdev->pdev); + msix_unset_vector_notifiers(pdev); /* * MSI-X will only release vectors if MSI-X is still enabled on the @@ -824,8 +976,8 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) */ for (i = 0; i < vdev->nr_vectors; i++) { if (vdev->msi_vectors[i].use) { - vfio_msix_vector_release(&vdev->pdev, i); - msix_vector_unuse(&vdev->pdev, i); + vfio_msix_vector_release(pdev, i); + msix_vector_unuse(pdev, i); } } @@ -833,7 +985,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) * Always clear MSI-X IRQ index. A PF device could have enabled * MSI-X with no vectors. See vfio_msix_enable(). */ - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); vfio_msi_disable_common(vdev); if (!vfio_intx_enable(vdev, &err)) { @@ -850,7 +1002,7 @@ static void vfio_msi_disable(VFIOPCIDevice *vdev) { Error *err = NULL; - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); + vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); vfio_msi_disable_common(vdev); vfio_intx_enable(vdev, &err); if (err) { @@ -862,6 +1014,7 @@ static void vfio_msi_disable(VFIOPCIDevice *vdev) static void vfio_update_msi(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); int i; for (i = 0; i < vdev->nr_vectors; i++) { @@ -872,25 +1025,29 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) continue; } - msg = msi_get_message(&vdev->pdev, i); - vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev); + msg = msi_get_message(pdev, i); + vfio_update_kvm_msi_virq(vector, msg, pdev); } } static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { - g_autofree struct vfio_region_info *reg_info = NULL; + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info = NULL; uint64_t size; off_t off = 0; ssize_t bytes; + int ret; + + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, + ®_info); - if (vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { - error_report("vfio: Error getting ROM info: %m"); + if (ret != 0) { + error_report("vfio: Error getting ROM info: %s", strerror(-ret)); return; } - trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, + trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -899,8 +1056,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) if (!vdev->rom_size) { vdev->rom_read_failed = true; - error_report("vfio-pci: Cannot read device rom at " - "%s", vdev->vbasedev.name); + error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name); error_printf("Device option ROM contents are probably invalid " "(check dmesg).\nSkip option ROM probe with rombar=0, " "or load from file with romfile=\n"); @@ -911,18 +1067,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = vbasedev->io_ops->region_read(vbasedev, + VFIO_PCI_ROM_REGION_INDEX, + off, size, vdev->rom + off); + if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strreaderror(bytes)); + break; } } @@ -958,6 +1118,24 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) } } +/* "Raw" read of underlying config space. */ +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); +} + +/* "Raw" write of underlying config space. */ +static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data, false); +} + static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) { VFIOPCIDevice *vdev = opaque; @@ -1010,14 +1188,14 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; char *name; - int fd = vdev->vbasedev.fd; - if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { + if (pdev->romfile || !pdev->rom_bar) { /* Since pci handles romfile, just print a message and return */ - if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) { + if (vfio_opt_rom_in_denylist(vdev) && pdev->romfile) { warn_report("Device at %s is known to cause system instability" " issues during option rom execution", vdev->vbasedev.name); @@ -1030,11 +1208,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) * Use the same size ROM BAR as the physical device. The contents * will get filled in later when the guest tries to read it. */ - if (pread(fd, &orig, 4, offset) != 4 || - pwrite(fd, &size, 4, offset) != 4 || - pread(fd, &size, 4, offset) != 4 || - pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); + if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) { + + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); return; } @@ -1045,7 +1224,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) } if (vfio_opt_rom_in_denylist(vdev)) { - if (vdev->pdev.rom_bar > 0) { + if (pdev->rom_bar > 0) { warn_report("Device at %s is known to cause system instability" " issues during option rom execution", vdev->vbasedev.name); @@ -1064,12 +1243,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name); - memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), + memory_region_init_io(&pdev->rom, OBJECT(vdev), &vfio_rom_ops, vdev, name, size); g_free(name); - pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, - PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); + pci_register_bar(pdev, PCI_ROM_SLOT, + PCI_BASE_ADDRESS_SPACE_MEMORY, &pdev->rom); vdev->rom_read_failed = false; } @@ -1167,7 +1346,7 @@ static const MemoryRegionOps vfio_vga_ops = { */ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIORegion *region = &vdev->bars[bar].region; MemoryRegion *mmap_mr, *region_mr, *base_mr; PCIIORegion *r; @@ -1213,7 +1392,8 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) */ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1226,12 +1406,12 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { ssize_t ret; - ret = pread(vdev->vbasedev.fd, &phys_val, len, - vdev->config_offset + addr); + ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val); if (ret != len) { - error_report("%s(%s, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, len); - return -errno; + error_report("%s(%s, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, len, + strreaderror(ret)); + return -1; } phys_val = le32_to_cpu(phys_val); } @@ -1246,16 +1426,19 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); + int ret; trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); /* Write everything to VFIO, let it filter out what we can't write */ - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) - != len) { - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, val, len); + ret = vfio_pci_config_space_write(vdev, addr, len, &val_le); + if (ret != len) { + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, val, len, + strwriteerror(ret)); } /* MSI/MSI-X Enabling/Disabling */ @@ -1338,14 +1521,17 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev) static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) { + PCIDevice *pdev = PCI_DEVICE(vdev); uint16_t ctrl; bool msi_64bit, msi_maskbit; int ret, entries; Error *err = NULL; - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", + strreaderror(ret)); return false; } ctrl = le16_to_cpu(ctrl); @@ -1356,7 +1542,7 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) trace_vfio_msi_setup(vdev->vbasedev.name, pos); - ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err); + ret = msi_init(pdev, pos, entries, msi_64bit, msi_maskbit, &err); if (ret < 0) { if (ret == -ENOTSUP) { return true; @@ -1378,8 +1564,8 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) * If the host driver allows mapping of a MSIX data, we are going to * do map the entire BAR and emulate MSIX table on top of that. */ - if (vfio_has_region_cap(&vdev->vbasedev, region->nr, - VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { + if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr, + VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { return; } @@ -1549,34 +1735,39 @@ static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) */ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) { + PCIDevice *pdev = PCI_DEVICE(vdev); uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int ret, fd = vdev->vbasedev.fd; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_MSIX_IRQ_INDEX }; + struct vfio_irq_info irq_info; VFIOMSIXInfo *msix; + int ret; - pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); + pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); if (!pos) { return true; } - if (pread(fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed to read PCI MSIX FLAGS: %s", + strreaderror(ret)); return false; } - if (pread(fd, &table, sizeof(table), - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE, + sizeof(table), &table); + if (ret != sizeof(table)) { + error_setg(errp, "failed to read PCI MSIX TABLE: %s", + strreaderror(ret)); return false; } - if (pread(fd, &pba, sizeof(pba), - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA, + sizeof(pba), &pba); + if (ret != sizeof(pba)) { + error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret)); return false; } @@ -1591,7 +1782,8 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + &irq_info); if (ret < 0) { error_setg_errno(errp, -ret, "failed to get MSI-X irq info"); g_free(msix); @@ -1643,12 +1835,13 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) { + PCIDevice *pdev = PCI_DEVICE(vdev); int ret; Error *err = NULL; vdev->msix->pending = g_new0(unsigned long, BITS_TO_LONGS(vdev->msix->entries)); - ret = msix_init(&vdev->pdev, vdev->msix->entries, + ret = msix_init(pdev, vdev->msix->entries, vdev->bars[vdev->msix->table_bar].mr, vdev->msix->table_bar, vdev->msix->table_offset, vdev->bars[vdev->msix->pba_bar].mr, @@ -1680,7 +1873,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) * vector-use notifier is called, which occurs on unmask, we test whether * PBA emulation is needed and again disable if not. */ - memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); + memory_region_set_enabled(&pdev->msix_pba_mmio, false); /* * The emulated machine may provide a paravirt interface for MSIX setup @@ -1692,18 +1885,20 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) */ if (object_property_get_bool(OBJECT(qdev_get_machine()), "vfio-no-msix-emulation", NULL)) { - memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); + memory_region_set_enabled(&pdev->msix_table_mmio, false); } return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev) { - msi_uninit(&vdev->pdev); + PCIDevice *pdev = PCI_DEVICE(vdev); + + msi_uninit(pdev); if (vdev->msix) { - msix_uninit(&vdev->pdev, + msix_uninit(pdev, vdev->bars[vdev->msix->table_bar].mr, vdev->bars[vdev->msix->pba_bar].mr); g_free(vdev->msix->pending); @@ -1735,10 +1930,10 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) } /* Determine what type of BAR this is for registration */ - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr), + sizeof(pci_bar), &pci_bar); if (ret != sizeof(pci_bar)) { - error_report("vfio: Failed to read BAR %d (%m)", nr); + error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret)); return; } @@ -1748,6 +1943,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) @@ -1761,6 +1959,7 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev) static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) { + PCIDevice *pdev = PCI_DEVICE(vdev); VFIOBAR *bar = &vdev->bars[nr]; char *name; @@ -1782,7 +1981,7 @@ static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) } } - pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr); + pci_register_bar(pdev, nr, bar->type, bar->mr); } static void vfio_bars_register(VFIOPCIDevice *vdev) @@ -1794,8 +1993,9 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_pci_bars_exit(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); int i; for (i = 0; i < PCI_ROM_SLOT; i++) { @@ -1809,7 +2009,7 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev) } if (vdev->vga) { - pci_unregister_vga(&vdev->pdev); + pci_unregister_vga(pdev); vfio_vga_quirk_exit(vdev); } } @@ -1825,7 +2025,6 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev) vfio_region_finalize(&bar->region); if (bar->mr) { assert(bar->size); - object_unparent(OBJECT(bar->mr)); g_free(bar->mr); bar->mr = NULL; } @@ -1833,9 +2032,6 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev) if (vdev->vga) { vfio_vga_quirk_finalize(vdev); - for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { - object_unparent(OBJECT(&vdev->vga->region[i].mem)); - } g_free(vdev->vga); } } @@ -1881,8 +2077,10 @@ static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos, uint16_t val, uint16_t mask) { - vfio_set_word_bits(vdev->pdev.config + pos, val, mask); - vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask); + PCIDevice *pdev = PCI_DEVICE(vdev); + + vfio_set_word_bits(pdev->config + pos, val, mask); + vfio_set_word_bits(pdev->wmask + pos, ~mask, mask); vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask); } @@ -1894,8 +2092,10 @@ static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask) static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, uint32_t val, uint32_t mask) { - vfio_set_long_bits(vdev->pdev.config + pos, val, mask); - vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask); + PCIDevice *pdev = PCI_DEVICE(vdev); + + vfio_set_long_bits(pdev->config + pos, val, mask); + vfio_set_long_bits(pdev->wmask + pos, ~mask, mask); vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); } @@ -1903,7 +2103,8 @@ static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev) { struct vfio_device_info_cap_pci_atomic_comp *cap; g_autofree struct vfio_device_info *info = NULL; - PCIBus *bus = pci_get_bus(&vdev->pdev); + PCIDevice *pdev = PCI_DEVICE(vdev); + PCIBus *bus = pci_get_bus(pdev); PCIDevice *parent = bus->parent_dev; struct vfio_info_cap_header *hdr; uint32_t mask = 0; @@ -1919,8 +2120,8 @@ static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev) if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap || pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT || pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 || - vdev->pdev.devfn || - vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + pdev->devfn || + pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { return; } @@ -1964,8 +2165,10 @@ static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev) static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev) { + PCIDevice *pdev = PCI_DEVICE(vdev); + if (vdev->clear_parent_atomics_on_exit) { - PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev; + PCIDevice *parent = pci_get_bus(pdev)->parent_dev; uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2; pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 | @@ -1977,10 +2180,11 @@ static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev) static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, Error **errp) { + PCIDevice *pdev = PCI_DEVICE(vdev); uint16_t flags; uint8_t type; - flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS); + flags = pci_get_word(pdev->config + pos + PCI_CAP_FLAGS); type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; if (type != PCI_EXP_TYPE_ENDPOINT && @@ -1992,8 +2196,8 @@ static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, return false; } - if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) { - PCIBus *bus = pci_get_bus(&vdev->pdev); + if (!pci_bus_is_express(pci_get_bus(pdev))) { + PCIBus *bus = pci_get_bus(pdev); PCIDevice *bridge; /* @@ -2025,7 +2229,7 @@ static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, return true; } - } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) { + } else if (pci_bus_is_root(pci_get_bus(pdev))) { /* * On a Root Complex bus Endpoints become Root Complex Integrated * Endpoints, which changes the type and clears the LNK & LNK2 fields. @@ -2093,20 +2297,20 @@ static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, 1, PCI_EXP_FLAGS_VERS); } - pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size, - errp); + pos = pci_add_capability(pdev, PCI_CAP_ID_EXP, pos, size, errp); if (pos < 0) { return false; } - vdev->pdev.exp.exp_cap = pos; + pdev->exp.exp_cap = pos; return true; } static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) { - uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); + PCIDevice *pdev = PCI_DEVICE(vdev); + uint32_t cap = pci_get_long(pdev->config + pos + PCI_EXP_DEVCAP); if (cap & PCI_EXP_DEVCAP_FLR) { trace_vfio_check_pcie_flr(vdev->vbasedev.name); @@ -2116,7 +2320,8 @@ static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) { - uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); + PCIDevice *pdev = PCI_DEVICE(vdev); + uint16_t csr = pci_get_word(pdev->config + pos + PCI_PM_CTRL); if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { trace_vfio_check_pm_reset(vdev->vbasedev.name); @@ -2126,7 +2331,8 @@ static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) { - uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); + PCIDevice *pdev = PCI_DEVICE(vdev); + uint8_t cap = pci_get_byte(pdev->config + pos + PCI_AF_CAP); if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { trace_vfio_check_af_flr(vdev->vbasedev.name); @@ -2137,7 +2343,7 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, Error **errp) { - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp); if (pos < 0) { @@ -2159,7 +2365,7 @@ static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos, static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) { ERRP_GUARD(); - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); uint8_t cap_id, next, size; bool ret; @@ -2245,17 +2451,18 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) { + PCIDevice *pdev = PCI_DEVICE(vdev); uint32_t ctrl; int i, nbar; - ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL); + ctrl = pci_get_long(pdev->config + pos + PCI_REBAR_CTRL); nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT; for (i = 0; i < nbar; i++) { uint32_t cap; int size; - ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8)); + ctrl = pci_get_long(pdev->config + pos + PCI_REBAR_CTRL + (i * 8)); size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT; /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */ @@ -2293,7 +2500,7 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) static void vfio_add_ext_cap(VFIOPCIDevice *vdev) { - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); uint32_t header; uint16_t cap_id, next, size; uint8_t cap_ver; @@ -2383,12 +2590,11 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) } g_free(config); - return; } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || !pdev->config[PCI_CAPABILITY_LIST]) { @@ -2405,7 +2611,7 @@ static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) void vfio_pci_pre_reset(VFIOPCIDevice *vdev) { - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); uint16_t cmd; vfio_disable_interrupts(vdev); @@ -2442,21 +2648,23 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) void vfio_pci_post_reset(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; Error *err = NULL; - int nr; + int ret, nr; if (!vfio_intx_enable(vdev, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); uint32_t val = 0; uint32_t len = sizeof(val); - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { - error_report("%s(%s) reset bar %d failed: %m", __func__, - vdev->vbasedev.name, nr); + ret = vfio_pci_config_space_write(vdev, addr, len, &val); + if (ret != len) { + error_report("%s(%s) reset bar %d failed: %s", __func__, + vbasedev->name, nr, strwriteerror(ret)); } } @@ -2599,8 +2807,8 @@ static const VMStateDescription vmstate_vfio_pci_config = { .version_id = 1, .minimum_version_id = 1, .fields = (const VMStateField[]) { - VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), - VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), + VMSTATE_PCI_DEVICE(parent_obj, VFIOPCIDevice), + VMSTATE_MSIX_TEST(parent_obj, VFIOPCIDevice, vfio_msix_present), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription * const []) { @@ -2613,23 +2821,26 @@ static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); - return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL, - errp); + return vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL, + errp); } static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); - PCIDevice *pdev = &vdev->pdev; + PCIDevice *pdev = PCI_DEVICE(vdev); pcibus_t old_addr[PCI_NUM_REGIONS - 1]; int bar, ret; + Error *local_err = NULL; for (bar = 0; bar < PCI_ROM_SLOT; bar++) { old_addr[bar] = pdev->io_regions[bar].addr; } - ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1, + &local_err); if (ret) { + error_report_err(local_err); return ret; } @@ -2657,10 +2868,33 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) return ret; } +/* Transform from VFIODevice to VFIOPCIDevice. Return NULL if fails. */ +VFIOPCIDevice *vfio_pci_from_vfio_device(VFIODevice *vbasedev) +{ + if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) { + return container_of(vbasedev, VFIOPCIDevice, vbasedev); + } + return NULL; +} + +void vfio_sub_page_bar_update_mappings(VFIOPCIDevice *vdev) +{ + PCIDevice *pdev = PCI_DEVICE(vdev); + int page_size = qemu_real_host_page_size(); + int bar; + + for (bar = 0; bar < PCI_ROM_SLOT; bar++) { + PCIIORegion *r = &pdev->io_regions[bar]; + if (r->addr != PCI_BAR_UNMAPPED && r->size > 0 && r->size < page_size) { + vfio_sub_page_bar_update_mapping(pdev, bar); + } + } +} + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_intx_eoi, + .vfio_eoi = vfio_pci_intx_eoi, .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, @@ -2669,10 +2903,10 @@ static VFIODeviceOps vfio_pci_ops = { bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; int ret; - ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); if (ret) { error_setg_errno(errp, -ret, "failed getting region info for VGA region index %d", @@ -2724,18 +2958,15 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) "vfio-vga-io@0x3c0", QEMU_PCI_VGA_IO_HI_SIZE); - pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); - return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp) { + PCIDevice *pdev = PCI_DEVICE(vdev); VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + struct vfio_region_info *reg_info = NULL; + struct vfio_irq_info irq_info; int i, ret = -1; /* Sanity check device */ @@ -2770,21 +3001,21 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) QLIST_INIT(&vdev->bars[i].quirks); } - ret = vfio_get_region_info(vbasedev, - VFIO_PCI_CONFIG_REGION_INDEX, ®_info); + ret = vfio_device_get_region_info(vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, ®_info); if (ret) { error_setg_errno(errp, -ret, "failed to get config info"); return false; } - trace_vfio_populate_device_config(vdev->vbasedev.name, + trace_vfio_pci_populate_device_config(vdev->vbasedev.name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); vdev->config_size = reg_info->size; if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { - vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; + pdev->cap_present &= ~QEMU_PCI_CAP_EXPRESS; } vdev->config_offset = reg_info->offset; @@ -2796,12 +3027,10 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } - irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; - - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); + trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2813,11 +3042,24 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { - vfio_detach_device(&vdev->vbasedev); + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + vfio_cpr_pci_unregister_device(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + + vfio_device_detach(&vdev->vbasedev); - g_free(vdev->vbasedev.name); + vfio_device_free_name(&vdev->vbasedev); g_free(vdev->msix); } @@ -2849,7 +3091,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2858,8 +3100,9 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Unable to init event notifier for error detection"); + if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0, + &err)) { + error_report_err(err); vdev->pci_aer = false; return; } @@ -2867,11 +3110,16 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (cpr_is_incoming()) { + return; + } + + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2884,13 +3132,13 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2908,35 +3156,43 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) { - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_REQ_IRQ_INDEX }; + struct vfio_irq_info irq_info; Error *err = NULL; int32_t fd; + int ret; if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { return; } - if (ioctl(vdev->vbasedev.fd, - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, + &irq_info); + if (ret < 0 || irq_info.count < 1) { return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { - error_report("vfio: Unable to init event notifier for device request"); + if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0, + &err)) { + error_report_err(err); return; } fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (cpr_is_incoming()) { + vdev->req_enabled = true; + return; + } + + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2950,87 +3206,43 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +void vfio_pci_config_register_vga(VFIOPCIDevice *vdev) { - ERRP_GUARD(); - VFIOPCIDevice *vdev = VFIO_PCI(pdev); - VFIODevice *vbasedev = &vdev->vbasedev; - int i, ret; - char uuid[UUID_STR_LEN]; - g_autofree char *name = NULL; - - if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { - if (!(~vdev->host.domain || ~vdev->host.bus || - ~vdev->host.slot || ~vdev->host.function)) { - error_setg(errp, "No provided host device"); - error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " -#ifdef CONFIG_IOMMUFD - "or -device vfio-pci,fd=DEVICE_FD " -#endif - "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); - return; - } - vbasedev->sysfsdev = - g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", - vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); - } - - if (!vfio_device_get_name(vbasedev, errp)) { - return; - } - - /* - * Mediated devices *might* operate compatibly with discarding of RAM, but - * we cannot know for certain, it depends on whether the mdev vendor driver - * stays in sync with the active working set of the guest driver. Prevent - * the x-balloon-allowed option unless this is minimally an mdev device. - */ - vbasedev->mdev = vfio_device_is_mdev(vbasedev); + PCIDevice *pdev = PCI_DEVICE(vdev); + assert(vdev->vga != NULL); - trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - - if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { - error_setg(errp, "x-balloon-allowed only potentially compatible " - "with mdev devices"); - goto error; - } - - if (!qemu_uuid_is_null(&vdev->vf_token)) { - qemu_uuid_unparse(&vdev->vf_token, uuid); - name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); - } else { - name = g_strdup(vbasedev->name); - } + pci_register_vga(pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); +} - if (!vfio_attach_device(name, vbasedev, - pci_device_iommu_address_space(pdev), errp)) { - goto error; - } +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) +{ + PCIDevice *pdev = PCI_DEVICE(vdev); + VFIODevice *vbasedev = &vdev->vbasedev; + uint32_t config_space_size; + int ret; - if (!vfio_populate_device(vdev, errp)) { - goto error; - } + config_space_size = MIN(pci_config_size(pdev), vdev->config_size); /* Get a copy of config space */ - ret = pread(vbasedev->fd, vdev->pdev.config, - MIN(pci_config_size(&vdev->pdev), vdev->config_size), - vdev->config_offset); - if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { - ret = ret < 0 ? -errno : -EFAULT; - error_setg_errno(errp, -ret, "failed to read device config space"); - goto error; + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + pdev->config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; } /* vfio emulates a lot for us, but some bits need extra love */ @@ -3049,7 +3261,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->vendor_id != PCI_ANY_ID) { if (vdev->vendor_id >= 0xffff) { error_setg(errp, "invalid PCI vendor ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id); @@ -3060,7 +3272,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->device_id != PCI_ANY_ID) { if (vdev->device_id > 0xffff) { error_setg(errp, "invalid PCI device ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id); @@ -3071,7 +3283,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->sub_vendor_id != PCI_ANY_ID) { if (vdev->sub_vendor_id > 0xffff) { error_setg(errp, "invalid PCI subsystem vendor ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, vdev->sub_vendor_id, ~0); @@ -3082,22 +3294,39 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vdev->sub_device_id != PCI_ANY_ID) { if (vdev->sub_device_id > 0xffff) { error_setg(errp, "invalid PCI subsystem device ID provided"); - goto error; + return false; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); trace_vfio_pci_emulated_sub_device_id(vbasedev->name, vdev->sub_device_id); } + /* + * Class code is a 24-bit value at config space 0x09. Allow overriding it + * with any 24-bit value. + */ + if (vdev->class_code != PCI_ANY_ID) { + if (vdev->class_code > 0xffffff) { + error_setg(errp, "invalid PCI class code provided"); + return false; + } + /* Higher 24 bits of PCI_CLASS_REVISION are class code */ + vfio_add_emulated_long(vdev, PCI_CLASS_REVISION, + vdev->class_code << 8, ~0xff); + trace_vfio_pci_emulated_class_code(vbasedev->name, vdev->class_code); + } else { + vdev->class_code = pci_get_long(pdev->config + PCI_CLASS_REVISION) >> 8; + } + /* QEMU can change multi-function devices to single function, or reverse */ vdev->emulated_config_bits[PCI_HEADER_TYPE] = PCI_HEADER_TYPE_MULTI_FUNCTION; /* Restore or clear multifunction, this is always controlled by QEMU */ - if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { - vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; + if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + pdev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; } else { - vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; + pdev->config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; } /* @@ -3105,26 +3334,137 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) * BAR, such as might be the case with the option ROM, we can get * confusing, unwritable, residual addresses from the host here. */ - memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); - memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); + memset(&pdev->config[PCI_BASE_ADDRESS_0], 0, 24); + memset(&pdev->config[PCI_ROM_ADDRESS], 0, 4); vfio_pci_size_rom(vdev); vfio_bars_prepare(vdev); if (!vfio_msix_early_setup(vdev, errp)) { - goto error; + return false; } vfio_bars_register(vdev); + if (vdev->vga && vfio_is_vga(vdev)) { + vfio_pci_config_register_vga(vdev); + } + + return true; +} + +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +{ + PCIDevice *pdev = PCI_DEVICE(vdev); + + /* QEMU emulates all of MSI & MSIX */ + if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { + memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, + MSIX_CAP_LENGTH); + } + + if (pdev->cap_present & QEMU_PCI_CAP_MSI) { + memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, + vdev->msi_cap_size); + } + + if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + vfio_intx_mmap_enable, vdev); + pci_device_set_intx_routing_notifier(pdev, + vfio_intx_routing_notifier); + vdev->irqchip_change_notifier.notify = vfio_irqchip_change; + kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); + + /* + * During CPR, do not call vfio_intx_enable at this time. Instead, + * call it from vfio_pci_post_load after the intx routing data has + * been loaded from vmstate. + */ + if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) { + timer_free(vdev->intx.mmap_timer); + pci_device_set_intx_routing_notifier(pdev, NULL); + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + return false; + } + } + return true; +} + +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) +{ + ERRP_GUARD(); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; + int i; + char uuid[UUID_STR_LEN]; + g_autofree char *name = NULL; + + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { + if (!(~vdev->host.domain || ~vdev->host.bus || + ~vdev->host.slot || ~vdev->host.function)) { + error_setg(errp, "No provided host device"); + error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif + "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); + return; + } + vbasedev->sysfsdev = + g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", + vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); + } + + if (!vfio_device_get_name(vbasedev, errp)) { + return; + } + + /* + * Mediated devices *might* operate compatibly with discarding of RAM, but + * we cannot know for certain, it depends on whether the mdev vendor driver + * stays in sync with the active working set of the guest driver. Prevent + * the x-balloon-allowed option unless this is minimally an mdev device. + */ + vbasedev->mdev = vfio_device_is_mdev(vbasedev); + + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); + + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { + error_setg(errp, "x-balloon-allowed only potentially compatible " + "with mdev devices"); + goto error; + } + + if (!qemu_uuid_is_null(&vdev->vf_token)) { + qemu_uuid_unparse(&vdev->vf_token, uuid); + name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); + } else { + name = g_strdup(vbasedev->name); + } + + if (!vfio_device_attach(name, vbasedev, + pci_device_iommu_address_space(pdev), errp)) { + goto error; + } + + if (!vfio_pci_populate_device(vdev, errp)) { + goto error; + } + + if (!vfio_pci_config_setup(vdev, errp)) { + goto error; + } + if (!vbasedev->mdev && !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { error_prepend(errp, "Failed to set vIOMMU: "); goto out_teardown; } - if (!vfio_add_capabilities(vdev, errp)) { + if (!vfio_pci_add_capabilities(vdev, errp)) { goto out_unset_idev; } @@ -3140,27 +3480,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bar_quirk_setup(vdev, i); } - /* QEMU emulates all of MSI & MSIX */ - if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { - memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, - MSIX_CAP_LENGTH); - } - - if (pdev->cap_present & QEMU_PCI_CAP_MSI) { - memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, - vdev->msi_cap_size); - } - - if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { - vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, - vfio_intx_mmap_enable, vdev); - pci_device_set_intx_routing_notifier(&vdev->pdev, - vfio_intx_routing_notifier); - vdev->irqchip_change_notifier.notify = vfio_irqchip_change; - kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); - if (!vfio_intx_enable(vdev, errp)) { - goto out_deregister; - } + if (!vfio_pci_interrupt_setup(vdev, errp)) { + goto out_unset_idev; } if (vdev->display != ON_OFF_AUTO_OFF) { @@ -3203,9 +3524,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - vfio_register_err_notifier(vdev); - vfio_register_req_notifier(vdev); + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); + vfio_cpr_pci_register_device(vdev); return; @@ -3213,7 +3535,7 @@ out_deregister: if (vdev->interrupt == VFIO_INT_INTx) { vfio_intx_disable(vdev); } - pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + pci_device_set_intx_routing_notifier(pdev, NULL); if (vdev->irqchip_change_notifier.notify) { kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); } @@ -3225,38 +3547,27 @@ out_unset_idev: pci_device_unset_iommu_device(pdev); } out_teardown: - vfio_teardown_msi(vdev); - vfio_bars_exit(vdev); + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } -static void vfio_instance_finalize(Object *obj) +static void vfio_pci_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } static void vfio_exitfn(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); - pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + pci_device_set_intx_routing_notifier(pdev, NULL); if (vdev->irqchip_change_notifier.notify) { kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); } @@ -3264,9 +3575,9 @@ static void vfio_exitfn(PCIDevice *pdev) if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } - vfio_teardown_msi(vdev); + vfio_pci_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); - vfio_bars_exit(vdev); + vfio_pci_bars_exit(vdev); vfio_migration_exit(vbasedev); if (!vbasedev->mdev) { pci_device_unset_iommu_device(pdev); @@ -3275,7 +3586,12 @@ static void vfio_exitfn(PCIDevice *pdev) static void vfio_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI(dev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(dev); + + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (cpr_is_incoming()) { + return; + } trace_vfio_pci_reset(vdev->vbasedev.name); @@ -3312,10 +3628,10 @@ post_reset: vfio_pci_post_reset(vdev); } -static void vfio_instance_init(Object *obj) +static void vfio_pci_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -3334,11 +3650,43 @@ static void vfio_instance_init(Object *obj) /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command * line, therefore, no need to wait to realize like other devices */ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + + /* + * A device that is resuming for cpr is already configured, so do not + * reset it during qemu_system_reset prior to cpr load, else interrupts + * may be lost. + */ + pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; +} + +static void vfio_pci_device_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->desc = "VFIO PCI base device"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pdc->exit = vfio_exitfn; + pdc->config_read = vfio_pci_read_config; + pdc->config_write = vfio_pci_write_config; } +static const TypeInfo vfio_pci_device_info = { + .name = TYPE_VFIO_PCI_DEVICE, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(VFIOPCIDevice), + .abstract = true, + .class_init = vfio_pci_device_class_init, + .interfaces = (const InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, +}; + static PropertyInfo vfio_pci_migration_multifd_transfer_prop; -static const Property vfio_pci_dev_properties[] = { +static const Property vfio_pci_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), @@ -3359,7 +3707,7 @@ static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true), DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false), DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice, @@ -3370,6 +3718,11 @@ static const Property vfio_pci_dev_properties[] = { vbasedev.migration_multifd_transfer, vfio_pci_migration_multifd_transfer_prop, OnOffAuto, .set_default = true, .defval.i = ON_OFF_AUTO_AUTO), + DEFINE_PROP_ON_OFF_AUTO("x-migration-load-config-after-iter", VFIOPCIDevice, + vbasedev.migration_load_config_after_iter, + ON_OFF_AUTO_AUTO), + DEFINE_PROP_SIZE("x-migration-max-queued-buffers-size", VFIOPCIDevice, + vbasedev.migration_max_queued_buffers_size, UINT64_MAX), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3390,6 +3743,8 @@ static const Property vfio_pci_dev_properties[] = { sub_vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-class-code", VFIOPCIDevice, + class_code, PCI_ANY_ID), DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice, nv_gpudirect_clique, @@ -3406,26 +3761,24 @@ static const Property vfio_pci_dev_properties[] = { #ifdef CONFIG_IOMMUFD static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) { - vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); + vfio_device_set_fd(&vdev->vbasedev, str, errp); } #endif -static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) +static void vfio_pci_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); device_class_set_legacy_reset(dc, vfio_pci_reset); - device_class_set_props(dc, vfio_pci_dev_properties); + device_class_set_props(dc, vfio_pci_properties); #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif + dc->vmsd = &vfio_cpr_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); - pdc->realize = vfio_realize; - pdc->exit = vfio_exitfn; - pdc->config_read = vfio_pci_read_config; - pdc->config_write = vfio_pci_write_config; + pdc->realize = vfio_pci_realize; object_class_property_set_description(klass, /* 1.3 */ "host", @@ -3546,33 +3899,44 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) "x-migration-multifd-transfer", "Transfer this device state via " "multifd channels when live migrating it"); -} - -static const TypeInfo vfio_pci_dev_info = { + object_class_property_set_description(klass, /* 10.1 */ + "x-migration-load-config-after-iter", + "Start the config load only after " + "all iterables were loaded (during " + "non-iterables loading phase) when " + "doing live migration of device state " + "via multifd channels"); + object_class_property_set_description(klass, /* 10.1 */ + "x-migration-max-queued-buffers-size", + "Maximum size of in-flight VFIO " + "device state buffers queued at the " + "destination when doing live " + "migration of device state via " + "multifd channels"); +} + +static const TypeInfo vfio_pci_info = { .name = TYPE_VFIO_PCI, - .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(VFIOPCIDevice), - .class_init = vfio_pci_dev_class_init, - .instance_init = vfio_instance_init, - .instance_finalize = vfio_instance_finalize, - .interfaces = (InterfaceInfo[]) { - { INTERFACE_PCIE_DEVICE }, - { INTERFACE_CONVENTIONAL_PCI_DEVICE }, - { } - }, + .parent = TYPE_VFIO_PCI_DEVICE, + .class_init = vfio_pci_class_init, + .instance_init = vfio_pci_init, + .instance_finalize = vfio_pci_finalize, }; -static const Property vfio_pci_dev_nohotplug_properties[] = { +static const Property vfio_pci_nohotplug_properties[] = { DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), + DEFINE_PROP_BOOL("use-legacy-x86-rom", VFIOPCIDevice, + use_legacy_x86_rom, false), DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate, ON_OFF_AUTO_AUTO), }; -static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) +static void vfio_pci_nohotplug_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - device_class_set_props(dc, vfio_pci_dev_nohotplug_properties); + device_class_set_props(dc, vfio_pci_nohotplug_properties); dc->hotpluggable = false; object_class_property_set_description(klass, /* 3.1 */ @@ -3583,13 +3947,16 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) "x-ramfb-migrate", "Override default migration support for ramfb support " "(DEBUG)"); + object_class_property_set_description(klass, /* 10.1 */ + "use-legacy-x86-rom", + "Controls loading of a legacy VGA BIOS ROM"); } -static const TypeInfo vfio_pci_nohotplug_dev_info = { +static const TypeInfo vfio_pci_nohotplug_info = { .name = TYPE_VFIO_PCI_NOHOTPLUG, .parent = TYPE_VFIO_PCI, .instance_size = sizeof(VFIOPCIDevice), - .class_init = vfio_pci_nohotplug_dev_class_init, + .class_init = vfio_pci_nohotplug_class_init, }; static void register_vfio_pci_dev_type(void) @@ -3605,8 +3972,9 @@ static void register_vfio_pci_dev_type(void) vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; - type_register_static(&vfio_pci_dev_info); - type_register_static(&vfio_pci_nohotplug_dev_info); + type_register_static(&vfio_pci_device_info); + type_register_static(&vfio_pci_info); + type_register_static(&vfio_pci_nohotplug_info); } type_init(register_vfio_pci_dev_type) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index d94ecab..0f78cf9 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -12,14 +12,17 @@ #ifndef HW_VFIO_VFIO_PCI_H #define HW_VFIO_VFIO_PCI_H -#include "exec/memory.h" +#include "system/memory.h" #include "hw/pci/pci_device.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/types.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-region.h" #include "qemu/event_notifier.h" #include "qemu/queue.h" #include "qemu/timer.h" #include "qom/object.h" #include "system/kvm.h" +#include "vfio-display.h" #define PCI_ANY_ID (~0) @@ -114,13 +117,14 @@ typedef struct VFIOMSIXInfo { uint32_t pba_offset; unsigned long *pending; bool noresize; + MemoryRegion *pba_region; } VFIOMSIXInfo; -#define TYPE_VFIO_PCI "vfio-pci" -OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI) +OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_DEVICE) struct VFIOPCIDevice { - PCIDevice pdev; + PCIDevice parent_obj; + VFIODevice vbasedev; VFIOINTx intx; unsigned int config_size; @@ -146,6 +150,7 @@ struct VFIOPCIDevice { uint32_t device_id; uint32_t sub_vendor_id; uint32_t sub_device_id; + uint32_t class_code; uint32_t features; #define VFIO_FEATURE_ENABLE_VGA_BIT 0 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) @@ -177,12 +182,14 @@ struct VFIOPCIDevice { bool no_kvm_ioeventfd; bool no_vfio_ioeventfd; bool enable_ramfb; + bool use_legacy_x86_rom; OnOffAuto ramfb_migrate; bool defer_kvm_irq_routing; bool clear_parent_atomics_on_exit; bool skip_vsc_check; VFIODisplay *dpy; Notifier irqchip_change_notifier; + VFIOPCICPR cpr; }; /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */ @@ -194,12 +201,25 @@ static inline bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t de static inline bool vfio_is_vga(VFIOPCIDevice *vdev) { - PCIDevice *pdev = &vdev->pdev; - uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE); + return (vdev->class_code >> 8) == PCI_CLASS_DISPLAY_VGA; +} - return class == PCI_CLASS_DISPLAY_VGA; +static inline bool vfio_is_base_display(VFIOPCIDevice *vdev) +{ + return (vdev->class_code >> 16) == PCI_BASE_CLASS_DISPLAY; } +/* MSI/MSI-X/INTx */ +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr); +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix); +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable); +void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev); +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable); + uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); @@ -207,6 +227,19 @@ void vfio_pci_write_config(PCIDevice *pdev, uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size); void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); +/** + * vfio_pci_from_vfio_device: Transform from VFIODevice to + * VFIOPCIDevice + * + * This function checks if the given @vbasedev is a VFIO PCI device. + * If it is, it returns the containing VFIOPCIDevice. + * + * @vbasedev: The VFIODevice to transform + * + * Return: The VFIOPCIDevice on success, NULL on failure. + */ +VFIOPCIDevice *vfio_pci_from_vfio_device(VFIODevice *vbasedev); +void vfio_sub_page_bar_update_mappings(VFIOPCIDevice *vdev); bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev); bool vfio_config_quirk_setup(VFIOPCIDevice *vdev, Error **errp); void vfio_vga_quirk_setup(VFIOPCIDevice *vdev); @@ -238,4 +271,16 @@ void vfio_display_finalize(VFIOPCIDevice *vdev); extern const VMStateDescription vfio_display_vmstate; +void vfio_pci_bars_exit(VFIOPCIDevice *vdev); +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_config_register_vga(VFIOPCIDevice *vdev); +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_intx_eoi(VFIODevice *vbasedev); +void vfio_pci_put_device(VFIOPCIDevice *vdev); +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev); +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev); +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c deleted file mode 100644 index 67bc574..0000000 --- a/hw/vfio/platform.c +++ /dev/null @@ -1,716 +0,0 @@ -/* - * vfio based device assignment support - platform devices - * - * Copyright Linaro Limited, 2014 - * - * Authors: - * Kim Phillips <kim.phillips@linaro.org> - * Eric Auger <eric.auger@linaro.org> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Based on vfio based PCI device assignment support: - * Copyright Red Hat, Inc. 2012 - */ - -#include "qemu/osdep.h" -#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ -#include "qapi/error.h" -#include <sys/ioctl.h> -#include <linux/vfio.h> - -#include "hw/vfio/vfio-platform.h" -#include "system/iommufd.h" -#include "migration/vmstate.h" -#include "qemu/error-report.h" -#include "qemu/lockable.h" -#include "qemu/main-loop.h" -#include "qemu/module.h" -#include "qemu/range.h" -#include "exec/memory.h" -#include "exec/address-spaces.h" -#include "qemu/queue.h" -#include "hw/sysbus.h" -#include "trace.h" -#include "hw/irq.h" -#include "hw/platform-bus.h" -#include "hw/qdev-properties.h" -#include "system/kvm.h" - -/* - * Functions used whatever the injection method - */ - -static inline bool vfio_irq_is_automasked(VFIOINTp *intp) -{ - return intp->flags & VFIO_IRQ_INFO_AUTOMASKED; -} - -/** - * vfio_init_intp - allocate, initialize the IRQ struct pointer - * and add it into the list of IRQs - * @vbasedev: the VFIO device handle - * @info: irq info struct retrieved from VFIO driver - * @errp: error object - */ -static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev, - struct vfio_irq_info info, Error **errp) -{ - int ret; - VFIOPlatformDevice *vdev = - container_of(vbasedev, VFIOPlatformDevice, vbasedev); - SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev); - VFIOINTp *intp; - - intp = g_malloc0(sizeof(*intp)); - intp->vdev = vdev; - intp->pin = info.index; - intp->flags = info.flags; - intp->state = VFIO_IRQ_INACTIVE; - intp->kvm_accel = false; - - sysbus_init_irq(sbdev, &intp->qemuirq); - - /* Get an eventfd for trigger */ - intp->interrupt = g_new0(EventNotifier, 1); - ret = event_notifier_init(intp->interrupt, 0); - if (ret) { - g_free(intp->interrupt); - g_free(intp); - error_setg_errno(errp, -ret, - "failed to initialize trigger eventfd notifier"); - return NULL; - } - if (vfio_irq_is_automasked(intp)) { - /* Get an eventfd for resample/unmask */ - intp->unmask = g_new0(EventNotifier, 1); - ret = event_notifier_init(intp->unmask, 0); - if (ret) { - g_free(intp->interrupt); - g_free(intp->unmask); - g_free(intp); - error_setg_errno(errp, -ret, - "failed to initialize resample eventfd notifier"); - return NULL; - } - } - - QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); - return intp; -} - -/** - * vfio_set_trigger_eventfd - set VFIO eventfd handling - * - * @intp: IRQ struct handle - * @handler: handler to be called on eventfd signaling - * - * Setup VFIO signaling and attach an optional user-side handler - * to the eventfd - */ -static int vfio_set_trigger_eventfd(VFIOINTp *intp, - eventfd_user_side_handler_t handler) -{ - VFIODevice *vbasedev = &intp->vdev->vbasedev; - int32_t fd = event_notifier_get_fd(intp->interrupt); - Error *err = NULL; - - qemu_set_fd_handler(fd, (IOHandler *)handler, NULL, intp); - - if (!vfio_set_irq_signaling(vbasedev, intp->pin, 0, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); - qemu_set_fd_handler(fd, NULL, NULL, NULL); - return -EINVAL; - } - - return 0; -} - -/* - * Functions only used when eventfds are handled on user-side - * ie. without irqfd - */ - -/** - * vfio_mmap_set_enabled - enable/disable the fast path mode - * @vdev: the VFIO platform device - * @enabled: the target mmap state - * - * enabled = true ~ fast path = MMIO region is mmaped (no KVM TRAP); - * enabled = false ~ slow path = MMIO region is trapped and region callbacks - * are called; slow path enables to trap the device IRQ status register reset -*/ - -static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled) -{ - int i; - - for (i = 0; i < vdev->vbasedev.num_regions; i++) { - vfio_region_mmaps_set_enabled(vdev->regions[i], enabled); - } -} - -/** - * vfio_intp_mmap_enable - timer function, restores the fast path - * if there is no more active IRQ - * @opaque: actually points to the VFIO platform device - * - * Called on mmap timer timeout, this function checks whether the - * IRQ is still active and if not, restores the fast path. - * by construction a single eventfd is handled at a time. - * if the IRQ is still active, the timer is re-programmed. - */ -static void vfio_intp_mmap_enable(void *opaque) -{ - VFIOINTp *tmp; - VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque; - - QEMU_LOCK_GUARD(&vdev->intp_mutex); - QLIST_FOREACH(tmp, &vdev->intp_list, next) { - if (tmp->state == VFIO_IRQ_ACTIVE) { - trace_vfio_platform_intp_mmap_enable(tmp->pin); - /* re-program the timer to check active status later */ - timer_mod(vdev->mmap_timer, - qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + - vdev->mmap_timeout); - return; - } - } - vfio_mmap_set_enabled(vdev, true); -} - -/** - * vfio_intp_inject_pending_lockheld - Injects a pending IRQ - * @opaque: opaque pointer, in practice the VFIOINTp handle - * - * The function is called on a previous IRQ completion, from - * vfio_platform_eoi, while the intp_mutex is locked. - * Also in such situation, the slow path already is set and - * the mmap timer was already programmed. - */ -static void vfio_intp_inject_pending_lockheld(VFIOINTp *intp) -{ - trace_vfio_platform_intp_inject_pending_lockheld(intp->pin, - event_notifier_get_fd(intp->interrupt)); - - intp->state = VFIO_IRQ_ACTIVE; - - /* trigger the virtual IRQ */ - qemu_set_irq(intp->qemuirq, 1); -} - -/** - * vfio_intp_interrupt - The user-side eventfd handler - * @opaque: opaque pointer which in practice is the VFIOINTp handle - * - * the function is entered in event handler context: - * the vIRQ is injected into the guest if there is no other active - * or pending IRQ. - */ -static void vfio_intp_interrupt(VFIOINTp *intp) -{ - int ret; - VFIOINTp *tmp; - VFIOPlatformDevice *vdev = intp->vdev; - bool delay_handling = false; - - QEMU_LOCK_GUARD(&vdev->intp_mutex); - if (intp->state == VFIO_IRQ_INACTIVE) { - QLIST_FOREACH(tmp, &vdev->intp_list, next) { - if (tmp->state == VFIO_IRQ_ACTIVE || - tmp->state == VFIO_IRQ_PENDING) { - delay_handling = true; - break; - } - } - } - if (delay_handling) { - /* - * the new IRQ gets a pending status and is pushed in - * the pending queue - */ - intp->state = VFIO_IRQ_PENDING; - trace_vfio_intp_interrupt_set_pending(intp->pin); - QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue, - intp, pqnext); - event_notifier_test_and_clear(intp->interrupt); - return; - } - - trace_vfio_platform_intp_interrupt(intp->pin, - event_notifier_get_fd(intp->interrupt)); - - ret = event_notifier_test_and_clear(intp->interrupt); - if (!ret) { - error_report("Error when clearing fd=%d (ret = %d)", - event_notifier_get_fd(intp->interrupt), ret); - } - - intp->state = VFIO_IRQ_ACTIVE; - - /* sets slow path */ - vfio_mmap_set_enabled(vdev, false); - - /* trigger the virtual IRQ */ - qemu_set_irq(intp->qemuirq, 1); - - /* - * Schedule the mmap timer which will restore fastpath when no IRQ - * is active anymore - */ - if (vdev->mmap_timeout) { - timer_mod(vdev->mmap_timer, - qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + - vdev->mmap_timeout); - } -} - -/** - * vfio_platform_eoi - IRQ completion routine - * @vbasedev: the VFIO device handle - * - * De-asserts the active virtual IRQ and unmasks the physical IRQ - * (effective for level sensitive IRQ auto-masked by the VFIO driver). - * Then it handles next pending IRQ if any. - * eoi function is called on the first access to any MMIO region - * after an IRQ was triggered, trapped since slow path was set. - * It is assumed this access corresponds to the IRQ status - * register reset. With such a mechanism, a single IRQ can be - * handled at a time since there is no way to know which IRQ - * was completed by the guest (we would need additional details - * about the IRQ status register mask). - */ -static void vfio_platform_eoi(VFIODevice *vbasedev) -{ - VFIOINTp *intp; - VFIOPlatformDevice *vdev = - container_of(vbasedev, VFIOPlatformDevice, vbasedev); - - QEMU_LOCK_GUARD(&vdev->intp_mutex); - QLIST_FOREACH(intp, &vdev->intp_list, next) { - if (intp->state == VFIO_IRQ_ACTIVE) { - trace_vfio_platform_eoi(intp->pin, - event_notifier_get_fd(intp->interrupt)); - intp->state = VFIO_IRQ_INACTIVE; - - /* deassert the virtual IRQ */ - qemu_set_irq(intp->qemuirq, 0); - - if (vfio_irq_is_automasked(intp)) { - /* unmasks the physical level-sensitive IRQ */ - vfio_unmask_single_irqindex(vbasedev, intp->pin); - } - - /* a single IRQ can be active at a time */ - break; - } - } - /* in case there are pending IRQs, handle the first one */ - if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) { - intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue); - vfio_intp_inject_pending_lockheld(intp); - QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext); - } -} - -/** - * vfio_start_eventfd_injection - starts the virtual IRQ injection using - * user-side handled eventfds - * @sbdev: the sysbus device handle - * @irq: the qemu irq handle - */ - -static void vfio_start_eventfd_injection(SysBusDevice *sbdev, qemu_irq irq) -{ - VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev); - VFIOINTp *intp; - - QLIST_FOREACH(intp, &vdev->intp_list, next) { - if (intp->qemuirq == irq) { - break; - } - } - assert(intp); - - if (vfio_set_trigger_eventfd(intp, vfio_intp_interrupt)) { - abort(); - } -} - -/* - * Functions used for irqfd - */ - -/** - * vfio_set_resample_eventfd - sets the resamplefd for an IRQ - * @intp: the IRQ struct handle - * programs the VFIO driver to unmask this IRQ when the - * intp->unmask eventfd is triggered - */ -static int vfio_set_resample_eventfd(VFIOINTp *intp) -{ - int32_t fd = event_notifier_get_fd(intp->unmask); - VFIODevice *vbasedev = &intp->vdev->vbasedev; - Error *err = NULL; - - qemu_set_fd_handler(fd, NULL, NULL, NULL); - if (!vfio_set_irq_signaling(vbasedev, intp->pin, 0, - VFIO_IRQ_SET_ACTION_UNMASK, fd, &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); - return -EINVAL; - } - return 0; -} - -/** - * vfio_start_irqfd_injection - starts the virtual IRQ injection using - * irqfd - * - * @sbdev: the sysbus device handle - * @irq: the qemu irq handle - * - * In case the irqfd setup fails, we fallback to userspace handled eventfd - */ -static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq) -{ - VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev); - VFIOINTp *intp; - - if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() || - !vdev->irqfd_allowed) { - goto fail_irqfd; - } - - QLIST_FOREACH(intp, &vdev->intp_list, next) { - if (intp->qemuirq == irq) { - break; - } - } - assert(intp); - - if (kvm_irqchip_add_irqfd_notifier(kvm_state, intp->interrupt, - intp->unmask, irq) < 0) { - goto fail_irqfd; - } - - if (vfio_set_trigger_eventfd(intp, NULL) < 0) { - goto fail_vfio; - } - if (vfio_irq_is_automasked(intp)) { - if (vfio_set_resample_eventfd(intp) < 0) { - goto fail_vfio; - } - trace_vfio_platform_start_level_irqfd_injection(intp->pin, - event_notifier_get_fd(intp->interrupt), - event_notifier_get_fd(intp->unmask)); - } else { - trace_vfio_platform_start_edge_irqfd_injection(intp->pin, - event_notifier_get_fd(intp->interrupt)); - } - - intp->kvm_accel = true; - - return; -fail_vfio: - kvm_irqchip_remove_irqfd_notifier(kvm_state, intp->interrupt, irq); - abort(); -fail_irqfd: - vfio_start_eventfd_injection(sbdev, irq); - return; -} - -/* VFIO skeleton */ - -static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev) -{ - vbasedev->needs_reset = true; -} - -/* not implemented yet */ -static int vfio_platform_hot_reset_multi(VFIODevice *vbasedev) -{ - return -1; -} - -/** - * vfio_populate_device - Allocate and populate MMIO region - * and IRQ structs according to driver returned information - * @vbasedev: the VFIO device handle - * @errp: error object - * - */ -static bool vfio_populate_device(VFIODevice *vbasedev, Error **errp) -{ - VFIOINTp *intp, *tmp; - int i, ret = -1; - VFIOPlatformDevice *vdev = - container_of(vbasedev, VFIOPlatformDevice, vbasedev); - - if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PLATFORM)) { - error_setg(errp, "this isn't a platform device"); - return false; - } - - vdev->regions = g_new0(VFIORegion *, vbasedev->num_regions); - - for (i = 0; i < vbasedev->num_regions; i++) { - char *name = g_strdup_printf("VFIO %s region %d\n", vbasedev->name, i); - - vdev->regions[i] = g_new0(VFIORegion, 1); - ret = vfio_region_setup(OBJECT(vdev), vbasedev, - vdev->regions[i], i, name); - g_free(name); - if (ret) { - error_setg_errno(errp, -ret, "failed to get region %d info", i); - goto reg_error; - } - } - - vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, - vfio_intp_mmap_enable, vdev); - - QSIMPLEQ_INIT(&vdev->pending_intp_queue); - - for (i = 0; i < vbasedev->num_irqs; i++) { - struct vfio_irq_info irq = { .argsz = sizeof(irq) }; - - irq.index = i; - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); - if (ret) { - error_setg_errno(errp, -ret, "failed to get device irq info"); - goto irq_err; - } else { - trace_vfio_platform_populate_interrupts(irq.index, - irq.count, - irq.flags); - intp = vfio_init_intp(vbasedev, irq, errp); - if (!intp) { - goto irq_err; - } - } - } - return true; -irq_err: - timer_del(vdev->mmap_timer); - QLIST_FOREACH_SAFE(intp, &vdev->intp_list, next, tmp) { - QLIST_REMOVE(intp, next); - g_free(intp); - } -reg_error: - for (i = 0; i < vbasedev->num_regions; i++) { - if (vdev->regions[i]) { - vfio_region_finalize(vdev->regions[i]); - } - g_free(vdev->regions[i]); - } - g_free(vdev->regions); - return false; -} - -/* specialized functions for VFIO Platform devices */ -static VFIODeviceOps vfio_platform_ops = { - .vfio_compute_needs_reset = vfio_platform_compute_needs_reset, - .vfio_hot_reset_multi = vfio_platform_hot_reset_multi, - .vfio_eoi = vfio_platform_eoi, -}; - -/** - * vfio_base_device_init - perform preliminary VFIO setup - * @vbasedev: the VFIO device handle - * @errp: error object - * - * Implement the VFIO command sequence that allows to discover - * assigned device resources: group extraction, device - * fd retrieval, resource query. - * Precondition: the device name must be initialized - */ -static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp) -{ - /* @fd takes precedence over @sysfsdev which takes precedence over @host */ - if (vbasedev->fd < 0 && vbasedev->sysfsdev) { - g_free(vbasedev->name); - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - } else if (vbasedev->fd < 0) { - if (!vbasedev->name || strchr(vbasedev->name, '/')) { - error_setg(errp, "wrong host device name"); - return false; - } - - vbasedev->sysfsdev = g_strdup_printf("/sys/bus/platform/devices/%s", - vbasedev->name); - } - - if (!vfio_device_get_name(vbasedev, errp)) { - return false; - } - - if (!vfio_attach_device(vbasedev->name, vbasedev, - &address_space_memory, errp)) { - return false; - } - - if (vfio_populate_device(vbasedev, errp)) { - return true; - } - - vfio_detach_device(vbasedev); - return false; -} - -/** - * vfio_platform_realize - the device realize function - * @dev: device state pointer - * @errp: error - * - * initialize the device, its memory regions and IRQ structures - * IRQ are started separately - */ -static void vfio_platform_realize(DeviceState *dev, Error **errp) -{ - ERRP_GUARD(); - VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); - SysBusDevice *sbdev = SYS_BUS_DEVICE(dev); - VFIODevice *vbasedev = &vdev->vbasedev; - int i; - - warn_report("-device vfio-platform is deprecated"); - qemu_mutex_init(&vdev->intp_mutex); - - trace_vfio_platform_realize(vbasedev->sysfsdev ? - vbasedev->sysfsdev : vbasedev->name, - vdev->compat); - - if (!vfio_base_device_init(vbasedev, errp)) { - goto init_err; - } - - if (!vdev->compat) { - GError *gerr = NULL; - gchar *contents; - gsize length; - char *path; - - path = g_strdup_printf("%s/of_node/compatible", vbasedev->sysfsdev); - if (!g_file_get_contents(path, &contents, &length, &gerr)) { - error_setg(errp, "%s", gerr->message); - g_error_free(gerr); - g_free(path); - return; - } - g_free(path); - vdev->compat = contents; - for (vdev->num_compat = 0; length; vdev->num_compat++) { - size_t skip = strlen(contents) + 1; - contents += skip; - length -= skip; - } - } - - for (i = 0; i < vbasedev->num_regions; i++) { - if (vfio_region_mmap(vdev->regions[i])) { - warn_report("%s mmap unsupported, performance may be slow", - memory_region_name(vdev->regions[i]->mem)); - } - sysbus_init_mmio(sbdev, vdev->regions[i]->mem); - } - return; - -init_err: - if (vdev->vbasedev.name) { - error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } else { - error_prepend(errp, "vfio error: "); - } -} - -static const VMStateDescription vfio_platform_vmstate = { - .name = "vfio-platform", - .unmigratable = 1, -}; - -static const Property vfio_platform_dev_properties[] = { - DEFINE_PROP_STRING("host", VFIOPlatformDevice, vbasedev.name), - DEFINE_PROP_STRING("sysfsdev", VFIOPlatformDevice, vbasedev.sysfsdev), - DEFINE_PROP_BOOL("x-no-mmap", VFIOPlatformDevice, vbasedev.no_mmap, false), - DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, - mmap_timeout, 1100), - DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), -#ifdef CONFIG_IOMMUFD - DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd, - TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), -#endif -}; - -static void vfio_platform_instance_init(Object *obj) -{ - VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); - VFIODevice *vbasedev = &vdev->vbasedev; - - vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, - DEVICE(vdev), false); -} - -#ifdef CONFIG_IOMMUFD -static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp) -{ - vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp); -} -#endif - -static void vfio_platform_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass); - - dc->realize = vfio_platform_realize; - device_class_set_props(dc, vfio_platform_dev_properties); -#ifdef CONFIG_IOMMUFD - object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd); -#endif - dc->vmsd = &vfio_platform_vmstate; - dc->desc = "VFIO-based platform device assignment"; - sbc->connect_irq_notifier = vfio_start_irqfd_injection; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); - - object_class_property_set_description(klass, /* 2.4 */ - "host", - "Host device name of assigned device"); - object_class_property_set_description(klass, /* 2.4 and 2.5 */ - "x-no-mmap", - "Disable MMAP for device. Allows to trace MMIO " - "accesses (DEBUG)"); - object_class_property_set_description(klass, /* 2.4 */ - "mmap-timeout-ms", - "When EOI is not provided by KVM/QEMU, wait time " - "(milliseconds) to re-enable device direct access " - "after level interrupt (DEBUG)"); - object_class_property_set_description(klass, /* 2.4 */ - "x-irqfd", - "Allow disabling irqfd support (DEBUG)"); - object_class_property_set_description(klass, /* 2.6 */ - "sysfsdev", - "Host sysfs path of assigned device"); -#ifdef CONFIG_IOMMUFD - object_class_property_set_description(klass, /* 9.0 */ - "iommufd", - "Set host IOMMUFD backend device"); -#endif -} - -static const TypeInfo vfio_platform_dev_info = { - .name = TYPE_VFIO_PLATFORM, - .parent = TYPE_DYNAMIC_SYS_BUS_DEVICE, - .instance_size = sizeof(VFIOPlatformDevice), - .instance_init = vfio_platform_instance_init, - .class_init = vfio_platform_class_init, - .class_size = sizeof(VFIOPlatformDeviceClass), -}; - -static void register_vfio_platform_dev_type(void) -{ - type_register_static(&vfio_platform_dev_info); -} - -type_init(register_vfio_platform_dev_type) diff --git a/hw/vfio/region.c b/hw/vfio/region.c new file mode 100644 index 0000000..b165ab0 --- /dev/null +++ b/hw/vfio/region.c @@ -0,0 +1,400 @@ +/* + * VFIO regions + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> + +#include "hw/vfio/vfio-region.h" +#include "hw/vfio/vfio-device.h" +#include "hw/hw.h" +#include "trace.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/units.h" +#include "monitor/monitor.h" +#include "vfio-helpers.h" + +/* + * IO Port/MMIO - Beware of the endians, VFIO is always little endian + */ +void vfio_region_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIORegion *region = opaque; + VFIODevice *vbasedev = region->vbasedev; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + uint64_t qword; + } buf; + int ret; + + switch (size) { + case 1: + buf.byte = data; + break; + case 2: + buf.word = cpu_to_le16(data); + break; + case 4: + buf.dword = cpu_to_le32(data); + break; + case 8: + buf.qword = cpu_to_le64(data); + break; + default: + hw_error("vfio: unsupported write size, %u bytes", size); + break; + } + + ret = vbasedev->io_ops->region_write(vbasedev, region->nr, + addr, size, &buf, region->post_wr); + if (ret != size) { + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 + ",%d) failed: %s", + __func__, vbasedev->name, region->nr, + addr, data, size, strwriteerror(ret)); + } + + trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); + + /* + * A read or write to a BAR always signals an INTx EOI. This will + * do nothing if not pending (including not in INTx mode). We assume + * that a BAR access is in response to an interrupt and that BAR + * accesses will service the interrupt. Unfortunately, we don't know + * which access will service the interrupt, so we're potentially + * getting quite a few host interrupts per guest interrupt. + */ + vbasedev->ops->vfio_eoi(vbasedev); +} + +uint64_t vfio_region_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIORegion *region = opaque; + VFIODevice *vbasedev = region->vbasedev; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + uint64_t qword; + } buf; + uint64_t data = 0; + int ret; + + ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", + __func__, vbasedev->name, region->nr, + addr, size, strreaderror(ret)); + return (uint64_t)-1; + } + switch (size) { + case 1: + data = buf.byte; + break; + case 2: + data = le16_to_cpu(buf.word); + break; + case 4: + data = le32_to_cpu(buf.dword); + break; + case 8: + data = le64_to_cpu(buf.qword); + break; + default: + hw_error("vfio: unsupported read size, %u bytes", size); + break; + } + + trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); + + /* Same as write above */ + vbasedev->ops->vfio_eoi(vbasedev); + + return data; +} + +static const MemoryRegionOps vfio_region_ops = { + .read = vfio_region_read, + .write = vfio_region_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; + int i, j; + + hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + if (!hdr) { + return -ENODEV; + } + + sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); + + trace_vfio_region_sparse_mmap_header(region->vbasedev->name, + region->nr, sparse->nr_areas); + + region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); + + for (i = 0, j = 0; i < sparse->nr_areas; i++) { + if (sparse->areas[i].size) { + trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, + sparse->areas[i].offset + + sparse->areas[i].size - 1); + region->mmaps[j].offset = sparse->areas[i].offset; + region->mmaps[j].size = sparse->areas[i].size; + j++; + } + } + + region->nr_mmaps = j; + region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); + + return 0; +} + +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, + int index, const char *name) +{ + struct vfio_region_info *info = NULL; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret) { + return ret; + } + + region->vbasedev = vbasedev; + region->flags = info->flags; + region->size = info->size; + region->fd_offset = info->offset; + region->nr = index; + region->post_wr = false; + + if (region->size) { + region->mem = g_new0(MemoryRegion, 1); + memory_region_init_io(region->mem, obj, &vfio_region_ops, + region, name, region->size); + + if (!vbasedev->no_mmap && + region->flags & VFIO_REGION_INFO_FLAG_MMAP) { + + ret = vfio_setup_region_sparse_mmaps(region, info); + + if (ret) { + region->nr_mmaps = 1; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; + } + } + } + + trace_vfio_region_setup(vbasedev->name, index, name, + region->flags, region->fd_offset, region->size); + return 0; +} + +static void vfio_subregion_unmap(VFIORegion *region, int index) +{ + trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), + region->mmaps[index].offset, + region->mmaps[index].offset + + region->mmaps[index].size - 1); + memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); + munmap(region->mmaps[index].mmap, region->mmaps[index].size); + object_unparent(OBJECT(®ion->mmaps[index].mem)); + region->mmaps[index].mmap = NULL; +} + +int vfio_region_mmap(VFIORegion *region) +{ + int i, ret, prot = 0; + char *name; + int fd; + + if (!region->mem) { + return 0; + } + + prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + + for (i = 0; i < region->nr_mmaps; i++) { + size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); + void *map_base, *map_align; + + /* + * Align the mmap for more efficient mapping in the kernel. Ideally + * we'd know the PMD and PUD mapping sizes to use as discrete alignment + * intervals, but we don't. As of Linux v6.12, the largest PUD size + * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set + * on x86_64). Align by power-of-two size, capped at 1GiB. + * + * NB. qemu_memalign() and friends actually allocate memory, whereas + * the region size here can exceed host memory, therefore we manually + * create an oversized anonymous mapping and clean it up for alignment. + */ + map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map_base == MAP_FAILED) { + ret = -errno; + goto no_mmap; + } + + fd = vfio_device_get_region_fd(region->vbasedev, region->nr); + + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); + munmap(map_base, map_align - map_base); + munmap(map_align + region->mmaps[i].size, + align - (map_align - map_base)); + + region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, + MAP_SHARED | MAP_FIXED, fd, + region->fd_offset + + region->mmaps[i].offset); + if (region->mmaps[i].mmap == MAP_FAILED) { + ret = -errno; + goto no_mmap; + } + + name = g_strdup_printf("%s mmaps[%d]", + memory_region_name(region->mem), i); + memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, + memory_region_owner(region->mem), + name, region->mmaps[i].size, + region->mmaps[i].mmap); + g_free(name); + memory_region_add_subregion(region->mem, region->mmaps[i].offset, + ®ion->mmaps[i].mem); + + trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), + region->mmaps[i].offset, + region->mmaps[i].offset + + region->mmaps[i].size - 1); + } + + return 0; + +no_mmap: + trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, + region->fd_offset + region->mmaps[i].offset, + region->fd_offset + region->mmaps[i].offset + + region->mmaps[i].size - 1, ret); + + region->mmaps[i].mmap = NULL; + + for (i--; i >= 0; i--) { + vfio_subregion_unmap(region, i); + } + + return ret; +} + +void vfio_region_unmap(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + vfio_subregion_unmap(region, i); + } + } +} + +void vfio_region_exit(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); + } + } + + trace_vfio_region_exit(region->vbasedev->name, region->nr); +} + +void vfio_region_finalize(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + munmap(region->mmaps[i].mmap, region->mmaps[i].size); + } + } + + g_free(region->mem); + g_free(region->mmaps); + + trace_vfio_region_finalize(region->vbasedev->name, region->nr); + + region->mem = NULL; + region->mmaps = NULL; + region->nr_mmaps = 0; + region->size = 0; + region->flags = 0; + region->nr = 0; +} + +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_set_enabled(®ion->mmaps[i].mem, enabled); + } + } + + trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), + enabled); +} diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 1a5d161..0f23681 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -13,20 +13,29 @@ #include <linux/vfio.h> #include "system/kvm.h" #include "system/hostmem.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-container-legacy.h" #include "hw/hw.h" -#include "exec/ram_addr.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "trace.h" +#include "vfio-helpers.h" + +typedef struct VFIOHostDMAWindow { + hwaddr min_iova; + hwaddr max_iova; + uint64_t iova_pgsizes; + QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; +} VFIOHostDMAWindow; + +struct VFIOSpaprContainer { + VFIOLegacyContainer parent_obj; -typedef struct VFIOSpaprContainer { - VFIOContainer container; MemoryListener prereg_listener; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; -} VFIOSpaprContainer; + unsigned int levels; +}; OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR); @@ -52,8 +61,8 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, { VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, prereg_listener); - VFIOContainer *container = &scontainer->container; - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(scontainer); + VFIOContainer *bcontainer = VFIO_IOMMU(container); const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -112,7 +121,7 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener, { VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, prereg_listener); - VFIOContainer *container = &scontainer->container; + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(scontainer); const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -209,7 +218,7 @@ static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, return hostwin_found ? hostwin : NULL; } -static int vfio_spapr_remove_window(VFIOContainer *container, +static int vfio_spapr_remove_window(VFIOLegacyContainer *container, hwaddr offset_within_address_space) { struct vfio_iommu_spapr_tce_remove remove = { @@ -230,15 +239,16 @@ static int vfio_spapr_remove_window(VFIOContainer *container, return 0; } -static int vfio_spapr_create_window(VFIOContainer *container, +static bool vfio_spapr_create_window(VFIOLegacyContainer *container, MemoryRegionSection *section, - hwaddr *pgsize) + hwaddr *pgsize, Error **errp) { int ret = 0; - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(bcontainer); IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; - unsigned entries, bits_total, bits_per_level, max_levels; + unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels; struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; long rampagesize = qemu_minrampagesize(); @@ -252,11 +262,11 @@ static int vfio_spapr_create_window(VFIOContainer *container, pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; if (!pagesize) { - error_report("Host doesn't support page size 0x%"PRIx64 - ", the supported mask is 0x%lx", - memory_region_iommu_get_min_page_size(iommu_mr), - bcontainer->pgsizes); - return -EINVAL; + error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64 + ", the supported mask is 0x%lx", + memory_region_iommu_get_min_page_size(iommu_mr), + bcontainer->pgsizes); + return false; } /* @@ -291,28 +301,41 @@ static int vfio_spapr_create_window(VFIOContainer *container, */ bits_per_level = ctz64(qemu_real_host_page_size()) + 8; create.levels = bits_total / bits_per_level; - if (bits_total % bits_per_level) { - ++create.levels; - } - max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); - for ( ; create.levels <= max_levels; ++create.levels) { - ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); - if (!ret) { - break; + + ddw_levels = scontainer->levels; + if (ddw_levels > 1) { + if (bits_total % bits_per_level) { + ++create.levels; } + max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); + for ( ; create.levels <= max_levels; ++create.levels) { + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + if (!ret) { + break; + } + } + } else { /* ddw_levels == 1 */ + if (create.levels > ddw_levels) { + error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables" + ". Use larger IO page size. Supported mask is 0x%lx", + bcontainer->pgsizes); + return false; + } + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); } + if (ret) { - error_report("Failed to create a window, ret = %d (%m)", ret); - return -errno; + error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret); + return false; } if (create.start_addr != section->offset_within_address_space) { vfio_spapr_remove_window(container, create.start_addr); - error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, - section->offset_within_address_space, - (uint64_t)create.start_addr); - return -EINVAL; + error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx + ", must be %"PRIx64, section->offset_within_address_space, + (uint64_t)create.start_addr); + return false; } trace_vfio_spapr_create_window(create.page_shift, create.levels, @@ -320,18 +343,16 @@ static int vfio_spapr_create_window(VFIOContainer *container, create.start_addr); *pgsize = pagesize; - return 0; + return true; } static bool -vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, +vfio_spapr_container_add_section_window(VFIOContainer *bcontainer, MemoryRegionSection *section, Error **errp) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -377,9 +398,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, } } - ret = vfio_spapr_create_window(container, section, &pgsize); - if (ret) { - error_setg_errno(errp, -ret, "Failed to create SPAPR window"); + ret = vfio_spapr_create_window(container, section, &pgsize, errp); + if (!ret) { return false; } @@ -417,13 +437,11 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, } static void -vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, +vfio_spapr_container_del_section_window(VFIOContainer *bcontainer, MemoryRegionSection *section) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; @@ -440,12 +458,10 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, } } -static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) +static void vfio_spapr_container_release(VFIOContainer *bcontainer) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { @@ -458,13 +474,11 @@ static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) } } -static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, +static bool vfio_spapr_container_setup(VFIOContainer *bcontainer, Error **errp) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; @@ -502,6 +516,8 @@ static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, goto listener_unregister_exit; } + scontainer->levels = info.ddw.levels; + if (v2) { bcontainer->pgsizes = info.ddw.pgsizes; /* @@ -534,7 +550,7 @@ listener_unregister_exit: return false; } -static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) +static void vfio_iommu_spapr_class_init(ObjectClass *klass, const void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9347e3a..1e89544 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -1,8 +1,10 @@ # See docs/devel/tracing.rst for syntax documentation. +# +# SPDX-License-Identifier: GPL-2.0-or-later # pci.c vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c" -vfio_intx_eoi(const char *name) " (%s) EOI" +vfio_pci_intx_eoi(const char *name) " (%s) EOI" vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled" vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled" vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" @@ -35,10 +37,8 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s" vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:" vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d" vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s" -vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" -vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" -vfio_attach_device(const char *name, int group_id) " (%s) group %d" -vfio_detach_device(const char *name, int group_id) " (%s) group %d" +vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" +vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d" vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x" vfio_pci_reset(const char *name) " (%s)" @@ -48,6 +48,7 @@ vfio_pci_emulated_vendor_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_device_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_sub_vendor_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_sub_device_id(const char *name, uint16_t val) "%s 0x%04x" +vfio_pci_emulated_class_code(const char *name, uint32_t val) "%s 0x%06x" # pci-quirks.c vfio_quirk_rom_in_denylist(const char *name, uint16_t vid, uint16_t did) "%s %04x:%04x" @@ -89,9 +90,7 @@ vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB" vfio_pci_igd_host_bridge_enabled(const char *name) "%s" vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" -# common.c -vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" -vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 +# listener.c vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" @@ -103,10 +102,20 @@ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t si vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]" -vfio_disconnect_container(int fd) "close container->fd=%d" -vfio_put_group(int fd) "close group->fd=%d" -vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" -vfio_put_base_device(int fd) "close vdev->fd=%d" +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 + +# container.c +vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t translated_addr, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64" dirty_pages=%"PRIu64 + +# container-legacy.c +vfio_container_disconnect(int fd) "close container->fd=%d" +vfio_group_put(int fd) "close group->fd=%d" +vfio_device_get(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" +vfio_device_put(int fd) "close vdev->fd=%d" + +# region.c +vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" +vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx" vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d" vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]" @@ -116,22 +125,6 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" -vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" -vfio_legacy_dma_unmap_overflow_workaround(void) "" -vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 -vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 -vfio_reset_handler(void) "" - -# platform.c -vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" -vfio_platform_eoi(int pin, int fd) "EOI IRQ pin %d (fd=%d)" -vfio_platform_intp_mmap_enable(int pin) "IRQ #%d still active, stay in slow path" -vfio_platform_intp_interrupt(int pin, int fd) "Inject IRQ #%d (fd = %d)" -vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)" -vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d: count %d, flags=0x%x" -vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING" -vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" -vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d" # spapr.c vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "0x%"PRIx64" - 0x%"PRIx64 @@ -192,3 +185,12 @@ iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" + +# cpr-iommufd.c +vfio_cpr_find_device(uint32_t ioas_id, int devid, uint32_t hwpt_id) "ioas_id %u, devid %d, hwpt_id %u" + +# device.c +vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" +vfio_device_reset_handler(void) "" +vfio_device_attach(const char *name, int group_id) " (%s) group %d" +vfio_device_detach(const char *name, int group_id) " (%s) group %d" diff --git a/hw/vfio/trace.h b/hw/vfio/trace.h index 5a343aa..b34b61d 100644 --- a/hw/vfio/trace.h +++ b/hw/vfio/trace.h @@ -1 +1,4 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + */ #include "trace/trace-hw_vfio.h" diff --git a/hw/vfio/types.h b/hw/vfio/types.h new file mode 100644 index 0000000..5482d90 --- /dev/null +++ b/hw/vfio/types.h @@ -0,0 +1,23 @@ +/* + * VFIO types definition + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#ifndef HW_VFIO_VFIO_TYPES_H +#define HW_VFIO_VFIO_TYPES_H + +/* + * TYPE_VFIO_PCI_DEVICE is an abstract type used to share code + * between VFIO implementations that use a kernel driver + * with those that use user sockets. + */ +#define TYPE_VFIO_PCI_DEVICE "vfio-pci-device" + +#define TYPE_VFIO_PCI "vfio-pci" +/* TYPE_VFIO_PCI shares struct VFIOPCIDevice. */ + +#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" + +#endif /* HW_VFIO_VFIO_TYPES_H */ diff --git a/hw/vfio/vfio-display.h b/hw/vfio/vfio-display.h new file mode 100644 index 0000000..2606c34 --- /dev/null +++ b/hw/vfio/vfio-display.h @@ -0,0 +1,42 @@ +/* + * VFIO display + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_DISPLAY_H +#define HW_VFIO_VFIO_DISPLAY_H + +#include "ui/console.h" +#include "hw/display/ramfb.h" +#include "hw/vfio/vfio-region.h" + +typedef struct VFIODMABuf { + QemuDmaBuf *buf; + uint32_t pos_x, pos_y, pos_updates; + uint32_t hot_x, hot_y, hot_updates; + int dmabuf_id; + QTAILQ_ENTRY(VFIODMABuf) next; +} VFIODMABuf; + +typedef struct VFIODisplay { + QemuConsole *con; + RAMFBState *ramfb; + struct vfio_region_info *edid_info; + struct vfio_region_gfx_edid *edid_regs; + uint8_t *edid_blob; + QEMUTimer *edid_link_timer; + struct { + VFIORegion buffer; + DisplaySurface *surface; + } region; + struct { + QTAILQ_HEAD(, VFIODMABuf) bufs; + VFIODMABuf *primary; + VFIODMABuf *cursor; + } dmabuf; +} VFIODisplay; + +#endif /* HW_VFIO_VFIO_DISPLAY_H */ diff --git a/hw/vfio/vfio-helpers.h b/hw/vfio/vfio-helpers.h new file mode 100644 index 0000000..ce31758 --- /dev/null +++ b/hw/vfio/vfio-helpers.h @@ -0,0 +1,37 @@ +/* + * VFIO helpers + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_HELPERS_H +#define HW_VFIO_VFIO_HELPERS_H + +#ifdef CONFIG_LINUX +#include <linux/vfio.h> + +extern int vfio_kvm_device_fd; + +struct vfio_info_cap_header * +vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); +struct vfio_info_cap_header * +vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); +struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); +struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id); +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail); +#endif + +int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); +struct vfio_device_info *vfio_get_device_info(int fd); + +int vfio_kvm_device_add_fd(int fd, Error **errp); +int vfio_kvm_device_del_fd(int fd, Error **errp); + +bool vfio_arch_wants_loading_config_after_iter(void); + +#endif /* HW_VFIO_VFIO_HELPERS_H */ diff --git a/hw/vfio/vfio-iommufd.h b/hw/vfio/vfio-iommufd.h new file mode 100644 index 0000000..6b28e1f --- /dev/null +++ b/hw/vfio/vfio-iommufd.h @@ -0,0 +1,35 @@ +/* + * VFIO iommufd + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_IOMMUFD_H +#define HW_VFIO_VFIO_IOMMUFD_H + +#include "hw/vfio/vfio-container.h" + +typedef struct VFIODevice VFIODevice; + +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + uint32_t hwpt_flags; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + +typedef struct IOMMUFDBackend IOMMUFDBackend; + +struct VFIOIOMMUFDContainer { + VFIOContainer parent_obj; + + IOMMUFDBackend *be; + uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; +}; + +OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD); + +#endif /* HW_VFIO_VFIO_IOMMUFD_H */ diff --git a/hw/vfio/vfio-listener.h b/hw/vfio/vfio-listener.h new file mode 100644 index 0000000..a90674c --- /dev/null +++ b/hw/vfio/vfio-listener.h @@ -0,0 +1,15 @@ +/* + * VFIO MemoryListener services + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_LISTENER_H +#define HW_VFIO_VFIO_LISTENER_H + +bool vfio_listener_register(VFIOContainer *bcontainer, Error **errp); +void vfio_listener_unregister(VFIOContainer *bcontainer); + +#endif /* HW_VFIO_VFIO_LISTENER_H */ diff --git a/hw/vfio/vfio-migration-internal.h b/hw/vfio/vfio-migration-internal.h new file mode 100644 index 0000000..814fbd9 --- /dev/null +++ b/hw/vfio/vfio-migration-internal.h @@ -0,0 +1,74 @@ +/* + * VFIO migration + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_MIGRATION_INTERNAL_H +#define HW_VFIO_VFIO_MIGRATION_INTERNAL_H + +#ifdef CONFIG_LINUX +#include <linux/vfio.h> +#endif + +#include "qemu/notify.h" + +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_LOAD_READY (0xffffffffef100006ULL) + +typedef struct VFIODevice VFIODevice; +typedef struct VFIOMultifd VFIOMultifd; + +typedef struct VFIOMigration { + struct VFIODevice *vbasedev; + VMChangeStateEntry *vm_state; + NotifierWithReturn migration_state; + uint32_t device_state; + int data_fd; + void *data_buffer; + size_t data_buffer_size; + uint64_t mig_flags; + uint64_t precopy_init_size; + uint64_t precopy_dirty_size; + bool multifd_transfer; + VFIOMultifd *multifd; + bool initial_data_sent; + + bool event_save_iterate_started; + bool event_precopy_empty_hit; +} VFIOMigration; + +bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); +void vfio_migration_exit(VFIODevice *vbasedev); +bool vfio_device_state_is_running(VFIODevice *vbasedev); +bool vfio_device_state_is_precopy(VFIODevice *vbasedev); +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp); +int vfio_load_device_config_state(QEMUFile *f, void *opaque); + +#ifdef CONFIG_LINUX +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp); +#endif + +void vfio_migration_add_bytes_transferred(unsigned long val); + +#endif /* HW_VFIO_VFIO_MIGRATION_INTERNAL_H */ diff --git a/hw/vfio/vfio-region.h b/hw/vfio/vfio-region.h new file mode 100644 index 0000000..ede6e0c --- /dev/null +++ b/hw/vfio/vfio-region.h @@ -0,0 +1,48 @@ +/* + * VFIO region + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_REGION_H +#define HW_VFIO_REGION_H + +#include "system/memory.h" + +typedef struct VFIOMmap { + MemoryRegion mem; + void *mmap; + off_t offset; + size_t size; +} VFIOMmap; + +typedef struct VFIODevice VFIODevice; + +typedef struct VFIORegion { + struct VFIODevice *vbasedev; + off_t fd_offset; /* offset of region within device fd */ + MemoryRegion *mem; /* slow, read/write access */ + size_t size; + uint32_t flags; /* VFIO region flags (rd/wr/mmap) */ + uint32_t nr_mmaps; + VFIOMmap *mmaps; + uint8_t nr; /* cache the region number for debug */ + bool post_wr; /* writes can be posted */ +} VFIORegion; + + +void vfio_region_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size); +uint64_t vfio_region_read(void *opaque, + hwaddr addr, unsigned size); +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, + int index, const char *name); +int vfio_region_mmap(VFIORegion *region); +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled); +void vfio_region_unmap(VFIORegion *region); +void vfio_region_exit(VFIORegion *region); +void vfio_region_finalize(VFIORegion *region); + +#endif /* HW_VFIO_REGION_H */ |