diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2025-03-07 07:39:21 +0800 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2025-03-07 07:39:21 +0800 |
commit | 2400fad572906127e9d453b92f90806d66583dc7 (patch) | |
tree | c2fe81ac6cbce922a1b4e4e27c17cdbac82723f0 /hw | |
parent | e17f08b5fbaeb71daee175f835b1b00d841391b6 (diff) | |
parent | 59a67e70950bcc2002d3a8d22a17743e0f70da96 (diff) | |
download | qemu-2400fad572906127e9d453b92f90806d66583dc7.zip qemu-2400fad572906127e9d453b92f90806d66583dc7.tar.gz qemu-2400fad572906127e9d453b92f90806d66583dc7.tar.bz2 |
Merge tag 'pull-vfio-20250306' of https://github.com/legoater/qemu into staging
vfio queue:
* Added property documentation
* Added Minor fixes
* Implemented basic PCI PM capability backing
* Promoted new IGD maintainer
* Deprecated vfio-plaform
* Extended VFIO migration with multifd support
# -----BEGIN PGP SIGNATURE-----
#
# iQIyBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmfJrZoACgkQUaNDx8/7
# 7KFE2A/0Dmief9u/dDJIKGIDa+iawcf4hu8iX4v5pB0DlGniT3rgK8WMGnhDpPxq
# Q4wsKfo+JJ2q6msInrT7Ckqyydu9nQztI3vwmfMuWxLhTMyH28K96ptwPqIZBjOx
# rPTEXfnVX4W3tpn1+48S+vefWVa/gkBkIvv7RpK18rMBXv1kDeyOvc/d2dbAt7ft
# zJc4f8gH3jfQzGwmnYVZU1yPrZN7p6zhYR/AD3RQOY97swgZIEyYxXhOuTPiCuEC
# zC+2AMKi9nmnCG6x/mnk7l2yJXSlv7lJdqcjYZhJ9EOIYfiUGTREYIgQbARcafE/
# 4KSg2QR35BoUd4YrmEWxXJCRf3XnyWXDY36dDKVhC0OHng1F/U44HuL4QxwoTIay
# s1SP/DHcvDiPAewVTvdgt7Iwfn9xGhcQO2pkrxBoNLB5JYwW+R6mG7WXeDv1o3GT
# QosTu1fXZezQqFd4v6+q5iRNS2KtBZLTspwAmVdywEFUs+ZLBRlC+bodYlinZw6B
# Yl/z0LfAEh4J55QmX2espbp8MH1+mALuW2H2tgSGSrTBX1nwxZFI5veFzPepgF2S
# eTx69BMjiNMwzIjq1T7e9NpDCceiW0fXDu7IK1MzYhqg1nM9lX9AidhFTeiF2DB2
# EPb3ljy/8fyxcPKa1T9X47hQaSjbMwofaO8Snoh0q0jokY246Q==
# =hIBw
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 06 Mar 2025 22:13:46 HKT
# gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [full]
# gpg: aka "Cédric Le Goater <clg@kaod.org>" [full]
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1
* tag 'pull-vfio-20250306' of https://github.com/legoater/qemu: (42 commits)
hw/core/machine: Add compat for x-migration-multifd-transfer VFIO property
vfio/migration: Make x-migration-multifd-transfer VFIO property mutable
vfio/migration: Add x-migration-multifd-transfer VFIO property
vfio/migration: Multifd device state transfer support - send side
vfio/migration: Multifd device state transfer support - config loading support
migration/qemu-file: Define g_autoptr() cleanup function for QEMUFile
vfio/migration: Multifd device state transfer support - load thread
vfio/migration: Multifd device state transfer support - received buffers queuing
vfio/migration: Setup and cleanup multifd transfer in these general methods
vfio/migration: Multifd setup/cleanup functions and associated VFIOMultifd
vfio/migration: Multifd device state transfer - add support checking function
vfio/migration: Multifd device state transfer support - basic types
vfio/migration: Move migration channel flags to vfio-common.h header file
vfio/migration: Add vfio_add_bytes_transferred()
vfio/migration: Convert bytes_transferred counter to atomic
vfio/migration: Add load_device_config_state_start trace event
migration: Add save_live_complete_precopy_thread handler
migration/multifd: Add multifd_device_state_supported()
migration/multifd: Make MultiFDSendData a struct
migration/multifd: Device state transfer support - send side
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/core/machine.c | 2 | ||||
-rw-r--r-- | hw/net/e1000e.c | 3 | ||||
-rw-r--r-- | hw/net/eepro100.c | 4 | ||||
-rw-r--r-- | hw/net/igb.c | 3 | ||||
-rw-r--r-- | hw/nvme/ctrl.c | 3 | ||||
-rw-r--r-- | hw/pci-bridge/pcie_pci_bridge.c | 3 | ||||
-rw-r--r-- | hw/pci/pci.c | 93 | ||||
-rw-r--r-- | hw/pci/trace-events | 2 | ||||
-rw-r--r-- | hw/vfio/amd-xgbe.c | 2 | ||||
-rw-r--r-- | hw/vfio/ap.c | 9 | ||||
-rw-r--r-- | hw/vfio/calxeda-xgmac.c | 2 | ||||
-rw-r--r-- | hw/vfio/ccw.c | 27 | ||||
-rw-r--r-- | hw/vfio/meson.build | 1 | ||||
-rw-r--r-- | hw/vfio/migration-multifd.c | 679 | ||||
-rw-r--r-- | hw/vfio/migration-multifd.h | 34 | ||||
-rw-r--r-- | hw/vfio/migration.c | 106 | ||||
-rw-r--r-- | hw/vfio/pci.c | 180 | ||||
-rw-r--r-- | hw/vfio/pci.h | 1 | ||||
-rw-r--r-- | hw/vfio/platform.c | 25 | ||||
-rw-r--r-- | hw/vfio/trace-events | 13 | ||||
-rw-r--r-- | hw/virtio/virtio-pci.c | 11 |
21 files changed, 1125 insertions, 78 deletions
diff --git a/hw/core/machine.c b/hw/core/machine.c index b68b8b9..f52a4f2 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -44,6 +44,8 @@ GlobalProperty hw_compat_9_2[] = { { "virtio-balloon-pci-non-transitional", "vectors", "0" }, { "virtio-mem-pci", "vectors", "0" }, { "migration", "multifd-clean-tls-termination", "false" }, + { "migration", "send-switchover-start", "off"}, + { "vfio-pci", "x-migration-multifd-transfer", "off" }, }; const size_t hw_compat_9_2_len = G_N_ELEMENTS(hw_compat_9_2); diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c index f637853..b72cbab 100644 --- a/hw/net/e1000e.c +++ b/hw/net/e1000e.c @@ -372,8 +372,7 @@ static int e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) { Error *local_err = NULL; - int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &local_err); + int ret = pci_pm_init(pdev, offset, &local_err); if (local_err) { error_report_err(local_err); diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index 6d85322..29a3986 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -551,9 +551,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp) if (info->power_management) { /* Power Management Capabilities */ int cfg_offset = 0xdc; - int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM, - cfg_offset, PCI_PM_SIZEOF, - errp); + int r = pci_pm_init(&s->dev, cfg_offset, errp); if (r < 0) { return; } diff --git a/hw/net/igb.c b/hw/net/igb.c index c965fc2..e318df4 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -356,8 +356,7 @@ static int igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) { Error *local_err = NULL; - int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &local_err); + int ret = pci_pm_init(pdev, offset, &local_err); if (local_err) { error_report_err(local_err); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e62c6a3..518d02d 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8600,8 +8600,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) Error *err = NULL; int ret; - ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &err); + ret = pci_pm_init(pci_dev, offset, &err); if (err) { error_report_err(err); return ret; diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c index fd4514a..2429503 100644 --- a/hw/pci-bridge/pcie_pci_bridge.c +++ b/hw/pci-bridge/pcie_pci_bridge.c @@ -52,11 +52,10 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) goto cap_error; } - pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp); + pos = pci_pm_init(d, 0, errp); if (pos < 0) { goto pm_error; } - d->exp.pm_cap = pos; pci_set_word(d->config + pos + PCI_PM_PMC, 0x3); pcie_cap_arifwd_init(d); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index e3c28668..2844ec5 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -435,6 +435,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg) attrs, NULL); } +/* + * Register and track a PM capability. If wmask is also enabled for the power + * state field of the pmcsr register, guest writes may change the device PM + * state. BAR access is only enabled while the device is in the D0 state. + * Return the capability offset or negative error code. + */ +int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp) +{ + int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp); + + if (cap < 0) { + return cap; + } + + d->pm_cap = cap; + d->cap_present |= QEMU_PCI_CAP_PM; + + return cap; +} + +static uint8_t pci_pm_state(PCIDevice *d) +{ + uint16_t pmcsr; + + if (!(d->cap_present & QEMU_PCI_CAP_PM)) { + return 0; + } + + pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL); + + return pmcsr & PCI_PM_CTRL_STATE_MASK; +} + +/* + * Update the PM capability state based on the new value stored in config + * space respective to the old, pre-write state provided. If the new value + * is rejected (unsupported or invalid transition) restore the old value. + * Return the resulting PM state. + */ +static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old) +{ + uint16_t pmc; + uint8_t new; + + if (!(d->cap_present & QEMU_PCI_CAP_PM) || + !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) { + return old; + } + + new = pci_pm_state(d); + if (new == old) { + return old; + } + + pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC); + + /* + * Transitions to D1 & D2 are only allowed if supported. Devices may + * only transition to higher D-states or to D0. + */ + if ((!(pmc & PCI_PM_CAP_D1) && new == 1) || + (!(pmc & PCI_PM_CAP_D2) && new == 2) || + (old && new && new < old)) { + pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL, + old); + trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d), + PCI_SLOT(d->devfn), PCI_FUNC(d->devfn), + old, new); + return old; + } + + trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn), + PCI_FUNC(d->devfn), old, new); + return new; +} + static void pci_reset_regions(PCIDevice *dev) { int r; @@ -474,6 +552,11 @@ static void pci_do_device_reset(PCIDevice *dev) pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE)); dev->config[PCI_CACHE_LINE_SIZE] = 0x0; + /* Default PM state is D0 */ + if (dev->cap_present & QEMU_PCI_CAP_PM) { + pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + } pci_reset_regions(dev); pci_update_mappings(dev); @@ -1606,7 +1689,7 @@ static void pci_update_mappings(PCIDevice *d) continue; new_addr = pci_bar_address(d, i, r->type, r->size); - if (!d->enabled) { + if (!d->enabled || pci_pm_state(d)) { new_addr = PCI_BAR_UNMAPPED; } @@ -1672,6 +1755,7 @@ uint32_t pci_default_read_config(PCIDevice *d, void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l) { + uint8_t new_pm_state, old_pm_state = pci_pm_state(d); int i, was_irq_disabled = pci_irq_disabled(d); uint32_t val = val_in; @@ -1684,11 +1768,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask); d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */ } + + new_pm_state = pci_pm_update(d, addr, l, old_pm_state); + if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) || ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) || ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) || - range_covers_byte(addr, l, PCI_COMMAND)) + range_covers_byte(addr, l, PCI_COMMAND) || + !!new_pm_state != !!old_pm_state) { pci_update_mappings(d); + } if (ranges_overlap(addr, l, PCI_COMMAND, 2)) { pci_update_irq_disabled(d, was_irq_disabled); diff --git a/hw/pci/trace-events b/hw/pci/trace-events index e98f575..6a99689 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -1,6 +1,8 @@ # See docs/devel/tracing.rst for syntax documentation. # pci.c +pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d" +pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d" pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s" diff --git a/hw/vfio/amd-xgbe.c b/hw/vfio/amd-xgbe.c index aaa9690..5927503 100644 --- a/hw/vfio/amd-xgbe.c +++ b/hw/vfio/amd-xgbe.c @@ -15,12 +15,14 @@ #include "hw/vfio/vfio-amd-xgbe.h" #include "migration/vmstate.h" #include "qemu/module.h" +#include "qemu/error-report.h" static void amd_xgbe_realize(DeviceState *dev, Error **errp) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); VFIOAmdXgbeDeviceClass *k = VFIO_AMD_XGBE_DEVICE_GET_CLASS(dev); + warn_report("-device vfio-amd-xgbe is deprecated"); vdev->compat = g_strdup("amd,xgbe-seattle-v1a"); vdev->num_compat = 1; diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 30b08ad..c7ab4ff 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -257,6 +257,15 @@ static void vfio_ap_class_init(ObjectClass *klass, void *data) dc->hotpluggable = true; device_class_set_legacy_reset(dc, vfio_ap_reset); dc->bus_type = TYPE_AP_BUS; + + object_class_property_set_description(klass, /* 3.1 */ + "sysfsdev", + "Host sysfs path of assigned device"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif } static const TypeInfo vfio_ap_info = { diff --git a/hw/vfio/calxeda-xgmac.c b/hw/vfio/calxeda-xgmac.c index b016d42..a5ef262 100644 --- a/hw/vfio/calxeda-xgmac.c +++ b/hw/vfio/calxeda-xgmac.c @@ -15,12 +15,14 @@ #include "hw/vfio/vfio-calxeda-xgmac.h" #include "migration/vmstate.h" #include "qemu/module.h" +#include "qemu/error-report.h" static void calxeda_xgmac_realize(DeviceState *dev, Error **errp) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); VFIOCalxedaXgmacDeviceClass *k = VFIO_CALXEDA_XGMAC_DEVICE_GET_CLASS(dev); + warn_report("-device vfio-calxeda-xgmac is deprecated"); vdev->compat = g_strdup("calxeda,hb-xgmac"); vdev->num_compat = 1; diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 67bc137..e5e0d9e 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -51,17 +51,8 @@ struct VFIOCCWDevice { EventNotifier crw_notifier; EventNotifier req_notifier; bool force_orb_pfch; - bool warned_orb_pfch; }; -static inline void warn_once_pfch(VFIOCCWDevice *vcdev, SubchDev *sch, - const char *msg) -{ - warn_report_once_cond(&vcdev->warned_orb_pfch, - "vfio-ccw (devno %x.%x.%04x): %s", - sch->cssid, sch->ssid, sch->devno, msg); -} - static void vfio_ccw_compute_needs_reset(VFIODevice *vdev) { vdev->needs_reset = false; @@ -83,7 +74,8 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch) if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH) && vcdev->force_orb_pfch) { sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; - warn_once_pfch(vcdev, sch, "PFCH flag forced"); + warn_report_once("vfio-ccw (devno %x.%x.%04x): PFCH flag forced", + sch->cssid, sch->ssid, sch->devno); } QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB)); @@ -717,6 +709,21 @@ static void vfio_ccw_class_init(ObjectClass *klass, void *data) cdc->handle_halt = vfio_ccw_handle_halt; cdc->handle_clear = vfio_ccw_handle_clear; cdc->handle_store = vfio_ccw_handle_store; + + object_class_property_set_description(klass, /* 2.10 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 3.0 */ + "force-orb-pfch", + "Force unlimited prefetch"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.2 */ + "loadparm", + "Define which devices that can be used for booting"); } static const TypeInfo vfio_ccw_info = { diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bba776f..260d65f 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -5,6 +5,7 @@ vfio_ss.add(files( 'container-base.c', 'container.c', 'migration.c', + 'migration-multifd.c', 'cpr.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c new file mode 100644 index 0000000..2337247 --- /dev/null +++ b/hw/vfio/migration-multifd.c @@ -0,0 +1,679 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/vfio/vfio-common.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "io/channel-buffer.h" +#include "migration/qemu-file.h" +#include "migration-multifd.h" +#include "trace.h" + +#define VFIO_DEVICE_STATE_CONFIG_STATE (1) + +#define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) + +typedef struct VFIODeviceStatePacket { + uint32_t version; + uint32_t idx; + uint32_t flags; + uint8_t data[0]; +} QEMU_PACKED VFIODeviceStatePacket; + +/* type safety */ +typedef struct VFIOStateBuffers { + GArray *array; +} VFIOStateBuffers; + +typedef struct VFIOStateBuffer { + bool is_present; + char *data; + size_t len; +} VFIOStateBuffer; + +typedef struct VFIOMultifd { + bool load_bufs_thread_running; + bool load_bufs_thread_want_exit; + + VFIOStateBuffers load_bufs; + QemuCond load_bufs_buffer_ready_cond; + QemuCond load_bufs_thread_finished_cond; + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ + uint32_t load_buf_idx; + uint32_t load_buf_idx_last; +} VFIOMultifd; + +static void vfio_state_buffer_clear(gpointer data) +{ + VFIOStateBuffer *lb = data; + + if (!lb->is_present) { + return; + } + + g_clear_pointer(&lb->data, g_free); + lb->is_present = false; +} + +static void vfio_state_buffers_init(VFIOStateBuffers *bufs) +{ + bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); + g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); +} + +static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) +{ + g_clear_pointer(&bufs->array, g_array_unref); +} + +static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) +{ + assert(bufs->array); +} + +static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) +{ + return bufs->array->len; +} + +static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, + unsigned int size) +{ + g_array_set_size(bufs->array, size); +} + +static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, + unsigned int idx) +{ + return &g_array_index(bufs->array, VFIOStateBuffer, idx); +} + +/* called with load_bufs_mutex locked */ +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, + VFIODeviceStatePacket *packet, + size_t packet_total_size, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + + vfio_state_buffers_assert_init(&multifd->load_bufs); + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); + if (lb->is_present) { + error_setg(errp, "%s: state buffer %" PRIu32 " already filled", + vbasedev->name, packet->idx); + return false; + } + + assert(packet->idx >= multifd->load_buf_idx); + + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); + lb->len = packet_total_size - sizeof(*packet); + lb->is_present = true; + + return true; +} + +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + error_setg(errp, + "%s: got device state packet but not doing multifd transfer", + vbasedev->name); + return false; + } + + assert(multifd); + + if (data_size < sizeof(*packet)) { + error_setg(errp, "%s: packet too short at %zu (min is %zu)", + vbasedev->name, data_size, sizeof(*packet)); + return false; + } + + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { + error_setg(errp, "%s: packet has unknown version %" PRIu32, + vbasedev->name, packet->version); + return false; + } + + if (packet->idx == UINT32_MAX) { + error_setg(errp, "%s: packet index is invalid", vbasedev->name); + return false; + } + + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); + + /* + * Holding BQL here would violate the lock order and can cause + * a deadlock once we attempt to lock load_bufs_mutex below. + */ + assert(!bql_locked()); + + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + /* config state packet should be the last one in the stream */ + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { + multifd->load_buf_idx_last = packet->idx; + } + + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, + errp)) { + return false; + } + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + } + + return true; +} + +static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; + uint64_t mig_header; + int ret; + + assert(multifd->load_buf_idx == multifd->load_buf_idx_last); + lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); + assert(lb->is_present); + + bioc = qio_channel_buffer_new(lb->len); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); + + f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); + qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); + + ret = qemu_fflush(f_out); + if (ret) { + error_setg(errp, "%s: load config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); + f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); + + mig_header = qemu_get_be64(f_in); + if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { + error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, + vbasedev->name, mig_header); + return false; + } + + bql_lock(); + ret = vfio_load_device_config_state(f_in, vbasedev); + bql_unlock(); + + if (ret < 0) { + error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", + vbasedev->name, ret); + return false; + } + + return true; +} + +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) +{ + VFIOStateBuffer *lb; + unsigned int bufs_len; + + bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); + if (multifd->load_buf_idx >= bufs_len) { + assert(multifd->load_buf_idx == bufs_len); + return NULL; + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, + multifd->load_buf_idx); + if (!lb->is_present) { + return NULL; + } + + return lb; +} + +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, + VFIOStateBuffer *lb, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + g_autofree char *buf = NULL; + char *buf_cur; + size_t buf_len; + + if (!lb->len) { + return true; + } + + trace_vfio_load_state_device_buffer_load_start(vbasedev->name, + multifd->load_buf_idx); + + /* lb might become re-allocated when we drop the lock */ + buf = g_steal_pointer(&lb->data); + buf_cur = buf; + buf_len = lb->len; + while (buf_len > 0) { + ssize_t wr_ret; + int errno_save; + + /* + * Loading data to the device takes a while, + * drop the lock during this process. + */ + qemu_mutex_unlock(&multifd->load_bufs_mutex); + wr_ret = write(migration->data_fd, buf_cur, buf_len); + errno_save = errno; + qemu_mutex_lock(&multifd->load_bufs_mutex); + + if (wr_ret < 0) { + error_setg(errp, + "%s: writing state buffer %" PRIu32 " failed: %d", + vbasedev->name, multifd->load_buf_idx, errno_save); + return false; + } + + assert(wr_ret <= buf_len); + buf_len -= wr_ret; + buf_cur += wr_ret; + } + + trace_vfio_load_state_device_buffer_load_end(vbasedev->name, + multifd->load_buf_idx); + + return true; +} + +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, + bool *should_quit) +{ + return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); +} + +/* + * This thread is spawned by vfio_multifd_switchover_start() which gets + * called upon encountering the switchover point marker in main migration + * stream. + * + * It exits after either: + * * completing loading the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by it setting should_quit + * or by vfio_load_cleanup_load_bufs_thread() setting + * multifd->load_bufs_thread_want_exit. + */ +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + bool ret = false; + + trace_vfio_load_bufs_thread_start(vbasedev->name); + + assert(multifd); + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); + + assert(multifd->load_bufs_thread_running); + + while (true) { + VFIOStateBuffer *lb; + + /* + * Always check cancellation first after the buffer_ready wait below in + * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). + */ + if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); + + lb = vfio_load_state_buffer_get(multifd); + if (!lb) { + trace_vfio_load_state_device_buffer_starved(vbasedev->name, + multifd->load_buf_idx); + qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, + &multifd->load_bufs_mutex); + continue; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last) { + break; + } + + if (multifd->load_buf_idx == 0) { + trace_vfio_load_state_device_buffer_start(vbasedev->name); + } + + if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { + goto thread_exit; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { + trace_vfio_load_state_device_buffer_end(vbasedev->name); + } + + multifd->load_buf_idx++; + } + + if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + /* + * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that + * this thread is exiting. + */ + multifd->load_bufs_thread_running = false; + qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); + + trace_vfio_load_bufs_thread_end(vbasedev->name); + + return ret; +} + +static VFIOMultifd *vfio_multifd_new(void) +{ + VFIOMultifd *multifd = g_new(VFIOMultifd, 1); + + vfio_state_buffers_init(&multifd->load_bufs); + + qemu_mutex_init(&multifd->load_bufs_mutex); + + multifd->load_buf_idx = 0; + multifd->load_buf_idx_last = UINT32_MAX; + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + + multifd->load_bufs_thread_running = false; + multifd->load_bufs_thread_want_exit = false; + qemu_cond_init(&multifd->load_bufs_thread_finished_cond); + + return multifd; +} + +/* + * Terminates vfio_load_bufs_thread by setting + * multifd->load_bufs_thread_want_exit and signalling all the conditions + * the thread could be blocked on. + * + * Waits for the thread to signal that it had finished. + */ +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) +{ + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + while (multifd->load_bufs_thread_running) { + multifd->load_bufs_thread_want_exit = true; + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, + &multifd->load_bufs_mutex); + } + } + bql_lock(); +} + +static void vfio_multifd_free(VFIOMultifd *multifd) +{ + vfio_load_cleanup_load_bufs_thread(multifd); + + qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); + vfio_state_buffers_destroy(&multifd->load_bufs); + qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); + qemu_mutex_destroy(&multifd->load_bufs_mutex); + + g_free(multifd); +} + +void vfio_multifd_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + g_clear_pointer(&migration->multifd, vfio_multifd_free); +} + +bool vfio_multifd_transfer_supported(void) +{ + return multifd_device_state_supported() && + migrate_send_switchover_start(); +} + +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->multifd_transfer; +} + +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + + /* + * Make a copy of this setting at the start in case it is changed + * mid-migration. + */ + if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { + migration->multifd_transfer = vfio_multifd_transfer_supported(); + } else { + migration->multifd_transfer = + vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; + } + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing further to check or do */ + return true; + } + + if (!vfio_multifd_transfer_supported()) { + error_setg(errp, + "%s: Multifd device transfer requested but unsupported in the current config", + vbasedev->name); + return false; + } + + if (alloc_multifd) { + assert(!migration->multifd); + migration->multifd = vfio_multifd_new(); + } + + return true; +} + +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) +{ + assert(vfio_multifd_transfer_enabled(vbasedev)); + + /* + * Emit dummy NOP data on the main migration channel since the actual + * device state transfer is done via multifd channels. + */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); +} + +static bool +vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, + char *idstr, + uint32_t instance_id, + uint32_t idx, + Error **errp) +{ + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f = NULL; + int ret; + g_autofree VFIODeviceStatePacket *packet = NULL; + size_t packet_len; + + bioc = qio_channel_buffer_new(0); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); + + f = qemu_file_new_output(QIO_CHANNEL(bioc)); + + if (vfio_save_device_config_state(f, vbasedev, errp)) { + return false; + } + + ret = qemu_fflush(f); + if (ret) { + error_setg(errp, "%s: save config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + packet_len = sizeof(*packet) + bioc->usage; + packet = g_malloc0(packet_len); + packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + packet->idx = idx; + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; + memcpy(&packet->data, bioc->data, bioc->usage); + + if (!multifd_queue_device_state(idstr, instance_id, + (char *)packet, packet_len)) { + error_setg(errp, "%s: multifd config data queuing failed", + vbasedev->name); + return false; + } + + vfio_mig_add_bytes_transferred(packet_len); + + return true; +} + +/* + * This thread is spawned by the migration core directly via + * .save_live_complete_precopy_thread SaveVMHandler. + * + * It exits after either: + * * completing saving the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by + * multifd_device_state_save_thread_should_exit() returning true. + */ +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp) +{ + VFIODevice *vbasedev = d->handler_opaque; + VFIOMigration *migration = vbasedev->migration; + bool ret = false; + g_autofree VFIODeviceStatePacket *packet = NULL; + uint32_t idx; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ + return true; + } + + trace_vfio_save_complete_precopy_thread_start(vbasedev->name, + d->idstr, d->instance_id); + + /* We reach here with device state STOP or STOP_COPY only */ + if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP, errp)) { + goto thread_exit; + } + + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); + packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + + for (idx = 0; ; idx++) { + ssize_t data_size; + size_t packet_size; + + if (multifd_device_state_save_thread_should_exit()) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + data_size = read(migration->data_fd, &packet->data, + migration->data_buffer_size); + if (data_size < 0) { + error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", + vbasedev->name, idx, errno); + goto thread_exit; + } else if (data_size == 0) { + break; + } + + packet->idx = idx; + packet_size = sizeof(*packet) + data_size; + + if (!multifd_queue_device_state(d->idstr, d->instance_id, + (char *)packet, packet_size)) { + error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); + goto thread_exit; + } + + vfio_mig_add_bytes_transferred(packet_size); + } + + if (!vfio_save_complete_precopy_thread_config_state(vbasedev, + d->idstr, + d->instance_id, + idx, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); + + return ret; +} + +int vfio_multifd_switchover_start(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + + assert(multifd); + + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + assert(!multifd->load_bufs_thread_running); + multifd->load_bufs_thread_running = true; + } + bql_lock(); + + qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); + + return 0; +} diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h new file mode 100644 index 0000000..a664051 --- /dev/null +++ b/hw/vfio/migration-multifd.h @@ -0,0 +1,34 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_MIGRATION_MULTIFD_H +#define HW_VFIO_MIGRATION_MULTIFD_H + +#include "hw/vfio/vfio-common.h" + +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp); +void vfio_multifd_cleanup(VFIODevice *vbasedev); + +bool vfio_multifd_transfer_supported(void); +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); + +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp); + +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f); + +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp); + +int vfio_multifd_switchover_start(VFIODevice *vbasedev); + +#endif diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index adfa752..416643d 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -23,6 +23,7 @@ #include "migration/qemu-file.h" #include "migration/register.h" #include "migration/blocker.h" +#include "migration-multifd.h" #include "qapi/error.h" #include "qapi/qapi-events-vfio.h" #include "exec/ramlist.h" @@ -32,30 +33,13 @@ #include "hw/hw.h" /* - * Flags to be used as unique delimiters for VFIO devices in the migration - * stream. These flags are composed as: - * 0xffffffff => MSB 32-bit all 1s - * 0xef10 => Magic ID, represents emulated (virtual) function IO - * 0x0000 => 16-bits reserved for flags - * - * The beginning of state information is marked by _DEV_CONFIG_STATE, - * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a - * certain state information is marked by _END_OF_STATE. - */ -#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) -#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) -#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) -#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) -#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) - -/* * This is an arbitrary size based on migration of mlx5 devices, where typically * total device migration size is on the order of 100s of MB. Testing with * larger values, e.g. 128MB and 1GB, did not show a performance improvement. */ #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) -static int64_t bytes_transferred; +static unsigned long bytes_transferred; static const char *mig_state_to_str(enum vfio_device_mig_state state) { @@ -136,10 +120,10 @@ static void vfio_migration_set_device_state(VFIODevice *vbasedev, vfio_migration_send_event(vbasedev); } -static int vfio_migration_set_state(VFIODevice *vbasedev, - enum vfio_device_mig_state new_state, - enum vfio_device_mig_state recover_state, - Error **errp) +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp) { VFIOMigration *migration = vbasedev->migration; uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + @@ -254,8 +238,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, return ret; } -static int vfio_save_device_config_state(QEMUFile *f, void *opaque, - Error **errp) +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp) { VFIODevice *vbasedev = opaque; int ret; @@ -280,11 +263,13 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque, return ret; } -static int vfio_load_device_config_state(QEMUFile *f, void *opaque) +int vfio_load_device_config_state(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; uint64_t data; + trace_vfio_load_device_config_state_start(vbasedev->name); + if (vbasedev->ops && vbasedev->ops->vfio_load_config) { int ret; @@ -303,7 +288,7 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return -EINVAL; } - trace_vfio_load_device_config_state(vbasedev->name); + trace_vfio_load_device_config_state_end(vbasedev->name); return qemu_file_get_error(f); } @@ -389,7 +374,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); qemu_put_be64(f, data_size); qemu_put_buffer(f, migration->data_buffer, data_size); - bytes_transferred += data_size; + vfio_mig_add_bytes_transferred(data_size); trace_vfio_save_block(migration->vbasedev->name, data_size); @@ -467,6 +452,10 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; int ret; + if (!vfio_multifd_setup(vbasedev, false, errp)) { + return -EINVAL; + } + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); vfio_query_stop_copy_size(vbasedev, &stop_copy_size); @@ -523,6 +512,9 @@ static void vfio_save_cleanup(void *opaque) Error *local_err = NULL; int ret; + /* Currently a NOP, done for symmetry with load_cleanup() */ + vfio_multifd_cleanup(vbasedev); + /* * Changing device state from STOP_COPY to STOP can take time. Do it here, * after migration has completed, so it won't increase downtime. @@ -645,6 +637,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) int ret; Error *local_err = NULL; + if (vfio_multifd_transfer_enabled(vbasedev)) { + vfio_multifd_emit_dummy_eos(vbasedev, f); + return 0; + } + trace_vfio_save_complete_precopy_start(vbasedev->name); /* We reach here with device state STOP or STOP_COPY only */ @@ -676,6 +673,11 @@ static void vfio_save_state(QEMUFile *f, void *opaque) Error *local_err = NULL; int ret; + if (vfio_multifd_transfer_enabled(vbasedev)) { + vfio_multifd_emit_dummy_eos(vbasedev, f); + return; + } + ret = vfio_save_device_config_state(f, opaque, &local_err); if (ret) { error_prepend(&local_err, @@ -688,15 +690,28 @@ static void vfio_save_state(QEMUFile *f, void *opaque) static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + if (!vfio_multifd_setup(vbasedev, true, errp)) { + return -EINVAL; + } + + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, + migration->device_state, errp); + if (ret) { + return ret; + } - return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, - vbasedev->migration->device_state, errp); + return 0; } static int vfio_load_cleanup(void *opaque) { VFIODevice *vbasedev = opaque; + vfio_multifd_cleanup(vbasedev); + vfio_migration_cleanup(vbasedev); trace_vfio_load_cleanup(vbasedev->name); @@ -717,6 +732,13 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) switch (data) { case VFIO_MIG_FLAG_DEV_CONFIG_STATE: { + if (vfio_multifd_transfer_enabled(vbasedev)) { + error_report("%s: got DEV_CONFIG_STATE in main migration " + "channel but doing multifd transfer", + vbasedev->name); + return -EINVAL; + } + return vfio_load_device_config_state(f, opaque); } case VFIO_MIG_FLAG_DEV_SETUP_STATE: @@ -782,6 +804,17 @@ static bool vfio_switchover_ack_needed(void *opaque) return vfio_precopy_supported(vbasedev); } +static int vfio_switchover_start(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + if (vfio_multifd_transfer_enabled(vbasedev)) { + return vfio_multifd_switchover_start(vbasedev); + } + + return 0; +} + static const SaveVMHandlers savevm_vfio_handlers = { .save_prepare = vfio_save_prepare, .save_setup = vfio_save_setup, @@ -796,6 +829,12 @@ static const SaveVMHandlers savevm_vfio_handlers = { .load_cleanup = vfio_load_cleanup, .load_state = vfio_load_state, .switchover_ack_needed = vfio_switchover_ack_needed, + /* + * Multifd support + */ + .load_state_buffer = vfio_multifd_load_state_buffer, + .switchover_start = vfio_switchover_start, + .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, }; /* ---------------------------------------------------------------------- */ @@ -1011,12 +1050,17 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) int64_t vfio_mig_bytes_transferred(void) { - return bytes_transferred; + return MIN(qatomic_read(&bytes_transferred), INT64_MAX); } void vfio_reset_bytes_transferred(void) { - bytes_transferred = 0; + qatomic_set(&bytes_transferred, 0); +} + +void vfio_mig_add_bytes_transferred(unsigned long val) +{ + qatomic_add(&bytes_transferred, val); } /* diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 89d900e..fdbc158 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2215,8 +2215,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) break; case PCI_CAP_ID_PM: vfio_check_pm_reset(vdev, pos); - vdev->pm_cap = pos; - ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0; + ret = pci_pm_init(pdev, pos, errp) >= 0; + /* + * PCI-core config space emulation needs write access to the power + * state enabled for tracking BAR mapping relative to PM state. + */ + pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); break; case PCI_CAP_ID_AF: vfio_check_af_flr(vdev, pos); @@ -2406,18 +2410,27 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_disable_interrupts(vdev); + /* + * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. + * Also put INTx Disable in known state. + */ + cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); + cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + /* Make sure the device is in D0 */ - if (vdev->pm_cap) { + if (pdev->pm_cap) { uint16_t pmcsr; uint8_t state; - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { pmcsr &= ~PCI_PM_CTRL_STATE_MASK; - vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); + vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); /* vfio handles the necessary delay here */ - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { error_report("vfio: Unable to power on device, stuck in D%d", @@ -2425,15 +2438,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) } } } - - /* - * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. - * Also put INTx Disable in known state. - */ - cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); - cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | - PCI_COMMAND_INTX_DISABLE); - vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } void vfio_pci_post_reset(VFIOPCIDevice *vdev) @@ -3353,6 +3357,8 @@ static void vfio_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } +static PropertyInfo vfio_pci_migration_multifd_transfer_prop; + static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), @@ -3377,6 +3383,10 @@ static const Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), + DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice, + vbasedev.migration_multifd_transfer, + vfio_pci_migration_multifd_transfer_prop, OnOffAuto, + .set_default = true, .defval.i = ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3433,6 +3443,126 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) pdc->exit = vfio_exitfn; pdc->config_read = vfio_pci_read_config; pdc->config_write = vfio_pci_write_config; + + object_class_property_set_description(klass, /* 1.3 */ + "host", + "Host PCI address [domain:]<bus:slot.function> of assigned device"); + object_class_property_set_description(klass, /* 1.3 */ + "x-intx-mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after INTx (DEBUG)"); + object_class_property_set_description(klass, /* 1.5 */ + "x-vga", + "Expose VGA address spaces for device"); + object_class_property_set_description(klass, /* 2.3 */ + "x-req", + "Disable device request notification support (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-intx", + "Disable direct VFIO->KVM INTx injection. Allows to " + "trace INTx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msi", + "Disable direct VFIO->KVM MSI injection. Allows to " + "trace MSI interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msix", + "Disable direct VFIO->KVM MSIx injection. Allows to " + "trace MSIx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-vendor-id", + "Override PCI Vendor ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-device-id", + "Override PCI device ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-vendor-id", + "Override PCI Subsystem Vendor ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-device-id", + "Override PCI Subsystem Device ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 2.7 */ + "x-igd-opregion", + "Expose host IGD OpRegion to guest"); + object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */ + "x-igd-gms", + "Override IGD data stolen memory size (32MiB units)"); + object_class_property_set_description(klass, /* 2.11 */ + "x-nv-gpudirect-clique", + "Add NVIDIA GPUDirect capability indicating P2P DMA " + "clique for device [0-15]"); + object_class_property_set_description(klass, /* 2.12 */ + "x-no-geforce-quirks", + "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). " + "Improves performance"); + object_class_property_set_description(klass, /* 2.12 */ + "display", + "Enable display support for device, ex. vGPU"); + object_class_property_set_description(klass, /* 2.12 */ + "x-msix-relocation", + "Specify MSI-X MMIO relocation to the end of specified " + "existing BAR or new BAR to avoid virtualization overhead " + "due to adjacent device registers"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-kvm-ioeventfd", + "Disable registration of ioeventfds with KVM (DEBUG)"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-vfio-ioeventfd", + "Disable linking of KVM ioeventfds to VFIO ioeventfds " + "(DEBUG)"); + object_class_property_set_description(klass, /* 3.1 */ + "x-balloon-allowed", + "Override allowing ballooning with device (DEBUG, DANGER)"); + object_class_property_set_description(klass, /* 3.2 */ + "xres", + "Set X display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 3.2 */ + "yres", + "Set Y display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 5.2 */ + "x-pre-copy-dirty-page-tracking", + "Disable dirty pages tracking during iterative phase " + "(DEBUG)"); + object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */ + "enable-migration", + "Enale device migration. Also requires a host VFIO PCI " + "variant or mdev driver with migration support enabled"); + object_class_property_set_description(klass, /* 8.1 */ + "vf-token", + "Specify UUID VF token. Required for VF when PF is owned " + "by another VFIO driver"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.1 */ + "x-device-dirty-page-tracking", + "Disable device dirty page tracking and use " + "container-based dirty page tracking (DEBUG)"); + object_class_property_set_description(klass, /* 9.1 */ + "migration-events", + "Emit VFIO migration QAPI event when a VFIO device " + "changes its migration state. For management applications"); + object_class_property_set_description(klass, /* 9.1 */ + "skip-vsc-check", + "Skip config space check for Vendor Specific Capability. " + "Setting to false will enforce strict checking of VSC content " + "(DEBUG)"); + object_class_property_set_description(klass, /* 10.0 */ + "x-migration-multifd-transfer", + "Transfer this device state via " + "multifd channels when live migrating it"); } static const TypeInfo vfio_pci_dev_info = { @@ -3461,6 +3591,15 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vfio_pci_dev_nohotplug_properties); dc->hotpluggable = false; + + object_class_property_set_description(klass, /* 3.1 */ + "ramfb", + "Enable ramfb to provide pre-boot graphics for devices " + "enabling display option"); + object_class_property_set_description(klass, /* 8.2 */ + "x-ramfb-migrate", + "Override default migration support for ramfb support " + "(DEBUG)"); } static const TypeInfo vfio_pci_nohotplug_dev_info = { @@ -3472,6 +3611,17 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { static void register_vfio_pci_dev_type(void) { + /* + * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can + * run for a long time before being migrated so it is desirable to have a + * fallback mechanism to the old way of transferring VFIO device state if + * it turns to be necessary. + * The following makes this type of property have the same mutability level + * as ordinary migration parameters. + */ + vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; + vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; + type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 43c1666..d638c78 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -160,7 +160,6 @@ struct VFIOPCIDevice { int32_t bootindex; uint32_t igd_gms; OffAutoPCIBAR msix_relo; - uint8_t pm_cap; uint8_t nv_gpudirect_clique; bool pci_aer; bool req_enabled; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index f491f4d..67bc574 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -575,6 +575,7 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i; + warn_report("-device vfio-platform is deprecated"); qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? @@ -672,6 +673,30 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) dc->desc = "VFIO-based platform device assignment"; sbc->connect_irq_notifier = vfio_start_irqfd_injection; set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + object_class_property_set_description(klass, /* 2.4 */ + "host", + "Host device name of assigned device"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 */ + "mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after level interrupt (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 */ + "x-irqfd", + "Allow disabling irqfd support (DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif } static const TypeInfo vfio_platform_dev_info = { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index c5385e1..9347e3a 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -149,10 +149,19 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" # migration.c +vfio_load_bufs_thread_start(const char *name) " (%s)" +vfio_load_bufs_thread_end(const char *name) " (%s)" vfio_load_cleanup(const char *name) " (%s)" -vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_device_config_state_start(const char *name) " (%s)" +vfio_load_device_config_state_end(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d" +vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_start(const char *name) " (%s)" +vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_end(const char *name) " (%s)" vfio_migration_realize(const char *name) " (%s)" vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s" vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s" @@ -162,6 +171,8 @@ vfio_save_block_precopy_empty_hit(const char *name) " (%s)" vfio_save_cleanup(const char *name) " (%s)" vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_complete_precopy_start(const char *name) " (%s)" +vfio_save_complete_precopy_thread_start(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32 +vfio_save_complete_precopy_thread_end(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64 vfio_save_iterate_start(const char *name) " (%s)" diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index c773a91..3ca3f84 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2204,14 +2204,11 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); - pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, - PCI_PM_SIZEOF, errp); + pos = pci_pm_init(pci_dev, 0, errp); if (pos < 0) { return; } - pci_dev->exp.pm_cap = pos; - /* * Indicates that this function complies with revision 1.2 of the * PCI Power Management Interface Specification. @@ -2310,11 +2307,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev) { uint16_t pmcsr; - if (!pci_is_express(dev) || !dev->exp.pm_cap) { + if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) { return false; } - pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL); + pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL); /* * When No_Soft_Reset bit is set and the device @@ -2343,7 +2340,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type) if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) { pci_word_test_and_clear_mask( - dev->config + dev->exp.pm_cap + PCI_PM_CTRL, + dev->config + dev->pm_cap + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); } } |