diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2023-06-30 08:11:08 +0200 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2023-06-30 08:11:08 +0200 |
commit | 408015a97dbe48a9dde8c0d2526c9312691952e7 (patch) | |
tree | b41b1a9349392da698def212ff60e19f7043f851 | |
parent | f7884164cbe3743c3bd2acc9daf877497fdb5fa3 (diff) | |
parent | 0cc889c8826cefa5b80110d31a62273b56aa1832 (diff) | |
download | qemu-408015a97dbe48a9dde8c0d2526c9312691952e7.zip qemu-408015a97dbe48a9dde8c0d2526c9312691952e7.tar.gz qemu-408015a97dbe48a9dde8c0d2526c9312691952e7.tar.bz2 |
Merge tag 'pull-vfio-20230630' of https://github.com/legoater/qemu into staging
vfio queue:
* migration: New switchover ack to reduce downtime
* VFIO migration pre-copy support
* Removal of the VFIO migration experimental flag
* Alternate offset for GPUDirect Cliques
* Misc fixes
# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmSeVHYACgkQUaNDx8/7
# 7KHeZw/+LRe9QQpx8hU//vKBvLet2QvI3WUaXGHiHbblbRT6HhiHjWHB2/8j6jji
# QhAGJ6w9yoKODyY0kGpVFEnkmXOKyqwWssBheV219ntZs09pFGxZr/ldUhT22aBN
# kH8mHU9BZ3J+zF/kKphpcIC1sPxVu/DlrtnJu5vDGuRAOu8+3kFV217JC1yGs1Vh
# n+KOho8a8oP9qxtzfvQ9iZ4dpBOOKpE9vscS12wJAlen93AGB6esR7VaLxDjExRP
# yL1pguQ8ZZ1gEXXbXO62djKo3IViobtD08KmCXTzQ6TVquLleJzqgjp+A0THnYAe
# J9Rlja7LpsO9MYSxmRE9WcQccC+sAGn/t/ufB0tL8zR43FvfhbF5H0PzBBY0H7YA
# JlzN+fgrKEEHJwMhXANNvSddhWCwvrkjNxo/80u3ySYMQR1Hav/tsXYBlk16e5nS
# fmtrFGTwhsVdy1Q6ZqEOyTni1eiYt5stEQMZFODdUNj6b9FugSZ0BK+2WN/M0CzU
# 6mKmJQgZAG/nBoRJm/XCO5OKQ6wm/4tm6F4HSH5EJ6mDT+DqETAk4GRUWTbYa2/G
# yAAOlhTMu8Xc/NhMeJ7Z99dyq0SM8pi/XpVEIv7p9yBak8ix60iCWZtDE8vlDv3M
# UfMVMTAvTS30kbS6FDN2Yyl6l8/ETdcwVIN4l02ipGzpMCtn9EQ=
# =dKUj
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 30 Jun 2023 06:05:10 AM CEST
# gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@kaod.org>" [undefined]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1
* tag 'pull-vfio-20230630' of https://github.com/legoater/qemu:
vfio/pci: Free leaked timer in vfio_realize error path
vfio/pci: Fix a segfault in vfio_realize
MAINTAINERS: Promote Cédric to VFIO co-maintainer
vfio/migration: Make VFIO migration non-experimental
vfio/migration: Reset bytes_transferred properly
vfio/pci: Call vfio_prepare_kvm_msi_virq_batch() in MSI retry path
hw/vfio/pci-quirks: Support alternate offset for GPUDirect Cliques
vfio: Implement a common device info helper
vfio/migration: Add support for switchover ack capability
vfio/migration: Add VFIO migration pre-copy support
vfio/migration: Store VFIO migration flags in VFIOMigration
vfio/migration: Refactor vfio_save_block() to return saved data size
tests: Add migration switchover ack capability test
migration: Enable switchover ack capability
migration: Implement switchover ack logic
migration: Add switchover ack capability
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | docs/devel/vfio-migration.rst | 45 | ||||
-rw-r--r-- | hw/s390x/s390-pci-vfio.c | 37 | ||||
-rw-r--r-- | hw/vfio/common.c | 68 | ||||
-rw-r--r-- | hw/vfio/migration.c | 305 | ||||
-rw-r--r-- | hw/vfio/pci-quirks.c | 41 | ||||
-rw-r--r-- | hw/vfio/pci.c | 15 | ||||
-rw-r--r-- | hw/vfio/trace-events | 6 | ||||
-rw-r--r-- | include/hw/vfio/vfio-common.h | 12 | ||||
-rw-r--r-- | include/migration/register.h | 2 | ||||
-rw-r--r-- | migration/migration.c | 33 | ||||
-rw-r--r-- | migration/migration.h | 15 | ||||
-rw-r--r-- | migration/options.c | 17 | ||||
-rw-r--r-- | migration/options.h | 1 | ||||
-rw-r--r-- | migration/savevm.c | 55 | ||||
-rw-r--r-- | migration/savevm.h | 1 | ||||
-rw-r--r-- | migration/target.c | 17 | ||||
-rw-r--r-- | migration/trace-events | 3 | ||||
-rw-r--r-- | qapi/migration.json | 12 | ||||
-rw-r--r-- | tests/qtest/migration-test.c | 31 |
20 files changed, 600 insertions, 118 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index aba0772..4feea49 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2051,7 +2051,7 @@ F: hw/usb/dev-serial.c VFIO M: Alex Williamson <alex.williamson@redhat.com> -R: Cédric Le Goater <clg@redhat.com> +M: Cédric Le Goater <clg@redhat.com> S: Supported F: hw/vfio/* F: include/hw/vfio/ diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst index 1b68ccf..b433cb5 100644 --- a/docs/devel/vfio-migration.rst +++ b/docs/devel/vfio-migration.rst @@ -7,12 +7,21 @@ the guest is running on source host and restoring this saved state on the destination host. This document details how saving and restoring of VFIO devices is done in QEMU. -Migration of VFIO devices currently consists of a single stop-and-copy phase. -During the stop-and-copy phase the guest is stopped and the entire VFIO device -data is transferred to the destination. - -The pre-copy phase of migration is currently not supported for VFIO devices. -Support for VFIO pre-copy will be added later on. +Migration of VFIO devices consists of two phases: the optional pre-copy phase, +and the stop-and-copy phase. The pre-copy phase is iterative and allows to +accommodate VFIO devices that have a large amount of data that needs to be +transferred. The iterative pre-copy phase of migration allows for the guest to +continue whilst the VFIO device state is transferred to the destination, this +helps to reduce the total downtime of the VM. VFIO devices opt-in to pre-copy +support by reporting the VFIO_MIGRATION_PRE_COPY flag in the +VFIO_DEVICE_FEATURE_MIGRATION ioctl. + +When pre-copy is supported, it's possible to further reduce downtime by +enabling "switchover-ack" migration capability. +VFIO migration uAPI defines "initial bytes" as part of its pre-copy data stream +and recommends that the initial bytes are sent and loaded in the destination +before stopping the source VM. Enabling this migration capability will +guarantee that and thus, can potentially reduce downtime even further. Note that currently VFIO migration is supported only for a single device. This is due to VFIO migration's lack of P2P support. However, P2P support is planned @@ -29,10 +38,23 @@ VFIO implements the device hooks for the iterative approach as follows: * A ``load_setup`` function that sets the VFIO device on the destination in _RESUMING state. +* A ``state_pending_estimate`` function that reports an estimate of the + remaining pre-copy data that the vendor driver has yet to save for the VFIO + device. + * A ``state_pending_exact`` function that reads pending_bytes from the vendor driver, which indicates the amount of data that the vendor driver has yet to save for the VFIO device. +* An ``is_active_iterate`` function that indicates ``save_live_iterate`` is + active only when the VFIO device is in pre-copy states. + +* A ``save_live_iterate`` function that reads the VFIO device's data from the + vendor driver during iterative pre-copy phase. + +* A ``switchover_ack_needed`` function that checks if the VFIO device uses + "switchover-ack" migration capability when this capability is enabled. + * A ``save_state`` function to save the device config space if it is present. * A ``save_live_complete_precopy`` function that sets the VFIO device in @@ -111,8 +133,10 @@ Flow of state changes during Live migration =========================================== Below is the flow of state change during live migration. -The values in the brackets represent the VM state, the migration state, and +The values in the parentheses represent the VM state, the migration state, and the VFIO device state, respectively. +The text in the square brackets represents the flow if the VFIO device supports +pre-copy. Live migration save path ------------------------ @@ -124,11 +148,12 @@ Live migration save path | migrate_init spawns migration_thread Migration thread then calls each device's .save_setup() - (RUNNING, _SETUP, _RUNNING) + (RUNNING, _SETUP, _RUNNING [_PRE_COPY]) | - (RUNNING, _ACTIVE, _RUNNING) - If device is active, get pending_bytes by .state_pending_exact() + (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY]) + If device is active, get pending_bytes by .state_pending_{estimate,exact}() If total pending_bytes >= threshold_size, call .save_live_iterate() + [Data of VFIO device for pre-copy phase is copied] Iterate till total pending bytes converge and are less than threshold | On migration completion, vCPU stops and calls .save_live_complete_precopy for diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index f51190d..59a2e03 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -289,38 +289,11 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS); } -static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev, - uint32_t argsz) +static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev) { - struct vfio_device_info *info = g_malloc0(argsz); - VFIOPCIDevice *vfio_pci; - int fd; + VFIOPCIDevice *vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); - vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); - fd = vfio_pci->vbasedev.fd; - - /* - * If the specified argsz is not large enough to contain all capabilities - * it will be updated upon return from the ioctl. Retry until we have - * a big enough buffer to hold the entire capability chain. On error, - * just exit and rely on CLP defaults. - */ -retry: - info->argsz = argsz; - - if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { - trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name); - g_free(info); - return NULL; - } - - if (info->argsz > argsz) { - argsz = info->argsz; - info = g_realloc(info, argsz); - goto retry; - } - - return info; + return vfio_get_device_info(vfio_pci->vbasedev.fd); } /* @@ -335,7 +308,7 @@ bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh) assert(fh); - info = get_device_info(pbdev, sizeof(*info)); + info = get_device_info(pbdev); if (!info) { return false; } @@ -356,7 +329,7 @@ void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { g_autofree struct vfio_device_info *info = NULL; - info = get_device_info(pbdev, sizeof(*info)); + info = get_device_info(pbdev); if (!info) { return; } diff --git a/hw/vfio/common.c b/hw/vfio/common.c index fa8fd94..77e2ee0 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -381,7 +381,7 @@ static unsigned int vfio_migratable_device_num(void) return device_num; } -int vfio_block_multiple_devices_migration(Error **errp) +int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) { int ret; @@ -390,6 +390,12 @@ int vfio_block_multiple_devices_migration(Error **errp) return 0; } + if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { + error_setg(errp, "Migration is currently not supported with multiple " + "VFIO devices"); + return -EINVAL; + } + error_setg(&multiple_devices_migration_blocker, "Migration is currently not supported with multiple " "VFIO devices"); @@ -427,7 +433,7 @@ static bool vfio_viommu_preset(void) return false; } -int vfio_block_giommu_migration(Error **errp) +int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp) { int ret; @@ -436,6 +442,12 @@ int vfio_block_giommu_migration(Error **errp) return 0; } + if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { + error_setg(errp, + "Migration is currently not supported with vIOMMU enabled"); + return -EINVAL; + } + error_setg(&giommu_migration_blocker, "Migration is currently not supported with vIOMMU enabled"); ret = migrate_add_blocker(giommu_migration_blocker, errp); @@ -492,7 +504,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) } if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && - migration->device_state == VFIO_DEVICE_STATE_RUNNING) { + (migration->device_state == VFIO_DEVICE_STATE_RUNNING || + migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) { return false; } } @@ -537,7 +550,8 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) return false; } - if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) { + if (migration->device_state == VFIO_DEVICE_STATE_RUNNING || + migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { continue; } else { return false; @@ -2844,11 +2858,35 @@ void vfio_put_group(VFIOGroup *group) } } +struct vfio_device_info *vfio_get_device_info(int fd) +{ + struct vfio_device_info *info; + uint32_t argsz = sizeof(*info); + + info = g_malloc0(argsz); + +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { + g_free(info); + return NULL; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + return info; +} + int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vbasedev, Error **errp) { - struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; - int ret, fd; + g_autofree struct vfio_device_info *info = NULL; + int fd; fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); if (fd < 0) { @@ -2860,11 +2898,11 @@ int vfio_get_device(VFIOGroup *group, const char *name, return fd; } - ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info); - if (ret) { + info = vfio_get_device_info(fd); + if (!info) { error_setg_errno(errp, errno, "error getting device info"); close(fd); - return ret; + return -1; } /* @@ -2892,14 +2930,14 @@ int vfio_get_device(VFIOGroup *group, const char *name, vbasedev->group = group; QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - vbasedev->num_irqs = dev_info.num_irqs; - vbasedev->num_regions = dev_info.num_regions; - vbasedev->flags = dev_info.flags; + vbasedev->num_irqs = info->num_irqs; + vbasedev->num_regions = info->num_regions; + vbasedev->flags = info->flags; + + trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs); - trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, - dev_info.num_irqs); + vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); - vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); return 0; } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 6b58ddd..1db7d52 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -18,6 +18,8 @@ #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" #include "migration/migration.h" +#include "migration/options.h" +#include "migration/savevm.h" #include "migration/vmstate.h" #include "migration/qemu-file.h" #include "migration/register.h" @@ -45,6 +47,7 @@ #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) /* * This is an arbitrary size based on migration of mlx5 devices, where typically @@ -68,6 +71,8 @@ static const char *mig_state_to_str(enum vfio_device_mig_state state) return "STOP_COPY"; case VFIO_DEVICE_STATE_RESUMING: return "RESUMING"; + case VFIO_DEVICE_STATE_PRE_COPY: + return "PRE_COPY"; default: return "UNKNOWN STATE"; } @@ -241,18 +246,45 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev, return 0; } -/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */ -static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) +static int vfio_query_precopy_size(VFIOMigration *migration) +{ + struct vfio_precopy_info precopy = { + .argsz = sizeof(precopy), + }; + + migration->precopy_init_size = 0; + migration->precopy_dirty_size = 0; + + if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { + return -errno; + } + + migration->precopy_init_size = precopy.initial_bytes; + migration->precopy_dirty_size = precopy.dirty_bytes; + + return 0; +} + +/* Returns the size of saved data on success and -errno on error */ +static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) { ssize_t data_size; data_size = read(migration->data_fd, migration->data_buffer, migration->data_buffer_size); if (data_size < 0) { + /* + * Pre-copy emptied all the device state for now. For more information, + * please refer to the Linux kernel VFIO uAPI. + */ + if (errno == ENOMSG) { + return 0; + } + return -errno; } if (data_size == 0) { - return 1; + return 0; } qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); @@ -262,7 +294,39 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) trace_vfio_save_block(migration->vbasedev->name, data_size); - return qemu_file_get_error(f); + return qemu_file_get_error(f) ?: data_size; +} + +static void vfio_update_estimated_pending_data(VFIOMigration *migration, + uint64_t data_size) +{ + if (!data_size) { + /* + * Pre-copy emptied all the device state for now, update estimated sizes + * accordingly. + */ + migration->precopy_init_size = 0; + migration->precopy_dirty_size = 0; + + return; + } + + if (migration->precopy_init_size) { + uint64_t init_size = MIN(migration->precopy_init_size, data_size); + + migration->precopy_init_size -= init_size; + data_size -= init_size; + } + + migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, + data_size); +} + +static bool vfio_precopy_supported(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; } /* ---------------------------------------------------------------------- */ @@ -285,6 +349,28 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) return -ENOMEM; } + if (vfio_precopy_supported(vbasedev)) { + int ret; + + switch (migration->device_state) { + case VFIO_DEVICE_STATE_RUNNING: + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, + VFIO_DEVICE_STATE_RUNNING); + if (ret) { + return ret; + } + + vfio_query_precopy_size(migration); + + break; + case VFIO_DEVICE_STATE_STOP: + /* vfio_save_complete_precopy() will go to STOP_COPY */ + break; + default: + return -EINVAL; + } + } + trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); @@ -299,26 +385,43 @@ static void vfio_save_cleanup(void *opaque) g_free(migration->data_buffer); migration->data_buffer = NULL; + migration->precopy_init_size = 0; + migration->precopy_dirty_size = 0; + migration->initial_data_sent = false; vfio_migration_cleanup(vbasedev); trace_vfio_save_cleanup(vbasedev->name); } +static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, + uint64_t *can_postcopy) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + + if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) { + return; + } + + *must_precopy += + migration->precopy_init_size + migration->precopy_dirty_size; + + trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, + *can_postcopy, + migration->precopy_init_size, + migration->precopy_dirty_size); +} + /* * Migration size of VFIO devices can be as little as a few KBs or as big as * many GBs. This value should be big enough to cover the worst case. */ #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) -/* - * Only exact function is implemented and not estimate function. The reason is - * that during pre-copy phase of migration the estimate function is called - * repeatedly while pending RAM size is over the threshold, thus migration - * can't converge and querying the VFIO device pending data size is useless. - */ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, uint64_t *can_postcopy) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; /* @@ -328,16 +431,64 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, vfio_query_stop_copy_size(vbasedev, &stop_copy_size); *must_precopy += stop_copy_size; + if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { + vfio_query_precopy_size(migration); + + *must_precopy += + migration->precopy_init_size + migration->precopy_dirty_size; + } + trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, - stop_copy_size); + stop_copy_size, migration->precopy_init_size, + migration->precopy_dirty_size); +} + +static bool vfio_is_active_iterate(void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + + return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY; +} + +static int vfio_save_iterate(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + ssize_t data_size; + + data_size = vfio_save_block(f, migration); + if (data_size < 0) { + return data_size; + } + + vfio_update_estimated_pending_data(migration, data_size); + + if (migrate_switchover_ack() && !migration->precopy_init_size && + !migration->initial_data_sent) { + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); + migration->initial_data_sent = true; + } else { + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + } + + trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, + migration->precopy_dirty_size); + + /* + * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero. + * Return 1 so following handlers will not be potentially blocked. + */ + return 1; } static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; + ssize_t data_size; int ret; - /* We reach here with device state STOP only */ + /* We reach here with device state STOP or STOP_COPY only */ ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, VFIO_DEVICE_STATE_STOP); if (ret) { @@ -345,11 +496,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) } do { - ret = vfio_save_block(f, vbasedev->migration); - if (ret < 0) { - return ret; + data_size = vfio_save_block(f, vbasedev->migration); + if (data_size < 0) { + return data_size; } - } while (!ret); + } while (data_size); qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ret = qemu_file_get_error(f); @@ -439,6 +590,24 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) } break; } + case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: + { + if (!vfio_precopy_supported(vbasedev) || + !migrate_switchover_ack()) { + error_report("%s: Received INIT_DATA_SENT but switchover ack " + "is not used", vbasedev->name); + return -EINVAL; + } + + ret = qemu_loadvm_approve_switchover(); + if (ret) { + error_report( + "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", + vbasedev->name, ret, strerror(-ret)); + } + + return ret; + } default: error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); return -EINVAL; @@ -453,15 +622,26 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) return ret; } +static bool vfio_switchover_ack_needed(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + return vfio_precopy_supported(vbasedev); +} + static const SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, + .state_pending_estimate = vfio_state_pending_estimate, .state_pending_exact = vfio_state_pending_exact, + .is_active_iterate = vfio_is_active_iterate, + .save_live_iterate = vfio_save_iterate, .save_live_complete_precopy = vfio_save_complete_precopy, .save_state = vfio_save_state, .load_setup = vfio_load_setup, .load_cleanup = vfio_load_cleanup, .load_state = vfio_load_state, + .switchover_ack_needed = vfio_switchover_ack_needed, }; /* ---------------------------------------------------------------------- */ @@ -469,13 +649,18 @@ static const SaveVMHandlers savevm_vfio_handlers = { static void vfio_vmstate_change(void *opaque, bool running, RunState state) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; enum vfio_device_mig_state new_state; int ret; if (running) { new_state = VFIO_DEVICE_STATE_RUNNING; } else { - new_state = VFIO_DEVICE_STATE_STOP; + new_state = + (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY && + (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? + VFIO_DEVICE_STATE_STOP_COPY : + VFIO_DEVICE_STATE_STOP; } /* @@ -512,7 +697,6 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_FAILED: - bytes_transferred = 0; /* * If setting the device in RUNNING state fails, the device should * be reset. To do so, use ERROR state as a recover state. @@ -540,14 +724,6 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) feature->argsz = sizeof(buf); feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { - if (errno == ENOTTY) { - error_report("%s: VFIO migration is not supported in kernel", - vbasedev->name); - } else { - error_report("%s: Failed to query VFIO migration support, err: %s", - vbasedev->name, strerror(errno)); - } - return -errno; } @@ -602,6 +778,7 @@ static int vfio_migration_init(VFIODevice *vbasedev) migration->vbasedev = vbasedev; migration->device_state = VFIO_DEVICE_STATE_RUNNING; migration->data_fd = -1; + migration->mig_flags = mig_flags; vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); @@ -625,6 +802,27 @@ static int vfio_migration_init(VFIODevice *vbasedev) return 0; } +static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) +{ + int ret; + + if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { + error_propagate(errp, err); + return -EINVAL; + } + + vbasedev->migration_blocker = error_copy(err); + error_free(err); + + ret = migrate_add_blocker(vbasedev->migration_blocker, errp); + if (ret < 0) { + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } + + return ret; +} + /* ---------------------------------------------------------------------- */ int64_t vfio_mig_bytes_transferred(void) @@ -632,42 +830,61 @@ int64_t vfio_mig_bytes_transferred(void) return bytes_transferred; } +void vfio_reset_bytes_transferred(void) +{ + bytes_transferred = 0; +} + int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) { - int ret = -ENOTSUP; + Error *err = NULL; + int ret; - if (!vbasedev->enable_migration) { - goto add_blocker; + if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { + error_setg(&err, "%s: Migration is disabled for VFIO device", + vbasedev->name); + return vfio_block_migration(vbasedev, err, errp); } ret = vfio_migration_init(vbasedev); if (ret) { - goto add_blocker; + if (ret == -ENOTTY) { + error_setg(&err, "%s: VFIO migration is not supported in kernel", + vbasedev->name); + } else { + error_setg(&err, + "%s: Migration couldn't be initialized for VFIO device, " + "err: %d (%s)", + vbasedev->name, ret, strerror(-ret)); + } + + return vfio_block_migration(vbasedev, err, errp); + } + + if (!vbasedev->dirty_pages_supported) { + if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { + error_setg(&err, + "%s: VFIO device doesn't support device dirty tracking", + vbasedev->name); + return vfio_block_migration(vbasedev, err, errp); + } + + warn_report("%s: VFIO device doesn't support device dirty tracking", + vbasedev->name); } - ret = vfio_block_multiple_devices_migration(errp); + ret = vfio_block_multiple_devices_migration(vbasedev, errp); if (ret) { return ret; } - ret = vfio_block_giommu_migration(errp); + ret = vfio_block_giommu_migration(vbasedev, errp); if (ret) { return ret; } - trace_vfio_migration_probe(vbasedev->name); + trace_vfio_migration_realize(vbasedev->name); return 0; - -add_blocker: - error_setg(&vbasedev->migration_blocker, - "VFIO device doesn't support migration"); - - ret = migrate_add_blocker(vbasedev->migration_blocker, errp); - if (ret < 0) { - error_free(vbasedev->migration_blocker); - vbasedev->migration_blocker = NULL; - } - return ret; } void vfio_migration_exit(VFIODevice *vbasedev) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index f0147a0..0ed2fcd 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1490,6 +1490,9 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) * +---------------------------------+---------------------------------+ * * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf + * + * Specification for Turning and later GPU architectures: + * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf */ static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, const char *name, void *opaque, @@ -1530,7 +1533,9 @@ const PropertyInfo qdev_prop_nv_gpudirect_clique = { static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; - int ret, pos = 0xC8; + int ret, pos; + bool c8_conflict = false, d4_conflict = false; + uint8_t tmp; if (vdev->nv_gpudirect_clique == 0xFF) { return 0; @@ -1547,6 +1552,40 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) return -EINVAL; } + /* + * Per the updated specification above, it's recommended to use offset + * D4h for Turing and later GPU architectures due to a conflict of the + * MSI-X capability at C8h. We don't know how to determine the GPU + * architecture, instead we walk the capability chain to mark conflicts + * and choose one or error based on the result. + * + * NB. Cap list head in pdev->config is already cleared, read from device. + */ + ret = pread(vdev->vbasedev.fd, &tmp, 1, + vdev->config_offset + PCI_CAPABILITY_LIST); + if (ret != 1 || !tmp) { + error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list"); + return -EINVAL; + } + + do { + if (tmp == 0xC8) { + c8_conflict = true; + } else if (tmp == 0xD4) { + d4_conflict = true; + } + tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]; + } while (tmp); + + if (!c8_conflict) { + pos = 0xC8; + } else if (!d4_conflict) { + pos = 0xD4; + } else { + error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space"); + return -EINVAL; + } + ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); if (ret < 0) { error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 73874a9..ab6645b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -663,6 +663,8 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev) vfio_disable_interrupts(vdev); + vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); +retry: /* * Setting vector notifiers needs to enable route for each vector. * Deferring to commit the KVM routes once rather than per vector @@ -670,8 +672,6 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev) */ vfio_prepare_kvm_msi_virq_batch(vdev); - vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); -retry: vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { @@ -3221,7 +3221,12 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) out_deregister: pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); - kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + } + if (vdev->intx.mmap_timer) { + timer_free(vdev->intx.mmap_timer); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3347,8 +3352,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), - DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice, - vbasedev.enable_migration, false), + DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, + vbasedev.enable_migration, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, vbasedev.ram_block_discard_allowed, false), diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index cfb60c3..ee7509e 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -155,13 +155,15 @@ vfio_load_cleanup(const char *name) " (%s)" vfio_load_device_config_state(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d" -vfio_migration_probe(const char *name) " (%s)" +vfio_migration_realize(const char *name) " (%s)" vfio_migration_set_state(const char *name, const char *state) " (%s) state %s" vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" vfio_save_block(const char *name, int data_size) " (%s) data_size %d" vfio_save_cleanup(const char *name) " (%s)" vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" +vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64 -vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64 +vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 +vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index eed244f..93429b9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -66,6 +66,10 @@ typedef struct VFIOMigration { int data_fd; void *data_buffer; size_t data_buffer_size; + uint64_t mig_flags; + uint64_t precopy_init_size; + uint64_t precopy_dirty_size; + bool initial_data_sent; } VFIOMigration; typedef struct VFIOAddressSpace { @@ -135,7 +139,7 @@ typedef struct VFIODevice { bool needs_reset; bool no_mmap; bool ram_block_discard_allowed; - bool enable_migration; + OnOffAuto enable_migration; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; @@ -212,6 +216,7 @@ void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp); void vfio_put_group(VFIOGroup *group); +struct vfio_device_info *vfio_get_device_info(int fd); int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vbasedev, Error **errp); @@ -220,10 +225,11 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; extern VFIOGroupList vfio_group_list; bool vfio_mig_active(void); -int vfio_block_multiple_devices_migration(Error **errp); +int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp); void vfio_unblock_multiple_devices_migration(void); -int vfio_block_giommu_migration(Error **errp); +int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp); int64_t vfio_mig_bytes_transferred(void); +void vfio_reset_bytes_transferred(void); #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, diff --git a/include/migration/register.h b/include/migration/register.h index a8dfd8f..90914f3 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -71,6 +71,8 @@ typedef struct SaveVMHandlers { int (*load_cleanup)(void *opaque); /* Called when postcopy migration wants to resume from failure */ int (*resume_prepare)(MigrationState *s, void *opaque); + /* Checks if switchover ack should be used. Called only in dest */ + bool (*switchover_ack_needed)(void *opaque); } SaveVMHandlers; int register_savevm_live(const char *idstr, diff --git a/migration/migration.c b/migration/migration.c index dc05c6f..096e819 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -78,6 +78,7 @@ enum mig_rp_message_type { MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */ MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */ + MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */ MIG_RP_MSG_MAX }; @@ -760,6 +761,11 @@ bool migration_has_all_channels(void) return true; } +int migrate_send_rp_switchover_ack(MigrationIncomingState *mis) +{ + return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL); +} + /* * Send a 'SHUT' message on the return channel with the given value * to indicate that we've finished with the RP. Non-0 value indicates @@ -1405,6 +1411,7 @@ void migrate_init(MigrationState *s) s->vm_old_state = -1; s->iteration_initial_bytes = 0; s->threshold_size = 0; + s->switchover_acked = false; } int migrate_add_blocker_internal(Error *reason, Error **errp) @@ -1621,6 +1628,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, */ memset(&mig_stats, 0, sizeof(mig_stats)); memset(&compression_counters, 0, sizeof(compression_counters)); + reset_vfio_bytes_transferred(); return true; } @@ -1721,6 +1729,7 @@ static struct rp_cmd_args { [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" }, + [MIG_RP_MSG_SWITCHOVER_ACK] = { .len = 0, .name = "SWITCHOVER_ACK" }, [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, }; @@ -1959,6 +1968,11 @@ retry: } break; + case MIG_RP_MSG_SWITCHOVER_ACK: + ms->switchover_acked = true; + trace_source_return_path_thread_switchover_acked(); + break; + default: break; } @@ -2693,6 +2707,20 @@ static void migration_update_counters(MigrationState *s, bandwidth, s->threshold_size); } +static bool migration_can_switchover(MigrationState *s) +{ + if (!migrate_switchover_ack()) { + return true; + } + + /* No reason to wait for switchover ACK if VM is stopped */ + if (!runstate_is_running()) { + return true; + } + + return s->switchover_acked; +} + /* Migration thread iteration status */ typedef enum { MIG_ITERATE_RESUME, /* Resume current iteration */ @@ -2708,6 +2736,7 @@ static MigIterateState migration_iteration_run(MigrationState *s) { uint64_t must_precopy, can_postcopy; bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; + bool can_switchover = migration_can_switchover(s); qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy); uint64_t pending_size = must_precopy + can_postcopy; @@ -2720,14 +2749,14 @@ static MigIterateState migration_iteration_run(MigrationState *s) trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy); } - if (!pending_size || pending_size < s->threshold_size) { + if ((!pending_size || pending_size < s->threshold_size) && can_switchover) { trace_migration_thread_low_pending(pending_size); migration_completion(s); return MIG_ITERATE_BREAK; } /* Still a significant amount to transfer */ - if (!in_postcopy && must_precopy <= s->threshold_size && + if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover && qatomic_read(&s->start_postcopy)) { if (postcopy_start(s)) { error_report("%s: postcopy failed to start", __func__); diff --git a/migration/migration.h b/migration/migration.h index 30c3e97..a80b22b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -210,6 +210,13 @@ struct MigrationIncomingState { * contains valid information. */ QemuMutex page_request_mutex; + + /* + * Number of devices that have yet to approve switchover. When this reaches + * zero an ACK that it's OK to do switchover is sent to the source. No lock + * is needed as this field is updated serially. + */ + unsigned int switchover_ack_pending_num; }; MigrationIncomingState *migration_incoming_get_current(void); @@ -440,6 +447,12 @@ struct MigrationState { /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */ JSONWriter *vmdesc; + + /* + * Indicates whether an ACK from the destination that it's OK to do + * switchover has been received. + */ + bool switchover_acked; }; void migrate_set_state(int *state, int old_state, int new_state); @@ -480,6 +493,7 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis, void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, char *block_name); void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value); +int migrate_send_rp_switchover_ack(MigrationIncomingState *mis); void dirty_bitmap_mig_before_vm_start(void); void dirty_bitmap_mig_cancel_outgoing(void); @@ -500,6 +514,7 @@ bool migration_rate_limit(void); void migration_cancel(const Error *error); void populate_vfio_info(MigrationInfo *info); +void reset_vfio_bytes_transferred(void); void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page); #endif diff --git a/migration/options.c b/migration/options.c index b62ab30..5a9505a 100644 --- a/migration/options.c +++ b/migration/options.c @@ -185,6 +185,8 @@ Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-zero-copy-send", MIGRATION_CAPABILITY_ZERO_COPY_SEND), #endif + DEFINE_PROP_MIG_CAP("x-switchover-ack", + MIGRATION_CAPABILITY_SWITCHOVER_ACK), DEFINE_PROP_END_OF_LIST(), }; @@ -308,6 +310,13 @@ bool migrate_return_path(void) return s->capabilities[MIGRATION_CAPABILITY_RETURN_PATH]; } +bool migrate_switchover_ack(void) +{ + MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_SWITCHOVER_ACK]; +} + bool migrate_validate_uuid(void) { MigrationState *s = migrate_get_current(); @@ -547,6 +556,14 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } } + if (new_caps[MIGRATION_CAPABILITY_SWITCHOVER_ACK]) { + if (!new_caps[MIGRATION_CAPABILITY_RETURN_PATH]) { + error_setg(errp, "Capability 'switchover-ack' requires capability " + "'return-path'"); + return false; + } + } + return true; } diff --git a/migration/options.h b/migration/options.h index 45991af..9aaf363 100644 --- a/migration/options.h +++ b/migration/options.h @@ -40,6 +40,7 @@ bool migrate_postcopy_ram(void); bool migrate_rdma_pin_all(void); bool migrate_release_ram(void); bool migrate_return_path(void); +bool migrate_switchover_ack(void); bool migrate_validate_uuid(void); bool migrate_xbzrle(void); bool migrate_zero_blocks(void); diff --git a/migration/savevm.c b/migration/savevm.c index bc28408..95c2abf4 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1622,6 +1622,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) migrate_init(ms); memset(&mig_stats, 0, sizeof(mig_stats)); memset(&compression_counters, 0, sizeof(compression_counters)); + reset_vfio_bytes_transferred(); ms->to_dst_file = f; qemu_mutex_unlock_iothread(); @@ -2360,6 +2361,21 @@ static int loadvm_process_command(QEMUFile *f) error_report("CMD_OPEN_RETURN_PATH failed"); return -1; } + + /* + * Switchover ack is enabled but no device uses it, so send an ACK to + * source that it's OK to switchover. Do it here, after return path has + * been created. + */ + if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) { + int ret = migrate_send_rp_switchover_ack(mis); + if (ret) { + error_report( + "Could not send switchover ack RP MSG, err %d (%s)", ret, + strerror(-ret)); + return ret; + } + } break; case MIG_CMD_PING: @@ -2586,6 +2602,23 @@ static int qemu_loadvm_state_header(QEMUFile *f) return 0; } +static void qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->switchover_ack_needed) { + continue; + } + + if (se->ops->switchover_ack_needed(se->opaque)) { + mis->switchover_ack_pending_num++; + } + } + + trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num); +} + static int qemu_loadvm_state_setup(QEMUFile *f) { SaveStateEntry *se; @@ -2789,6 +2822,10 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } + if (migrate_switchover_ack()) { + qemu_loadvm_state_switchover_ack_needed(mis); + } + cpu_synchronize_all_pre_loadvm(); ret = qemu_loadvm_state_main(f, mis); @@ -2862,6 +2899,24 @@ int qemu_load_device_state(QEMUFile *f) return 0; } +int qemu_loadvm_approve_switchover(void) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + if (!mis->switchover_ack_pending_num) { + return -EINVAL; + } + + mis->switchover_ack_pending_num--; + trace_loadvm_approve_switchover(mis->switchover_ack_pending_num); + + if (mis->switchover_ack_pending_num) { + return 0; + } + + return migrate_send_rp_switchover_ack(mis); +} + bool save_snapshot(const char *name, bool overwrite, const char *vmstate, bool has_devices, strList *devices, Error **errp) { diff --git a/migration/savevm.h b/migration/savevm.h index fb63673..e894bbc 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -65,6 +65,7 @@ int qemu_loadvm_state(QEMUFile *f); void qemu_loadvm_state_cleanup(void); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); +int qemu_loadvm_approve_switchover(void); int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks); diff --git a/migration/target.c b/migration/target.c index 00ca007..f39c9a8 100644 --- a/migration/target.c +++ b/migration/target.c @@ -14,12 +14,25 @@ #include "hw/vfio/vfio-common.h" #endif +#ifdef CONFIG_VFIO void populate_vfio_info(MigrationInfo *info) { -#ifdef CONFIG_VFIO if (vfio_mig_active()) { info->vfio = g_malloc0(sizeof(*info->vfio)); info->vfio->transferred = vfio_mig_bytes_transferred(); } -#endif } + +void reset_vfio_bytes_transferred(void) +{ + vfio_reset_bytes_transferred(); +} +#else +void populate_vfio_info(MigrationInfo *info) +{ +} + +void reset_vfio_bytes_transferred(void) +{ +} +#endif diff --git a/migration/trace-events b/migration/trace-events index cdaef7a..5259c10 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -7,6 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u" qemu_loadvm_state_post_main(int ret) "%d" qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u" qemu_savevm_send_packaged(void) "" +loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u" loadvm_state_setup(void) "" loadvm_state_cleanup(void) "" loadvm_handle_cmd_packaged(unsigned int length) "%u" @@ -23,6 +24,7 @@ loadvm_postcopy_ram_handle_discard_end(void) "" loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud" loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d" loadvm_process_command_ping(uint32_t val) "0x%x" +loadvm_approve_switchover(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u" postcopy_ram_listen_thread_exit(void) "" postcopy_ram_listen_thread_start(void) "" qemu_savevm_send_postcopy_advise(void) "" @@ -180,6 +182,7 @@ source_return_path_thread_loop_top(void) "" source_return_path_thread_pong(uint32_t val) "0x%x" source_return_path_thread_shut(uint32_t val) "0x%x" source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 +source_return_path_thread_switchover_acked(void) "" migration_thread_low_pending(uint64_t pending) "%" PRIu64 migrate_transferred(uint64_t tranferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64 process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" diff --git a/qapi/migration.json b/qapi/migration.json index 5bb5ab8..47dfef0 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -487,6 +487,16 @@ # and should not affect the correctness of postcopy migration. # (since 7.1) # +# @switchover-ack: If enabled, migration will not stop the source VM +# and complete the migration until an ACK is received from the +# destination that it's OK to do so. Exactly when this ACK is +# sent depends on the migrated devices that use this feature. +# For example, a device can use it to make sure some of its data +# is sent and loaded in the destination before doing switchover. +# This can reduce downtime if devices that support this capability +# are present. 'return-path' capability must be enabled to use +# it. (since 8.1) +# # Features: # # @unstable: Members @x-colo and @x-ignore-shared are experimental. @@ -502,7 +512,7 @@ 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] }, 'validate-uuid', 'background-snapshot', - 'zero-copy-send', 'postcopy-preempt'] } + 'zero-copy-send', 'postcopy-preempt', 'switchover-ack'] } ## # @MigrationCapabilityStatus: diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index b0c355b..b9cc194 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -1693,6 +1693,33 @@ static void test_precopy_tcp_plain(void) test_precopy_common(&args); } +static void *test_migrate_switchover_ack_start(QTestState *from, QTestState *to) +{ + + migrate_set_capability(from, "return-path", true); + migrate_set_capability(to, "return-path", true); + + migrate_set_capability(from, "switchover-ack", true); + migrate_set_capability(to, "switchover-ack", true); + + return NULL; +} + +static void test_precopy_tcp_switchover_ack(void) +{ + MigrateCommon args = { + .listen_uri = "tcp:127.0.0.1:0", + .start_hook = test_migrate_switchover_ack_start, + /* + * Source VM must be running in order to consider the switchover ACK + * when deciding to do switchover or not. + */ + .live = true, + }; + + test_precopy_common(&args); +} + #ifdef CONFIG_GNUTLS static void test_precopy_tcp_tls_psk_match(void) { @@ -2737,6 +2764,10 @@ int main(int argc, char **argv) #endif /* CONFIG_GNUTLS */ qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain); + + qtest_add_func("/migration/precopy/tcp/plain/switchover-ack", + test_precopy_tcp_switchover_ack); + #ifdef CONFIG_GNUTLS qtest_add_func("/migration/precopy/tcp/tls/psk/match", test_precopy_tcp_tls_psk_match); |