diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2023-02-20 17:08:05 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2023-02-20 17:08:05 +0000 |
commit | 9b0699ab801405fe5bdf1adea83bceac9ec62f97 (patch) | |
tree | 6abd869ae56a840e6e7235ac4b3304775d6ca141 /hw | |
parent | 0aaf44776e00d9008806a4731a03271f039515a1 (diff) | |
parent | 57edb7e44489ec4d85075acba47223127ecf1521 (diff) | |
download | qemu-9b0699ab801405fe5bdf1adea83bceac9ec62f97.zip qemu-9b0699ab801405fe5bdf1adea83bceac9ec62f97.tar.gz qemu-9b0699ab801405fe5bdf1adea83bceac9ec62f97.tar.bz2 |
Merge tag 'vfio-updates-20230216.0' of https://gitlab.com/alex.williamson/qemu into staging
VFIO updates 2023-02-16
* Initial v2 migration support for vfio (Avihai Horon)
* Add Cédric as vfio reviewer (Cédric Le Goater)
# -----BEGIN PGP SIGNATURE-----
#
# iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmPumhUbHGFsZXgud2ls
# bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsijnMP/0Rz/lsGxym76mXtr5WY
# OR5SDFpifpaUVi+1xTugYFPnZvN+RdnlcLrcp1g8G+lmd4ANqwT0b9XTTTI8WTau
# DhSHW/05WgAOrf/jOSV29oNSf7jtGJZcDbAy8f5NXxwK/IRlJEDJfCaqxwYSyYf1
# nfC0ZwMTrBrA6pzF5OzIJSkhl/uPwlTsBxRnbN86Z22rE128ASjUtj1jir4rPLg0
# ClUn7Rrdk/Y6uXIB9c6TFC+wmG0QAVsklWIeNLUFWUak4H0gqp7AUmMlJV99i5Q7
# 3H4Zjspwn79llvGm4X1QpuLaop2QaIQaW4FTpzRSftelEosjIjkTCMrWTb4MKff1
# cgT0dmC1Hht+zQ0MPbmgeaiwPH/V7r+J9GffG6p2b4itdHmrKVsqKQMSQS/IJFBw
# eiO1rENRXNcTnC29jPUhe1IS1DEwCNkWm9NgJoC5WPJYQXsiEvo4YDH/30FnByXg
# KQdd5OxR7o6qJM5e4PUn4wd9sHsYU8IsIEJdKnynoS9qUdPqv0tJ+tLYWcBhQPJq
# M8R+mDwImMzw0bgurg4607VgL9HJEXna2rgdd5hcMq88M+M5OpmowXlk4TTY4Ha9
# lmWSndYJG6npNY4NXcxbe4x5H8ndvHcO+g3weynsxPFjnL959NzQyWNFXFDBqBg3
# fhNVqYTrMOcEN5uv18o+mnsG
# =oK7/
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 16 Feb 2023 21:03:17 GMT
# gpg: using RSA key 42F6C04E540BD1A99E7B8A90239B9B6E3BB08B22
# gpg: issuer "alex.williamson@redhat.com"
# gpg: Good signature from "Alex Williamson <alex.williamson@redhat.com>" [full]
# gpg: aka "Alex Williamson <alex@shazbot.org>" [full]
# gpg: aka "Alex Williamson <alwillia@redhat.com>" [full]
# gpg: aka "Alex Williamson <alex.l.williamson@gmail.com>" [full]
# Primary key fingerprint: 42F6 C04E 540B D1A9 9E7B 8A90 239B 9B6E 3BB0 8B22
* tag 'vfio-updates-20230216.0' of https://gitlab.com/alex.williamson/qemu:
MAINTAINERS: Add myself as VFIO reviewer
docs/devel: Align VFIO migration docs to v2 protocol
vfio: Alphabetize migration section of VFIO trace-events file
vfio/migration: Remove VFIO migration protocol v1
vfio/migration: Implement VFIO migration protocol v2
vfio/migration: Rename functions/structs related to v1 protocol
vfio/migration: Move migration v1 logic to vfio_migration_init()
vfio/migration: Block multiple devices migration
vfio/common: Change vfio_devices_all_running_and_saving() logic to equivalent one
vfio/migration: Allow migration without VFIO IOMMU dirty tracking support
vfio/migration: Fix NULL pointer dereference bug
linux-headers: Update to v6.2-rc8
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/vfio/common.c | 92 | ||||
-rw-r--r-- | hw/vfio/migration.c | 736 | ||||
-rw-r--r-- | hw/vfio/trace-events | 28 |
3 files changed, 341 insertions, 515 deletions
diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 130e5d1..bab83c0 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -40,6 +40,8 @@ #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" +#include "migration/misc.h" +#include "migration/blocker.h" #include "sysemu/tpm.h" VFIOGroupList vfio_group_list = @@ -336,6 +338,58 @@ bool vfio_mig_active(void) return true; } +static Error *multiple_devices_migration_blocker; + +static unsigned int vfio_migratable_device_num(void) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + unsigned int device_num = 0; + + QLIST_FOREACH(group, &vfio_group_list, next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->migration) { + device_num++; + } + } + } + + return device_num; +} + +int vfio_block_multiple_devices_migration(Error **errp) +{ + int ret; + + if (multiple_devices_migration_blocker || + vfio_migratable_device_num() <= 1) { + return 0; + } + + error_setg(&multiple_devices_migration_blocker, + "Migration is currently not supported with multiple " + "VFIO devices"); + ret = migrate_add_blocker(multiple_devices_migration_blocker, errp); + if (ret < 0) { + error_free(multiple_devices_migration_blocker); + multiple_devices_migration_blocker = NULL; + } + + return ret; +} + +void vfio_unblock_multiple_devices_migration(void) +{ + if (!multiple_devices_migration_blocker || + vfio_migratable_device_num() > 1) { + return; + } + + migrate_del_blocker(multiple_devices_migration_blocker); + error_free(multiple_devices_migration_blocker); + multiple_devices_migration_blocker = NULL; +} + static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; @@ -354,8 +408,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return false; } - if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) - && (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) { + if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && + migration->device_state == VFIO_DEVICE_STATE_RUNNING) { return false; } } @@ -363,13 +417,16 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return true; } -static bool vfio_devices_all_running_and_saving(VFIOContainer *container) +/* + * Check if all VFIO devices are running and migration is active, which is + * essentially equivalent to the migration being in pre-copy phase. + */ +static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) { VFIOGroup *group; VFIODevice *vbasedev; - MigrationState *ms = migrate_get_current(); - if (!migration_is_setup_or_active(ms->state)) { + if (!migration_is_active(migrate_get_current())) { return false; } @@ -381,8 +438,7 @@ static bool vfio_devices_all_running_and_saving(VFIOContainer *container) return false; } - if ((migration->device_state & VFIO_DEVICE_STATE_V1_SAVING) && - (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) { + if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) { continue; } else { return false; @@ -461,7 +517,7 @@ static int vfio_dma_unmap(VFIOContainer *container, }; if (iotlb && container->dirty_pages_supported && - vfio_devices_all_running_and_saving(container)) { + vfio_devices_all_running_and_mig_active(container)) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -488,6 +544,12 @@ static int vfio_dma_unmap(VFIOContainer *container, return -errno; } + if (iotlb && vfio_devices_all_running_and_mig_active(container)) { + cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + } + return 0; } @@ -1201,6 +1263,10 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) .argsz = sizeof(dirty), }; + if (!container->dirty_pages_supported) { + return; + } + if (start) { dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; } else { @@ -1236,6 +1302,13 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, uint64_t pages; int ret; + if (!container->dirty_pages_supported) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + return 0; + } + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); @@ -1409,8 +1482,7 @@ static void vfio_listener_log_sync(MemoryListener *listener, { VFIOContainer *container = container_of(listener, VFIOContainer, listener); - if (vfio_listener_skipped_section(section) || - !container->dirty_pages_supported) { + if (vfio_listener_skipped_section(section)) { return; } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 83d2d44..a2c3d9b 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -10,6 +10,7 @@ #include "qemu/osdep.h" #include "qemu/main-loop.h" #include "qemu/cutils.h" +#include "qemu/units.h" #include <linux/vfio.h> #include <sys/ioctl.h> @@ -44,310 +45,124 @@ #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +/* + * This is an arbitrary size based on migration of mlx5 devices, where typically + * total device migration size is on the order of 100s of MB. Testing with + * larger values, e.g. 128MB and 1GB, did not show a performance improvement. + */ +#define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) + static int64_t bytes_transferred; -static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, - off_t off, bool iswrite) +static const char *mig_state_to_str(enum vfio_device_mig_state state) { - int ret; - - ret = iswrite ? pwrite(vbasedev->fd, val, count, off) : - pread(vbasedev->fd, val, count, off); - if (ret < count) { - error_report("vfio_mig_%s %d byte %s: failed at offset 0x%" - HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count, - vbasedev->name, off, strerror(errno)); - return (ret < 0) ? ret : -EINVAL; + switch (state) { + case VFIO_DEVICE_STATE_ERROR: + return "ERROR"; + case VFIO_DEVICE_STATE_STOP: + return "STOP"; + case VFIO_DEVICE_STATE_RUNNING: + return "RUNNING"; + case VFIO_DEVICE_STATE_STOP_COPY: + return "STOP_COPY"; + case VFIO_DEVICE_STATE_RESUMING: + return "RESUMING"; + default: + return "UNKNOWN STATE"; } - return 0; } -static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count, - off_t off, bool iswrite) -{ - int ret, done = 0; - __u8 *tbuf = buf; - - while (count) { - int bytes = 0; - - if (count >= 8 && !(off % 8)) { - bytes = 8; - } else if (count >= 4 && !(off % 4)) { - bytes = 4; - } else if (count >= 2 && !(off % 2)) { - bytes = 2; - } else { - bytes = 1; - } - - ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite); - if (ret) { - return ret; - } - - count -= bytes; - done += bytes; - off += bytes; - tbuf += bytes; - } - return done; -} - -#define vfio_mig_read(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, false) -#define vfio_mig_write(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, true) - -#define VFIO_MIG_STRUCT_OFFSET(f) \ - offsetof(struct vfio_device_migration_info, f) -/* - * Change the device_state register for device @vbasedev. Bits set in @mask - * are preserved, bits set in @value are set, and bits not set in either @mask - * or @value are cleared in device_state. If the register cannot be accessed, - * the resulting state would be invalid, or the device enters an error state, - * an error is returned. - */ - -static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, - uint32_t value) +static int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state) { VFIOMigration *migration = vbasedev->migration; - VFIORegion *region = &migration->region; - off_t dev_state_off = region->fd_offset + - VFIO_MIG_STRUCT_OFFSET(device_state); - uint32_t device_state; + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_mig_state), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_mig_state *mig_state = + (struct vfio_device_feature_mig_state *)feature->data; int ret; - ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), - dev_state_off); - if (ret < 0) { - return ret; - } - - device_state = (device_state & mask) | value; - - if (!VFIO_DEVICE_STATE_VALID(device_state)) { - return -EINVAL; - } - - ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state), - dev_state_off); - if (ret < 0) { - int rret; - - rret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), - dev_state_off); - - if ((rret < 0) || (VFIO_DEVICE_STATE_IS_ERROR(device_state))) { - hw_error("%s: Device in error state 0x%x", vbasedev->name, - device_state); - return rret ? rret : -EIO; + feature->argsz = sizeof(buf); + feature->flags = + VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; + mig_state->device_state = new_state; + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + /* Try to set the device in some good state */ + ret = -errno; + + if (recover_state == VFIO_DEVICE_STATE_ERROR) { + error_report("%s: Failed setting device state to %s, err: %s. " + "Recover state is ERROR. Resetting device", + vbasedev->name, mig_state_to_str(new_state), + strerror(errno)); + + goto reset_device; } - return ret; - } - migration->device_state = device_state; - trace_vfio_migration_set_state(vbasedev->name, device_state); - return 0; -} + error_report( + "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", + vbasedev->name, mig_state_to_str(new_state), + strerror(errno), mig_state_to_str(recover_state)); -static void *get_data_section_size(VFIORegion *region, uint64_t data_offset, - uint64_t data_size, uint64_t *size) -{ - void *ptr = NULL; - uint64_t limit = 0; - int i; + mig_state->device_state = recover_state; + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + ret = -errno; + error_report( + "%s: Failed setting device in recover state, err: %s. Resetting device", + vbasedev->name, strerror(errno)); - if (!region->mmaps) { - if (size) { - *size = MIN(data_size, region->size - data_offset); + goto reset_device; } - return ptr; - } - for (i = 0; i < region->nr_mmaps; i++) { - VFIOMmap *map = region->mmaps + i; + migration->device_state = recover_state; - if ((data_offset >= map->offset) && - (data_offset < map->offset + map->size)) { + return ret; + } - /* check if data_offset is within sparse mmap areas */ - ptr = map->mmap + data_offset - map->offset; - if (size) { - *size = MIN(data_size, map->offset + map->size - data_offset); - } - break; - } else if ((data_offset < map->offset) && - (!limit || limit > map->offset)) { + migration->device_state = new_state; + if (mig_state->data_fd != -1) { + if (migration->data_fd != -1) { /* - * data_offset is not within sparse mmap areas, find size of - * non-mapped area. Check through all list since region->mmaps list - * is not sorted. + * This can happen if the device is asynchronously reset and + * terminates a data transfer. */ - limit = map->offset; - } - } - - if (!ptr && size) { - *size = limit ? MIN(data_size, limit - data_offset) : data_size; - } - return ptr; -} + error_report("%s: data_fd out of sync", vbasedev->name); + close(mig_state->data_fd); -static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) -{ - VFIOMigration *migration = vbasedev->migration; - VFIORegion *region = &migration->region; - uint64_t data_offset = 0, data_size = 0, sz; - int ret; - - ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); - if (ret < 0) { - return ret; - } + return -EBADF; + } - ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size), - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); - if (ret < 0) { - return ret; + migration->data_fd = mig_state->data_fd; } - trace_vfio_save_buffer(vbasedev->name, data_offset, data_size, - migration->pending_bytes); - - qemu_put_be64(f, data_size); - sz = data_size; - - while (sz) { - void *buf; - uint64_t sec_size; - bool buf_allocated = false; - - buf = get_data_section_size(region, data_offset, sz, &sec_size); - - if (!buf) { - buf = g_try_malloc(sec_size); - if (!buf) { - error_report("%s: Error allocating buffer ", __func__); - return -ENOMEM; - } - buf_allocated = true; - - ret = vfio_mig_read(vbasedev, buf, sec_size, - region->fd_offset + data_offset); - if (ret < 0) { - g_free(buf); - return ret; - } - } + trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); - qemu_put_buffer(f, buf, sec_size); + return 0; - if (buf_allocated) { - g_free(buf); - } - sz -= sec_size; - data_offset += sec_size; +reset_device: + if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { + hw_error("%s: Failed resetting device, err: %s", vbasedev->name, + strerror(errno)); } - ret = qemu_file_get_error(f); + migration->device_state = VFIO_DEVICE_STATE_RUNNING; - if (!ret && size) { - *size = data_size; - } - - bytes_transferred += data_size; return ret; } static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t data_size) { - VFIORegion *region = &vbasedev->migration->region; - uint64_t data_offset = 0, size, report_size; - int ret; - - do { - ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); - if (ret < 0) { - return ret; - } - - if (data_offset + data_size > region->size) { - /* - * If data_size is greater than the data section of migration region - * then iterate the write buffer operation. This case can occur if - * size of migration region at destination is smaller than size of - * migration region at source. - */ - report_size = size = region->size - data_offset; - data_size -= size; - } else { - report_size = size = data_size; - data_size = 0; - } - - trace_vfio_load_state_device_data(vbasedev->name, data_offset, size); - - while (size) { - void *buf; - uint64_t sec_size; - bool buf_alloc = false; - - buf = get_data_section_size(region, data_offset, size, &sec_size); - - if (!buf) { - buf = g_try_malloc(sec_size); - if (!buf) { - error_report("%s: Error allocating buffer ", __func__); - return -ENOMEM; - } - buf_alloc = true; - } - - qemu_get_buffer(f, buf, sec_size); - - if (buf_alloc) { - ret = vfio_mig_write(vbasedev, buf, sec_size, - region->fd_offset + data_offset); - g_free(buf); - - if (ret < 0) { - return ret; - } - } - size -= sec_size; - data_offset += sec_size; - } - - ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size), - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); - if (ret < 0) { - return ret; - } - } while (data_size); - - return 0; -} - -static int vfio_update_pending(VFIODevice *vbasedev) -{ VFIOMigration *migration = vbasedev->migration; - VFIORegion *region = &migration->region; - uint64_t pending_bytes = 0; int ret; - ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes), - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes)); - if (ret < 0) { - migration->pending_bytes = 0; - return ret; - } + ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); + trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); - migration->pending_bytes = pending_bytes; - trace_vfio_update_pending(vbasedev->name, pending_bytes); - return 0; + return ret; } static int vfio_save_device_config_state(QEMUFile *f, void *opaque) @@ -398,180 +213,157 @@ static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; - if (migration->region.mmaps) { - vfio_region_unmap(&migration->region); - } + close(migration->data_fd); + migration->data_fd = -1; } -/* ---------------------------------------------------------------------- */ - -static int vfio_save_setup(QEMUFile *f, void *opaque) +static int vfio_query_stop_copy_size(VFIODevice *vbasedev, + uint64_t *stop_copy_size) { - VFIODevice *vbasedev = opaque; - VFIOMigration *migration = vbasedev->migration; - int ret; - - trace_vfio_save_setup(vbasedev->name); + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_mig_data_size), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_mig_data_size *mig_data_size = + (struct vfio_device_feature_mig_data_size *)feature->data; - qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); - - if (migration->region.mmaps) { - /* - * Calling vfio_region_mmap() from migration thread. Memory API called - * from this function require locking the iothread when called from - * outside the main loop thread. - */ - qemu_mutex_lock_iothread(); - ret = vfio_region_mmap(&migration->region); - qemu_mutex_unlock_iothread(); - if (ret) { - error_report("%s: Failed to mmap VFIO migration region: %s", - vbasedev->name, strerror(-ret)); - error_report("%s: Falling back to slow path", vbasedev->name); - } - } + feature->argsz = sizeof(buf); + feature->flags = + VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; - ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK, - VFIO_DEVICE_STATE_V1_SAVING); - if (ret) { - error_report("%s: Failed to set state SAVING", vbasedev->name); - return ret; + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + return -errno; } - qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); - - ret = qemu_file_get_error(f); - if (ret) { - return ret; - } + *stop_copy_size = mig_data_size->stop_copy_length; return 0; } -static void vfio_save_cleanup(void *opaque) +/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */ +static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) { - VFIODevice *vbasedev = opaque; + ssize_t data_size; - vfio_migration_cleanup(vbasedev); - trace_vfio_save_cleanup(vbasedev->name); + data_size = read(migration->data_fd, migration->data_buffer, + migration->data_buffer_size); + if (data_size < 0) { + return -errno; + } + if (data_size == 0) { + return 1; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + qemu_put_be64(f, data_size); + qemu_put_buffer(f, migration->data_buffer, data_size); + bytes_transferred += data_size; + + trace_vfio_save_block(migration->vbasedev->name, data_size); + + return qemu_file_get_error(f); } -static void vfio_state_pending(void *opaque, uint64_t *must_precopy, - uint64_t *can_postcopy) +/* ---------------------------------------------------------------------- */ + +static int vfio_save_setup(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; VFIOMigration *migration = vbasedev->migration; - int ret; + uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; - ret = vfio_update_pending(vbasedev); - if (ret) { - return; + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); + + vfio_query_stop_copy_size(vbasedev, &stop_copy_size); + migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, + stop_copy_size); + migration->data_buffer = g_try_malloc0(migration->data_buffer_size); + if (!migration->data_buffer) { + error_report("%s: Failed to allocate migration data buffer", + vbasedev->name); + return -ENOMEM; } - *must_precopy += migration->pending_bytes; + trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); - trace_vfio_state_pending(vbasedev->name, *must_precopy, *can_postcopy); + return qemu_file_get_error(f); } -static int vfio_save_iterate(QEMUFile *f, void *opaque) +static void vfio_save_cleanup(void *opaque) { VFIODevice *vbasedev = opaque; VFIOMigration *migration = vbasedev->migration; - uint64_t data_size; - int ret; - - qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); - - if (migration->pending_bytes == 0) { - ret = vfio_update_pending(vbasedev); - if (ret) { - return ret; - } - - if (migration->pending_bytes == 0) { - qemu_put_be64(f, 0); - qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); - /* indicates data finished, goto complete phase */ - return 1; - } - } - ret = vfio_save_buffer(f, vbasedev, &data_size); - if (ret) { - error_report("%s: vfio_save_buffer failed %s", vbasedev->name, - strerror(errno)); - return ret; - } + g_free(migration->data_buffer); + migration->data_buffer = NULL; + vfio_migration_cleanup(vbasedev); + trace_vfio_save_cleanup(vbasedev->name); +} - qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); +/* + * Migration size of VFIO devices can be as little as a few KBs or as big as + * many GBs. This value should be big enough to cover the worst case. + */ +#define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) - ret = qemu_file_get_error(f); - if (ret) { - return ret; - } +/* + * Only exact function is implemented and not estimate function. The reason is + * that during pre-copy phase of migration the estimate function is called + * repeatedly while pending RAM size is over the threshold, thus migration + * can't converge and querying the VFIO device pending data size is useless. + */ +static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, + uint64_t *can_postcopy) +{ + VFIODevice *vbasedev = opaque; + uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; /* - * Reset pending_bytes as state_pending* are not called during - * savevm or snapshot case, in such case vfio_update_pending() at - * the start of this function updates pending_bytes. + * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is + * reported so downtime limit won't be violated. */ - migration->pending_bytes = 0; - trace_vfio_save_iterate(vbasedev->name, data_size); - return 0; + vfio_query_stop_copy_size(vbasedev, &stop_copy_size); + *must_precopy += stop_copy_size; + + trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, + stop_copy_size); } static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; - VFIOMigration *migration = vbasedev->migration; - uint64_t data_size; int ret; - ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_V1_RUNNING, - VFIO_DEVICE_STATE_V1_SAVING); - if (ret) { - error_report("%s: Failed to set state STOP and SAVING", - vbasedev->name); - return ret; - } - - ret = vfio_update_pending(vbasedev); + /* We reach here with device state STOP only */ + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP); if (ret) { return ret; } - while (migration->pending_bytes > 0) { - qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); - ret = vfio_save_buffer(f, vbasedev, &data_size); + do { + ret = vfio_save_block(f, vbasedev->migration); if (ret < 0) { - error_report("%s: Failed to save buffer", vbasedev->name); - return ret; - } - - if (data_size == 0) { - break; - } - - ret = vfio_update_pending(vbasedev); - if (ret) { return ret; } - } + } while (!ret); qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); - ret = qemu_file_get_error(f); if (ret) { return ret; } - ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_V1_SAVING, 0); - if (ret) { - error_report("%s: Failed to set state STOPPED", vbasedev->name); - return ret; - } + /* + * If setting the device in STOP state fails, the device should be reset. + * To do so, use ERROR state as a recover state. + */ + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP, + VFIO_DEVICE_STATE_ERROR); + trace_vfio_save_complete_precopy(vbasedev->name, ret); - trace_vfio_save_complete_precopy(vbasedev->name); return ret; } @@ -591,28 +383,9 @@ static void vfio_save_state(QEMUFile *f, void *opaque) static int vfio_load_setup(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; - VFIOMigration *migration = vbasedev->migration; - int ret = 0; - if (migration->region.mmaps) { - ret = vfio_region_mmap(&migration->region); - if (ret) { - error_report("%s: Failed to mmap VFIO migration region %d: %s", - vbasedev->name, migration->region.nr, - strerror(-ret)); - error_report("%s: Falling back to slow path", vbasedev->name); - } - } - - ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK, - VFIO_DEVICE_STATE_V1_RESUMING); - if (ret) { - error_report("%s: Failed to set state RESUMING", vbasedev->name); - if (migration->region.mmaps) { - vfio_region_unmap(&migration->region); - } - } - return ret; + return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, + vbasedev->migration->device_state); } static int vfio_load_cleanup(void *opaque) @@ -621,6 +394,7 @@ static int vfio_load_cleanup(void *opaque) vfio_migration_cleanup(vbasedev); trace_vfio_load_cleanup(vbasedev->name); + return 0; } @@ -678,12 +452,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) return ret; } -static SaveVMHandlers savevm_vfio_handlers = { +static const SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, - .state_pending_exact = vfio_state_pending, - .state_pending_estimate = vfio_state_pending, - .save_live_iterate = vfio_save_iterate, + .state_pending_exact = vfio_state_pending_exact, .save_live_complete_precopy = vfio_save_complete_precopy, .save_state = vfio_save_state, .load_setup = vfio_load_setup, @@ -696,56 +468,33 @@ static SaveVMHandlers savevm_vfio_handlers = { static void vfio_vmstate_change(void *opaque, bool running, RunState state) { VFIODevice *vbasedev = opaque; - VFIOMigration *migration = vbasedev->migration; - uint32_t value, mask; + enum vfio_device_mig_state new_state; int ret; - if (vbasedev->migration->vm_running == running) { - return; - } - if (running) { - /* - * Here device state can have one of _SAVING, _RESUMING or _STOP bit. - * Transition from _SAVING to _RUNNING can happen if there is migration - * failure, in that case clear _SAVING bit. - * Transition from _RESUMING to _RUNNING occurs during resuming - * phase, in that case clear _RESUMING bit. - * In both the above cases, set _RUNNING bit. - */ - mask = ~VFIO_DEVICE_STATE_MASK; - value = VFIO_DEVICE_STATE_V1_RUNNING; + new_state = VFIO_DEVICE_STATE_RUNNING; } else { - /* - * Here device state could be either _RUNNING or _SAVING|_RUNNING. Reset - * _RUNNING bit - */ - mask = ~VFIO_DEVICE_STATE_V1_RUNNING; - - /* - * When VM state transition to stop for savevm command, device should - * start saving data. - */ - if (state == RUN_STATE_SAVE_VM) { - value = VFIO_DEVICE_STATE_V1_SAVING; - } else { - value = 0; - } + new_state = VFIO_DEVICE_STATE_STOP; } - ret = vfio_migration_set_state(vbasedev, mask, value); + /* + * If setting the device in new_state fails, the device should be reset. + * To do so, use ERROR state as a recover state. + */ + ret = vfio_migration_set_state(vbasedev, new_state, + VFIO_DEVICE_STATE_ERROR); if (ret) { /* * Migration should be aborted in this case, but vm_state_notify() * currently does not support reporting failures. */ - error_report("%s: Failed to set device state 0x%x", vbasedev->name, - (migration->device_state & mask) | value); - qemu_file_set_error(migrate_get_current()->to_dst_file, ret); + if (migrate_get_current()->to_dst_file) { + qemu_file_set_error(migrate_get_current()->to_dst_file, ret); + } } - vbasedev->migration->vm_running = running; + trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), - (migration->device_state & mask) | value); + mig_state_to_str(new_state)); } static void vfio_migration_state_notifier(Notifier *notifier, void *data) @@ -754,7 +503,6 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) VFIOMigration *migration = container_of(notifier, VFIOMigration, migration_state); VFIODevice *vbasedev = migration->vbasedev; - int ret; trace_vfio_migration_state_notifier(vbasedev->name, MigrationStatus_str(s->state)); @@ -764,34 +512,57 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_FAILED: bytes_transferred = 0; - ret = vfio_migration_set_state(vbasedev, - ~(VFIO_DEVICE_STATE_V1_SAVING | - VFIO_DEVICE_STATE_V1_RESUMING), - VFIO_DEVICE_STATE_V1_RUNNING); - if (ret) { - error_report("%s: Failed to set state RUNNING", vbasedev->name); - } + /* + * If setting the device in RUNNING state fails, the device should + * be reset. To do so, use ERROR state as a recover state. + */ + vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING, + VFIO_DEVICE_STATE_ERROR); } } static void vfio_migration_exit(VFIODevice *vbasedev) { - VFIOMigration *migration = vbasedev->migration; - - vfio_region_exit(&migration->region); - vfio_region_finalize(&migration->region); g_free(vbasedev->migration); vbasedev->migration = NULL; } -static int vfio_migration_init(VFIODevice *vbasedev, - struct vfio_region_info *info) +static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_migration), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_migration *mig = + (struct vfio_device_feature_migration *)feature->data; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + if (errno == ENOTTY) { + error_report("%s: VFIO migration is not supported in kernel", + vbasedev->name); + } else { + error_report("%s: Failed to query VFIO migration support, err: %s", + vbasedev->name, strerror(errno)); + } + + return -errno; + } + + *mig_flags = mig->flags; + + return 0; +} + +static int vfio_migration_init(VFIODevice *vbasedev) { int ret; Object *obj; VFIOMigration *migration; char id[256] = ""; g_autofree char *path = NULL, *oid = NULL; + uint64_t mig_flags = 0; if (!vbasedev->ops->vfio_get_object) { return -EINVAL; @@ -802,27 +573,21 @@ static int vfio_migration_init(VFIODevice *vbasedev, return -EINVAL; } - vbasedev->migration = g_new0(VFIOMigration, 1); - vbasedev->migration->device_state = VFIO_DEVICE_STATE_V1_RUNNING; - vbasedev->migration->vm_running = runstate_is_running(); - - ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region, - info->index, "migration"); + ret = vfio_migration_query_flags(vbasedev, &mig_flags); if (ret) { - error_report("%s: Failed to setup VFIO migration region %d: %s", - vbasedev->name, info->index, strerror(-ret)); - goto err; + return ret; } - if (!vbasedev->migration->region.size) { - error_report("%s: Invalid zero-sized VFIO migration region %d", - vbasedev->name, info->index); - ret = -EINVAL; - goto err; + /* Basic migration functionality must be supported */ + if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { + return -EOPNOTSUPP; } + vbasedev->migration = g_new0(VFIOMigration, 1); migration = vbasedev->migration; migration->vbasedev = vbasedev; + migration->device_state = VFIO_DEVICE_STATE_RUNNING; + migration->data_fd = -1; oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); if (oid) { @@ -840,11 +605,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, vbasedev); migration->migration_state.notify = vfio_migration_state_notifier; add_migration_state_change_notifier(&migration->migration_state); - return 0; -err: - vfio_migration_exit(vbasedev); - return ret; + return 0; } /* ---------------------------------------------------------------------- */ @@ -856,35 +618,28 @@ int64_t vfio_mig_bytes_transferred(void) int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) { - VFIOContainer *container = vbasedev->group->container; - struct vfio_region_info *info = NULL; int ret = -ENOTSUP; - if (!vbasedev->enable_migration || !container->dirty_pages_supported) { + if (!vbasedev->enable_migration) { goto add_blocker; } - ret = vfio_get_dev_region_info(vbasedev, - VFIO_REGION_TYPE_MIGRATION_DEPRECATED, - VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED, - &info); + ret = vfio_migration_init(vbasedev); if (ret) { goto add_blocker; } - ret = vfio_migration_init(vbasedev, info); + ret = vfio_block_multiple_devices_migration(errp); if (ret) { - goto add_blocker; + return ret; } - trace_vfio_migration_probe(vbasedev->name, info->index); - g_free(info); + trace_vfio_migration_probe(vbasedev->name); return 0; add_blocker: error_setg(&vbasedev->migration_blocker, "VFIO device doesn't support migration"); - g_free(info); ret = migrate_add_blocker(vbasedev->migration_blocker, errp); if (ret < 0) { @@ -903,6 +658,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) qemu_del_vm_change_state_handler(migration->vm_state); unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); vfio_migration_exit(vbasedev); + vfio_unblock_multiple_devices_migration(); } if (vbasedev->migration_blocker) { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 90a8aec..669d9fe 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -119,6 +119,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" vfio_dma_unmap_overflow_workaround(void) "" +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 # platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" @@ -148,21 +150,17 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" # migration.c -vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" -vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" -vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" +vfio_load_cleanup(const char *name) " (%s)" +vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 +vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d" +vfio_migration_probe(const char *name) " (%s)" +vfio_migration_set_state(const char *name, const char *state) " (%s) state %s" vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" -vfio_save_setup(const char *name) " (%s)" +vfio_save_block(const char *name, int data_size) " (%s) data_size %d" vfio_save_cleanup(const char *name) " (%s)" -vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64 -vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64 +vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" -vfio_state_pending(const char *name, uint64_t precopy, uint64_t postcopy) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64 -vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" -vfio_save_complete_precopy(const char *name) " (%s)" -vfio_load_device_config_state(const char *name) " (%s)" -vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 -vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 -vfio_load_cleanup(const char *name) " (%s)" -vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 -vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 +vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64 +vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64 +vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" |