aboutsummaryrefslogtreecommitdiff
path: root/hw
diff options
context:
space:
mode:
Diffstat (limited to 'hw')
-rw-r--r--hw/vfio/common.c92
-rw-r--r--hw/vfio/migration.c736
-rw-r--r--hw/vfio/trace-events28
3 files changed, 341 insertions, 515 deletions
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 130e5d1..bab83c0 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -40,6 +40,8 @@
#include "trace.h"
#include "qapi/error.h"
#include "migration/migration.h"
+#include "migration/misc.h"
+#include "migration/blocker.h"
#include "sysemu/tpm.h"
VFIOGroupList vfio_group_list =
@@ -336,6 +338,58 @@ bool vfio_mig_active(void)
return true;
}
+static Error *multiple_devices_migration_blocker;
+
+static unsigned int vfio_migratable_device_num(void)
+{
+ VFIOGroup *group;
+ VFIODevice *vbasedev;
+ unsigned int device_num = 0;
+
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ if (vbasedev->migration) {
+ device_num++;
+ }
+ }
+ }
+
+ return device_num;
+}
+
+int vfio_block_multiple_devices_migration(Error **errp)
+{
+ int ret;
+
+ if (multiple_devices_migration_blocker ||
+ vfio_migratable_device_num() <= 1) {
+ return 0;
+ }
+
+ error_setg(&multiple_devices_migration_blocker,
+ "Migration is currently not supported with multiple "
+ "VFIO devices");
+ ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
+ if (ret < 0) {
+ error_free(multiple_devices_migration_blocker);
+ multiple_devices_migration_blocker = NULL;
+ }
+
+ return ret;
+}
+
+void vfio_unblock_multiple_devices_migration(void)
+{
+ if (!multiple_devices_migration_blocker ||
+ vfio_migratable_device_num() > 1) {
+ return;
+ }
+
+ migrate_del_blocker(multiple_devices_migration_blocker);
+ error_free(multiple_devices_migration_blocker);
+ multiple_devices_migration_blocker = NULL;
+}
+
static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
{
VFIOGroup *group;
@@ -354,8 +408,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return false;
}
- if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
- && (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) {
+ if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
+ migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
return false;
}
}
@@ -363,13 +417,16 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return true;
}
-static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
+/*
+ * Check if all VFIO devices are running and migration is active, which is
+ * essentially equivalent to the migration being in pre-copy phase.
+ */
+static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
{
VFIOGroup *group;
VFIODevice *vbasedev;
- MigrationState *ms = migrate_get_current();
- if (!migration_is_setup_or_active(ms->state)) {
+ if (!migration_is_active(migrate_get_current())) {
return false;
}
@@ -381,8 +438,7 @@ static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
return false;
}
- if ((migration->device_state & VFIO_DEVICE_STATE_V1_SAVING) &&
- (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) {
+ if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
continue;
} else {
return false;
@@ -461,7 +517,7 @@ static int vfio_dma_unmap(VFIOContainer *container,
};
if (iotlb && container->dirty_pages_supported &&
- vfio_devices_all_running_and_saving(container)) {
+ vfio_devices_all_running_and_mig_active(container)) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
@@ -488,6 +544,12 @@ static int vfio_dma_unmap(VFIOContainer *container,
return -errno;
}
+ if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
+ cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
+ tcg_enabled() ? DIRTY_CLIENTS_ALL :
+ DIRTY_CLIENTS_NOCODE);
+ }
+
return 0;
}
@@ -1201,6 +1263,10 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
.argsz = sizeof(dirty),
};
+ if (!container->dirty_pages_supported) {
+ return;
+ }
+
if (start) {
dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
} else {
@@ -1236,6 +1302,13 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
uint64_t pages;
int ret;
+ if (!container->dirty_pages_supported) {
+ cpu_physical_memory_set_dirty_range(ram_addr, size,
+ tcg_enabled() ? DIRTY_CLIENTS_ALL :
+ DIRTY_CLIENTS_NOCODE);
+ return 0;
+ }
+
dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
@@ -1409,8 +1482,7 @@ static void vfio_listener_log_sync(MemoryListener *listener,
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
- if (vfio_listener_skipped_section(section) ||
- !container->dirty_pages_supported) {
+ if (vfio_listener_skipped_section(section)) {
return;
}
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 83d2d44..a2c3d9b 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -10,6 +10,7 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
#include "qemu/cutils.h"
+#include "qemu/units.h"
#include <linux/vfio.h>
#include <sys/ioctl.h>
@@ -44,310 +45,124 @@
#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
+/*
+ * This is an arbitrary size based on migration of mlx5 devices, where typically
+ * total device migration size is on the order of 100s of MB. Testing with
+ * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
+ */
+#define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
+
static int64_t bytes_transferred;
-static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
- off_t off, bool iswrite)
+static const char *mig_state_to_str(enum vfio_device_mig_state state)
{
- int ret;
-
- ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
- pread(vbasedev->fd, val, count, off);
- if (ret < count) {
- error_report("vfio_mig_%s %d byte %s: failed at offset 0x%"
- HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count,
- vbasedev->name, off, strerror(errno));
- return (ret < 0) ? ret : -EINVAL;
+ switch (state) {
+ case VFIO_DEVICE_STATE_ERROR:
+ return "ERROR";
+ case VFIO_DEVICE_STATE_STOP:
+ return "STOP";
+ case VFIO_DEVICE_STATE_RUNNING:
+ return "RUNNING";
+ case VFIO_DEVICE_STATE_STOP_COPY:
+ return "STOP_COPY";
+ case VFIO_DEVICE_STATE_RESUMING:
+ return "RESUMING";
+ default:
+ return "UNKNOWN STATE";
}
- return 0;
}
-static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
- off_t off, bool iswrite)
-{
- int ret, done = 0;
- __u8 *tbuf = buf;
-
- while (count) {
- int bytes = 0;
-
- if (count >= 8 && !(off % 8)) {
- bytes = 8;
- } else if (count >= 4 && !(off % 4)) {
- bytes = 4;
- } else if (count >= 2 && !(off % 2)) {
- bytes = 2;
- } else {
- bytes = 1;
- }
-
- ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
- if (ret) {
- return ret;
- }
-
- count -= bytes;
- done += bytes;
- off += bytes;
- tbuf += bytes;
- }
- return done;
-}
-
-#define vfio_mig_read(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, false)
-#define vfio_mig_write(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, true)
-
-#define VFIO_MIG_STRUCT_OFFSET(f) \
- offsetof(struct vfio_device_migration_info, f)
-/*
- * Change the device_state register for device @vbasedev. Bits set in @mask
- * are preserved, bits set in @value are set, and bits not set in either @mask
- * or @value are cleared in device_state. If the register cannot be accessed,
- * the resulting state would be invalid, or the device enters an error state,
- * an error is returned.
- */
-
-static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
- uint32_t value)
+static int vfio_migration_set_state(VFIODevice *vbasedev,
+ enum vfio_device_mig_state new_state,
+ enum vfio_device_mig_state recover_state)
{
VFIOMigration *migration = vbasedev->migration;
- VFIORegion *region = &migration->region;
- off_t dev_state_off = region->fd_offset +
- VFIO_MIG_STRUCT_OFFSET(device_state);
- uint32_t device_state;
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+ sizeof(struct vfio_device_feature_mig_state),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ struct vfio_device_feature_mig_state *mig_state =
+ (struct vfio_device_feature_mig_state *)feature->data;
int ret;
- ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
- dev_state_off);
- if (ret < 0) {
- return ret;
- }
-
- device_state = (device_state & mask) | value;
-
- if (!VFIO_DEVICE_STATE_VALID(device_state)) {
- return -EINVAL;
- }
-
- ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
- dev_state_off);
- if (ret < 0) {
- int rret;
-
- rret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
- dev_state_off);
-
- if ((rret < 0) || (VFIO_DEVICE_STATE_IS_ERROR(device_state))) {
- hw_error("%s: Device in error state 0x%x", vbasedev->name,
- device_state);
- return rret ? rret : -EIO;
+ feature->argsz = sizeof(buf);
+ feature->flags =
+ VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
+ mig_state->device_state = new_state;
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ /* Try to set the device in some good state */
+ ret = -errno;
+
+ if (recover_state == VFIO_DEVICE_STATE_ERROR) {
+ error_report("%s: Failed setting device state to %s, err: %s. "
+ "Recover state is ERROR. Resetting device",
+ vbasedev->name, mig_state_to_str(new_state),
+ strerror(errno));
+
+ goto reset_device;
}
- return ret;
- }
- migration->device_state = device_state;
- trace_vfio_migration_set_state(vbasedev->name, device_state);
- return 0;
-}
+ error_report(
+ "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s",
+ vbasedev->name, mig_state_to_str(new_state),
+ strerror(errno), mig_state_to_str(recover_state));
-static void *get_data_section_size(VFIORegion *region, uint64_t data_offset,
- uint64_t data_size, uint64_t *size)
-{
- void *ptr = NULL;
- uint64_t limit = 0;
- int i;
+ mig_state->device_state = recover_state;
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ ret = -errno;
+ error_report(
+ "%s: Failed setting device in recover state, err: %s. Resetting device",
+ vbasedev->name, strerror(errno));
- if (!region->mmaps) {
- if (size) {
- *size = MIN(data_size, region->size - data_offset);
+ goto reset_device;
}
- return ptr;
- }
- for (i = 0; i < region->nr_mmaps; i++) {
- VFIOMmap *map = region->mmaps + i;
+ migration->device_state = recover_state;
- if ((data_offset >= map->offset) &&
- (data_offset < map->offset + map->size)) {
+ return ret;
+ }
- /* check if data_offset is within sparse mmap areas */
- ptr = map->mmap + data_offset - map->offset;
- if (size) {
- *size = MIN(data_size, map->offset + map->size - data_offset);
- }
- break;
- } else if ((data_offset < map->offset) &&
- (!limit || limit > map->offset)) {
+ migration->device_state = new_state;
+ if (mig_state->data_fd != -1) {
+ if (migration->data_fd != -1) {
/*
- * data_offset is not within sparse mmap areas, find size of
- * non-mapped area. Check through all list since region->mmaps list
- * is not sorted.
+ * This can happen if the device is asynchronously reset and
+ * terminates a data transfer.
*/
- limit = map->offset;
- }
- }
-
- if (!ptr && size) {
- *size = limit ? MIN(data_size, limit - data_offset) : data_size;
- }
- return ptr;
-}
+ error_report("%s: data_fd out of sync", vbasedev->name);
+ close(mig_state->data_fd);
-static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
-{
- VFIOMigration *migration = vbasedev->migration;
- VFIORegion *region = &migration->region;
- uint64_t data_offset = 0, data_size = 0, sz;
- int ret;
-
- ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
- if (ret < 0) {
- return ret;
- }
+ return -EBADF;
+ }
- ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
- if (ret < 0) {
- return ret;
+ migration->data_fd = mig_state->data_fd;
}
- trace_vfio_save_buffer(vbasedev->name, data_offset, data_size,
- migration->pending_bytes);
-
- qemu_put_be64(f, data_size);
- sz = data_size;
-
- while (sz) {
- void *buf;
- uint64_t sec_size;
- bool buf_allocated = false;
-
- buf = get_data_section_size(region, data_offset, sz, &sec_size);
-
- if (!buf) {
- buf = g_try_malloc(sec_size);
- if (!buf) {
- error_report("%s: Error allocating buffer ", __func__);
- return -ENOMEM;
- }
- buf_allocated = true;
-
- ret = vfio_mig_read(vbasedev, buf, sec_size,
- region->fd_offset + data_offset);
- if (ret < 0) {
- g_free(buf);
- return ret;
- }
- }
+ trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state));
- qemu_put_buffer(f, buf, sec_size);
+ return 0;
- if (buf_allocated) {
- g_free(buf);
- }
- sz -= sec_size;
- data_offset += sec_size;
+reset_device:
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) {
+ hw_error("%s: Failed resetting device, err: %s", vbasedev->name,
+ strerror(errno));
}
- ret = qemu_file_get_error(f);
+ migration->device_state = VFIO_DEVICE_STATE_RUNNING;
- if (!ret && size) {
- *size = data_size;
- }
-
- bytes_transferred += data_size;
return ret;
}
static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
uint64_t data_size)
{
- VFIORegion *region = &vbasedev->migration->region;
- uint64_t data_offset = 0, size, report_size;
- int ret;
-
- do {
- ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
- if (ret < 0) {
- return ret;
- }
-
- if (data_offset + data_size > region->size) {
- /*
- * If data_size is greater than the data section of migration region
- * then iterate the write buffer operation. This case can occur if
- * size of migration region at destination is smaller than size of
- * migration region at source.
- */
- report_size = size = region->size - data_offset;
- data_size -= size;
- } else {
- report_size = size = data_size;
- data_size = 0;
- }
-
- trace_vfio_load_state_device_data(vbasedev->name, data_offset, size);
-
- while (size) {
- void *buf;
- uint64_t sec_size;
- bool buf_alloc = false;
-
- buf = get_data_section_size(region, data_offset, size, &sec_size);
-
- if (!buf) {
- buf = g_try_malloc(sec_size);
- if (!buf) {
- error_report("%s: Error allocating buffer ", __func__);
- return -ENOMEM;
- }
- buf_alloc = true;
- }
-
- qemu_get_buffer(f, buf, sec_size);
-
- if (buf_alloc) {
- ret = vfio_mig_write(vbasedev, buf, sec_size,
- region->fd_offset + data_offset);
- g_free(buf);
-
- if (ret < 0) {
- return ret;
- }
- }
- size -= sec_size;
- data_offset += sec_size;
- }
-
- ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
- if (ret < 0) {
- return ret;
- }
- } while (data_size);
-
- return 0;
-}
-
-static int vfio_update_pending(VFIODevice *vbasedev)
-{
VFIOMigration *migration = vbasedev->migration;
- VFIORegion *region = &migration->region;
- uint64_t pending_bytes = 0;
int ret;
- ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes));
- if (ret < 0) {
- migration->pending_bytes = 0;
- return ret;
- }
+ ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
+ trace_vfio_load_state_device_data(vbasedev->name, data_size, ret);
- migration->pending_bytes = pending_bytes;
- trace_vfio_update_pending(vbasedev->name, pending_bytes);
- return 0;
+ return ret;
}
static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
@@ -398,180 +213,157 @@ static void vfio_migration_cleanup(VFIODevice *vbasedev)
{
VFIOMigration *migration = vbasedev->migration;
- if (migration->region.mmaps) {
- vfio_region_unmap(&migration->region);
- }
+ close(migration->data_fd);
+ migration->data_fd = -1;
}
-/* ---------------------------------------------------------------------- */
-
-static int vfio_save_setup(QEMUFile *f, void *opaque)
+static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
+ uint64_t *stop_copy_size)
{
- VFIODevice *vbasedev = opaque;
- VFIOMigration *migration = vbasedev->migration;
- int ret;
-
- trace_vfio_save_setup(vbasedev->name);
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+ sizeof(struct vfio_device_feature_mig_data_size),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ struct vfio_device_feature_mig_data_size *mig_data_size =
+ (struct vfio_device_feature_mig_data_size *)feature->data;
- qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
-
- if (migration->region.mmaps) {
- /*
- * Calling vfio_region_mmap() from migration thread. Memory API called
- * from this function require locking the iothread when called from
- * outside the main loop thread.
- */
- qemu_mutex_lock_iothread();
- ret = vfio_region_mmap(&migration->region);
- qemu_mutex_unlock_iothread();
- if (ret) {
- error_report("%s: Failed to mmap VFIO migration region: %s",
- vbasedev->name, strerror(-ret));
- error_report("%s: Falling back to slow path", vbasedev->name);
- }
- }
+ feature->argsz = sizeof(buf);
+ feature->flags =
+ VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE;
- ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK,
- VFIO_DEVICE_STATE_V1_SAVING);
- if (ret) {
- error_report("%s: Failed to set state SAVING", vbasedev->name);
- return ret;
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ return -errno;
}
- qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
-
- ret = qemu_file_get_error(f);
- if (ret) {
- return ret;
- }
+ *stop_copy_size = mig_data_size->stop_copy_length;
return 0;
}
-static void vfio_save_cleanup(void *opaque)
+/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */
+static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
{
- VFIODevice *vbasedev = opaque;
+ ssize_t data_size;
- vfio_migration_cleanup(vbasedev);
- trace_vfio_save_cleanup(vbasedev->name);
+ data_size = read(migration->data_fd, migration->data_buffer,
+ migration->data_buffer_size);
+ if (data_size < 0) {
+ return -errno;
+ }
+ if (data_size == 0) {
+ return 1;
+ }
+
+ qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
+ qemu_put_be64(f, data_size);
+ qemu_put_buffer(f, migration->data_buffer, data_size);
+ bytes_transferred += data_size;
+
+ trace_vfio_save_block(migration->vbasedev->name, data_size);
+
+ return qemu_file_get_error(f);
}
-static void vfio_state_pending(void *opaque, uint64_t *must_precopy,
- uint64_t *can_postcopy)
+/* ---------------------------------------------------------------------- */
+
+static int vfio_save_setup(QEMUFile *f, void *opaque)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
- int ret;
+ uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
- ret = vfio_update_pending(vbasedev);
- if (ret) {
- return;
+ qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
+
+ vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
+ migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE,
+ stop_copy_size);
+ migration->data_buffer = g_try_malloc0(migration->data_buffer_size);
+ if (!migration->data_buffer) {
+ error_report("%s: Failed to allocate migration data buffer",
+ vbasedev->name);
+ return -ENOMEM;
}
- *must_precopy += migration->pending_bytes;
+ trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
+
+ qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
- trace_vfio_state_pending(vbasedev->name, *must_precopy, *can_postcopy);
+ return qemu_file_get_error(f);
}
-static int vfio_save_iterate(QEMUFile *f, void *opaque)
+static void vfio_save_cleanup(void *opaque)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
- uint64_t data_size;
- int ret;
-
- qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
-
- if (migration->pending_bytes == 0) {
- ret = vfio_update_pending(vbasedev);
- if (ret) {
- return ret;
- }
-
- if (migration->pending_bytes == 0) {
- qemu_put_be64(f, 0);
- qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
- /* indicates data finished, goto complete phase */
- return 1;
- }
- }
- ret = vfio_save_buffer(f, vbasedev, &data_size);
- if (ret) {
- error_report("%s: vfio_save_buffer failed %s", vbasedev->name,
- strerror(errno));
- return ret;
- }
+ g_free(migration->data_buffer);
+ migration->data_buffer = NULL;
+ vfio_migration_cleanup(vbasedev);
+ trace_vfio_save_cleanup(vbasedev->name);
+}
- qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+/*
+ * Migration size of VFIO devices can be as little as a few KBs or as big as
+ * many GBs. This value should be big enough to cover the worst case.
+ */
+#define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
- ret = qemu_file_get_error(f);
- if (ret) {
- return ret;
- }
+/*
+ * Only exact function is implemented and not estimate function. The reason is
+ * that during pre-copy phase of migration the estimate function is called
+ * repeatedly while pending RAM size is over the threshold, thus migration
+ * can't converge and querying the VFIO device pending data size is useless.
+ */
+static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
+ uint64_t *can_postcopy)
+{
+ VFIODevice *vbasedev = opaque;
+ uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
/*
- * Reset pending_bytes as state_pending* are not called during
- * savevm or snapshot case, in such case vfio_update_pending() at
- * the start of this function updates pending_bytes.
+ * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
+ * reported so downtime limit won't be violated.
*/
- migration->pending_bytes = 0;
- trace_vfio_save_iterate(vbasedev->name, data_size);
- return 0;
+ vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
+ *must_precopy += stop_copy_size;
+
+ trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
+ stop_copy_size);
}
static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
{
VFIODevice *vbasedev = opaque;
- VFIOMigration *migration = vbasedev->migration;
- uint64_t data_size;
int ret;
- ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_V1_RUNNING,
- VFIO_DEVICE_STATE_V1_SAVING);
- if (ret) {
- error_report("%s: Failed to set state STOP and SAVING",
- vbasedev->name);
- return ret;
- }
-
- ret = vfio_update_pending(vbasedev);
+ /* We reach here with device state STOP only */
+ ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
+ VFIO_DEVICE_STATE_STOP);
if (ret) {
return ret;
}
- while (migration->pending_bytes > 0) {
- qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
- ret = vfio_save_buffer(f, vbasedev, &data_size);
+ do {
+ ret = vfio_save_block(f, vbasedev->migration);
if (ret < 0) {
- error_report("%s: Failed to save buffer", vbasedev->name);
- return ret;
- }
-
- if (data_size == 0) {
- break;
- }
-
- ret = vfio_update_pending(vbasedev);
- if (ret) {
return ret;
}
- }
+ } while (!ret);
qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
-
ret = qemu_file_get_error(f);
if (ret) {
return ret;
}
- ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_V1_SAVING, 0);
- if (ret) {
- error_report("%s: Failed to set state STOPPED", vbasedev->name);
- return ret;
- }
+ /*
+ * If setting the device in STOP state fails, the device should be reset.
+ * To do so, use ERROR state as a recover state.
+ */
+ ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
+ VFIO_DEVICE_STATE_ERROR);
+ trace_vfio_save_complete_precopy(vbasedev->name, ret);
- trace_vfio_save_complete_precopy(vbasedev->name);
return ret;
}
@@ -591,28 +383,9 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
static int vfio_load_setup(QEMUFile *f, void *opaque)
{
VFIODevice *vbasedev = opaque;
- VFIOMigration *migration = vbasedev->migration;
- int ret = 0;
- if (migration->region.mmaps) {
- ret = vfio_region_mmap(&migration->region);
- if (ret) {
- error_report("%s: Failed to mmap VFIO migration region %d: %s",
- vbasedev->name, migration->region.nr,
- strerror(-ret));
- error_report("%s: Falling back to slow path", vbasedev->name);
- }
- }
-
- ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK,
- VFIO_DEVICE_STATE_V1_RESUMING);
- if (ret) {
- error_report("%s: Failed to set state RESUMING", vbasedev->name);
- if (migration->region.mmaps) {
- vfio_region_unmap(&migration->region);
- }
- }
- return ret;
+ return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
+ vbasedev->migration->device_state);
}
static int vfio_load_cleanup(void *opaque)
@@ -621,6 +394,7 @@ static int vfio_load_cleanup(void *opaque)
vfio_migration_cleanup(vbasedev);
trace_vfio_load_cleanup(vbasedev->name);
+
return 0;
}
@@ -678,12 +452,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
return ret;
}
-static SaveVMHandlers savevm_vfio_handlers = {
+static const SaveVMHandlers savevm_vfio_handlers = {
.save_setup = vfio_save_setup,
.save_cleanup = vfio_save_cleanup,
- .state_pending_exact = vfio_state_pending,
- .state_pending_estimate = vfio_state_pending,
- .save_live_iterate = vfio_save_iterate,
+ .state_pending_exact = vfio_state_pending_exact,
.save_live_complete_precopy = vfio_save_complete_precopy,
.save_state = vfio_save_state,
.load_setup = vfio_load_setup,
@@ -696,56 +468,33 @@ static SaveVMHandlers savevm_vfio_handlers = {
static void vfio_vmstate_change(void *opaque, bool running, RunState state)
{
VFIODevice *vbasedev = opaque;
- VFIOMigration *migration = vbasedev->migration;
- uint32_t value, mask;
+ enum vfio_device_mig_state new_state;
int ret;
- if (vbasedev->migration->vm_running == running) {
- return;
- }
-
if (running) {
- /*
- * Here device state can have one of _SAVING, _RESUMING or _STOP bit.
- * Transition from _SAVING to _RUNNING can happen if there is migration
- * failure, in that case clear _SAVING bit.
- * Transition from _RESUMING to _RUNNING occurs during resuming
- * phase, in that case clear _RESUMING bit.
- * In both the above cases, set _RUNNING bit.
- */
- mask = ~VFIO_DEVICE_STATE_MASK;
- value = VFIO_DEVICE_STATE_V1_RUNNING;
+ new_state = VFIO_DEVICE_STATE_RUNNING;
} else {
- /*
- * Here device state could be either _RUNNING or _SAVING|_RUNNING. Reset
- * _RUNNING bit
- */
- mask = ~VFIO_DEVICE_STATE_V1_RUNNING;
-
- /*
- * When VM state transition to stop for savevm command, device should
- * start saving data.
- */
- if (state == RUN_STATE_SAVE_VM) {
- value = VFIO_DEVICE_STATE_V1_SAVING;
- } else {
- value = 0;
- }
+ new_state = VFIO_DEVICE_STATE_STOP;
}
- ret = vfio_migration_set_state(vbasedev, mask, value);
+ /*
+ * If setting the device in new_state fails, the device should be reset.
+ * To do so, use ERROR state as a recover state.
+ */
+ ret = vfio_migration_set_state(vbasedev, new_state,
+ VFIO_DEVICE_STATE_ERROR);
if (ret) {
/*
* Migration should be aborted in this case, but vm_state_notify()
* currently does not support reporting failures.
*/
- error_report("%s: Failed to set device state 0x%x", vbasedev->name,
- (migration->device_state & mask) | value);
- qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
+ if (migrate_get_current()->to_dst_file) {
+ qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
+ }
}
- vbasedev->migration->vm_running = running;
+
trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
- (migration->device_state & mask) | value);
+ mig_state_to_str(new_state));
}
static void vfio_migration_state_notifier(Notifier *notifier, void *data)
@@ -754,7 +503,6 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
VFIOMigration *migration = container_of(notifier, VFIOMigration,
migration_state);
VFIODevice *vbasedev = migration->vbasedev;
- int ret;
trace_vfio_migration_state_notifier(vbasedev->name,
MigrationStatus_str(s->state));
@@ -764,34 +512,57 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_FAILED:
bytes_transferred = 0;
- ret = vfio_migration_set_state(vbasedev,
- ~(VFIO_DEVICE_STATE_V1_SAVING |
- VFIO_DEVICE_STATE_V1_RESUMING),
- VFIO_DEVICE_STATE_V1_RUNNING);
- if (ret) {
- error_report("%s: Failed to set state RUNNING", vbasedev->name);
- }
+ /*
+ * If setting the device in RUNNING state fails, the device should
+ * be reset. To do so, use ERROR state as a recover state.
+ */
+ vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
+ VFIO_DEVICE_STATE_ERROR);
}
}
static void vfio_migration_exit(VFIODevice *vbasedev)
{
- VFIOMigration *migration = vbasedev->migration;
-
- vfio_region_exit(&migration->region);
- vfio_region_finalize(&migration->region);
g_free(vbasedev->migration);
vbasedev->migration = NULL;
}
-static int vfio_migration_init(VFIODevice *vbasedev,
- struct vfio_region_info *info)
+static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
+{
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+ sizeof(struct vfio_device_feature_migration),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ struct vfio_device_feature_migration *mig =
+ (struct vfio_device_feature_migration *)feature->data;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ if (errno == ENOTTY) {
+ error_report("%s: VFIO migration is not supported in kernel",
+ vbasedev->name);
+ } else {
+ error_report("%s: Failed to query VFIO migration support, err: %s",
+ vbasedev->name, strerror(errno));
+ }
+
+ return -errno;
+ }
+
+ *mig_flags = mig->flags;
+
+ return 0;
+}
+
+static int vfio_migration_init(VFIODevice *vbasedev)
{
int ret;
Object *obj;
VFIOMigration *migration;
char id[256] = "";
g_autofree char *path = NULL, *oid = NULL;
+ uint64_t mig_flags = 0;
if (!vbasedev->ops->vfio_get_object) {
return -EINVAL;
@@ -802,27 +573,21 @@ static int vfio_migration_init(VFIODevice *vbasedev,
return -EINVAL;
}
- vbasedev->migration = g_new0(VFIOMigration, 1);
- vbasedev->migration->device_state = VFIO_DEVICE_STATE_V1_RUNNING;
- vbasedev->migration->vm_running = runstate_is_running();
-
- ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
- info->index, "migration");
+ ret = vfio_migration_query_flags(vbasedev, &mig_flags);
if (ret) {
- error_report("%s: Failed to setup VFIO migration region %d: %s",
- vbasedev->name, info->index, strerror(-ret));
- goto err;
+ return ret;
}
- if (!vbasedev->migration->region.size) {
- error_report("%s: Invalid zero-sized VFIO migration region %d",
- vbasedev->name, info->index);
- ret = -EINVAL;
- goto err;
+ /* Basic migration functionality must be supported */
+ if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
+ return -EOPNOTSUPP;
}
+ vbasedev->migration = g_new0(VFIOMigration, 1);
migration = vbasedev->migration;
migration->vbasedev = vbasedev;
+ migration->device_state = VFIO_DEVICE_STATE_RUNNING;
+ migration->data_fd = -1;
oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
if (oid) {
@@ -840,11 +605,8 @@ static int vfio_migration_init(VFIODevice *vbasedev,
vbasedev);
migration->migration_state.notify = vfio_migration_state_notifier;
add_migration_state_change_notifier(&migration->migration_state);
- return 0;
-err:
- vfio_migration_exit(vbasedev);
- return ret;
+ return 0;
}
/* ---------------------------------------------------------------------- */
@@ -856,35 +618,28 @@ int64_t vfio_mig_bytes_transferred(void)
int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
{
- VFIOContainer *container = vbasedev->group->container;
- struct vfio_region_info *info = NULL;
int ret = -ENOTSUP;
- if (!vbasedev->enable_migration || !container->dirty_pages_supported) {
+ if (!vbasedev->enable_migration) {
goto add_blocker;
}
- ret = vfio_get_dev_region_info(vbasedev,
- VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
- VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
- &info);
+ ret = vfio_migration_init(vbasedev);
if (ret) {
goto add_blocker;
}
- ret = vfio_migration_init(vbasedev, info);
+ ret = vfio_block_multiple_devices_migration(errp);
if (ret) {
- goto add_blocker;
+ return ret;
}
- trace_vfio_migration_probe(vbasedev->name, info->index);
- g_free(info);
+ trace_vfio_migration_probe(vbasedev->name);
return 0;
add_blocker:
error_setg(&vbasedev->migration_blocker,
"VFIO device doesn't support migration");
- g_free(info);
ret = migrate_add_blocker(vbasedev->migration_blocker, errp);
if (ret < 0) {
@@ -903,6 +658,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
qemu_del_vm_change_state_handler(migration->vm_state);
unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
vfio_migration_exit(vbasedev);
+ vfio_unblock_multiple_devices_migration();
}
if (vbasedev->migration_blocker) {
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 90a8aec..669d9fe 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -119,6 +119,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
vfio_dma_unmap_overflow_workaround(void) ""
+vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
+vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
# platform.c
vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
@@ -148,21 +150,17 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u"
vfio_display_edid_write_error(void) ""
# migration.c
-vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
-vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d"
-vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
+vfio_load_cleanup(const char *name) " (%s)"
+vfio_load_device_config_state(const char *name) " (%s)"
+vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
+vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d"
+vfio_migration_probe(const char *name) " (%s)"
+vfio_migration_set_state(const char *name, const char *state) " (%s) state %s"
vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s"
-vfio_save_setup(const char *name) " (%s)"
+vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
vfio_save_cleanup(const char *name) " (%s)"
-vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64
-vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64
+vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
vfio_save_device_config_state(const char *name) " (%s)"
-vfio_state_pending(const char *name, uint64_t precopy, uint64_t postcopy) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64
-vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d"
-vfio_save_complete_precopy(const char *name) " (%s)"
-vfio_load_device_config_state(const char *name) " (%s)"
-vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
-vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
-vfio_load_cleanup(const char *name) " (%s)"
-vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
-vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64
+vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64
+vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"