aboutsummaryrefslogtreecommitdiff
path: root/hw/vfio/migration-multifd.c
diff options
context:
space:
mode:
Diffstat (limited to 'hw/vfio/migration-multifd.c')
-rw-r--r--hw/vfio/migration-multifd.c685
1 files changed, 685 insertions, 0 deletions
diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
new file mode 100644
index 0000000..850a319
--- /dev/null
+++ b/hw/vfio/migration-multifd.c
@@ -0,0 +1,685 @@
+/*
+ * Multifd VFIO migration
+ *
+ * Copyright (C) 2024,2025 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/vfio/vfio-device.h"
+#include "migration/misc.h"
+#include "qapi/error.h"
+#include "qemu/bswap.h"
+#include "qemu/error-report.h"
+#include "qemu/lockable.h"
+#include "qemu/main-loop.h"
+#include "qemu/thread.h"
+#include "io/channel-buffer.h"
+#include "migration/qemu-file.h"
+#include "migration-multifd.h"
+#include "vfio-migration-internal.h"
+#include "trace.h"
+
+#define VFIO_DEVICE_STATE_CONFIG_STATE (1)
+
+#define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
+
+typedef struct VFIODeviceStatePacket {
+ uint32_t version;
+ uint32_t idx;
+ uint32_t flags;
+ uint8_t data[0];
+} QEMU_PACKED VFIODeviceStatePacket;
+
+/* type safety */
+typedef struct VFIOStateBuffers {
+ GArray *array;
+} VFIOStateBuffers;
+
+typedef struct VFIOStateBuffer {
+ bool is_present;
+ char *data;
+ size_t len;
+} VFIOStateBuffer;
+
+typedef struct VFIOMultifd {
+ bool load_bufs_thread_running;
+ bool load_bufs_thread_want_exit;
+
+ VFIOStateBuffers load_bufs;
+ QemuCond load_bufs_buffer_ready_cond;
+ QemuCond load_bufs_thread_finished_cond;
+ QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
+ uint32_t load_buf_idx;
+ uint32_t load_buf_idx_last;
+} VFIOMultifd;
+
+static void vfio_state_buffer_clear(gpointer data)
+{
+ VFIOStateBuffer *lb = data;
+
+ if (!lb->is_present) {
+ return;
+ }
+
+ g_clear_pointer(&lb->data, g_free);
+ lb->is_present = false;
+}
+
+static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
+{
+ bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
+ g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
+}
+
+static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
+{
+ g_clear_pointer(&bufs->array, g_array_unref);
+}
+
+static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
+{
+ assert(bufs->array);
+}
+
+static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
+{
+ return bufs->array->len;
+}
+
+static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
+ unsigned int size)
+{
+ g_array_set_size(bufs->array, size);
+}
+
+static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
+ unsigned int idx)
+{
+ return &g_array_index(bufs->array, VFIOStateBuffer, idx);
+}
+
+/* called with load_bufs_mutex locked */
+static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
+ VFIODeviceStatePacket *packet,
+ size_t packet_total_size,
+ Error **errp)
+{
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ VFIOStateBuffer *lb;
+
+ vfio_state_buffers_assert_init(&multifd->load_bufs);
+ if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
+ vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
+ }
+
+ lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
+ if (lb->is_present) {
+ error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
+ vbasedev->name, packet->idx);
+ return false;
+ }
+
+ assert(packet->idx >= multifd->load_buf_idx);
+
+ lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
+ lb->len = packet_total_size - sizeof(*packet);
+ lb->is_present = true;
+
+ return true;
+}
+
+bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
+ Error **errp)
+{
+ VFIODevice *vbasedev = opaque;
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
+
+ if (!vfio_multifd_transfer_enabled(vbasedev)) {
+ error_setg(errp,
+ "%s: got device state packet but not doing multifd transfer",
+ vbasedev->name);
+ return false;
+ }
+
+ assert(multifd);
+
+ if (data_size < sizeof(*packet)) {
+ error_setg(errp, "%s: packet too short at %zu (min is %zu)",
+ vbasedev->name, data_size, sizeof(*packet));
+ return false;
+ }
+
+ packet->version = be32_to_cpu(packet->version);
+ if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
+ error_setg(errp, "%s: packet has unknown version %" PRIu32,
+ vbasedev->name, packet->version);
+ return false;
+ }
+
+ packet->idx = be32_to_cpu(packet->idx);
+ packet->flags = be32_to_cpu(packet->flags);
+
+ if (packet->idx == UINT32_MAX) {
+ error_setg(errp, "%s: packet index is invalid", vbasedev->name);
+ return false;
+ }
+
+ trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
+
+ /*
+ * Holding BQL here would violate the lock order and can cause
+ * a deadlock once we attempt to lock load_bufs_mutex below.
+ */
+ assert(!bql_locked());
+
+ WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
+ /* config state packet should be the last one in the stream */
+ if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
+ multifd->load_buf_idx_last = packet->idx;
+ }
+
+ if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
+ errp)) {
+ return false;
+ }
+
+ qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
+ }
+
+ return true;
+}
+
+static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
+ Error **errp)
+{
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ VFIOStateBuffer *lb;
+ g_autoptr(QIOChannelBuffer) bioc = NULL;
+ g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
+ uint64_t mig_header;
+ int ret;
+
+ assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
+ lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
+ assert(lb->is_present);
+
+ bioc = qio_channel_buffer_new(lb->len);
+ qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
+
+ f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
+ qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
+
+ ret = qemu_fflush(f_out);
+ if (ret) {
+ error_setg(errp, "%s: load config state flush failed: %d",
+ vbasedev->name, ret);
+ return false;
+ }
+
+ qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
+ f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
+
+ mig_header = qemu_get_be64(f_in);
+ if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
+ error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
+ vbasedev->name, mig_header);
+ return false;
+ }
+
+ bql_lock();
+ ret = vfio_load_device_config_state(f_in, vbasedev);
+ bql_unlock();
+
+ if (ret < 0) {
+ error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
+ vbasedev->name, ret);
+ return false;
+ }
+
+ return true;
+}
+
+static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
+{
+ VFIOStateBuffer *lb;
+ unsigned int bufs_len;
+
+ bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
+ if (multifd->load_buf_idx >= bufs_len) {
+ assert(multifd->load_buf_idx == bufs_len);
+ return NULL;
+ }
+
+ lb = vfio_state_buffers_at(&multifd->load_bufs,
+ multifd->load_buf_idx);
+ if (!lb->is_present) {
+ return NULL;
+ }
+
+ return lb;
+}
+
+static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
+ VFIOStateBuffer *lb,
+ Error **errp)
+{
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ g_autofree char *buf = NULL;
+ char *buf_cur;
+ size_t buf_len;
+
+ if (!lb->len) {
+ return true;
+ }
+
+ trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
+ multifd->load_buf_idx);
+
+ /* lb might become re-allocated when we drop the lock */
+ buf = g_steal_pointer(&lb->data);
+ buf_cur = buf;
+ buf_len = lb->len;
+ while (buf_len > 0) {
+ ssize_t wr_ret;
+ int errno_save;
+
+ /*
+ * Loading data to the device takes a while,
+ * drop the lock during this process.
+ */
+ qemu_mutex_unlock(&multifd->load_bufs_mutex);
+ wr_ret = write(migration->data_fd, buf_cur, buf_len);
+ errno_save = errno;
+ qemu_mutex_lock(&multifd->load_bufs_mutex);
+
+ if (wr_ret < 0) {
+ error_setg(errp,
+ "%s: writing state buffer %" PRIu32 " failed: %d",
+ vbasedev->name, multifd->load_buf_idx, errno_save);
+ return false;
+ }
+
+ assert(wr_ret <= buf_len);
+ buf_len -= wr_ret;
+ buf_cur += wr_ret;
+ }
+
+ trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
+ multifd->load_buf_idx);
+
+ return true;
+}
+
+static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
+ bool *should_quit)
+{
+ return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
+}
+
+/*
+ * This thread is spawned by vfio_multifd_switchover_start() which gets
+ * called upon encountering the switchover point marker in main migration
+ * stream.
+ *
+ * It exits after either:
+ * * completing loading the remaining device state and device config, OR:
+ * * encountering some error while doing the above, OR:
+ * * being forcefully aborted by the migration core by it setting should_quit
+ * or by vfio_load_cleanup_load_bufs_thread() setting
+ * multifd->load_bufs_thread_want_exit.
+ */
+static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
+{
+ VFIODevice *vbasedev = opaque;
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ bool ret = false;
+
+ trace_vfio_load_bufs_thread_start(vbasedev->name);
+
+ assert(multifd);
+ QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
+
+ assert(multifd->load_bufs_thread_running);
+
+ while (true) {
+ VFIOStateBuffer *lb;
+
+ /*
+ * Always check cancellation first after the buffer_ready wait below in
+ * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
+ */
+ if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
+ error_setg(errp, "operation cancelled");
+ goto thread_exit;
+ }
+
+ assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
+
+ lb = vfio_load_state_buffer_get(multifd);
+ if (!lb) {
+ trace_vfio_load_state_device_buffer_starved(vbasedev->name,
+ multifd->load_buf_idx);
+ qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
+ &multifd->load_bufs_mutex);
+ continue;
+ }
+
+ if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
+ break;
+ }
+
+ if (multifd->load_buf_idx == 0) {
+ trace_vfio_load_state_device_buffer_start(vbasedev->name);
+ }
+
+ if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
+ goto thread_exit;
+ }
+
+ if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
+ trace_vfio_load_state_device_buffer_end(vbasedev->name);
+ }
+
+ multifd->load_buf_idx++;
+ }
+
+ if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
+ goto thread_exit;
+ }
+
+ ret = true;
+
+thread_exit:
+ /*
+ * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
+ * this thread is exiting.
+ */
+ multifd->load_bufs_thread_running = false;
+ qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
+
+ trace_vfio_load_bufs_thread_end(vbasedev->name);
+
+ return ret;
+}
+
+static VFIOMultifd *vfio_multifd_new(void)
+{
+ VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
+
+ vfio_state_buffers_init(&multifd->load_bufs);
+
+ qemu_mutex_init(&multifd->load_bufs_mutex);
+
+ multifd->load_buf_idx = 0;
+ multifd->load_buf_idx_last = UINT32_MAX;
+ qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
+
+ multifd->load_bufs_thread_running = false;
+ multifd->load_bufs_thread_want_exit = false;
+ qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
+
+ return multifd;
+}
+
+/*
+ * Terminates vfio_load_bufs_thread by setting
+ * multifd->load_bufs_thread_want_exit and signalling all the conditions
+ * the thread could be blocked on.
+ *
+ * Waits for the thread to signal that it had finished.
+ */
+static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
+{
+ /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
+ bql_unlock();
+ WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
+ while (multifd->load_bufs_thread_running) {
+ multifd->load_bufs_thread_want_exit = true;
+
+ qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
+ qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
+ &multifd->load_bufs_mutex);
+ }
+ }
+ bql_lock();
+}
+
+static void vfio_multifd_free(VFIOMultifd *multifd)
+{
+ vfio_load_cleanup_load_bufs_thread(multifd);
+
+ qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
+ vfio_state_buffers_destroy(&multifd->load_bufs);
+ qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
+ qemu_mutex_destroy(&multifd->load_bufs_mutex);
+
+ g_free(multifd);
+}
+
+void vfio_multifd_cleanup(VFIODevice *vbasedev)
+{
+ VFIOMigration *migration = vbasedev->migration;
+
+ g_clear_pointer(&migration->multifd, vfio_multifd_free);
+}
+
+bool vfio_multifd_transfer_supported(void)
+{
+ return multifd_device_state_supported() &&
+ migrate_send_switchover_start();
+}
+
+bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
+{
+ VFIOMigration *migration = vbasedev->migration;
+
+ return migration->multifd_transfer;
+}
+
+bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
+{
+ VFIOMigration *migration = vbasedev->migration;
+
+ /*
+ * Make a copy of this setting at the start in case it is changed
+ * mid-migration.
+ */
+ if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
+ migration->multifd_transfer = vfio_multifd_transfer_supported();
+ } else {
+ migration->multifd_transfer =
+ vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
+ }
+
+ if (!vfio_multifd_transfer_enabled(vbasedev)) {
+ /* Nothing further to check or do */
+ return true;
+ }
+
+ if (!vfio_multifd_transfer_supported()) {
+ error_setg(errp,
+ "%s: Multifd device transfer requested but unsupported in the current config",
+ vbasedev->name);
+ return false;
+ }
+
+ if (alloc_multifd) {
+ assert(!migration->multifd);
+ migration->multifd = vfio_multifd_new();
+ }
+
+ return true;
+}
+
+void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
+{
+ assert(vfio_multifd_transfer_enabled(vbasedev));
+
+ /*
+ * Emit dummy NOP data on the main migration channel since the actual
+ * device state transfer is done via multifd channels.
+ */
+ qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+}
+
+static bool
+vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
+ char *idstr,
+ uint32_t instance_id,
+ uint32_t idx,
+ Error **errp)
+{
+ g_autoptr(QIOChannelBuffer) bioc = NULL;
+ g_autoptr(QEMUFile) f = NULL;
+ int ret;
+ g_autofree VFIODeviceStatePacket *packet = NULL;
+ size_t packet_len;
+
+ bioc = qio_channel_buffer_new(0);
+ qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
+
+ f = qemu_file_new_output(QIO_CHANNEL(bioc));
+
+ if (vfio_save_device_config_state(f, vbasedev, errp)) {
+ return false;
+ }
+
+ ret = qemu_fflush(f);
+ if (ret) {
+ error_setg(errp, "%s: save config state flush failed: %d",
+ vbasedev->name, ret);
+ return false;
+ }
+
+ packet_len = sizeof(*packet) + bioc->usage;
+ packet = g_malloc0(packet_len);
+ packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
+ packet->idx = cpu_to_be32(idx);
+ packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE);
+ memcpy(&packet->data, bioc->data, bioc->usage);
+
+ if (!multifd_queue_device_state(idstr, instance_id,
+ (char *)packet, packet_len)) {
+ error_setg(errp, "%s: multifd config data queuing failed",
+ vbasedev->name);
+ return false;
+ }
+
+ vfio_migration_add_bytes_transferred(packet_len);
+
+ return true;
+}
+
+/*
+ * This thread is spawned by the migration core directly via
+ * .save_live_complete_precopy_thread SaveVMHandler.
+ *
+ * It exits after either:
+ * * completing saving the remaining device state and device config, OR:
+ * * encountering some error while doing the above, OR:
+ * * being forcefully aborted by the migration core by
+ * multifd_device_state_save_thread_should_exit() returning true.
+ */
+bool
+vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
+ Error **errp)
+{
+ VFIODevice *vbasedev = d->handler_opaque;
+ VFIOMigration *migration = vbasedev->migration;
+ bool ret = false;
+ g_autofree VFIODeviceStatePacket *packet = NULL;
+ uint32_t idx;
+
+ if (!vfio_multifd_transfer_enabled(vbasedev)) {
+ /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
+ return true;
+ }
+
+ trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
+ d->idstr, d->instance_id);
+
+ /* We reach here with device state STOP or STOP_COPY only */
+ if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
+ VFIO_DEVICE_STATE_STOP, errp)) {
+ goto thread_exit;
+ }
+
+ packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
+ packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
+
+ for (idx = 0; ; idx++) {
+ ssize_t data_size;
+ size_t packet_size;
+
+ if (multifd_device_state_save_thread_should_exit()) {
+ error_setg(errp, "operation cancelled");
+ goto thread_exit;
+ }
+
+ data_size = read(migration->data_fd, &packet->data,
+ migration->data_buffer_size);
+ if (data_size < 0) {
+ error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
+ vbasedev->name, idx, errno);
+ goto thread_exit;
+ } else if (data_size == 0) {
+ break;
+ }
+
+ packet->idx = cpu_to_be32(idx);
+ packet_size = sizeof(*packet) + data_size;
+
+ if (!multifd_queue_device_state(d->idstr, d->instance_id,
+ (char *)packet, packet_size)) {
+ error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
+ goto thread_exit;
+ }
+
+ vfio_migration_add_bytes_transferred(packet_size);
+ }
+
+ if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
+ d->idstr,
+ d->instance_id,
+ idx, errp)) {
+ goto thread_exit;
+ }
+
+ ret = true;
+
+thread_exit:
+ trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
+
+ return ret;
+}
+
+int vfio_multifd_switchover_start(VFIODevice *vbasedev)
+{
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+
+ assert(multifd);
+
+ /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
+ bql_unlock();
+ WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
+ assert(!multifd->load_bufs_thread_running);
+ multifd->load_bufs_thread_running = true;
+ }
+ bql_lock();
+
+ qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
+
+ return 0;
+}