diff options
Diffstat (limited to 'hw/vfio/migration-multifd.c')
-rw-r--r-- | hw/vfio/migration-multifd.c | 685 |
1 files changed, 685 insertions, 0 deletions
diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c new file mode 100644 index 0000000..850a319 --- /dev/null +++ b/hw/vfio/migration-multifd.c @@ -0,0 +1,685 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/vfio/vfio-device.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "qemu/bswap.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "io/channel-buffer.h" +#include "migration/qemu-file.h" +#include "migration-multifd.h" +#include "vfio-migration-internal.h" +#include "trace.h" + +#define VFIO_DEVICE_STATE_CONFIG_STATE (1) + +#define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) + +typedef struct VFIODeviceStatePacket { + uint32_t version; + uint32_t idx; + uint32_t flags; + uint8_t data[0]; +} QEMU_PACKED VFIODeviceStatePacket; + +/* type safety */ +typedef struct VFIOStateBuffers { + GArray *array; +} VFIOStateBuffers; + +typedef struct VFIOStateBuffer { + bool is_present; + char *data; + size_t len; +} VFIOStateBuffer; + +typedef struct VFIOMultifd { + bool load_bufs_thread_running; + bool load_bufs_thread_want_exit; + + VFIOStateBuffers load_bufs; + QemuCond load_bufs_buffer_ready_cond; + QemuCond load_bufs_thread_finished_cond; + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ + uint32_t load_buf_idx; + uint32_t load_buf_idx_last; +} VFIOMultifd; + +static void vfio_state_buffer_clear(gpointer data) +{ + VFIOStateBuffer *lb = data; + + if (!lb->is_present) { + return; + } + + g_clear_pointer(&lb->data, g_free); + lb->is_present = false; +} + +static void vfio_state_buffers_init(VFIOStateBuffers *bufs) +{ + bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); + g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); +} + +static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) +{ + g_clear_pointer(&bufs->array, g_array_unref); +} + +static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) +{ + assert(bufs->array); +} + +static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) +{ + return bufs->array->len; +} + +static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, + unsigned int size) +{ + g_array_set_size(bufs->array, size); +} + +static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, + unsigned int idx) +{ + return &g_array_index(bufs->array, VFIOStateBuffer, idx); +} + +/* called with load_bufs_mutex locked */ +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, + VFIODeviceStatePacket *packet, + size_t packet_total_size, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + + vfio_state_buffers_assert_init(&multifd->load_bufs); + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); + if (lb->is_present) { + error_setg(errp, "%s: state buffer %" PRIu32 " already filled", + vbasedev->name, packet->idx); + return false; + } + + assert(packet->idx >= multifd->load_buf_idx); + + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); + lb->len = packet_total_size - sizeof(*packet); + lb->is_present = true; + + return true; +} + +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + error_setg(errp, + "%s: got device state packet but not doing multifd transfer", + vbasedev->name); + return false; + } + + assert(multifd); + + if (data_size < sizeof(*packet)) { + error_setg(errp, "%s: packet too short at %zu (min is %zu)", + vbasedev->name, data_size, sizeof(*packet)); + return false; + } + + packet->version = be32_to_cpu(packet->version); + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { + error_setg(errp, "%s: packet has unknown version %" PRIu32, + vbasedev->name, packet->version); + return false; + } + + packet->idx = be32_to_cpu(packet->idx); + packet->flags = be32_to_cpu(packet->flags); + + if (packet->idx == UINT32_MAX) { + error_setg(errp, "%s: packet index is invalid", vbasedev->name); + return false; + } + + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); + + /* + * Holding BQL here would violate the lock order and can cause + * a deadlock once we attempt to lock load_bufs_mutex below. + */ + assert(!bql_locked()); + + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + /* config state packet should be the last one in the stream */ + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { + multifd->load_buf_idx_last = packet->idx; + } + + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, + errp)) { + return false; + } + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + } + + return true; +} + +static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; + uint64_t mig_header; + int ret; + + assert(multifd->load_buf_idx == multifd->load_buf_idx_last); + lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); + assert(lb->is_present); + + bioc = qio_channel_buffer_new(lb->len); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); + + f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); + qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); + + ret = qemu_fflush(f_out); + if (ret) { + error_setg(errp, "%s: load config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); + f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); + + mig_header = qemu_get_be64(f_in); + if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { + error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, + vbasedev->name, mig_header); + return false; + } + + bql_lock(); + ret = vfio_load_device_config_state(f_in, vbasedev); + bql_unlock(); + + if (ret < 0) { + error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", + vbasedev->name, ret); + return false; + } + + return true; +} + +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) +{ + VFIOStateBuffer *lb; + unsigned int bufs_len; + + bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); + if (multifd->load_buf_idx >= bufs_len) { + assert(multifd->load_buf_idx == bufs_len); + return NULL; + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, + multifd->load_buf_idx); + if (!lb->is_present) { + return NULL; + } + + return lb; +} + +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, + VFIOStateBuffer *lb, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + g_autofree char *buf = NULL; + char *buf_cur; + size_t buf_len; + + if (!lb->len) { + return true; + } + + trace_vfio_load_state_device_buffer_load_start(vbasedev->name, + multifd->load_buf_idx); + + /* lb might become re-allocated when we drop the lock */ + buf = g_steal_pointer(&lb->data); + buf_cur = buf; + buf_len = lb->len; + while (buf_len > 0) { + ssize_t wr_ret; + int errno_save; + + /* + * Loading data to the device takes a while, + * drop the lock during this process. + */ + qemu_mutex_unlock(&multifd->load_bufs_mutex); + wr_ret = write(migration->data_fd, buf_cur, buf_len); + errno_save = errno; + qemu_mutex_lock(&multifd->load_bufs_mutex); + + if (wr_ret < 0) { + error_setg(errp, + "%s: writing state buffer %" PRIu32 " failed: %d", + vbasedev->name, multifd->load_buf_idx, errno_save); + return false; + } + + assert(wr_ret <= buf_len); + buf_len -= wr_ret; + buf_cur += wr_ret; + } + + trace_vfio_load_state_device_buffer_load_end(vbasedev->name, + multifd->load_buf_idx); + + return true; +} + +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, + bool *should_quit) +{ + return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); +} + +/* + * This thread is spawned by vfio_multifd_switchover_start() which gets + * called upon encountering the switchover point marker in main migration + * stream. + * + * It exits after either: + * * completing loading the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by it setting should_quit + * or by vfio_load_cleanup_load_bufs_thread() setting + * multifd->load_bufs_thread_want_exit. + */ +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + bool ret = false; + + trace_vfio_load_bufs_thread_start(vbasedev->name); + + assert(multifd); + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); + + assert(multifd->load_bufs_thread_running); + + while (true) { + VFIOStateBuffer *lb; + + /* + * Always check cancellation first after the buffer_ready wait below in + * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). + */ + if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); + + lb = vfio_load_state_buffer_get(multifd); + if (!lb) { + trace_vfio_load_state_device_buffer_starved(vbasedev->name, + multifd->load_buf_idx); + qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, + &multifd->load_bufs_mutex); + continue; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last) { + break; + } + + if (multifd->load_buf_idx == 0) { + trace_vfio_load_state_device_buffer_start(vbasedev->name); + } + + if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { + goto thread_exit; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { + trace_vfio_load_state_device_buffer_end(vbasedev->name); + } + + multifd->load_buf_idx++; + } + + if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + /* + * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that + * this thread is exiting. + */ + multifd->load_bufs_thread_running = false; + qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); + + trace_vfio_load_bufs_thread_end(vbasedev->name); + + return ret; +} + +static VFIOMultifd *vfio_multifd_new(void) +{ + VFIOMultifd *multifd = g_new(VFIOMultifd, 1); + + vfio_state_buffers_init(&multifd->load_bufs); + + qemu_mutex_init(&multifd->load_bufs_mutex); + + multifd->load_buf_idx = 0; + multifd->load_buf_idx_last = UINT32_MAX; + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + + multifd->load_bufs_thread_running = false; + multifd->load_bufs_thread_want_exit = false; + qemu_cond_init(&multifd->load_bufs_thread_finished_cond); + + return multifd; +} + +/* + * Terminates vfio_load_bufs_thread by setting + * multifd->load_bufs_thread_want_exit and signalling all the conditions + * the thread could be blocked on. + * + * Waits for the thread to signal that it had finished. + */ +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) +{ + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + while (multifd->load_bufs_thread_running) { + multifd->load_bufs_thread_want_exit = true; + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, + &multifd->load_bufs_mutex); + } + } + bql_lock(); +} + +static void vfio_multifd_free(VFIOMultifd *multifd) +{ + vfio_load_cleanup_load_bufs_thread(multifd); + + qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); + vfio_state_buffers_destroy(&multifd->load_bufs); + qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); + qemu_mutex_destroy(&multifd->load_bufs_mutex); + + g_free(multifd); +} + +void vfio_multifd_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + g_clear_pointer(&migration->multifd, vfio_multifd_free); +} + +bool vfio_multifd_transfer_supported(void) +{ + return multifd_device_state_supported() && + migrate_send_switchover_start(); +} + +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->multifd_transfer; +} + +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + + /* + * Make a copy of this setting at the start in case it is changed + * mid-migration. + */ + if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { + migration->multifd_transfer = vfio_multifd_transfer_supported(); + } else { + migration->multifd_transfer = + vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; + } + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing further to check or do */ + return true; + } + + if (!vfio_multifd_transfer_supported()) { + error_setg(errp, + "%s: Multifd device transfer requested but unsupported in the current config", + vbasedev->name); + return false; + } + + if (alloc_multifd) { + assert(!migration->multifd); + migration->multifd = vfio_multifd_new(); + } + + return true; +} + +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) +{ + assert(vfio_multifd_transfer_enabled(vbasedev)); + + /* + * Emit dummy NOP data on the main migration channel since the actual + * device state transfer is done via multifd channels. + */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); +} + +static bool +vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, + char *idstr, + uint32_t instance_id, + uint32_t idx, + Error **errp) +{ + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f = NULL; + int ret; + g_autofree VFIODeviceStatePacket *packet = NULL; + size_t packet_len; + + bioc = qio_channel_buffer_new(0); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); + + f = qemu_file_new_output(QIO_CHANNEL(bioc)); + + if (vfio_save_device_config_state(f, vbasedev, errp)) { + return false; + } + + ret = qemu_fflush(f); + if (ret) { + error_setg(errp, "%s: save config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + packet_len = sizeof(*packet) + bioc->usage; + packet = g_malloc0(packet_len); + packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); + packet->idx = cpu_to_be32(idx); + packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE); + memcpy(&packet->data, bioc->data, bioc->usage); + + if (!multifd_queue_device_state(idstr, instance_id, + (char *)packet, packet_len)) { + error_setg(errp, "%s: multifd config data queuing failed", + vbasedev->name); + return false; + } + + vfio_migration_add_bytes_transferred(packet_len); + + return true; +} + +/* + * This thread is spawned by the migration core directly via + * .save_live_complete_precopy_thread SaveVMHandler. + * + * It exits after either: + * * completing saving the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by + * multifd_device_state_save_thread_should_exit() returning true. + */ +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp) +{ + VFIODevice *vbasedev = d->handler_opaque; + VFIOMigration *migration = vbasedev->migration; + bool ret = false; + g_autofree VFIODeviceStatePacket *packet = NULL; + uint32_t idx; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ + return true; + } + + trace_vfio_save_complete_precopy_thread_start(vbasedev->name, + d->idstr, d->instance_id); + + /* We reach here with device state STOP or STOP_COPY only */ + if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP, errp)) { + goto thread_exit; + } + + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); + packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); + + for (idx = 0; ; idx++) { + ssize_t data_size; + size_t packet_size; + + if (multifd_device_state_save_thread_should_exit()) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + data_size = read(migration->data_fd, &packet->data, + migration->data_buffer_size); + if (data_size < 0) { + error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", + vbasedev->name, idx, errno); + goto thread_exit; + } else if (data_size == 0) { + break; + } + + packet->idx = cpu_to_be32(idx); + packet_size = sizeof(*packet) + data_size; + + if (!multifd_queue_device_state(d->idstr, d->instance_id, + (char *)packet, packet_size)) { + error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); + goto thread_exit; + } + + vfio_migration_add_bytes_transferred(packet_size); + } + + if (!vfio_save_complete_precopy_thread_config_state(vbasedev, + d->idstr, + d->instance_id, + idx, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); + + return ret; +} + +int vfio_multifd_switchover_start(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + + assert(multifd); + + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + assert(!multifd->load_bufs_thread_running); + multifd->load_bufs_thread_running = true; + } + bql_lock(); + + qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); + + return 0; +} |