/* * Multifd VFIO migration * * Copyright (C) 2024,2025 Oracle and/or its affiliates. * * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. * * SPDX-License-Identifier: GPL-2.0-or-later */ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" #include "migration/misc.h" #include "qapi/error.h" #include "qemu/bswap.h" #include "qemu/error-report.h" #include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/thread.h" #include "io/channel-buffer.h" #include "migration/qemu-file.h" #include "migration-multifd.h" #include "vfio-migration-internal.h" #include "trace.h" #define VFIO_DEVICE_STATE_CONFIG_STATE (1) #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) typedef struct VFIODeviceStatePacket { uint32_t version; uint32_t idx; uint32_t flags; uint8_t data[0]; } QEMU_PACKED VFIODeviceStatePacket; /* type safety */ typedef struct VFIOStateBuffers { GArray *array; } VFIOStateBuffers; typedef struct VFIOStateBuffer { bool is_present; char *data; size_t len; } VFIOStateBuffer; typedef struct VFIOMultifd { bool load_bufs_thread_running; bool load_bufs_thread_want_exit; VFIOStateBuffers load_bufs; QemuCond load_bufs_buffer_ready_cond; QemuCond load_bufs_thread_finished_cond; QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ uint32_t load_buf_idx; uint32_t load_buf_idx_last; } VFIOMultifd; static void vfio_state_buffer_clear(gpointer data) { VFIOStateBuffer *lb = data; if (!lb->is_present) { return; } g_clear_pointer(&lb->data, g_free); lb->is_present = false; } static void vfio_state_buffers_init(VFIOStateBuffers *bufs) { bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); } static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) { g_clear_pointer(&bufs->array, g_array_unref); } static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) { assert(bufs->array); } static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) { return bufs->array->len; } static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, unsigned int size) { g_array_set_size(bufs->array, size); } static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, unsigned int idx) { return &g_array_index(bufs->array, VFIOStateBuffer, idx); } /* called with load_bufs_mutex locked */ static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, VFIODeviceStatePacket *packet, size_t packet_total_size, Error **errp) { VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; VFIOStateBuffer *lb; vfio_state_buffers_assert_init(&multifd->load_bufs); if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); } lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); if (lb->is_present) { error_setg(errp, "%s: state buffer %" PRIu32 " already filled", vbasedev->name, packet->idx); return false; } assert(packet->idx >= multifd->load_buf_idx); lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); lb->len = packet_total_size - sizeof(*packet); lb->is_present = true; return true; } bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, Error **errp) { VFIODevice *vbasedev = opaque; VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; if (!vfio_multifd_transfer_enabled(vbasedev)) { error_setg(errp, "%s: got device state packet but not doing multifd transfer", vbasedev->name); return false; } assert(multifd); if (data_size < sizeof(*packet)) { error_setg(errp, "%s: packet too short at %zu (min is %zu)", vbasedev->name, data_size, sizeof(*packet)); return false; } packet->version = be32_to_cpu(packet->version); if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { error_setg(errp, "%s: packet has unknown version %" PRIu32, vbasedev->name, packet->version); return false; } packet->idx = be32_to_cpu(packet->idx); packet->flags = be32_to_cpu(packet->flags); if (packet->idx == UINT32_MAX) { error_setg(errp, "%s: packet index is invalid", vbasedev->name); return false; } trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); /* * Holding BQL here would violate the lock order and can cause * a deadlock once we attempt to lock load_bufs_mutex below. */ assert(!bql_locked()); WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { /* config state packet should be the last one in the stream */ if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { multifd->load_buf_idx_last = packet->idx; } if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, errp)) { return false; } qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); } return true; } static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, Error **errp) { VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; VFIOStateBuffer *lb; g_autoptr(QIOChannelBuffer) bioc = NULL; g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; uint64_t mig_header; int ret; assert(multifd->load_buf_idx == multifd->load_buf_idx_last); lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); assert(lb->is_present); bioc = qio_channel_buffer_new(lb->len); qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); ret = qemu_fflush(f_out); if (ret) { error_setg(errp, "%s: load config state flush failed: %d", vbasedev->name, ret); return false; } qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); mig_header = qemu_get_be64(f_in); if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, vbasedev->name, mig_header); return false; } bql_lock(); ret = vfio_load_device_config_state(f_in, vbasedev); bql_unlock(); if (ret < 0) { error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", vbasedev->name, ret); return false; } return true; } static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) { VFIOStateBuffer *lb; unsigned int bufs_len; bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); if (multifd->load_buf_idx >= bufs_len) { assert(multifd->load_buf_idx == bufs_len); return NULL; } lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); if (!lb->is_present) { return NULL; } return lb; } static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, VFIOStateBuffer *lb, Error **errp) { VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; g_autofree char *buf = NULL; char *buf_cur; size_t buf_len; if (!lb->len) { return true; } trace_vfio_load_state_device_buffer_load_start(vbasedev->name, multifd->load_buf_idx); /* lb might become re-allocated when we drop the lock */ buf = g_steal_pointer(&lb->data); buf_cur = buf; buf_len = lb->len; while (buf_len > 0) { ssize_t wr_ret; int errno_save; /* * Loading data to the device takes a while, * drop the lock during this process. */ qemu_mutex_unlock(&multifd->load_bufs_mutex); wr_ret = write(migration->data_fd, buf_cur, buf_len); errno_save = errno; qemu_mutex_lock(&multifd->load_bufs_mutex); if (wr_ret < 0) { error_setg(errp, "%s: writing state buffer %" PRIu32 " failed: %d", vbasedev->name, multifd->load_buf_idx, errno_save); return false; } assert(wr_ret <= buf_len); buf_len -= wr_ret; buf_cur += wr_ret; } trace_vfio_load_state_device_buffer_load_end(vbasedev->name, multifd->load_buf_idx); return true; } static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, bool *should_quit) { return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); } /* * This thread is spawned by vfio_multifd_switchover_start() which gets * called upon encountering the switchover point marker in main migration * stream. * * It exits after either: * * completing loading the remaining device state and device config, OR: * * encountering some error while doing the above, OR: * * being forcefully aborted by the migration core by it setting should_quit * or by vfio_load_cleanup_load_bufs_thread() setting * multifd->load_bufs_thread_want_exit. */ static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) { VFIODevice *vbasedev = opaque; VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; bool ret = false; trace_vfio_load_bufs_thread_start(vbasedev->name); assert(multifd); QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); assert(multifd->load_bufs_thread_running); while (true) { VFIOStateBuffer *lb; /* * Always check cancellation first after the buffer_ready wait below in * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). */ if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { error_setg(errp, "operation cancelled"); goto thread_exit; } assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); lb = vfio_load_state_buffer_get(multifd); if (!lb) { trace_vfio_load_state_device_buffer_starved(vbasedev->name, multifd->load_buf_idx); qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, &multifd->load_bufs_mutex); continue; } if (multifd->load_buf_idx == multifd->load_buf_idx_last) { break; } if (multifd->load_buf_idx == 0) { trace_vfio_load_state_device_buffer_start(vbasedev->name); } if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { goto thread_exit; } if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { trace_vfio_load_state_device_buffer_end(vbasedev->name); } multifd->load_buf_idx++; } if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { goto thread_exit; } ret = true; thread_exit: /* * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that * this thread is exiting. */ multifd->load_bufs_thread_running = false; qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); trace_vfio_load_bufs_thread_end(vbasedev->name); return ret; } static VFIOMultifd *vfio_multifd_new(void) { VFIOMultifd *multifd = g_new(VFIOMultifd, 1); vfio_state_buffers_init(&multifd->load_bufs); qemu_mutex_init(&multifd->load_bufs_mutex); multifd->load_buf_idx = 0; multifd->load_buf_idx_last = UINT32_MAX; qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); multifd->load_bufs_thread_running = false; multifd->load_bufs_thread_want_exit = false; qemu_cond_init(&multifd->load_bufs_thread_finished_cond); return multifd; } /* * Terminates vfio_load_bufs_thread by setting * multifd->load_bufs_thread_want_exit and signalling all the conditions * the thread could be blocked on. * * Waits for the thread to signal that it had finished. */ static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) { /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ bql_unlock(); WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { while (multifd->load_bufs_thread_running) { multifd->load_bufs_thread_want_exit = true; qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, &multifd->load_bufs_mutex); } } bql_lock(); } static void vfio_multifd_free(VFIOMultifd *multifd) { vfio_load_cleanup_load_bufs_thread(multifd); qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); vfio_state_buffers_destroy(&multifd->load_bufs); qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); qemu_mutex_destroy(&multifd->load_bufs_mutex); g_free(multifd); } void vfio_multifd_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; g_clear_pointer(&migration->multifd, vfio_multifd_free); } bool vfio_multifd_transfer_supported(void) { return multifd_device_state_supported() && migrate_send_switchover_start(); } bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; return migration->multifd_transfer; } bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) { VFIOMigration *migration = vbasedev->migration; /* * Make a copy of this setting at the start in case it is changed * mid-migration. */ if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { migration->multifd_transfer = vfio_multifd_transfer_supported(); } else { migration->multifd_transfer = vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; } if (!vfio_multifd_transfer_enabled(vbasedev)) { /* Nothing further to check or do */ return true; } if (!vfio_multifd_transfer_supported()) { error_setg(errp, "%s: Multifd device transfer requested but unsupported in the current config", vbasedev->name); return false; } if (alloc_multifd) { assert(!migration->multifd); migration->multifd = vfio_multifd_new(); } return true; } void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) { assert(vfio_multifd_transfer_enabled(vbasedev)); /* * Emit dummy NOP data on the main migration channel since the actual * device state transfer is done via multifd channels. */ qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); } static bool vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, char *idstr, uint32_t instance_id, uint32_t idx, Error **errp) { g_autoptr(QIOChannelBuffer) bioc = NULL; g_autoptr(QEMUFile) f = NULL; int ret; g_autofree VFIODeviceStatePacket *packet = NULL; size_t packet_len; bioc = qio_channel_buffer_new(0); qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); f = qemu_file_new_output(QIO_CHANNEL(bioc)); if (vfio_save_device_config_state(f, vbasedev, errp)) { return false; } ret = qemu_fflush(f); if (ret) { error_setg(errp, "%s: save config state flush failed: %d", vbasedev->name, ret); return false; } packet_len = sizeof(*packet) + bioc->usage; packet = g_malloc0(packet_len); packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); packet->idx = cpu_to_be32(idx); packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE); memcpy(&packet->data, bioc->data, bioc->usage); if (!multifd_queue_device_state(idstr, instance_id, (char *)packet, packet_len)) { error_setg(errp, "%s: multifd config data queuing failed", vbasedev->name); return false; } vfio_migration_add_bytes_transferred(packet_len); return true; } /* * This thread is spawned by the migration core directly via * .save_live_complete_precopy_thread SaveVMHandler. * * It exits after either: * * completing saving the remaining device state and device config, OR: * * encountering some error while doing the above, OR: * * being forcefully aborted by the migration core by * multifd_device_state_save_thread_should_exit() returning true. */ bool vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, Error **errp) { VFIODevice *vbasedev = d->handler_opaque; VFIOMigration *migration = vbasedev->migration; bool ret = false; g_autofree VFIODeviceStatePacket *packet = NULL; uint32_t idx; if (!vfio_multifd_transfer_enabled(vbasedev)) { /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ return true; } trace_vfio_save_complete_precopy_thread_start(vbasedev->name, d->idstr, d->instance_id); /* We reach here with device state STOP or STOP_COPY only */ if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, VFIO_DEVICE_STATE_STOP, errp)) { goto thread_exit; } packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); for (idx = 0; ; idx++) { ssize_t data_size; size_t packet_size; if (multifd_device_state_save_thread_should_exit()) { error_setg(errp, "operation cancelled"); goto thread_exit; } data_size = read(migration->data_fd, &packet->data, migration->data_buffer_size); if (data_size < 0) { error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", vbasedev->name, idx, errno); goto thread_exit; } else if (data_size == 0) { break; } packet->idx = cpu_to_be32(idx); packet_size = sizeof(*packet) + data_size; if (!multifd_queue_device_state(d->idstr, d->instance_id, (char *)packet, packet_size)) { error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); goto thread_exit; } vfio_migration_add_bytes_transferred(packet_size); } if (!vfio_save_complete_precopy_thread_config_state(vbasedev, d->idstr, d->instance_id, idx, errp)) { goto thread_exit; } ret = true; thread_exit: trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); return ret; } int vfio_multifd_switchover_start(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; assert(multifd); /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ bql_unlock(); WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { assert(!multifd->load_bufs_thread_running); multifd->load_bufs_thread_running = true; } bql_lock(); qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); return 0; }