diff options
Diffstat (limited to 'migration')
47 files changed, 4197 insertions, 2020 deletions
diff --git a/migration/block-active.c b/migration/block-active.c new file mode 100644 index 0000000..40e986a --- /dev/null +++ b/migration/block-active.c @@ -0,0 +1,48 @@ +/* + * Block activation tracking for migration purpose + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2024 Red Hat, Inc. + */ +#include "qemu/osdep.h" +#include "block/block.h" +#include "qapi/error.h" +#include "migration/migration.h" +#include "qemu/error-report.h" +#include "trace.h" + +bool migration_block_activate(Error **errp) +{ + ERRP_GUARD(); + + assert(bql_locked()); + + trace_migration_block_activation("active"); + + bdrv_activate_all(errp); + if (*errp) { + error_report_err(error_copy(*errp)); + return false; + } + + return true; +} + +bool migration_block_inactivate(void) +{ + int ret; + + assert(bql_locked()); + + trace_migration_block_activation("inactive"); + + ret = bdrv_inactivate_all(); + if (ret) { + error_report("%s: bdrv_inactivate_all() failed: %d", + __func__, ret); + return false; + } + + return true; +} diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index a7d5504..f2c352d 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -62,8 +62,8 @@ #include "block/block.h" #include "block/block_int.h" #include "block/dirty-bitmap.h" -#include "sysemu/block-backend.h" -#include "sysemu/runstate.h" +#include "system/block-backend.h" +#include "system/runstate.h" #include "qemu/main-loop.h" #include "qemu/error-report.h" #include "migration/misc.h" diff --git a/migration/channel-block.c b/migration/channel-block.c index fff8d87..97de5a6 100644 --- a/migration/channel-block.c +++ b/migration/channel-block.c @@ -123,7 +123,7 @@ qio_channel_block_seek(QIOChannel *ioc, bioc->offset = offset; break; case SEEK_CUR: - bioc->offset += whence; + bioc->offset += offset; break; case SEEK_END: error_setg(errp, "Size of VMstate region is unknown"); @@ -170,7 +170,7 @@ qio_channel_block_set_aio_fd_handler(QIOChannel *ioc, static void qio_channel_block_class_init(ObjectClass *klass, - void *class_data G_GNUC_UNUSED) + const void *class_data G_GNUC_UNUSED) { QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass); diff --git a/migration/channel.c b/migration/channel.c index f9de064..a547b1f 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -33,6 +33,7 @@ void migration_channel_process_incoming(QIOChannel *ioc) { MigrationState *s = migrate_get_current(); + MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; trace_migration_set_incoming_channel( @@ -47,6 +48,10 @@ void migration_channel_process_incoming(QIOChannel *ioc) if (local_err) { error_report_err(local_err); + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + if (mis->exit_on_error) { + exit(EXIT_FAILURE); + } } } @@ -74,7 +79,7 @@ void migration_channel_connect(MigrationState *s, if (!error) { /* tls_channel_connect will call back to this * function after the TLS handshake, - * so we mustn't call migrate_fd_connect until then + * so we mustn't call migration_connect until then */ return; @@ -89,7 +94,7 @@ void migration_channel_connect(MigrationState *s, qemu_mutex_unlock(&s->qemu_file_lock); } } - migrate_fd_connect(s, error); + migration_connect(s, error); error_free(error); } diff --git a/migration/colo.c b/migration/colo.c index 6449490..e0f713c 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -11,7 +11,7 @@ */ #include "qemu/osdep.h" -#include "sysemu/sysemu.h" +#include "system/system.h" #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" #include "migration.h" @@ -30,8 +30,8 @@ #include "net/colo.h" #include "block/block.h" #include "qapi/qapi-events-migration.h" -#include "sysemu/cpus.h" -#include "sysemu/runstate.h" +#include "system/cpus.h" +#include "system/runstate.h" #include "net/filter.h" #include "options.h" @@ -146,7 +146,7 @@ static void secondary_vm_do_failover(void) return; } /* Notify COLO incoming thread that failover work is finished */ - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); /* For Secondary VM, jump to incoming co */ if (mis->colo_incoming_co) { @@ -195,7 +195,7 @@ static void primary_vm_do_failover(void) } /* Notify COLO thread that failover work is finished */ - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); } COLOMode get_colo_mode(void) @@ -452,6 +452,9 @@ static int colo_do_checkpoint_transaction(MigrationState *s, bql_unlock(); goto out; } + + qemu_savevm_maybe_send_switchover_start(s->to_dst_file); + /* Note: device state is saved into buffer */ ret = qemu_save_device_state(fb); @@ -617,8 +620,8 @@ out: } /* Hope this not to be too long to wait here */ - qemu_sem_wait(&s->colo_exit_sem); - qemu_sem_destroy(&s->colo_exit_sem); + qemu_event_wait(&s->colo_exit_event); + qemu_event_destroy(&s->colo_exit_event); /* * It is safe to unregister notifier after failover finished. @@ -648,7 +651,7 @@ void migrate_start_colo_process(MigrationState *s) s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, colo_checkpoint_notify_timer, NULL); - qemu_sem_init(&s->colo_exit_sem, 0); + qemu_event_init(&s->colo_exit_event, false); colo_process_checkpoint(s); bql_lock(); } @@ -805,11 +808,11 @@ void colo_shutdown(void) case COLO_MODE_PRIMARY: s = migrate_get_current(); qemu_event_set(&s->colo_checkpoint_event); - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); break; case COLO_MODE_SECONDARY: mis = migration_incoming_get_current(); - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); break; default: break; @@ -824,7 +827,7 @@ static void *colo_process_incoming_thread(void *opaque) Error *local_err = NULL; rcu_register_thread(); - qemu_sem_init(&mis->colo_incoming_sem, 0); + qemu_event_init(&mis->colo_incoming_event, false); migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -836,7 +839,7 @@ static void *colo_process_incoming_thread(void *opaque) /* Make sure all file formats throw away their mutable metadata */ bql_lock(); - bdrv_activate_all(&local_err); + migration_block_activate(&local_err); bql_unlock(); if (local_err) { error_report_err(local_err); @@ -920,8 +923,8 @@ out: } /* Hope this not to be too long to loop here */ - qemu_sem_wait(&mis->colo_incoming_sem); - qemu_sem_destroy(&mis->colo_incoming_sem); + qemu_event_wait(&mis->colo_incoming_event); + qemu_event_destroy(&mis->colo_incoming_event); rcu_unregister_thread(); return NULL; @@ -935,7 +938,8 @@ void coroutine_fn colo_incoming_co(void) assert(bql_locked()); assert(migration_incoming_colo_enabled()); - qemu_thread_create(&th, "mig/dst/colo", colo_process_incoming_thread, + qemu_thread_create(&th, MIGRATION_THREAD_DST_COLO, + colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE); mis->colo_incoming_co = qemu_coroutine_self(); diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c new file mode 100644 index 0000000..00371d1 --- /dev/null +++ b/migration/cpr-transfer.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022, 2024 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "io/channel-file.h" +#include "io/channel-socket.h" +#include "io/net-listener.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/savevm.h" +#include "migration/qemu-file.h" +#include "migration/vmstate.h" +#include "trace.h" + +QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp) +{ + MigrationAddress *addr = channel->addr; + + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET && + addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) { + + g_autoptr(QIOChannelSocket) sioc = qio_channel_socket_new(); + QIOChannel *ioc = QIO_CHANNEL(sioc); + SocketAddress *saddr = &addr->u.socket; + + if (qio_channel_socket_connect_sync(sioc, saddr, errp) < 0) { + return NULL; + } + trace_cpr_transfer_output(addr->u.socket.u.q_unix.path); + qio_channel_set_name(ioc, "cpr-out"); + return qemu_file_new_output(ioc); + + } else { + error_setg(errp, "bad cpr channel address; must be unix"); + return NULL; + } +} + +QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp) +{ + MigrationAddress *addr = channel->addr; + + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET && + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX || + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) { + + g_autoptr(QIOChannelSocket) sioc = NULL; + SocketAddress *saddr = &addr->u.socket; + g_autoptr(QIONetListener) listener = qio_net_listener_new(); + QIOChannel *ioc; + + qio_net_listener_set_name(listener, "cpr-socket-listener"); + if (qio_net_listener_open_sync(listener, saddr, 1, errp) < 0) { + return NULL; + } + + sioc = qio_net_listener_wait_client(listener); + ioc = QIO_CHANNEL(sioc); + trace_cpr_transfer_input( + addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ? + addr->u.socket.u.q_unix.path : addr->u.socket.u.fd.str); + qio_channel_set_name(ioc, "cpr-in"); + return qemu_file_new_input(ioc); + + } else { + error_setg(errp, "bad cpr channel socket type; must be unix"); + return NULL; + } +} diff --git a/migration/cpr.c b/migration/cpr.c new file mode 100644 index 0000000..a50a57e --- /dev/null +++ b/migration/cpr.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2021-2024 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/misc.h" +#include "migration/options.h" +#include "migration/qemu-file.h" +#include "migration/savevm.h" +#include "migration/vmstate.h" +#include "system/runstate.h" +#include "trace.h" + +/*************************************************************************/ +/* cpr state container for all information to be saved. */ + +typedef QLIST_HEAD(CprFdList, CprFd) CprFdList; + +typedef struct CprState { + CprFdList fds; +} CprState; + +static CprState cpr_state; + +/****************************************************************************/ + +typedef struct CprFd { + char *name; + unsigned int namelen; + int id; + int fd; + QLIST_ENTRY(CprFd) next; +} CprFd; + +static const VMStateDescription vmstate_cpr_fd = { + .name = "cpr fd", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(namelen, CprFd), + VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), + VMSTATE_INT32(id, CprFd), + VMSTATE_FD(fd, CprFd), + VMSTATE_END_OF_LIST() + } +}; + +void cpr_save_fd(const char *name, int id, int fd) +{ + CprFd *elem = g_new0(CprFd, 1); + + trace_cpr_save_fd(name, id, fd); + elem->name = g_strdup(name); + elem->namelen = strlen(name) + 1; + elem->id = id; + elem->fd = fd; + QLIST_INSERT_HEAD(&cpr_state.fds, elem, next); +} + +static CprFd *find_fd(CprFdList *head, const char *name, int id) +{ + CprFd *elem; + + QLIST_FOREACH(elem, head, next) { + if (!strcmp(elem->name, name) && elem->id == id) { + return elem; + } + } + return NULL; +} + +void cpr_delete_fd(const char *name, int id) +{ + CprFd *elem = find_fd(&cpr_state.fds, name, id); + + if (elem) { + QLIST_REMOVE(elem, next); + g_free(elem->name); + g_free(elem); + } + + trace_cpr_delete_fd(name, id); +} + +int cpr_find_fd(const char *name, int id) +{ + CprFd *elem = find_fd(&cpr_state.fds, name, id); + int fd = elem ? elem->fd : -1; + + trace_cpr_find_fd(name, id, fd); + return fd; +} + +void cpr_resave_fd(const char *name, int id, int fd) +{ + CprFd *elem = find_fd(&cpr_state.fds, name, id); + int old_fd = elem ? elem->fd : -1; + + if (old_fd < 0) { + cpr_save_fd(name, id, fd); + } else if (old_fd != fd) { + error_setg(&error_fatal, + "internal error: cpr fd '%s' id %d value %d " + "already saved with a different value %d", + name, id, fd, old_fd); + } +} + +int cpr_open_fd(const char *path, int flags, const char *name, int id, + Error **errp) +{ + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = qemu_open(path, flags, errp); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +/*************************************************************************/ +#define CPR_STATE "CprState" + +static const VMStateDescription vmstate_cpr_state = { + .name = CPR_STATE, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next), + VMSTATE_END_OF_LIST() + } +}; +/*************************************************************************/ + +static QEMUFile *cpr_state_file; + +QIOChannel *cpr_state_ioc(void) +{ + return qemu_file_get_ioc(cpr_state_file); +} + +static MigMode incoming_mode = MIG_MODE_NONE; + +MigMode cpr_get_incoming_mode(void) +{ + return incoming_mode; +} + +void cpr_set_incoming_mode(MigMode mode) +{ + incoming_mode = mode; +} + +bool cpr_is_incoming(void) +{ + return incoming_mode != MIG_MODE_NONE; +} + +int cpr_state_save(MigrationChannel *channel, Error **errp) +{ + int ret; + QEMUFile *f; + MigMode mode = migrate_mode(); + + trace_cpr_state_save(MigMode_str(mode)); + + if (mode == MIG_MODE_CPR_TRANSFER) { + g_assert(channel); + f = cpr_transfer_output(channel, errp); + } else { + return 0; + } + if (!f) { + return -1; + } + + qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); + qemu_put_be32(f, QEMU_CPR_FILE_VERSION); + + ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0); + if (ret) { + error_setg(errp, "vmstate_save_state error %d", ret); + qemu_fclose(f); + return ret; + } + + /* + * Close the socket only partially so we can later detect when the other + * end closes by getting a HUP event. + */ + qemu_fflush(f); + qio_channel_shutdown(qemu_file_get_ioc(f), QIO_CHANNEL_SHUTDOWN_WRITE, + NULL); + cpr_state_file = f; + return 0; +} + +int cpr_state_load(MigrationChannel *channel, Error **errp) +{ + int ret; + uint32_t v; + QEMUFile *f; + MigMode mode = 0; + + if (channel) { + mode = MIG_MODE_CPR_TRANSFER; + cpr_set_incoming_mode(mode); + f = cpr_transfer_input(channel, errp); + } else { + return 0; + } + if (!f) { + return -1; + } + + trace_cpr_state_load(MigMode_str(mode)); + + v = qemu_get_be32(f); + if (v != QEMU_CPR_FILE_MAGIC) { + error_setg(errp, "Not a migration stream (bad magic %x)", v); + qemu_fclose(f); + return -EINVAL; + } + v = qemu_get_be32(f); + if (v != QEMU_CPR_FILE_VERSION) { + error_setg(errp, "Unsupported migration stream version %d", v); + qemu_fclose(f); + return -ENOTSUP; + } + + ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1); + if (ret) { + error_setg(errp, "vmstate_load_state error %d", ret); + qemu_fclose(f); + return ret; + } + + /* + * Let the caller decide when to close the socket (and generate a HUP event + * for the sending side). + */ + cpr_state_file = f; + + return ret; +} + +void cpr_state_close(void) +{ + if (cpr_state_file) { + qemu_fclose(cpr_state_file); + cpr_state_file = NULL; + } +} + +bool cpr_incoming_needed(void *opaque) +{ + MigMode mode = migrate_mode(); + return mode == MIG_MODE_CPR_TRANSFER; +} diff --git a/migration/cpu-throttle.c b/migration/cpu-throttle.c new file mode 100644 index 0000000..0642e6b --- /dev/null +++ b/migration/cpu-throttle.c @@ -0,0 +1,199 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/thread.h" +#include "hw/core/cpu.h" +#include "qemu/main-loop.h" +#include "system/cpus.h" +#include "system/cpu-throttle.h" +#include "migration.h" +#include "migration-stats.h" +#include "trace.h" + +/* vcpu throttling controls */ +static QEMUTimer *throttle_timer, *throttle_dirty_sync_timer; +static unsigned int throttle_percentage; +static bool throttle_dirty_sync_timer_active; +static uint64_t throttle_dirty_sync_count_prev; + +#define CPU_THROTTLE_PCT_MIN 1 +#define CPU_THROTTLE_PCT_MAX 99 +#define CPU_THROTTLE_TIMESLICE_NS 10000000 + +/* Making sure RAMBlock dirty bitmap is synchronized every five seconds */ +#define CPU_THROTTLE_DIRTY_SYNC_TIMESLICE_MS 5000 + +static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque) +{ + double pct; + double throttle_ratio; + int64_t sleeptime_ns, endtime_ns; + + if (!cpu_throttle_get_percentage()) { + return; + } + + pct = (double)cpu_throttle_get_percentage() / 100; + throttle_ratio = pct / (1 - pct); + /* Add 1ns to fix double's rounding error (like 0.9999999...) */ + sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1); + endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns; + while (sleeptime_ns > 0 && !cpu->stop) { + if (sleeptime_ns > SCALE_MS) { + qemu_cond_timedwait_bql(cpu->halt_cond, + sleeptime_ns / SCALE_MS); + } else { + bql_unlock(); + g_usleep(sleeptime_ns / SCALE_US); + bql_lock(); + } + sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + } + qatomic_set(&cpu->throttle_thread_scheduled, 0); +} + +static void cpu_throttle_timer_tick(void *opaque) +{ + CPUState *cpu; + double pct; + + /* Stop the timer if needed */ + if (!cpu_throttle_get_percentage()) { + return; + } + CPU_FOREACH(cpu) { + if (!qatomic_xchg(&cpu->throttle_thread_scheduled, 1)) { + async_run_on_cpu(cpu, cpu_throttle_thread, + RUN_ON_CPU_NULL); + } + } + + pct = (double)cpu_throttle_get_percentage() / 100; + timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) + + CPU_THROTTLE_TIMESLICE_NS / (1 - pct)); +} + +void cpu_throttle_set(int new_throttle_pct) +{ + /* + * boolean to store whether throttle is already active or not, + * before modifying throttle_percentage + */ + bool throttle_active = cpu_throttle_active(); + + trace_cpu_throttle_set(new_throttle_pct); + + /* Ensure throttle percentage is within valid range */ + new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX); + new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN); + + qatomic_set(&throttle_percentage, new_throttle_pct); + + if (!throttle_active) { + cpu_throttle_timer_tick(NULL); + } +} + +void cpu_throttle_stop(void) +{ + qatomic_set(&throttle_percentage, 0); + cpu_throttle_dirty_sync_timer(false); +} + +bool cpu_throttle_active(void) +{ + return (cpu_throttle_get_percentage() != 0); +} + +int cpu_throttle_get_percentage(void) +{ + return qatomic_read(&throttle_percentage); +} + +void cpu_throttle_dirty_sync_timer_tick(void *opaque) +{ + uint64_t sync_cnt = stat64_get(&mig_stats.dirty_sync_count); + + /* + * The first iteration copies all memory anyhow and has no + * effect on guest performance, therefore omit it to avoid + * paying extra for the sync penalty. + */ + if (sync_cnt <= 1) { + goto end; + } + + if (sync_cnt == throttle_dirty_sync_count_prev) { + trace_cpu_throttle_dirty_sync(); + WITH_RCU_READ_LOCK_GUARD() { + migration_bitmap_sync_precopy(false); + } + } + +end: + throttle_dirty_sync_count_prev = stat64_get(&mig_stats.dirty_sync_count); + + timer_mod(throttle_dirty_sync_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + + CPU_THROTTLE_DIRTY_SYNC_TIMESLICE_MS); +} + +static bool cpu_throttle_dirty_sync_active(void) +{ + return qatomic_read(&throttle_dirty_sync_timer_active); +} + +void cpu_throttle_dirty_sync_timer(bool enable) +{ + assert(throttle_dirty_sync_timer); + + if (enable) { + if (!cpu_throttle_dirty_sync_active()) { + /* + * Always reset the dirty sync count cache, in case migration + * was cancelled once. + */ + throttle_dirty_sync_count_prev = 0; + timer_mod(throttle_dirty_sync_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + + CPU_THROTTLE_DIRTY_SYNC_TIMESLICE_MS); + qatomic_set(&throttle_dirty_sync_timer_active, 1); + } + } else { + if (cpu_throttle_dirty_sync_active()) { + timer_del(throttle_dirty_sync_timer); + qatomic_set(&throttle_dirty_sync_timer_active, 0); + } + } +} + +void cpu_throttle_init(void) +{ + throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT, + cpu_throttle_timer_tick, NULL); + throttle_dirty_sync_timer = + timer_new_ms(QEMU_CLOCK_VIRTUAL_RT, + cpu_throttle_dirty_sync_timer_tick, NULL); +} diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c index 1d9db81..986624c 100644 --- a/migration/dirtyrate.c +++ b/migration/dirtyrate.c @@ -14,7 +14,7 @@ #include "qemu/error-report.h" #include "hw/core/cpu.h" #include "qapi/error.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "qemu/rcu_queue.h" #include "qemu/main-loop.h" @@ -24,11 +24,12 @@ #include "dirtyrate.h" #include "monitor/hmp.h" #include "monitor/monitor.h" -#include "qapi/qmp/qdict.h" -#include "sysemu/kvm.h" -#include "sysemu/runstate.h" -#include "exec/memory.h" +#include "qobject/qdict.h" +#include "system/kvm.h" +#include "system/runstate.h" +#include "system/memory.h" #include "qemu/xxhash.h" +#include "migration.h" /* * total_dirty_pages is procted by BQL and is used @@ -149,12 +150,12 @@ int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms, unsigned int flag, bool one_shot) { - DirtyPageRecord *records; + DirtyPageRecord *records = NULL; int64_t init_time_ms; int64_t duration; int64_t dirtyrate; int i = 0; - unsigned int gen_id; + unsigned int gen_id = 0; retry: init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -228,8 +229,7 @@ static int time_unit_to_power(TimeUnit time_unit) case TIME_UNIT_MILLISECOND: return -3; default: - assert(false); /* unreachable */ - return 0; + g_assert_not_reached(); } } @@ -437,6 +437,7 @@ static void get_ramblock_dirty_info(RAMBlock *block, struct DirtyRateConfig *config) { uint64_t sample_pages_per_gigabytes = config->sample_pages_per_gigabytes; + gsize len; /* Right shift 30 bits to calc ramblock size in GB */ info->sample_pages_count = (qemu_ram_get_used_length(block) * @@ -445,7 +446,9 @@ static void get_ramblock_dirty_info(RAMBlock *block, info->ramblock_pages = qemu_ram_get_used_length(block) >> qemu_target_page_bits(); info->ramblock_addr = qemu_ram_get_host_addr(block); - strcpy(info->idstr, qemu_ram_get_idstr(block)); + len = g_strlcpy(info->idstr, qemu_ram_get_idstr(block), + sizeof(info->idstr)); + g_assert(len < sizeof(info->idstr)); } static void free_ramblock_dirty_info(struct RamblockDirtyInfo *infos, int count) @@ -840,8 +843,9 @@ void qmp_calc_dirty_rate(int64_t calc_time, init_dirtyrate_stat(config); - qemu_thread_create(&thread, "get_dirtyrate", get_dirtyrate_thread, - (void *)&config, QEMU_THREAD_DETACHED); + qemu_thread_create(&thread, MIGRATION_THREAD_DIRTY_RATE, + get_dirtyrate_thread, (void *)&config, + QEMU_THREAD_DETACHED); } diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h index 869c060..35225c3 100644 --- a/migration/dirtyrate.h +++ b/migration/dirtyrate.h @@ -13,7 +13,7 @@ #ifndef QEMU_MIGRATION_DIRTYRATE_H #define QEMU_MIGRATION_DIRTYRATE_H -#include "sysemu/dirtyrate.h" +#include "system/dirtyrate.h" /* * Sample 512 pages per GB as default. diff --git a/migration/fd.c b/migration/fd.c index aab5189..9bf9be6 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -25,6 +25,29 @@ #include "io/channel-util.h" #include "trace.h" +static bool fd_is_pipe(int fd) +{ + struct stat statbuf; + + if (fstat(fd, &statbuf) == -1) { + return false; + } + + return S_ISFIFO(statbuf.st_mode); +} + +static bool migration_fd_valid(int fd) +{ + if (fd_is_socket(fd)) { + return true; + } + + if (fd_is_pipe(fd)) { + return true; + } + + return false; +} void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { @@ -34,7 +57,7 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } - if (!fd_is_socket(fd)) { + if (!migration_fd_valid(fd)) { warn_report("fd: migration to a file is deprecated." " Use file: instead."); } @@ -68,7 +91,7 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) return; } - if (!fd_is_socket(fd)) { + if (!migration_fd_valid(fd)) { warn_report("fd: migration to a file is deprecated." " Use file: instead."); } diff --git a/migration/file.c b/migration/file.c index db870f2..bb8031e 100644 --- a/migration/file.c +++ b/migration/file.c @@ -6,7 +6,7 @@ */ #include "qemu/osdep.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "qemu/cutils.h" #include "qemu/error-report.h" #include "qapi/error.h" @@ -112,7 +112,6 @@ void file_start_outgoing_migration(MigrationState *s, error_setg_errno(errp, errno, "failed to truncate migration file to offset %" PRIx64, offset); - object_unref(OBJECT(fioc)); return; } @@ -120,7 +119,6 @@ void file_start_outgoing_migration(MigrationState *s, ioc = QIO_CHANNEL(fioc); if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { - object_unref(OBJECT(fioc)); return; } qio_channel_set_name(ioc, "migration-file-outgoing"); @@ -198,12 +196,13 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) } int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, - int niov, RAMBlock *block, Error **errp) + int niov, MultiFDPages_t *pages, Error **errp) { ssize_t ret = 0; int i, slice_idx, slice_num; uintptr_t base, next, offset; size_t len; + RAMBlock *block = pages->block; slice_idx = 0; slice_num = 1; diff --git a/migration/file.h b/migration/file.h index 9f71e87..1a1115f 100644 --- a/migration/file.h +++ b/migration/file.h @@ -21,6 +21,6 @@ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); void file_cleanup_outgoing_migration(void); bool file_send_channel_create(gpointer opaque, Error **errp); int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, - int niov, RAMBlock *block, Error **errp); + int niov, MultiFDPages_t *pages, Error **errp); int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp); #endif diff --git a/migration/global_state.c b/migration/global_state.c index 3a9796c..c1f90fc 100644 --- a/migration/global_state.c +++ b/migration/global_state.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" #include "qemu/error-report.h" -#include "sysemu/runstate.h" +#include "system/runstate.h" #include "qapi/error.h" #include "migration.h" #include "migration/global_state.h" diff --git a/migration/meson.build b/migration/meson.build index 5ce2acb4..9aa48b2 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -11,8 +11,12 @@ migration_files = files( system_ss.add(files( 'block-dirty-bitmap.c', + 'block-active.c', 'channel.c', 'channel-block.c', + 'cpr.c', + 'cpr-transfer.c', + 'cpu-throttle.c', 'dirtyrate.c', 'exec.c', 'fd.c', @@ -21,6 +25,8 @@ system_ss.add(files( 'migration-hmp-cmds.c', 'migration.c', 'multifd.c', + 'multifd-device-state.c', + 'multifd-nocomp.c', 'multifd-zlib.c', 'multifd-zero-page.c', 'options.c', @@ -41,6 +47,7 @@ system_ss.add(when: rdma, if_true: files('rdma.c')) system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 7d608d2..e8a563c 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -21,15 +21,15 @@ #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" #include "qapi/qapi-visit-migration.h" -#include "qapi/qmp/qdict.h" +#include "qobject/qdict.h" #include "qapi/string-input-visitor.h" #include "qapi/string-output-visitor.h" #include "qemu/cutils.h" #include "qemu/error-report.h" #include "qemu/sockets.h" -#include "sysemu/runstate.h" +#include "system/runstate.h" #include "ui/qemu-spice.h" -#include "sysemu/sysemu.h" +#include "system/system.h" #include "options.h" #include "migration.h" @@ -37,27 +37,28 @@ static void migration_global_dump(Monitor *mon) { MigrationState *ms = migrate_get_current(); - monitor_printf(mon, "globals:\n"); - monitor_printf(mon, "store-global-state: %s\n", + monitor_printf(mon, "Globals:\n"); + monitor_printf(mon, " store-global-state: %s\n", ms->store_global_state ? "on" : "off"); - monitor_printf(mon, "only-migratable: %s\n", + monitor_printf(mon, " only-migratable: %s\n", only_migratable ? "on" : "off"); - monitor_printf(mon, "send-configuration: %s\n", + monitor_printf(mon, " send-configuration: %s\n", ms->send_configuration ? "on" : "off"); - monitor_printf(mon, "send-section-footer: %s\n", + monitor_printf(mon, " send-section-footer: %s\n", ms->send_section_footer ? "on" : "off"); - monitor_printf(mon, "clear-bitmap-shift: %u\n", + monitor_printf(mon, " send-switchover-start: %s\n", + ms->send_switchover_start ? "on" : "off"); + monitor_printf(mon, " clear-bitmap-shift: %u\n", ms->clear_bitmap_shift); } void hmp_info_migrate(Monitor *mon, const QDict *qdict) { + bool show_all = qdict_get_try_bool(qdict, "all", false); MigrationInfo *info; info = qmp_query_migrate(NULL); - migration_global_dump(mon); - if (info->blocked_reasons) { strList *reasons = info->blocked_reasons; monitor_printf(mon, "Outgoing migration blocked:\n"); @@ -68,7 +69,7 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) } if (info->has_status) { - monitor_printf(mon, "Migration status: %s", + monitor_printf(mon, "Status: %s", MigrationStatus_str(info->status)); if (info->status == MIGRATION_STATUS_FAILED && info->error_desc) { monitor_printf(mon, " (%s)\n", info->error_desc); @@ -76,107 +77,130 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) monitor_printf(mon, "\n"); } - monitor_printf(mon, "total time: %" PRIu64 " ms\n", - info->total_time); - if (info->has_expected_downtime) { - monitor_printf(mon, "expected downtime: %" PRIu64 " ms\n", - info->expected_downtime); - } - if (info->has_downtime) { - monitor_printf(mon, "downtime: %" PRIu64 " ms\n", - info->downtime); + if (info->total_time) { + monitor_printf(mon, "Time (ms): total=%" PRIu64, + info->total_time); + if (info->has_setup_time) { + monitor_printf(mon, ", setup=%" PRIu64, + info->setup_time); + } + if (info->has_expected_downtime) { + monitor_printf(mon, ", exp_down=%" PRIu64, + info->expected_downtime); + } + if (info->has_downtime) { + monitor_printf(mon, ", down=%" PRIu64, + info->downtime); + } + monitor_printf(mon, "\n"); } - if (info->has_setup_time) { - monitor_printf(mon, "setup: %" PRIu64 " ms\n", - info->setup_time); + } + + if (info->has_socket_address) { + SocketAddressList *addr; + + monitor_printf(mon, "Sockets: [\n"); + + for (addr = info->socket_address; addr; addr = addr->next) { + char *s = socket_uri(addr->value); + monitor_printf(mon, "\t%s\n", s); + g_free(s); } + monitor_printf(mon, "]\n"); } if (info->ram) { - monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n", - info->ram->transferred >> 10); - monitor_printf(mon, "throughput: %0.2f mbps\n", + monitor_printf(mon, "RAM info:\n"); + monitor_printf(mon, " Throughput (Mbps): %0.2f\n", info->ram->mbps); - monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n", - info->ram->remaining >> 10); - monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n", + monitor_printf(mon, " Sizes (KiB): pagesize=%" PRIu64 + ", total=%" PRIu64 ",\n", + info->ram->page_size >> 10, info->ram->total >> 10); - monitor_printf(mon, "duplicate: %" PRIu64 " pages\n", - info->ram->duplicate); - monitor_printf(mon, "normal: %" PRIu64 " pages\n", - info->ram->normal); - monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n", - info->ram->normal_bytes >> 10); - monitor_printf(mon, "dirty sync count: %" PRIu64 "\n", - info->ram->dirty_sync_count); - monitor_printf(mon, "page size: %" PRIu64 " kbytes\n", - info->ram->page_size >> 10); - monitor_printf(mon, "multifd bytes: %" PRIu64 " kbytes\n", - info->ram->multifd_bytes >> 10); - monitor_printf(mon, "pages-per-second: %" PRIu64 "\n", + monitor_printf(mon, " transferred=%" PRIu64 + ", remain=%" PRIu64 ",\n", + info->ram->transferred >> 10, + info->ram->remaining >> 10); + monitor_printf(mon, " precopy=%" PRIu64 + ", multifd=%" PRIu64 + ", postcopy=%" PRIu64, + info->ram->precopy_bytes >> 10, + info->ram->multifd_bytes >> 10, + info->ram->postcopy_bytes >> 10); + + if (info->vfio) { + monitor_printf(mon, ", vfio=%" PRIu64, + info->vfio->transferred >> 10); + } + monitor_printf(mon, "\n"); + + monitor_printf(mon, " Pages: normal=%" PRIu64 ", zero=%" PRIu64 + ", rate_per_sec=%" PRIu64 "\n", + info->ram->normal, + info->ram->duplicate, info->ram->pages_per_second); + monitor_printf(mon, " Others: dirty_syncs=%" PRIu64, + info->ram->dirty_sync_count); if (info->ram->dirty_pages_rate) { - monitor_printf(mon, "dirty pages rate: %" PRIu64 " pages\n", + monitor_printf(mon, ", dirty_pages_rate=%" PRIu64, info->ram->dirty_pages_rate); } if (info->ram->postcopy_requests) { - monitor_printf(mon, "postcopy request count: %" PRIu64 "\n", + monitor_printf(mon, ", postcopy_req=%" PRIu64, info->ram->postcopy_requests); } - if (info->ram->precopy_bytes) { - monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n", - info->ram->precopy_bytes >> 10); - } if (info->ram->downtime_bytes) { - monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n", - info->ram->downtime_bytes >> 10); - } - if (info->ram->postcopy_bytes) { - monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n", - info->ram->postcopy_bytes >> 10); + monitor_printf(mon, ", downtime_ram=%" PRIu64, + info->ram->downtime_bytes); } if (info->ram->dirty_sync_missed_zero_copy) { - monitor_printf(mon, - "Zero-copy-send fallbacks happened: %" PRIu64 " times\n", + monitor_printf(mon, ", zerocopy_fallbacks=%" PRIu64, info->ram->dirty_sync_missed_zero_copy); } + monitor_printf(mon, "\n"); + } + + if (!show_all) { + goto out; } + migration_global_dump(mon); + if (info->xbzrle_cache) { - monitor_printf(mon, "cache size: %" PRIu64 " bytes\n", - info->xbzrle_cache->cache_size); - monitor_printf(mon, "xbzrle transferred: %" PRIu64 " kbytes\n", - info->xbzrle_cache->bytes >> 10); - monitor_printf(mon, "xbzrle pages: %" PRIu64 " pages\n", - info->xbzrle_cache->pages); - monitor_printf(mon, "xbzrle cache miss: %" PRIu64 " pages\n", - info->xbzrle_cache->cache_miss); - monitor_printf(mon, "xbzrle cache miss rate: %0.2f\n", - info->xbzrle_cache->cache_miss_rate); - monitor_printf(mon, "xbzrle encoding rate: %0.2f\n", - info->xbzrle_cache->encoding_rate); - monitor_printf(mon, "xbzrle overflow: %" PRIu64 "\n", + monitor_printf(mon, "XBZRLE: size=%" PRIu64 + ", transferred=%" PRIu64 + ", pages=%" PRIu64 + ", miss=%" PRIu64 "\n" + " miss_rate=%0.2f" + ", encode_rate=%0.2f" + ", overflow=%" PRIu64 "\n", + info->xbzrle_cache->cache_size, + info->xbzrle_cache->bytes, + info->xbzrle_cache->pages, + info->xbzrle_cache->cache_miss, + info->xbzrle_cache->cache_miss_rate, + info->xbzrle_cache->encoding_rate, info->xbzrle_cache->overflow); } if (info->has_cpu_throttle_percentage) { - monitor_printf(mon, "cpu throttle percentage: %" PRIu64 "\n", + monitor_printf(mon, "CPU Throttle (%%): %" PRIu64 "\n", info->cpu_throttle_percentage); } if (info->has_dirty_limit_throttle_time_per_round) { - monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n", + monitor_printf(mon, "Dirty-limit Throttle (us): %" PRIu64 "\n", info->dirty_limit_throttle_time_per_round); } if (info->has_dirty_limit_ring_full_time) { - monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n", + monitor_printf(mon, "Dirty-limit Ring Full (us): %" PRIu64 "\n", info->dirty_limit_ring_full_time); } if (info->has_postcopy_blocktime) { - monitor_printf(mon, "postcopy blocktime: %u\n", + monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n", info->postcopy_blocktime); } @@ -187,28 +211,12 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) visit_type_uint32List(v, NULL, &info->postcopy_vcpu_blocktime, &error_abort); visit_complete(v, &str); - monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str); + monitor_printf(mon, "Postcopy vCPU Blocktime: %s\n", str); g_free(str); visit_free(v); } - if (info->has_socket_address) { - SocketAddressList *addr; - - monitor_printf(mon, "socket address: [\n"); - - for (addr = info->socket_address; addr; addr = addr->next) { - char *s = socket_uri(addr->value); - monitor_printf(mon, "\t%s\n", s); - g_free(s); - } - monitor_printf(mon, "]\n"); - } - - if (info->vfio) { - monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n", - info->vfio->transferred >> 10); - } +out: qapi_free_MigrationInfo(info); } @@ -576,6 +584,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zlib_level = true; visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; + case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL: + p->has_multifd_qatzip_level = true; + visit_type_uint8(v, param, &p->multifd_qatzip_level, &err); + break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); @@ -636,7 +648,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) visit_type_bool(v, param, &p->direct_io, &err); break; default: - assert(0); + g_assert_not_reached(); } if (err) { diff --git a/migration/migration.c b/migration/migration.c index 3dea06d..4098870 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -14,6 +14,7 @@ */ #include "qemu/osdep.h" +#include "qemu/ctype.h" #include "qemu/cutils.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -22,11 +23,12 @@ #include "fd.h" #include "file.h" #include "socket.h" -#include "sysemu/runstate.h" -#include "sysemu/sysemu.h" -#include "sysemu/cpu-throttle.h" +#include "system/runstate.h" +#include "system/system.h" +#include "system/cpu-throttle.h" #include "rdma.h" #include "ram.h" +#include "migration/cpr.h" #include "migration/global_state.h" #include "migration/misc.h" #include "migration.h" @@ -43,7 +45,7 @@ #include "qapi/qapi-commands-migration.h" #include "qapi/qapi-events-migration.h" #include "qapi/qmp/qerror.h" -#include "qapi/qmp/qnull.h" +#include "qobject/qnull.h" #include "qemu/rcu.h" #include "postcopy-ram.h" #include "qemu/thread.h" @@ -59,13 +61,13 @@ #include "multifd.h" #include "threadinfo.h" #include "qemu/yank.h" -#include "sysemu/cpus.h" +#include "system/cpus.h" #include "yank_functions.h" -#include "sysemu/qtest.h" +#include "system/qtest.h" #include "options.h" -#include "sysemu/dirtylimit.h" +#include "system/dirtylimit.h" #include "qemu/sockets.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" #define NOTIFIER_ELEM_INIT(array, elem) \ [elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem]) @@ -75,6 +77,7 @@ static NotifierWithReturnList migration_state_notifiers[] = { NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), }; /* Messages sent on the return path from destination to source */ @@ -92,6 +95,9 @@ enum mig_rp_message_type { MIG_RP_MSG_MAX }; +/* Migration channel types */ +enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY }; + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -102,12 +108,10 @@ static MigrationIncomingState *current_incoming; static GSList *migration_blockers[MIG_MODE__MAX]; static bool migration_object_check(MigrationState *ms, Error **errp); -static int migration_maybe_pause(MigrationState *s, - int *current_active_state, - int new_state); -static void migrate_fd_cancel(MigrationState *s); +static bool migration_switchover_start(MigrationState *s, Error **errp); static bool close_return_path_on_source(MigrationState *s); static void migration_completion_end(MigrationState *s); +static void migrate_hup_delete(MigrationState *s); static void migration_downtime_start(MigrationState *s) { @@ -115,6 +119,27 @@ static void migration_downtime_start(MigrationState *s) s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); } +/* + * This is unfortunate: incoming migration actually needs the outgoing + * migration state (MigrationState) to be there too, e.g. to query + * capabilities, parameters, using locks, setup errors, etc. + * + * NOTE: when calling this, making sure current_migration exists and not + * been freed yet! Otherwise trying to access the refcount is already + * an use-after-free itself.. + * + * TODO: Move shared part of incoming / outgoing out into separate object. + * Then this is not needed. + */ +static void migrate_incoming_ref_outgoing_state(void) +{ + object_ref(migrate_get_current()); +} +static void migrate_incoming_unref_outgoing_state(void) +{ + object_unref(migrate_get_current()); +} + static void migration_downtime_end(MigrationState *s) { int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -125,9 +150,19 @@ static void migration_downtime_end(MigrationState *s) */ if (!s->downtime) { s->downtime = now - s->downtime_start; + trace_vmstate_downtime_checkpoint("src-downtime-end"); + } +} + +static void precopy_notify_complete(void) +{ + Error *local_err = NULL; + + if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) { + error_report_err(local_err); } - trace_vmstate_downtime_checkpoint("src-downtime-end"); + trace_migration_precopy_complete(); } static bool migration_needs_multiple_sockets(void) @@ -135,6 +170,21 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } +static RunState migration_get_target_runstate(void) +{ + /* + * When the global state is not migrated, it means we don't know the + * runstate of the src QEMU. We don't have much choice but assuming + * the VM is running. NOTE: this is pretty rare case, so far only Xen + * uses it. + */ + if (!global_state_received()) { + return RUN_STATE_RUNNING; + } + + return global_state_get_runstate(); +} + static bool transport_supports_multi_channels(MigrationAddress *addr) { if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { @@ -203,9 +253,33 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, return false; } + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + error_setg(errp, "Migration requires streamable transport (eg unix)"); + return false; + } + + return true; +} + +static bool +migration_capabilities_and_transport_compatible(MigrationAddress *addr, + Error **errp) +{ + if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { + return migrate_rdma_caps_check(migrate_get_current()->capabilities, + errp); + } + return true; } +static bool migration_transport_compatible(MigrationAddress *addr, Error **errp) +{ + return migration_channels_and_transport_compatible(addr, errp) && + migration_capabilities_and_transport_compatible(addr, errp); +} + static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) { uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp; @@ -263,6 +337,9 @@ void migration_object_init(void) ram_mig_init(); dirty_bitmap_mig_init(); + + /* Initialize cpu throttle timers */ + cpu_throttle_init(); } typedef struct { @@ -306,17 +383,6 @@ void migration_bh_schedule(QEMUBHFunc *cb, void *opaque) qemu_bh_schedule(bh); } -void migration_cancel(const Error *error) -{ - if (error) { - migrate_set_error(current_migration, error); - } - if (migrate_dirty_limit()) { - qmp_cancel_vcpu_dirty_limit(false, -1, NULL); - } - migrate_fd_cancel(current_migration); -} - void migration_shutdown(void) { /* @@ -329,7 +395,7 @@ void migration_shutdown(void) * Cancel the current migration - that will (eventually) * stop the migration using this structure */ - migration_cancel(NULL); + migration_cancel(); object_unref(OBJECT(current_migration)); /* @@ -379,6 +445,24 @@ void migration_incoming_state_destroy(void) multifd_recv_cleanup(); + /* + * RAM state cleanup needs to happen after multifd cleanup, because + * multifd threads can use some of its states (receivedmap). + * The VFIO load_cleanup() implementation is BQL-sensitive. It requires + * BQL must NOT be taken when recycling load threads, so that it won't + * block the load threads from making progress on address space + * modification operations. + * + * To make it work, we could try to not take BQL for all load_cleanup(), + * or conditionally unlock BQL only if bql_locked() in VFIO. + * + * Since most existing call sites take BQL for load_cleanup(), make + * it simple by taking BQL always as the rule, so that VFIO can unlock + * BQL and retake unconditionally. + */ + assert(bql_locked()); + qemu_loadvm_state_cleanup(mis); + if (mis->to_src_file) { /* Tell source that we are done */ migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); @@ -410,6 +494,7 @@ void migration_incoming_state_destroy(void) mis->postcopy_qemufile_dst = NULL; } + cpr_set_incoming_mode(MIG_MODE_NONE); yank_unregister_instance(MIGRATION_YANK_INSTANCE); } @@ -563,6 +648,16 @@ void migrate_add_address(SocketAddress *address) QAPI_CLONE(SocketAddress, address)); } +bool migrate_is_uri(const char *uri) +{ + while (*uri && *uri != ':') { + if (!qemu_isalpha(*uri++)) { + return false; + } + } + return *uri == ':'; +} + bool migrate_uri_parse(const char *uri, MigrationChannel **channel, Error **errp) { @@ -660,7 +755,8 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, if (channels) { /* To verify that Migrate channel list has only item */ if (channels->next) { - error_setg(errp, "Channel list has more than one entries"); + error_setg(errp, "Channel list must have only one entry, " + "for type 'main'"); return; } addr = channels->value->addr; @@ -675,7 +771,7 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, } /* transport mechanism not suitable for migration? */ - if (!migration_channels_and_transport_compatible(addr, errp)) { + if (!migration_transport_compatible(addr, errp)) { return; } @@ -694,14 +790,6 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, } #ifdef CONFIG_RDMA } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { - if (migrate_xbzrle()) { - error_setg(errp, "RDMA and XBZRLE can't be used together"); - return; - } - if (migrate_multifd()) { - error_setg(errp, "RDMA and multifd can't be used together"); - return; - } rdma_start_incoming_migration(&addr->u.rdma, errp); #endif } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) { @@ -711,34 +799,17 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, } else { error_setg(errp, "unknown migration protocol: %s", uri); } + + /* Close cpr socket to tell source that we are listening */ + cpr_state_close(); } static void process_incoming_migration_bh(void *opaque) { - Error *local_err = NULL; MigrationIncomingState *mis = opaque; trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter"); - /* If capability late_block_activate is set: - * Only fire up the block code now if we're going to restart the - * VM, else 'cont' will do it. - * This causes file locking to happen; so we don't want it to happen - * unless we really are starting the VM. - */ - if (!migrate_late_block_activate() || - (autostart && (!global_state_received() || - runstate_is_live(global_state_get_runstate())))) { - /* Make sure all file formats throw away their mutable metadata. - * If we get an error here, just don't restart the VM yet. */ - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - local_err = NULL; - autostart = false; - } - } - /* * This must happen after all error conditions are dealt with and * we're sure the VM is going to be running on this host. @@ -751,10 +822,23 @@ static void process_incoming_migration_bh(void *opaque) dirty_bitmap_mig_before_vm_start(); - if (!global_state_received() || - runstate_is_live(global_state_get_runstate())) { + if (runstate_is_live(migration_get_target_runstate())) { if (autostart) { - vm_start(); + /* + * Block activation is always delayed until VM starts, either + * here (which means we need to start the dest VM right now..), + * or until qmp_cont() later. + * + * We used to have cap 'late-block-activate' but now we do this + * unconditionally, as it has no harm but only benefit. E.g., + * it's not part of migration ABI on the time of disk activation. + * + * Make sure all file formats throw away their mutable + * metadata. If error, don't restart the VM yet. + */ + if (migration_block_activate(NULL)) { + vm_start(); + } } else { runstate_set(RUN_STATE_PAUSED); } @@ -813,7 +897,7 @@ process_incoming_migration_co(void *opaque) * postcopy thread. */ trace_process_incoming_migration_co_postcopy_end_main(); - return; + goto out; } /* Else if something went wrong then just fall out of the normal exit */ } @@ -829,7 +913,8 @@ process_incoming_migration_co(void *opaque) } migration_bh_schedule(process_incoming_migration_bh, mis); - return; + goto out; + fail: migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_FAILED); @@ -846,6 +931,9 @@ fail: exit(EXIT_FAILURE); } +out: + /* Pairs with the refcount taken in qmp_migrate_incoming() */ + migrate_incoming_unref_outgoing_state(); } /** @@ -856,9 +944,8 @@ static void migration_incoming_setup(QEMUFile *f) { MigrationIncomingState *mis = migration_incoming_get_current(); - if (!mis->from_src_file) { - mis->from_src_file = f; - } + assert(!mis->from_src_file); + mis->from_src_file = f; qemu_file_set_blocking(f, false); } @@ -910,28 +997,19 @@ void migration_fd_process_incoming(QEMUFile *f) migration_incoming_process(); } -/* - * Returns true when we want to start a new incoming migration process, - * false otherwise. - */ -static bool migration_should_start_incoming(bool main_channel) +static bool migration_has_main_and_multifd_channels(void) { - /* Multifd doesn't start unless all channels are established */ - if (migrate_multifd()) { - return migration_has_all_channels(); + MigrationIncomingState *mis = migration_incoming_get_current(); + if (!mis->from_src_file) { + /* main channel not established */ + return false; } - /* Preempt channel only starts when the main channel is created */ - if (migrate_postcopy_preempt()) { - return main_channel; + if (migrate_multifd() && !multifd_recv_all_channels_created()) { + return false; } - /* - * For all the rest types of migration, we should only reach here when - * it's the main channel that's being created, and we should always - * proceed with this channel. - */ - assert(main_channel); + /* main and all multifd channels are established */ return true; } @@ -940,59 +1018,81 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; QEMUFile *f; - bool default_channel = true; + uint8_t channel; uint32_t channel_magic = 0; int ret = 0; - if (migrate_multifd() && !migrate_mapped_ram() && - !migrate_postcopy_ram() && - qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { - /* - * With multiple channels, it is possible that we receive channels - * out of order on destination side, causing incorrect mapping of - * source channels on destination side. Check channel MAGIC to - * decide type of channel. Please note this is best effort, postcopy - * preempt channel does not send any magic number so avoid it for - * postcopy live migration. Also tls live migration already does - * tls handshake while initializing main channel so with tls this - * issue is not possible. - */ - ret = migration_channel_read_peek(ioc, (void *)&channel_magic, - sizeof(channel_magic), errp); + if (!migration_has_main_and_multifd_channels()) { + if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { + /* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, + * postcopy preempt channel does not send any magic number so + * avoid it for postcopy live migration. Also tls live migration + * already does tls handshake while initializing main channel so + * with tls this issue is not possible. + */ + ret = migration_channel_read_peek(ioc, (void *)&channel_magic, + sizeof(channel_magic), errp); + if (ret != 0) { + return; + } - if (ret != 0) { + channel_magic = be32_to_cpu(channel_magic); + if (channel_magic == QEMU_VM_FILE_MAGIC) { + channel = CH_MAIN; + } else if (channel_magic == MULTIFD_MAGIC) { + assert(migrate_multifd()); + channel = CH_MULTIFD; + } else if (!mis->from_src_file && + mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { + /* reconnect main channel for postcopy recovery */ + channel = CH_MAIN; + } else { + error_setg(errp, "unknown channel magic: %u", channel_magic); + return; + } + } else if (mis->from_src_file && migrate_multifd()) { + /* + * Non-peekable channels like tls/file are processed as + * multifd channels when multifd is enabled. + */ + channel = CH_MULTIFD; + } else if (!mis->from_src_file) { + channel = CH_MAIN; + } else { + error_setg(errp, "non-peekable channel used without multifd"); return; } - - default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); } else { - default_channel = !mis->from_src_file; + assert(migrate_postcopy_preempt()); + channel = CH_POSTCOPY; } if (multifd_recv_setup(errp) != 0) { return; } - if (default_channel) { + if (channel == CH_MAIN) { f = qemu_file_new_input(ioc); migration_incoming_setup(f); - } else { + } else if (channel == CH_MULTIFD) { /* Multiple connections */ - assert(migration_needs_multiple_sockets()); - if (migrate_multifd()) { - multifd_recv_new_channel(ioc, &local_err); - } else { - assert(migrate_postcopy_preempt()); - f = qemu_file_new_input(ioc); - postcopy_preempt_new_channel(mis, f); - } + multifd_recv_new_channel(ioc, &local_err); if (local_err) { error_propagate(errp, local_err); return; } + } else if (channel == CH_POSTCOPY) { + assert(!mis->postcopy_qemufile_dst); + f = qemu_file_new_input(ioc); + postcopy_preempt_new_channel(mis, f); + return; } - if (migration_should_start_incoming(default_channel)) { + if (migration_has_main_and_multifd_channels()) { /* If it's a recovery, we're done */ if (postcopy_try_recover()) { return; @@ -1009,18 +1109,13 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) */ bool migration_has_all_channels(void) { - MigrationIncomingState *mis = migration_incoming_get_current(); - - if (!mis->from_src_file) { + if (!migration_has_main_and_multifd_channels()) { return false; } - if (migrate_multifd()) { - return multifd_recv_all_channels_created(); - } - - if (migrate_postcopy_preempt()) { - return mis->postcopy_qemufile_dst != NULL; + MigrationIncomingState *mis = migration_incoming_get_current(); + if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) { + return false; } return true; @@ -1105,14 +1200,14 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf); } -/* - * Return true if we're already in the middle of a migration - * (i.e. any of the active or setup states) - */ -bool migration_is_setup_or_active(void) +bool migration_is_running(void) { MigrationState *s = current_migration; + if (!s) { + return false; + } + switch (s->state) { case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: @@ -1123,36 +1218,20 @@ bool migration_is_setup_or_active(void) case MIGRATION_STATUS_PRE_SWITCHOVER: case MIGRATION_STATUS_DEVICE: case MIGRATION_STATUS_WAIT_UNPLUG: + case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_COLO: return true; - default: return false; - } } -bool migration_is_running(void) +static bool migration_is_active(void) { MigrationState *s = current_migration; - switch (s->state) { - case MIGRATION_STATUS_ACTIVE: - case MIGRATION_STATUS_POSTCOPY_ACTIVE: - case MIGRATION_STATUS_POSTCOPY_PAUSED: - case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: - case MIGRATION_STATUS_POSTCOPY_RECOVER: - case MIGRATION_STATUS_SETUP: - case MIGRATION_STATUS_PRE_SWITCHOVER: - case MIGRATION_STATUS_DEVICE: - case MIGRATION_STATUS_WAIT_UNPLUG: - case MIGRATION_STATUS_CANCELLING: - return true; - - default: - return false; - - } + return (s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } static bool migrate_show_downtime(MigrationState *s) @@ -1397,39 +1476,52 @@ void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, } } -static void migrate_fd_cleanup(MigrationState *s) +static void migration_cleanup_json_writer(MigrationState *s) +{ + g_clear_pointer(&s->vmdesc, json_writer_free); +} + +static void migration_cleanup(MigrationState *s) { MigrationEventType type; + QEMUFile *tmp = NULL; + + trace_migration_cleanup(); + + migration_cleanup_json_writer(s); g_free(s->hostname); s->hostname = NULL; - json_writer_free(s->vmdesc); - s->vmdesc = NULL; qemu_savevm_state_cleanup(); + cpr_state_close(); + migrate_hup_delete(s); close_return_path_on_source(s); - if (s->to_dst_file) { - QEMUFile *tmp; - - trace_migrate_fd_cleanup(); + if (s->migration_thread_running) { bql_unlock(); - if (s->migration_thread_running) { - qemu_thread_join(&s->thread); - s->migration_thread_running = false; - } + qemu_thread_join(&s->thread); + s->migration_thread_running = false; bql_lock(); + } - multifd_send_shutdown(); - qemu_mutex_lock(&s->qemu_file_lock); + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { + /* + * Close the file handle without the lock to make sure the critical + * section won't block for long. + */ tmp = s->to_dst_file; s->to_dst_file = NULL; - qemu_mutex_unlock(&s->qemu_file_lock); + } + + if (tmp) { /* - * Close the file handle without the lock to make sure the - * critical section won't block for long. + * We only need to shutdown multifd if tmp!=NULL, because if + * tmp==NULL, it means the main channel isn't established, while + * multifd is only setup after that (in migration_thread()). */ + multifd_send_shutdown(); migration_ioc_unregister_yank_from_file(tmp); qemu_fclose(tmp); } @@ -1451,9 +1543,9 @@ static void migrate_fd_cleanup(MigrationState *s) yank_unregister_instance(MIGRATION_YANK_INSTANCE); } -static void migrate_fd_cleanup_bh(void *opaque) +static void migration_cleanup_bh(void *opaque) { - migrate_fd_cleanup(opaque); + migration_cleanup(opaque); } void migrate_set_error(MigrationState *s, const Error *error) @@ -1483,7 +1575,7 @@ static void migrate_error_free(MigrationState *s) } } -static void migrate_fd_error(MigrationState *s, const Error *error) +static void migration_connect_set_error(MigrationState *s, const Error *error) { MigrationStatus current = s->state; MigrationStatus next; @@ -1512,11 +1604,17 @@ static void migrate_fd_error(MigrationState *s, const Error *error) migrate_set_error(s, error); } -static void migrate_fd_cancel(MigrationState *s) +void migration_cancel(void) { + MigrationState *s = migrate_get_current(); int old_state ; + bool setup = (s->state == MIGRATION_STATUS_SETUP); + + trace_migration_cancel(); - trace_migrate_fd_cancel(); + if (migrate_dirty_limit()) { + qmp_cancel_vcpu_dirty_limit(false, -1, NULL); + } WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { if (s->rp_state.from_dst_file) { @@ -1532,7 +1630,7 @@ static void migrate_fd_cancel(MigrationState *s) } /* If the migration is paused, kick it out of the pause */ if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); } while (s->state != MIGRATION_STATUS_CANCELLING); @@ -1549,15 +1647,16 @@ static void migrate_fd_cancel(MigrationState *s) } } } - if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) { - Error *local_err = NULL; - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } else { - s->block_inactive = false; - } + /* + * If qmp_migrate_finish has not been called, then there is no path that + * will complete the cancellation. Do it now. + */ + if (setup && !s->to_dst_file) { + migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, + MIGRATION_STATUS_CANCELLED); + cpr_state_close(); + migrate_hup_delete(s); } } @@ -1644,42 +1743,7 @@ bool migration_incoming_postcopy_advised(void) bool migration_in_bg_snapshot(void) { - return migrate_background_snapshot() && - migration_is_setup_or_active(); -} - -bool migration_is_idle(void) -{ - MigrationState *s = current_migration; - - if (!s) { - return true; - } - - switch (s->state) { - case MIGRATION_STATUS_NONE: - case MIGRATION_STATUS_CANCELLED: - case MIGRATION_STATUS_COMPLETED: - case MIGRATION_STATUS_FAILED: - return true; - default: - return false; - } -} - -bool migration_is_active(void) -{ - MigrationState *s = current_migration; - - return (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); -} - -bool migration_is_device(void) -{ - MigrationState *s = current_migration; - - return s->state == MIGRATION_STATUS_DEVICE; + return migrate_background_snapshot() && migration_is_running(); } bool migration_thread_is_self(void) @@ -1691,7 +1755,9 @@ bool migration_thread_is_self(void) bool migrate_mode_is_cpr(MigrationState *s) { - return s->parameters.mode == MIG_MODE_CPR_REBOOT; + MigMode mode = s->parameters.mode; + return mode == MIG_MODE_CPR_REBOOT || + mode == MIG_MODE_CPR_TRANSFER; } int migrate_init(MigrationState *s, Error **errp) @@ -1720,7 +1786,10 @@ int migrate_init(MigrationState *s, Error **errp) s->migration_thread_running = false; error_free(s->error); s->error = NULL; - s->vmdesc = NULL; + + if (should_send_vmdesc()) { + s->vmdesc = json_writer_new(false); + } migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); @@ -1745,7 +1814,7 @@ static bool is_busy(Error **reasonp, Error **errp) ERRP_GUARD(); /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */ - if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) { + if (runstate_check(RUN_STATE_SAVE_VM) || migration_is_running()) { error_propagate_prepend(errp, *reasonp, "disallowing migration blocker " "(migration/snapshot in progress) for: "); @@ -1877,6 +1946,17 @@ void qmp_migrate_incoming(const char *uri, bool has_channels, return; } + /* + * Making sure MigrationState is available until incoming migration + * completes. + * + * NOTE: QEMU _might_ leak this refcount in some failure paths, but + * that's OK. This is the minimum change we need to at least making + * sure success case is clean on the refcount. We can try harder to + * make it accurate for any kind of failures, but it might be an + * overkill and doesn't bring us much benefit. + */ + migrate_incoming_ref_outgoing_state(); once = false; } @@ -2066,6 +2146,40 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) return true; } +static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, + Error **errp); + +static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb, + void *opaque) +{ + s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP); + g_source_set_callback(s->hup_source, cb, opaque, NULL); + g_source_attach(s->hup_source, NULL); +} + +static void migrate_hup_delete(MigrationState *s) +{ + if (s->hup_source) { + g_source_destroy(s->hup_source); + g_source_unref(s->hup_source); + s->hup_source = NULL; + } +} + +static gboolean qmp_migrate_finish_cb(QIOChannel *channel, + GIOCondition cond, + void *opaque) +{ + MigrationAddress *addr = opaque; + + qmp_migrate_finish(addr, false, NULL); + + cpr_state_close(); + migrate_hup_delete(migrate_get_current()); + qapi_free_MigrationAddress(addr); + return G_SOURCE_REMOVE; +} + void qmp_migrate(const char *uri, bool has_channels, MigrationChannelList *channels, bool has_detach, bool detach, bool has_resume, bool resume, Error **errp) @@ -2075,6 +2189,8 @@ void qmp_migrate(const char *uri, bool has_channels, MigrationState *s = migrate_get_current(); g_autoptr(MigrationChannel) channel = NULL; MigrationAddress *addr = NULL; + MigrationChannel *channelv[MIGRATION_CHANNEL_TYPE__MAX] = { NULL }; + MigrationChannel *cpr_channel = NULL; /* * Having preliminary checks for uri and channel @@ -2085,12 +2201,22 @@ void qmp_migrate(const char *uri, bool has_channels, } if (channels) { - /* To verify that Migrate channel list has only item */ - if (channels->next) { - error_setg(errp, "Channel list has more than one entries"); + for ( ; channels; channels = channels->next) { + MigrationChannelType type = channels->value->channel_type; + + if (channelv[type]) { + error_setg(errp, "Channel list has more than one %s entry", + MigrationChannelType_str(type)); + return; + } + channelv[type] = channels->value; + } + cpr_channel = channelv[MIGRATION_CHANNEL_TYPE_CPR]; + addr = channelv[MIGRATION_CHANNEL_TYPE_MAIN]->addr; + if (!addr) { + error_setg(errp, "Channel list has no main entry"); return; } - addr = channels->value->addr; } if (uri) { @@ -2102,7 +2228,12 @@ void qmp_migrate(const char *uri, bool has_channels, } /* transport mechanism not suitable for migration? */ - if (!migration_channels_and_transport_compatible(addr, errp)) { + if (!migration_transport_compatible(addr, errp)) { + return; + } + + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER && !cpr_channel) { + error_setg(errp, "missing 'cpr' migration channel"); return; } @@ -2112,6 +2243,41 @@ void qmp_migrate(const char *uri, bool has_channels, return; } + if (cpr_state_save(cpr_channel, &local_err)) { + goto out; + } + + /* + * For cpr-transfer, the target may not be listening yet on the migration + * channel, because first it must finish cpr_load_state. The target tells + * us it is listening by closing the cpr-state socket. Wait for that HUP + * event before connecting in qmp_migrate_finish. + * + * The HUP could occur because the target fails while reading CPR state, + * in which case the target will not listen for the incoming migration + * connection, so qmp_migrate_finish will fail to connect, and then recover. + */ + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) { + migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb, + QAPI_CLONE(MigrationAddress, addr)); + + } else { + qmp_migrate_finish(addr, resume_requested, errp); + } + +out: + if (local_err) { + migration_connect_set_error(s, local_err); + error_propagate(errp, local_err); + } +} + +static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, + Error **errp) +{ + MigrationState *s = migrate_get_current(); + Error *local_err = NULL; + if (!resume_requested) { if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { return; @@ -2146,7 +2312,7 @@ void qmp_migrate(const char *uri, bool has_channels, if (!resume_requested) { yank_unregister_instance(MIGRATION_YANK_INSTANCE); } - migrate_fd_error(s, local_err); + migration_connect_set_error(s, local_err); error_propagate(errp, local_err); return; } @@ -2154,7 +2320,18 @@ void qmp_migrate(const char *uri, bool has_channels, void qmp_migrate_cancel(Error **errp) { - migration_cancel(NULL); + /* + * After postcopy migration has started, the source machine is not + * recoverable in case of a migration error. This also means the + * cancel command cannot be used as cancel should allow the + * machine to continue operation. + */ + if (migration_in_postcopy()) { + error_setg(errp, "Postcopy migration in progress, cannot cancel."); + return; + } + + migration_cancel(); } void qmp_migrate_continue(MigrationStatus state, Error **errp) @@ -2165,7 +2342,7 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) MigrationStatus_str(s->state)); return; } - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } int migration_rp_wait(MigrationState *s) @@ -2273,7 +2450,7 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s, */ static void migration_release_dst_files(MigrationState *ms) { - QEMUFile *file; + QEMUFile *file = NULL; WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { /* @@ -2318,7 +2495,7 @@ static void *source_return_path_thread(void *opaque) trace_source_return_path_thread_entry(); rcu_register_thread(); - while (migration_is_setup_or_active()) { + while (migration_is_running()) { trace_source_return_path_thread_loop_top(); header_type = qemu_get_be16(rp); @@ -2473,7 +2650,7 @@ static int open_return_path_on_source(MigrationState *ms) trace_open_return_path_on_source(); - qemu_thread_create(&ms->rp_state.rp_thread, "mig/src/rp-thr", + qemu_thread_create(&ms->rp_state.rp_thread, MIGRATION_THREAD_SRC_RETURN, source_return_path_thread, ms, QEMU_THREAD_JOINABLE); ms->rp_state.rp_thread_created = true; @@ -2528,23 +2705,30 @@ static int postcopy_start(MigrationState *ms, Error **errp) int ret; QIOChannelBuffer *bioc; QEMUFile *fb; - uint64_t bandwidth = migrate_max_postcopy_bandwidth(); - bool restart_block = false; - int cur_state = MIGRATION_STATUS_ACTIVE; + + /* + * Now we're 100% sure to switch to postcopy, so JSON writer won't be + * useful anymore. Free the resources early if it is there. Clearing + * the vmdesc also means any follow up vmstate_save()s will start to + * skip all JSON operations, which can shrink postcopy downtime. + */ + migration_cleanup_json_writer(ms); if (migrate_postcopy_preempt()) { migration_wait_main_channel(ms); if (postcopy_preempt_establish_channel(ms)) { - migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); + if (ms->state != MIGRATION_STATUS_CANCELLING) { + migrate_set_state(&ms->state, ms->state, + MIGRATION_STATUS_FAILED); + } error_setg(errp, "%s: Failed to establish preempt channel", __func__); return -1; } } - if (!migrate_pause_before_switchover()) { - migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_POSTCOPY_ACTIVE); + if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) { + return -1; } trace_postcopy_start(); @@ -2557,27 +2741,19 @@ static int postcopy_start(MigrationState *ms, Error **errp) goto fail; } - ret = migration_maybe_pause(ms, &cur_state, - MIGRATION_STATUS_POSTCOPY_ACTIVE); - if (ret < 0) { - error_setg_errno(errp, -ret, "%s: Failed in migration_maybe_pause()", - __func__); + if (!migration_switchover_start(ms, errp)) { goto fail; } - ret = bdrv_inactivate_all(); - if (ret < 0) { - error_setg_errno(errp, -ret, "%s: Failed in bdrv_inactivate_all()", - __func__); - goto fail; - } - restart_block = true; - /* * Cause any non-postcopiable, but iterative devices to * send out their final data. */ - qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false); + ret = qemu_savevm_state_complete_precopy_iterable(ms->to_dst_file, true); + if (ret) { + error_setg(errp, "Postcopy save non-postcopiable iterables failed"); + goto fail; + } /* * in Finish migrate and with the io-lock held everything should @@ -2589,12 +2765,6 @@ static int postcopy_start(MigrationState *ms, Error **errp) ram_postcopy_send_discard_bitmap(ms); } - /* - * send rest of state - note things that are doing postcopy - * will notice we're in POSTCOPY_ACTIVE and not actually - * wrap their state up here - */ - migration_rate_set(bandwidth); if (migrate_postcopy_ram()) { /* Ping just for debugging, helps line traces up */ qemu_savevm_send_ping(ms->to_dst_file, 2); @@ -2622,7 +2792,12 @@ static int postcopy_start(MigrationState *ms, Error **errp) */ qemu_savevm_send_postcopy_listen(fb); - qemu_savevm_state_complete_precopy(fb, false, false); + ret = qemu_savevm_state_complete_precopy_non_iterable(fb, true); + if (ret) { + error_setg(errp, "Postcopy save non-iterable device states failed"); + goto fail_closefb; + } + if (migrate_postcopy_ram()) { qemu_savevm_send_ping(fb, 3); } @@ -2641,8 +2816,6 @@ static int postcopy_start(MigrationState *ms, Error **errp) goto fail_closefb; } - restart_block = false; - /* Now send that blob */ if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { error_setg(errp, "%s: Failed to send packaged data", __func__); @@ -2658,8 +2831,6 @@ static int postcopy_start(MigrationState *ms, Error **errp) migration_downtime_end(ms); - bql_unlock(); - if (migrate_postcopy_ram()) { /* * Although this ping is just for debug, it could potentially be @@ -2675,11 +2846,22 @@ static int postcopy_start(MigrationState *ms, Error **errp) ret = qemu_file_get_error(ms->to_dst_file); if (ret) { error_setg_errno(errp, -ret, "postcopy_start: Migration stream error"); - bql_lock(); goto fail; } trace_postcopy_preempt_enabled(migrate_postcopy_preempt()); + /* + * Now postcopy officially started, switch to postcopy bandwidth that + * user specified. + */ + migration_rate_set(migrate_max_postcopy_bandwidth()); + + /* Now, switchover looks all fine, switching to postcopy-active */ + migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + + bql_unlock(); + return ret; fail_closefb: @@ -2687,67 +2869,104 @@ fail_closefb: fail: migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_FAILED); - if (restart_block) { - /* A failure happened early enough that we know the destination hasn't - * accessed block devices, so we're safe to recover. - */ - Error *local_err = NULL; - - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } - } + migration_block_activate(NULL); migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL); bql_unlock(); return -1; } /** - * migration_maybe_pause: Pause if required to by - * migrate_pause_before_switchover called with the BQL locked - * Returns: 0 on success + * @migration_switchover_prepare: Start VM switchover procedure + * + * @s: The migration state object pointer + * + * Prepares for the switchover, depending on "pause-before-switchover" + * capability. + * + * If cap set, state machine goes like: + * [postcopy-]active -> pre-switchover -> device + * + * If cap not set: + * [postcopy-]active -> device + * + * Returns: true on success, false on interruptions. */ -static int migration_maybe_pause(MigrationState *s, - int *current_active_state, - int new_state) +static bool migration_switchover_prepare(MigrationState *s) { + /* Concurrent cancellation? Quit */ + if (s->state == MIGRATION_STATUS_CANCELLING) { + return false; + } + + /* + * No matter precopy or postcopy, since we still hold BQL it must not + * change concurrently to CANCELLING, so it must be either ACTIVE or + * POSTCOPY_ACTIVE. + */ + assert(migration_is_active()); + + /* If the pre stage not requested, directly switch to DEVICE */ if (!migrate_pause_before_switchover()) { - return 0; + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_DEVICE); + return true; } - /* Since leaving this state is not atomic with posting the semaphore + /* + * Since leaving this state is not atomic with setting the event * it's possible that someone could have issued multiple migrate_continue - * and the semaphore is incorrectly positive at this point; - * the docs say it's undefined to reinit a semaphore that's already - * init'd, so use timedwait to eat up any existing posts. + * and the event is incorrectly set at this point so reset it. */ - while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { - /* This block intentionally left blank */ - } + qemu_event_reset(&s->pause_event); + + /* Update [POSTCOPY_]ACTIVE to PRE_SWITCHOVER */ + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_PRE_SWITCHOVER); + bql_unlock(); + + qemu_event_wait(&s->pause_event); + bql_lock(); /* - * If the migration is cancelled when it is in the completion phase, - * the migration state is set to MIGRATION_STATUS_CANCELLING. - * So we don't need to wait a semaphore, otherwise we would always - * wait for the 'pause_sem' semaphore. + * After BQL released and retaken, the state can be CANCELLING if it + * happend during sem_wait().. Only change the state if it's still + * pre-switchover. */ - if (s->state != MIGRATION_STATUS_CANCELLING) { - bql_unlock(); - migrate_set_state(&s->state, *current_active_state, - MIGRATION_STATUS_PRE_SWITCHOVER); - qemu_sem_wait(&s->pause_sem); - migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, - new_state); - *current_active_state = new_state; - bql_lock(); + migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, + MIGRATION_STATUS_DEVICE); + + return s->state == MIGRATION_STATUS_DEVICE; +} + +static bool migration_switchover_start(MigrationState *s, Error **errp) +{ + ERRP_GUARD(); + + if (!migration_switchover_prepare(s)) { + error_setg(errp, "Switchover is interrupted"); + return false; } - return s->state == new_state ? 0 : -EINVAL; + /* Inactivate disks except in COLO */ + if (!migrate_colo()) { + /* + * Inactivate before sending QEMU_VM_EOF so that the + * bdrv_activate_all() on the other end won't fail. + */ + if (!migration_block_inactivate()) { + error_setg(errp, "Block inactivate failed during switchover"); + return false; + } + } + + migration_rate_set(RATE_LIMIT_DISABLED); + + precopy_notify_complete(); + + qemu_savevm_maybe_send_switchover_start(s->to_dst_file); + + return true; } -static int migration_completion_precopy(MigrationState *s, - int *current_active_state) +static int migration_completion_precopy(MigrationState *s) { int ret; @@ -2760,20 +2979,12 @@ static int migration_completion_precopy(MigrationState *s, } } - ret = migration_maybe_pause(s, current_active_state, - MIGRATION_STATUS_DEVICE); - if (ret < 0) { + if (!migration_switchover_start(s, NULL)) { + ret = -EFAULT; goto out_unlock; } - /* - * Inactivate disks except in COLO, and track that we have done so in order - * to remember to reactivate them if migration fails or is cancelled. - */ - s->block_inactive = !migrate_colo(); - migration_rate_set(RATE_LIMIT_DISABLED); - ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, - s->block_inactive); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false); out_unlock: bql_unlock(); return ret; @@ -2798,31 +3009,6 @@ static void migration_completion_postcopy(MigrationState *s) trace_migration_completion_postcopy_end_after_complete(); } -static void migration_completion_failed(MigrationState *s, - int current_active_state) -{ - if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_DEVICE)) { - /* - * If not doing postcopy, vm_start() will be called: let's - * regain control on images. - */ - Error *local_err = NULL; - - bql_lock(); - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } else { - s->block_inactive = false; - } - bql_unlock(); - } - - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); -} - /** * migration_completion: Used by migration_thread when there's not much left. * The caller 'breaks' the loop when this returns. @@ -2832,11 +3018,10 @@ static void migration_completion_failed(MigrationState *s, static void migration_completion(MigrationState *s) { int ret = 0; - int current_active_state = s->state; Error *local_err = NULL; if (s->state == MIGRATION_STATUS_ACTIVE) { - ret = migration_completion_precopy(s, ¤t_active_state); + ret = migration_completion_precopy(s); } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { migration_completion_postcopy(s); } else { @@ -2876,7 +3061,9 @@ fail: error_free(local_err); } - migration_completion_failed(s, current_active_state); + if (s->state != MIGRATION_STATUS_CANCELLING) { + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + } } /** @@ -2899,7 +3086,7 @@ static void bg_migration_completion(MigrationState *s) qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); qemu_fflush(s->to_dst_file); } else if (s->state == MIGRATION_STATUS_CANCELLING) { - goto fail; + return; } if (qemu_file_get_error(s->to_dst_file)) { @@ -3283,10 +3470,17 @@ static MigIterateState migration_iteration_run(MigrationState *s) static void migration_iteration_finish(MigrationState *s) { - /* If we enabled cpu throttling for auto-converge, turn it off. */ - cpu_throttle_stop(); - bql_lock(); + + /* + * If we enabled cpu throttling for auto-converge, turn it off. + * Stopping CPU throttle should be serialized by BQL to avoid + * racing for the throttle_dirty_sync_timer. + */ + if (migrate_auto_converge()) { + cpu_throttle_stop(); + } + switch (s->state) { case MIGRATION_STATUS_COMPLETED: runstate_set(RUN_STATE_POSTMIGRATE); @@ -3299,6 +3493,11 @@ static void migration_iteration_finish(MigrationState *s) case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLING: + /* + * Re-activate the block drives if they're inactivated. Note, COLO + * shouldn't use block_active at all, so it should be no-op there. + */ + migration_block_activate(NULL); if (runstate_is_live(s->vm_old_state)) { if (!runstate_check(RUN_STATE_SHUTDOWN)) { vm_start(); @@ -3316,7 +3515,7 @@ static void migration_iteration_finish(MigrationState *s) break; } - migration_bh_schedule(migrate_fd_cleanup_bh, s); + migration_bh_schedule(migration_cleanup_bh, s); bql_unlock(); } @@ -3344,7 +3543,7 @@ static void bg_migration_iteration_finish(MigrationState *s) break; } - migration_bh_schedule(migrate_fd_cleanup_bh, s); + migration_bh_schedule(migration_cleanup_bh, s); bql_unlock(); } @@ -3462,11 +3661,11 @@ static void *migration_thread(void *opaque) Error *local_err = NULL; int ret; - thread = migration_threads_add("live_migration", qemu_get_thread_id()); + thread = migration_threads_add(MIGRATION_THREAD_SRC_MAIN, + qemu_get_thread_id()); rcu_register_thread(); - object_ref(OBJECT(s)); update_iteration_initial_status(s); if (!multifd_send_setup()) { @@ -3503,6 +3702,11 @@ static void *migration_thread(void *opaque) qemu_savevm_send_colo_enable(s->to_dst_file); } + if (migrate_auto_converge()) { + /* Start RAMBlock dirty bitmap sync timer */ + cpu_throttle_dirty_sync_timer(true); + } + bql_lock(); ret = qemu_savevm_state_setup(s->to_dst_file, &local_err); bql_unlock(); @@ -3599,7 +3803,6 @@ static void *bg_migration_thread(void *opaque) int ret; rcu_register_thread(); - object_ref(OBJECT(s)); migration_rate_set(RATE_LIMIT_DISABLED); @@ -3657,12 +3860,8 @@ static void *bg_migration_thread(void *opaque) if (migration_stop_vm(s, RUN_STATE_PAUSED)) { goto fail; } - /* - * Put vCPUs in sync with shadow context structures, then - * save their state to channel-buffer along with devices. - */ - cpu_synchronize_all_states(); - if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { + + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false)) { goto fail; } /* @@ -3726,7 +3925,7 @@ fail_setup: return NULL; } -void migrate_fd_connect(MigrationState *s, Error *error_in) +void migration_connect(MigrationState *s, Error *error_in) { Error *local_err = NULL; uint64_t rate_limit; @@ -3736,24 +3935,24 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) /* * If there's a previous error, free it and prepare for another one. * Meanwhile if migration completes successfully, there won't have an error - * dumped when calling migrate_fd_cleanup(). + * dumped when calling migration_cleanup(). */ migrate_error_free(s); s->expected_downtime = migrate_downtime_limit(); if (error_in) { - migrate_fd_error(s, error_in); + migration_connect_set_error(s, error_in); if (resume) { /* * Don't do cleanup for resume if channel is invalid, but only dump * the error. We wait for another channel connect from the user. * The error_report still gives HMP user a hint on what failed. - * It's normally done in migrate_fd_cleanup(), but call it here + * It's normally done in migration_cleanup(), but call it here * explicitly. */ error_report_err(error_copy(s->error)); } else { - migrate_fd_cleanup(s); + migration_cleanup(s); } return; } @@ -3811,11 +4010,19 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) } } + /* + * Take a refcount to make sure the migration object won't get freed by + * the main thread already in migration_shutdown(). + * + * The refcount will be released at the end of the thread function. + */ + object_ref(OBJECT(s)); + if (migrate_background_snapshot()) { - qemu_thread_create(&s->thread, "mig/snapshot", + qemu_thread_create(&s->thread, MIGRATION_THREAD_SNAPSHOT, bg_migration_thread, s, QEMU_THREAD_JOINABLE); } else { - qemu_thread_create(&s->thread, "mig/src/main", + qemu_thread_create(&s->thread, MIGRATION_THREAD_SRC_MAIN, migration_thread, s, QEMU_THREAD_JOINABLE); } s->migration_thread_running = true; @@ -3823,17 +4030,20 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) fail: migrate_set_error(s, local_err); - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + if (s->state != MIGRATION_STATUS_CANCELLING) { + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + } error_report_err(local_err); - migrate_fd_cleanup(s); + migration_cleanup(s); } -static void migration_class_init(ObjectClass *klass, void *data) +static void migration_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->user_creatable = false; - device_class_set_props(dc, migration_properties); + device_class_set_props_n(dc, migration_properties, + migration_properties_count); } static void migration_instance_finalize(Object *obj) @@ -3844,7 +4054,7 @@ static void migration_instance_finalize(Object *obj) qemu_mutex_destroy(&ms->qemu_file_lock); qemu_sem_destroy(&ms->wait_unplug_sem); qemu_sem_destroy(&ms->rate_limit_sem); - qemu_sem_destroy(&ms->pause_sem); + qemu_event_destroy(&ms->pause_event); qemu_sem_destroy(&ms->postcopy_pause_sem); qemu_sem_destroy(&ms->rp_state.rp_sem); qemu_sem_destroy(&ms->rp_state.rp_pong_acks); @@ -3859,7 +4069,7 @@ static void migration_instance_init(Object *obj) ms->state = MIGRATION_STATUS_NONE; ms->mbps = -1; ms->pages_per_second = -1; - qemu_sem_init(&ms->pause_sem, 0); + qemu_event_init(&ms->pause_event, false); qemu_mutex_init(&ms->error_mutex); migrate_params_init(&ms->parameters); diff --git a/migration/migration.h b/migration/migration.h index 38aa140..739289d 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -17,7 +17,7 @@ #include "exec/cpu-common.h" #include "hw/qdev-core.h" #include "qapi/qapi-types-migration.h" -#include "qapi/qmp/json-writer.h" +#include "qobject/json-writer.h" #include "qemu/thread.h" #include "qemu/coroutine.h" #include "io/channel.h" @@ -25,10 +25,25 @@ #include "net/announce.h" #include "qom/object.h" #include "postcopy-ram.h" -#include "sysemu/runstate.h" +#include "system/runstate.h" #include "migration/misc.h" +#define MIGRATION_THREAD_SNAPSHOT "mig/snapshot" +#define MIGRATION_THREAD_DIRTY_RATE "mig/dirtyrate" + +#define MIGRATION_THREAD_SRC_MAIN "mig/src/main" +#define MIGRATION_THREAD_SRC_MULTIFD "mig/src/send_%d" +#define MIGRATION_THREAD_SRC_RETURN "mig/src/return" +#define MIGRATION_THREAD_SRC_TLS "mig/src/tls" + +#define MIGRATION_THREAD_DST_COLO "mig/dst/colo" +#define MIGRATION_THREAD_DST_MULTIFD "mig/dst/recv_%d" +#define MIGRATION_THREAD_DST_FAULT "mig/dst/fault" +#define MIGRATION_THREAD_DST_LISTEN "mig/dst/listen" +#define MIGRATION_THREAD_DST_PREEMPT "mig/dst/preempt" + struct PostcopyBlocktimeContext; +typedef struct ThreadPool ThreadPool; #define MIGRATION_RESUME_ACK_VALUE (1) @@ -83,9 +98,9 @@ struct MigrationIncomingState { void (*transport_cleanup)(void *data); /* * Used to sync thread creations. Note that we can't create threads in - * parallel with this sem. + * parallel with this event. */ - QemuSemaphore thread_sync_sem; + QemuEvent thread_sync_event; /* * Free at the start of the main state load, set as the main thread finishes * loading state. @@ -171,7 +186,11 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *colo_incoming_co; - QemuSemaphore colo_incoming_sem; + QemuEvent colo_incoming_event; + + /* Optional load threads pool and its thread exit request flag */ + ThreadPool *load_threads; + bool load_threads_abort; /* * PostcopyBlocktimeContext to keep information for postcopy @@ -356,17 +375,14 @@ struct MigrationState { /* Flag set once the migration thread is running (and needs joining) */ bool migration_thread_running; - /* Flag set once the migration thread called bdrv_inactivate_all */ - bool block_inactive; - /* Migration is waiting for guest to unplug device */ QemuSemaphore wait_unplug_sem; /* Migration is paused due to pause-before-switchover */ - QemuSemaphore pause_sem; + QemuEvent pause_event; - /* The semaphore is used to notify COLO thread that failover is finished */ - QemuSemaphore colo_exit_sem; + /* The event is used to notify COLO thread that failover is finished */ + QemuEvent colo_exit_event; /* The event is used to notify COLO thread to do checkpoint */ QemuEvent colo_checkpoint_event; @@ -389,6 +405,8 @@ struct MigrationState { bool send_configuration; /* Whether we send section footer during migration */ bool send_section_footer; + /* Whether we send switchover start notification during migration */ + bool send_switchover_start; /* Needed by postcopy-pause state */ QemuSemaphore postcopy_pause_sem; @@ -432,6 +450,39 @@ struct MigrationState { * Default value is false. (since 8.1) */ bool multifd_flush_after_each_section; + + /* + * This variable only makes sense when set on the machine that is + * the destination of a multifd migration with TLS enabled. It + * affects the behavior of the last send->recv iteration with + * regards to termination of the TLS session. + * + * When set: + * + * - the destination QEMU instance can expect to never get a + * GNUTLS_E_PREMATURE_TERMINATION error. Manifested as the error + * message: "The TLS connection was non-properly terminated". + * + * When clear: + * + * - the destination QEMU instance can expect to see a + * GNUTLS_E_PREMATURE_TERMINATION error in any multifd channel + * whenever the last recv() call of that channel happens after + * the source QEMU instance has already issued shutdown() on the + * channel. + * + * Commit 637280aeb2 (since 9.1) introduced a side effect that + * causes the destination instance to not be affected by the + * premature termination, while commit 1d457daf86 (since 10.0) + * causes the premature termination condition to be once again + * reachable. + * + * NOTE: Regardless of the state of this option, a premature + * termination of the TLS connection might happen due to error at + * any moment prior to the last send->recv iteration. + */ + bool multifd_clean_tls_termination; + /* * This decides the size of guest memory chunk that will be used * to track dirty bitmap clearing. The size of memory chunk will @@ -457,6 +508,8 @@ struct MigrationState { bool switchover_acked; /* Is this a rdma migration */ bool rdma_migration; + + GSource *hup_source; }; void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, @@ -471,7 +524,7 @@ bool migration_has_all_channels(void); void migrate_set_error(MigrationState *s, const Error *error); bool migrate_has_error(MigrationState *s); -void migrate_fd_connect(MigrationState *s, Error *error_in); +void migration_connect(MigrationState *s, Error *error_in); int migration_call_notifiers(MigrationState *s, MigrationEventType type, Error **errp); @@ -508,8 +561,6 @@ bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm, Error **errp); void migrate_add_address(SocketAddress *address); -bool migrate_uri_parse(const char *uri, MigrationChannel **channel, - Error **errp); int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque); #define qemu_ram_foreach_block \ @@ -519,7 +570,7 @@ void migration_make_urgent_request(void); void migration_consume_urgent_request(void); bool migration_rate_limit(void); void migration_bh_schedule(QEMUBHFunc *cb, void *opaque); -void migration_cancel(const Error *error); +void migration_cancel(void); void migration_populate_vfio_info(MigrationInfo *info); void migration_reset_vfio_bytes_transferred(void); @@ -537,4 +588,10 @@ int migration_rp_wait(MigrationState *s); */ void migration_rp_kick(MigrationState *s); +void migration_bitmap_sync_precopy(bool last_stage); + +/* migration/block-dirty-bitmap.c */ +void dirty_bitmap_mig_init(void); +bool should_send_vmdesc(void); + #endif diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c new file mode 100644 index 0000000..94222d0 --- /dev/null +++ b/migration/multifd-device-state.c @@ -0,0 +1,212 @@ +/* + * Multifd device state migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/lockable.h" +#include "block/thread-pool.h" +#include "migration.h" +#include "migration/misc.h" +#include "multifd.h" +#include "options.h" + +static struct { + QemuMutex queue_job_mutex; + + MultiFDSendData *send_data; + + ThreadPool *threads; + bool threads_abort; +} *multifd_send_device_state; + +void multifd_device_state_send_setup(void) +{ + assert(!multifd_send_device_state); + multifd_send_device_state = g_malloc(sizeof(*multifd_send_device_state)); + + qemu_mutex_init(&multifd_send_device_state->queue_job_mutex); + + multifd_send_device_state->send_data = multifd_send_data_alloc(); + + multifd_send_device_state->threads = thread_pool_new(); + multifd_send_device_state->threads_abort = false; +} + +void multifd_device_state_send_cleanup(void) +{ + g_clear_pointer(&multifd_send_device_state->threads, thread_pool_free); + g_clear_pointer(&multifd_send_device_state->send_data, + multifd_send_data_free); + + qemu_mutex_destroy(&multifd_send_device_state->queue_job_mutex); + + g_clear_pointer(&multifd_send_device_state, g_free); +} + +void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state) +{ + g_clear_pointer(&device_state->idstr, g_free); + g_clear_pointer(&device_state->buf, g_free); +} + +static void multifd_device_state_fill_packet(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + MultiFDPacketDeviceState_t *packet = p->packet_device_state; + + packet->hdr.flags = cpu_to_be32(p->flags); + strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr) - 1); + packet->idstr[sizeof(packet->idstr) - 1] = 0; + packet->instance_id = cpu_to_be32(device_state->instance_id); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); +} + +static void multifd_prepare_header_device_state(MultiFDSendParams *p) +{ + p->iov[0].iov_len = sizeof(*p->packet_device_state); + p->iov[0].iov_base = p->packet_device_state; + p->iovs_num++; +} + +void multifd_device_state_send_prepare(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + + assert(multifd_payload_device_state(p->data)); + + multifd_prepare_header_device_state(p); + + assert(!(p->flags & MULTIFD_FLAG_SYNC)); + + p->next_packet_size = device_state->buf_len; + if (p->next_packet_size > 0) { + p->iov[p->iovs_num].iov_base = device_state->buf; + p->iov[p->iovs_num].iov_len = p->next_packet_size; + p->iovs_num++; + } + + p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE; + + multifd_device_state_fill_packet(p); +} + +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len) +{ + /* Device state submissions can come from multiple threads */ + QEMU_LOCK_GUARD(&multifd_send_device_state->queue_job_mutex); + MultiFDDeviceState_t *device_state; + + assert(multifd_payload_empty(multifd_send_device_state->send_data)); + + multifd_set_payload_type(multifd_send_device_state->send_data, + MULTIFD_PAYLOAD_DEVICE_STATE); + device_state = &multifd_send_device_state->send_data->u.device_state; + device_state->idstr = g_strdup(idstr); + device_state->instance_id = instance_id; + device_state->buf = g_memdup2(data, len); + device_state->buf_len = len; + + if (!multifd_send(&multifd_send_device_state->send_data)) { + multifd_send_data_clear(multifd_send_device_state->send_data); + return false; + } + + return true; +} + +bool multifd_device_state_supported(void) +{ + return migrate_multifd() && !migrate_mapped_ram() && + migrate_multifd_compression() == MULTIFD_COMPRESSION_NONE; +} + +static void multifd_device_state_save_thread_data_free(void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data = opaque; + + g_clear_pointer(&data->idstr, g_free); + g_free(data); +} + +static int multifd_device_state_save_thread(void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data = opaque; + g_autoptr(Error) local_err = NULL; + + if (!data->hdlr(data, &local_err)) { + MigrationState *s = migrate_get_current(); + + /* + * Can't call abort_device_state_save_threads() here since new + * save threads could still be in process of being launched + * (if, for example, the very first save thread launched exited + * with an error very quickly). + */ + + assert(local_err); + + /* + * In case of multiple save threads failing which thread error + * return we end setting is purely arbitrary. + */ + migrate_set_error(s, local_err); + } + + return 0; +} + +bool multifd_device_state_save_thread_should_exit(void) +{ + return qatomic_read(&multifd_send_device_state->threads_abort); +} + +void +multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr, + char *idstr, uint32_t instance_id, + void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data; + + assert(multifd_device_state_supported()); + assert(multifd_send_device_state); + + assert(!qatomic_read(&multifd_send_device_state->threads_abort)); + + data = g_new(SaveLiveCompletePrecopyThreadData, 1); + data->hdlr = hdlr; + data->idstr = g_strdup(idstr); + data->instance_id = instance_id; + data->handler_opaque = opaque; + + thread_pool_submit_immediate(multifd_send_device_state->threads, + multifd_device_state_save_thread, + data, + multifd_device_state_save_thread_data_free); +} + +void multifd_abort_device_state_save_threads(void) +{ + assert(multifd_device_state_supported()); + + qatomic_set(&multifd_send_device_state->threads_abort, true); +} + +bool multifd_join_device_state_save_threads(void) +{ + MigrationState *s = migrate_get_current(); + + assert(multifd_device_state_supported()); + + thread_pool_wait(multifd_send_device_state->threads); + + return !migrate_has_error(s); +} diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c new file mode 100644 index 0000000..b48eae3 --- /dev/null +++ b/migration/multifd-nocomp.c @@ -0,0 +1,468 @@ +/* + * Multifd RAM migration without compression + * + * Copyright (c) 2019-2020 Red Hat Inc + * + * Authors: + * Juan Quintela <quintela@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "system/ramblock.h" +#include "exec/target_page.h" +#include "file.h" +#include "migration-stats.h" +#include "multifd.h" +#include "options.h" +#include "migration.h" +#include "qapi/error.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "qemu-file.h" + +static MultiFDSendData *multifd_ram_send; + +void multifd_ram_payload_alloc(MultiFDPages_t *pages) +{ + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); +} + +void multifd_ram_payload_free(MultiFDPages_t *pages) +{ + g_clear_pointer(&pages->offset, g_free); +} + +void multifd_ram_save_setup(void) +{ + multifd_ram_send = multifd_send_data_alloc(); +} + +void multifd_ram_save_cleanup(void) +{ + g_clear_pointer(&multifd_ram_send, multifd_send_data_free); +} + +static void multifd_set_file_bitmap(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = &p->data->u.ram; + + assert(pages->block); + + for (int i = 0; i < pages->normal_num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); + } + + for (int i = pages->normal_num; i < pages->num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false); + } +} + +static int multifd_nocomp_send_setup(MultiFDSendParams *p, Error **errp) +{ + uint32_t page_count = multifd_ram_page_count(); + + if (migrate_zero_copy_send()) { + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + + if (!migrate_mapped_ram()) { + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + } else { + p->iov = g_new0(struct iovec, page_count); + } + + return 0; +} + +static void multifd_nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + g_free(p->iov); + p->iov = NULL; +} + +static void multifd_ram_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = &p->data->u.ram; + uint32_t page_size = multifd_ram_page_size(); + + for (int i = 0; i < pages->normal_num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->normal_num * page_size; +} + +static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) +{ + bool use_zero_copy_send = migrate_zero_copy_send(); + int ret; + + multifd_send_zero_page_detect(p); + + if (migrate_mapped_ram()) { + multifd_send_prepare_iovs(p); + multifd_set_file_bitmap(p); + + return 0; + } + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_ram_prepare_header(p); + } + + multifd_send_prepare_iovs(p); + p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, errp); + if (ret != 0) { + return -1; + } + + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + } + + return 0; +} + +static int multifd_nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + p->iov = g_new0(struct iovec, multifd_ram_page_count()); + return 0; +} + +static void multifd_nocomp_recv_cleanup(MultiFDRecvParams *p) +{ + g_free(p->iov); + p->iov = NULL; +} + +static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp) +{ + uint32_t flags; + + if (migrate_mapped_ram()) { + return multifd_file_recv_data(p, errp); + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (flags != MULTIFD_FLAG_NOCOMP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_NOCOMP); + return -1; + } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + + for (int i = 0; i < p->normal_num; i++) { + p->iov[i].iov_base = p->host + p->normal[i]; + p->iov[i].iov_len = multifd_ram_page_size(); + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); +} + +static void multifd_pages_reset(MultiFDPages_t *pages) +{ + /* + * We don't need to touch offset[] array, because it will be + * overwritten later when reused. + */ + pages->num = 0; + pages->normal_num = 0; + pages->block = NULL; +} + +void multifd_ram_fill_packet(MultiFDSendParams *p) +{ + MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = &p->data->u.ram; + uint32_t zero_num = pages->num - pages->normal_num; + + packet->pages_alloc = cpu_to_be32(multifd_ram_page_count()); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); + + if (pages->block) { + pstrcpy(packet->ramblock, sizeof(packet->ramblock), + pages->block->idstr); + } + + for (int i = 0; i < pages->num; i++) { + /* there are architectures where ram_addr_t is 32 bit */ + uint64_t temp = pages->offset[i]; + + packet->offset[i] = cpu_to_be64(temp); + } + + trace_multifd_send_ram_fill(p->id, pages->normal_num, + zero_num); +} + +int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp) +{ + MultiFDPacket_t *packet = p->packet; + uint32_t page_count = multifd_ram_page_count(); + uint32_t page_size = multifd_ram_page_size(); + uint32_t pages_per_packet = be32_to_cpu(packet->pages_alloc); + int i; + + if (pages_per_packet > page_count) { + error_setg(errp, "multifd: received packet with %u pages, expected %u", + pages_per_packet, page_count); + return -1; + } + + p->normal_num = be32_to_cpu(packet->normal_pages); + if (p->normal_num > pages_per_packet) { + error_setg(errp, "multifd: received packet with %u non-zero pages, " + "which exceeds maximum expected pages %u", + p->normal_num, pages_per_packet); + return -1; + } + + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > pages_per_packet - p->normal_num) { + error_setg(errp, + "multifd: received packet with %u zero pages, expected maximum %u", + p->zero_num, pages_per_packet - p->normal_num); + return -1; + } + + if (p->normal_num == 0 && p->zero_num == 0) { + return 0; + } + + /* make sure that ramblock is 0 terminated */ + packet->ramblock[255] = 0; + p->block = qemu_ram_block_by_name(packet->ramblock); + if (!p->block) { + error_setg(errp, "multifd: unknown ram block %s", + packet->ramblock); + return -1; + } + + p->host = p->block->host; + for (i = 0; i < p->normal_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[i]); + + if (offset > (p->block->used_length - page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->normal[i] = offset; + } + + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + + return 0; +} + +static inline bool multifd_queue_empty(MultiFDPages_t *pages) +{ + return pages->num == 0; +} + +static inline bool multifd_queue_full(MultiFDPages_t *pages) +{ + return pages->num == multifd_ram_page_count(); +} + +static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) +{ + pages->offset[pages->num++] = offset; +} + +/* Returns true if enqueue successful, false otherwise */ +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) +{ + MultiFDPages_t *pages; + +retry: + pages = &multifd_ram_send->u.ram; + + if (multifd_payload_empty(multifd_ram_send)) { + multifd_pages_reset(pages); + multifd_set_payload_type(multifd_ram_send, MULTIFD_PAYLOAD_RAM); + } + + /* If the queue is empty, we can already enqueue now */ + if (multifd_queue_empty(pages)) { + pages->block = block; + multifd_enqueue(pages, offset); + return true; + } + + /* + * Not empty, meanwhile we need a flush. It can because of either: + * + * (1) The page is not on the same ramblock of previous ones, or, + * (2) The queue is full. + * + * After flush, always retry. + */ + if (pages->block != block || multifd_queue_full(pages)) { + if (!multifd_send(&multifd_ram_send)) { + return false; + } + goto retry; + } + + /* Not empty, and we still have space, do it! */ + multifd_enqueue(pages, offset); + return true; +} + +/* + * We have two modes for multifd flushes: + * + * - Per-section mode: this is the legacy way to flush, it requires one + * MULTIFD_FLAG_SYNC message for each RAM_SAVE_FLAG_EOS. + * + * - Per-round mode: this is the modern way to flush, it requires one + * MULTIFD_FLAG_SYNC message only for each round of RAM scan. Normally + * it's paired with a new RAM_SAVE_FLAG_MULTIFD_FLUSH message in network + * based migrations. + * + * One thing to mention is mapped-ram always use the modern way to sync. + */ + +/* Do we need a per-section multifd flush (legacy way)? */ +bool multifd_ram_sync_per_section(void) +{ + if (!migrate_multifd()) { + return false; + } + + if (migrate_mapped_ram()) { + return false; + } + + return migrate_multifd_flush_after_each_section(); +} + +/* Do we need a per-round multifd flush (modern way)? */ +bool multifd_ram_sync_per_round(void) +{ + if (!migrate_multifd()) { + return false; + } + + if (migrate_mapped_ram()) { + return true; + } + + return !migrate_multifd_flush_after_each_section(); +} + +int multifd_ram_flush_and_sync(QEMUFile *f) +{ + MultiFDSyncReq req; + int ret; + + if (!migrate_multifd() || migration_in_postcopy()) { + return 0; + } + + if (!multifd_payload_empty(multifd_ram_send)) { + if (!multifd_send(&multifd_ram_send)) { + error_report("%s: multifd_send fail", __func__); + return -1; + } + } + + /* File migrations only need to sync with threads */ + req = migrate_mapped_ram() ? MULTIFD_SYNC_LOCAL : MULTIFD_SYNC_ALL; + + ret = multifd_send_sync_main(req); + if (ret) { + return ret; + } + + /* If we don't need to sync with remote at all, nothing else to do */ + if (req == MULTIFD_SYNC_LOCAL) { + return 0; + } + + /* + * Old QEMUs don't understand RAM_SAVE_FLAG_MULTIFD_FLUSH, it relies + * on RAM_SAVE_FLAG_EOS instead. + */ + if (migrate_multifd_flush_after_each_section()) { + return 0; + } + + qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); + qemu_fflush(f); + + return 0; +} + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = &p->data->u.ram; + multifd_ram_prepare_header(p); + multifd_send_zero_page_detect(p); + + if (!pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + return true; +} + +static const MultiFDMethods multifd_nocomp_ops = { + .send_setup = multifd_nocomp_send_setup, + .send_cleanup = multifd_nocomp_send_cleanup, + .send_prepare = multifd_nocomp_send_prepare, + .recv_setup = multifd_nocomp_recv_setup, + .recv_cleanup = multifd_nocomp_recv_cleanup, + .recv = multifd_nocomp_recv +}; + +static void multifd_nocomp_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_NONE, &multifd_nocomp_ops); +} + +migration_init(multifd_nocomp_register); diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file mode 100644 index 0000000..7419e5d --- /dev/null +++ b/migration/multifd-qatzip.c @@ -0,0 +1,395 @@ +/* + * Multifd QATzip compression implementation + * + * Copyright (c) Bytedance + * + * Authors: + * Bryan Zhang <bryan.zhang@bytedance.com> + * Hao Xiang <hao.xiang@bytedance.com> + * Yichen Wang <yichen.wang@bytedance.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "system/ramblock.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qapi-types-migration.h" +#include "options.h" +#include "multifd.h" +#include <qatzip.h> + +typedef struct { + /* + * Unique session for use with QATzip API + */ + QzSession_T sess; + + /* + * For compression: Buffer for pages to compress + * For decompression: Buffer for data to decompress + */ + uint8_t *in_buf; + uint32_t in_len; + + /* + * For compression: Output buffer of compressed data + * For decompression: Output buffer of decompressed data + */ + uint8_t *out_buf; + uint32_t out_len; +} QatzipData; + +/** + * qatzip_send_setup: Set up QATzip session and private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, 2); + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + /* Make sure to use configured QATzip compression level. */ + params.common_params.comp_lvl = migrate_multifd_qatzip_level(); + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + if (MULTIFD_PACKET_SIZE > UINT32_MAX) { + err_msg = "packet size too large for QAT"; + goto err; + } + + q->in_len = MULTIFD_PACKET_SIZE; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess); + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_send_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return None + */ +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + + g_free(p->iov); + p->iov = NULL; + p->compress_data = NULL; +} + +/** + * qatzip_send_prepare: Compress pages and update IO channel info. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) +{ + uint32_t page_size = multifd_ram_page_size(); + MultiFDPages_t *pages = &p->data->u.ram; + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* + * Unlike other multifd compression implementations, we use a non-streaming + * API and place all the data into one buffer, rather than sending each + * page to the compression API at a time. Based on initial benchmarks, the + * non-streaming API outperforms the streaming API. Plus, the logic in QEMU + * is friendly to using the non-streaming API anyway. If either of these + * statements becomes no longer true, we can revisit adding a streaming + * implementation. + */ + for (int i = 0; i < pages->normal_num; i++) { + memcpy(q->in_buf + (i * page_size), + pages->block->host + pages->offset[i], + page_size); + } + + in_len = pages->normal_num * page_size; + if (in_len > q->in_len) { + error_setg(errp, "multifd %u: unexpectedly large input", p->id); + return -1; + } + out_len = q->out_len; + + ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK", + p->id, ret); + return -1; + } + if (in_len != pages->normal_num * page_size) { + error_setg(errp, "multifd %u: QATzip failed to compress all input", + p->id); + return -1; + } + + p->iov[p->iovs_num].iov_base = q->out_buf; + p->iov[p->iovs_num].iov_len = out_len; + p->iovs_num++; + p->next_packet_size = out_len; + +out: + p->flags |= MULTIFD_FLAG_QATZIP; + multifd_send_fill_packet(p); + return 0; +} + +/** + * qatzip_recv_setup: Set up QATzip session and allocate private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + /* + * Reserve extra spaces for the incoming packets. Current implementation + * doesn't send uncompressed pages in case the compression gets too big. + */ + q->in_len = MULTIFD_PACKET_SIZE * 2; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = MULTIFD_PACKET_SIZE; + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_recv_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @return None + */ +static void qatzip_recv_cleanup(MultiFDRecvParams *p) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + p->compress_data = NULL; +} + + +/** + * qatzip_recv: Decompress pages and copy them to the appropriate + * locations. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + uint32_t in_size = p->next_packet_size; + uint32_t page_size = multifd_ram_page_size(); + uint32_t expected_size = p->normal_num * page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (in_size > q->in_len) { + error_setg(errp, "multifd %u: received unexpectedly large packet", + p->id); + return -1; + } + + if (flags != MULTIFD_FLAG_QATZIP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QATZIP); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp); + if (ret != 0) { + return ret; + } + + in_len = in_size; + out_len = q->out_len; + ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: qzDecompress failed", p->id); + return -1; + } + if (out_len != expected_size) { + error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_len, expected_size); + return -1; + } + + /* Copy each page to its appropriate location. */ + for (int i = 0; i < p->normal_num; i++) { + memcpy(p->host + p->normal[i], q->out_buf + page_size * i, page_size); + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + return 0; +} + +static MultiFDMethods multifd_qatzip_ops = { + .send_setup = qatzip_send_setup, + .send_cleanup = qatzip_send_cleanup, + .send_prepare = qatzip_send_prepare, + .recv_setup = qatzip_recv_setup, + .recv_cleanup = qatzip_recv_cleanup, + .recv = qatzip_recv +}; + +static void multifd_qatzip_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops); +} + +migration_init(multifd_qatzip_register); diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 9265098..52902eb 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -14,7 +14,7 @@ #include "qemu/module.h" #include "qapi/error.h" #include "qapi/qapi-types-migration.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "multifd.h" #include "qpl/qpl.h" @@ -220,21 +220,13 @@ static void multifd_qpl_deinit(QplData *qpl) } } -/** - * multifd_qpl_send_setup: set up send side - * - * Set up the channel with QPL compression. - * - * Returns 0 on success or -1 on error - * - * @p: Params for the channel being used - * @errp: pointer to an error - */ static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) { QplData *qpl; + uint32_t page_size = multifd_ram_page_size(); + uint32_t page_count = multifd_ram_page_count(); - qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + qpl = multifd_qpl_init(page_count, page_size, errp); if (!qpl) { return -1; } @@ -245,18 +237,10 @@ static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) * additional two IOVs are used to store packet header and compressed data * length */ - p->iov = g_new0(struct iovec, p->page_count + 2); + p->iov = g_new0(struct iovec, page_count + 2); return 0; } -/** - * multifd_qpl_send_cleanup: clean up send side - * - * Close the channel and free memory. - * - * @p: Params for the channel being used - * @errp: pointer to an error - */ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) { multifd_qpl_deinit(p->compress_data); @@ -404,13 +388,14 @@ retry: static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) { QplData *qpl = p->compress_data; - uint32_t size = p->page_size; + MultiFDPages_t *pages = &p->data->u.ram; + uint32_t size = multifd_ram_page_size(); qpl_job *job = qpl->sw_job; uint8_t *zbuf = qpl->zbuf; uint8_t *buf; - for (int i = 0; i < p->pages->normal_num; i++) { - buf = p->pages->block->host + p->pages->offset[i]; + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; multifd_qpl_prepare_comp_job(job, buf, zbuf, size); if (qpl_execute_job(job) == QPL_STS_OK) { multifd_qpl_fill_packet(i, p, zbuf, job->total_out); @@ -434,8 +419,8 @@ static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) static void multifd_qpl_compress_pages(MultiFDSendParams *p) { QplData *qpl = p->compress_data; - MultiFDPages_t *pages = p->pages; - uint32_t size = p->page_size; + MultiFDPages_t *pages = &p->data->u.ram; + uint32_t size = multifd_ram_page_size(); QplHwJob *hw_job; uint8_t *buf; uint8_t *zbuf; @@ -484,20 +469,10 @@ static void multifd_qpl_compress_pages(MultiFDSendParams *p) } } -/** - * multifd_qpl_send_prepare: prepare data to be able to send - * - * Create a compressed buffer with all the pages that we are going to - * send. - * - * Returns 0 on success or -1 on error - * - * @p: Params for the channel being used - * @errp: pointer to an error - */ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) { QplData *qpl = p->compress_data; + MultiFDPages_t *pages = &p->data->u.ram; uint32_t len = 0; if (!multifd_send_prepare_common(p)) { @@ -505,7 +480,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) } /* The first IOV is used to store the compressed page lengths */ - len = p->pages->normal_num * sizeof(uint32_t); + len = pages->normal_num * sizeof(uint32_t); multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len); if (qpl->hw_avail) { multifd_qpl_compress_pages(p); @@ -519,21 +494,13 @@ out: return 0; } -/** - * multifd_qpl_recv_setup: set up receive side - * - * Create the compressed channel and buffer. - * - * Returns 0 on success or -1 on error - * - * @p: Params for the channel being used - * @errp: pointer to an error - */ static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) { QplData *qpl; + uint32_t page_size = multifd_ram_page_size(); + uint32_t page_count = multifd_ram_page_count(); - qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + qpl = multifd_qpl_init(page_count, page_size, errp); if (!qpl) { return -1; } @@ -541,13 +508,6 @@ static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) return 0; } -/** - * multifd_qpl_recv_cleanup: set up receive side - * - * Close the channel and free memory. - * - * @p: Params for the channel being used - */ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) { multifd_qpl_deinit(p->compress_data); @@ -600,7 +560,7 @@ static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, Error **errp) { QplData *qpl = p->compress_data; - uint32_t size = p->page_size; + uint32_t size = multifd_ram_page_size(); qpl_job *job = qpl->sw_job; uint8_t *zbuf = qpl->zbuf; uint8_t *addr; @@ -638,7 +598,7 @@ static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) { QplData *qpl = p->compress_data; - uint32_t size = p->page_size; + uint32_t size = multifd_ram_page_size(); uint8_t *zbuf = qpl->zbuf; uint8_t *addr; uint32_t len; @@ -688,17 +648,6 @@ static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) } return 0; } -/** - * multifd_qpl_recv: read the data from the channel into actual pages - * - * Read the compressed buffer, and uncompress it into the actual - * pages. - * - * Returns 0 on success or -1 on error - * - * @p: Params for the channel being used - * @errp: pointer to an error - */ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) { QplData *qpl = p->compress_data; @@ -728,8 +677,9 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) } for (int i = 0; i < p->normal_num; i++) { qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); - assert(qpl->zlen[i] <= p->page_size); + assert(qpl->zlen[i] <= multifd_ram_page_size()); zbuf_len += qpl->zlen[i]; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } /* read compressed pages */ @@ -745,7 +695,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) return multifd_qpl_decompress_pages_slow_path(p, errp); } -static MultiFDMethods multifd_qpl_ops = { +static const MultiFDMethods multifd_qpl_ops = { .send_setup = multifd_qpl_send_setup, .send_cleanup = multifd_qpl_send_cleanup, .send_prepare = multifd_qpl_send_prepare, diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index d12353f..fd7cd9b 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "migration.h" #include "multifd.h" #include "options.h" @@ -103,19 +103,13 @@ static void multifd_uadk_uninit_sess(struct wd_data *wd) g_free(wd); } -/** - * multifd_uadk_send_setup: setup send side - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp) { struct wd_data *wd; + uint32_t page_size = multifd_ram_page_size(); + uint32_t page_count = multifd_ram_page_count(); - wd = multifd_uadk_init_sess(p->page_count, p->page_size, true, errp); + wd = multifd_uadk_init_sess(page_count, page_size, true, errp); if (!wd) { return -1; } @@ -128,24 +122,18 @@ static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp) * length */ - p->iov = g_new0(struct iovec, p->page_count + 2); + p->iov = g_new0(struct iovec, page_count + 2); return 0; } -/** - * multifd_uadk_send_cleanup: cleanup send side - * - * Close the channel and return memory. - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) { struct wd_data *wd = p->compress_data; multifd_uadk_uninit_sess(wd); p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; } static inline void prepare_next_iov(MultiFDSendParams *p, void *base, @@ -157,40 +145,31 @@ static inline void prepare_next_iov(MultiFDSendParams *p, void *base, p->iovs_num++; } -/** - * multifd_uadk_send_prepare: prepare data to be able to send - * - * Create a compressed buffer with all the pages that we are going to - * send. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) { struct wd_data *uadk_data = p->compress_data; uint32_t hdr_size; + uint32_t page_size = multifd_ram_page_size(); uint8_t *buf = uadk_data->buf; int ret = 0; + MultiFDPages_t *pages = &p->data->u.ram; if (!multifd_send_prepare_common(p)) { goto out; } - hdr_size = p->pages->normal_num * sizeof(uint32_t); + hdr_size = pages->normal_num * sizeof(uint32_t); /* prepare the header that stores the lengths of all compressed data */ prepare_next_iov(p, uadk_data->buf_hdr, hdr_size); - for (int i = 0; i < p->pages->normal_num; i++) { + for (int i = 0; i < pages->normal_num; i++) { struct wd_comp_req creq = { .op_type = WD_DIR_COMPRESS, - .src = p->pages->block->host + p->pages->offset[i], - .src_len = p->page_size, + .src = pages->block->host + pages->offset[i], + .src_len = page_size, .dst = buf, /* Set dst_len to double the src in case compressed out >= page_size */ - .dst_len = p->page_size * 2, + .dst_len = page_size * 2, }; if (uadk_data->handle) { @@ -200,7 +179,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) p->id, ret, creq.status); return -1; } - if (creq.dst_len < p->page_size) { + if (creq.dst_len < page_size) { uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); prepare_next_iov(p, buf, creq.dst_len); buf += creq.dst_len; @@ -212,11 +191,11 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) * than page_size as well because at the receive end we can skip the * decompression. But it is tricky to find the right number here. */ - if (!uadk_data->handle || creq.dst_len >= p->page_size) { - uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size); - prepare_next_iov(p, p->pages->block->host + p->pages->offset[i], - p->page_size); - buf += p->page_size; + if (!uadk_data->handle || creq.dst_len >= page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(page_size); + prepare_next_iov(p, pages->block->host + pages->offset[i], + page_size); + buf += page_size; } } out: @@ -225,21 +204,13 @@ out: return 0; } -/** - * multifd_uadk_recv_setup: setup receive side - * - * Create the compressed channel and buffer. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp) { struct wd_data *wd; + uint32_t page_size = multifd_ram_page_size(); + uint32_t page_count = multifd_ram_page_count(); - wd = multifd_uadk_init_sess(p->page_count, p->page_size, false, errp); + wd = multifd_uadk_init_sess(page_count, page_size, false, errp); if (!wd) { return -1; } @@ -247,13 +218,6 @@ static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp) return 0; } -/** - * multifd_uadk_recv_cleanup: cleanup receive side - * - * Close the channel and return memory. - * - * @p: Params for the channel that we are using - */ static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) { struct wd_data *wd = p->compress_data; @@ -262,17 +226,6 @@ static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) p->compress_data = NULL; } -/** - * multifd_uadk_recv: read the data from the channel into actual pages - * - * Read the compressed buffer, and uncompress it into the actual - * pages. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) { struct wd_data *uadk_data = p->compress_data; @@ -280,6 +233,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; uint32_t hdr_len = p->normal_num * sizeof(uint32_t); uint32_t data_len = 0; + uint32_t page_size = multifd_ram_page_size(); uint8_t *buf = uadk_data->buf; int ret = 0; @@ -306,7 +260,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) for (int i = 0; i < p->normal_num; i++) { uadk_data->buf_hdr[i] = be32_to_cpu(uadk_data->buf_hdr[i]); data_len += uadk_data->buf_hdr[i]; - assert(uadk_data->buf_hdr[i] <= p->page_size); + assert(uadk_data->buf_hdr[i] <= page_size); } /* read compressed data */ @@ -322,12 +276,12 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) .src = buf, .src_len = uadk_data->buf_hdr[i], .dst = p->host + p->normal[i], - .dst_len = p->page_size, + .dst_len = page_size, }; - if (uadk_data->buf_hdr[i] == p->page_size) { - memcpy(p->host + p->normal[i], buf, p->page_size); - buf += p->page_size; + if (uadk_data->buf_hdr[i] == page_size) { + memcpy(p->host + p->normal[i], buf, page_size); + buf += page_size; continue; } @@ -343,7 +297,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) p->id, ret, creq.status); return -1; } - if (creq.dst_len != p->page_size) { + if (creq.dst_len != page_size) { error_setg(errp, "multifd %u: decompressed length error", p->id); return -1; } @@ -353,7 +307,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) return 0; } -static MultiFDMethods multifd_uadk_ops = { +static const MultiFDMethods multifd_uadk_ops = { .send_setup = multifd_uadk_send_setup, .send_cleanup = multifd_uadk_send_cleanup, .send_prepare = multifd_uadk_send_prepare, diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c index e1b8370..4cde868 100644 --- a/migration/multifd-zero-page.c +++ b/migration/multifd-zero-page.c @@ -12,8 +12,9 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "migration.h" +#include "migration-stats.h" #include "multifd.h" #include "options.h" #include "ram.h" @@ -46,14 +47,14 @@ static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) */ void multifd_send_zero_page_detect(MultiFDSendParams *p) { - MultiFDPages_t *pages = p->pages; + MultiFDPages_t *pages = &p->data->u.ram; RAMBlock *rb = pages->block; int i = 0; int j = pages->num - 1; if (!multifd_zero_page_enabled()) { pages->normal_num = pages->num; - return; + goto out; } /* @@ -63,7 +64,7 @@ void multifd_send_zero_page_detect(MultiFDSendParams *p) while (i <= j) { uint64_t offset = pages->offset[i]; - if (!buffer_is_zero(rb->host + offset, p->page_size)) { + if (!buffer_is_zero(rb->host + offset, multifd_ram_page_size())) { i++; continue; } @@ -74,15 +75,37 @@ void multifd_send_zero_page_detect(MultiFDSendParams *p) } pages->normal_num = i; + +out: + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); } void multifd_recv_zero_page_process(MultiFDRecvParams *p) { for (int i = 0; i < p->zero_num; i++) { void *page = p->host + p->zero[i]; - if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { - memset(page, 0, p->page_size); - } else { + bool received = + ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i]); + + /* + * During multifd migration zero page is written to the memory + * only if it is migrated more than once. + * + * It becomes a problem when both multifd & postcopy options are + * enabled. If the zero page which was skipped during multifd phase, + * is accessed during the postcopy phase of the migration, a page + * fault occurs. But this page fault is not served because the + * 'receivedmap' says the zero page is already received. Thus the + * thread accessing that page may hang. + * + * When postcopy is enabled, always write the zero page as and when + * it is migrated. + */ + if (migrate_postcopy_ram() || received) { + memset(page, 0, multifd_ram_page_size()); + } + if (!received) { ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); } } diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 2ced694..8820b2a 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include <zlib.h> #include "qemu/rcu.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "qapi/error.h" #include "migration.h" @@ -34,17 +34,7 @@ struct zlib_data { /* Multifd zlib compression */ -/** - * zlib_send_setup: setup send side - * - * Setup each channel with zlib compression. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zlib_send_setup(MultiFDSendParams *p, Error **errp) +static int multifd_zlib_send_setup(MultiFDSendParams *p, Error **errp) { struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; @@ -86,15 +76,7 @@ err_free_z: return -1; } -/** - * zlib_send_cleanup: cleanup send side - * - * Close the channel and return memory. - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) +static void multifd_zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { struct zlib_data *z = p->compress_data; @@ -110,23 +92,13 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) p->iov = NULL; } -/** - * zlib_send_prepare: prepare date to be able to send - * - * Create a compressed buffer with all the pages that we are going to - * send. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) +static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp) { - MultiFDPages_t *pages = p->pages; + MultiFDPages_t *pages = &p->data->u.ram; struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; + uint32_t page_size = multifd_ram_page_size(); int ret; uint32_t i; @@ -147,8 +119,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); - zs->avail_in = p->page_size; + memcpy(z->buf, pages->block->host + pages->offset[i], page_size); + zs->avail_in = page_size; zs->next_in = z->buf; zs->avail_out = available; @@ -188,17 +160,7 @@ out: return 0; } -/** - * zlib_recv_setup: setup receive side - * - * Create the compressed channel and buffer. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) +static int multifd_zlib_recv_setup(MultiFDRecvParams *p, Error **errp) { struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; @@ -224,14 +186,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) return 0; } -/** - * zlib_recv_cleanup: setup receive side - * - * For no compression this function does nothing. - * - * @p: Params for the channel that we are using - */ -static void zlib_recv_cleanup(MultiFDRecvParams *p) +static void multifd_zlib_recv_cleanup(MultiFDRecvParams *p) { struct zlib_data *z = p->compress_data; @@ -242,25 +197,15 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) p->compress_data = NULL; } -/** - * zlib_recv: read the data from the channel into actual pages - * - * Read the compressed buffer, and uncompress it into the actual - * pages. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zlib_recv(MultiFDRecvParams *p, Error **errp) +static int multifd_zlib_recv(MultiFDRecvParams *p, Error **errp) { struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ uint32_t out_size = zs->total_out; - uint32_t expected_size = p->normal_num * p->page_size; + uint32_t page_size = multifd_ram_page_size(); + uint32_t expected_size = p->normal_num * page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; int ret; int i; @@ -296,7 +241,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) flush = Z_SYNC_FLUSH; } - zs->avail_out = p->page_size; + zs->avail_out = page_size; zs->next_out = p->host + p->normal[i]; /* @@ -310,8 +255,8 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) do { ret = inflate(zs, flush); } while (ret == Z_OK && zs->avail_in - && (zs->total_out - start) < p->page_size); - if (ret == Z_OK && (zs->total_out - start) < p->page_size) { + && (zs->total_out - start) < page_size); + if (ret == Z_OK && (zs->total_out - start) < page_size) { error_setg(errp, "multifd %u: inflate generated too few output", p->id); return -1; @@ -332,13 +277,13 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) return 0; } -static MultiFDMethods multifd_zlib_ops = { - .send_setup = zlib_send_setup, - .send_cleanup = zlib_send_cleanup, - .send_prepare = zlib_send_prepare, - .recv_setup = zlib_recv_setup, - .recv_cleanup = zlib_recv_cleanup, - .recv = zlib_recv +static const MultiFDMethods multifd_zlib_ops = { + .send_setup = multifd_zlib_send_setup, + .send_cleanup = multifd_zlib_send_cleanup, + .send_prepare = multifd_zlib_send_prepare, + .recv_setup = multifd_zlib_recv_setup, + .recv_cleanup = multifd_zlib_recv_cleanup, + .recv = multifd_zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index ca17b7e..3c2dcf7 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include <zstd.h> #include "qemu/rcu.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "qapi/error.h" #include "migration.h" @@ -37,17 +37,7 @@ struct zstd_data { /* Multifd zstd compression */ -/** - * zstd_send_setup: setup send side - * - * Setup each channel with zstd compression. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zstd_send_setup(MultiFDSendParams *p, Error **errp) +static int multifd_zstd_send_setup(MultiFDSendParams *p, Error **errp) { struct zstd_data *z = g_new0(struct zstd_data, 1); int res; @@ -83,15 +73,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) return 0; } -/** - * zstd_send_cleanup: cleanup send side - * - * Close the channel and return memory. - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) +static void multifd_zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { struct zstd_data *z = p->compress_data; @@ -106,20 +88,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) p->iov = NULL; } -/** - * zstd_send_prepare: prepare date to be able to send - * - * Create a compressed buffer with all the pages that we are going to - * send. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) +static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp) { - MultiFDPages_t *pages = p->pages; + MultiFDPages_t *pages = &p->data->u.ram; struct zstd_data *z = p->compress_data; int ret; uint32_t i; @@ -138,8 +109,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } - z->in.src = p->pages->block->host + pages->offset[i]; - z->in.size = p->page_size; + z->in.src = pages->block->host + pages->offset[i]; + z->in.size = multifd_ram_page_size(); z->in.pos = 0; /* @@ -152,9 +123,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) */ do { ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush); - } while (ret > 0 && (z->in.size - z->in.pos > 0) - && (z->out.size - z->out.pos > 0)); - if (ret > 0 && (z->in.size - z->in.pos > 0)) { + } while (ret > 0 && (z->in.size > z->in.pos) + && (z->out.size > z->out.pos)); + if (ret > 0 && (z->in.size > z->in.pos)) { error_setg(errp, "multifd %u: compressStream buffer too small", p->id); return -1; @@ -176,17 +147,7 @@ out: return 0; } -/** - * zstd_recv_setup: setup receive side - * - * Create the compressed channel and buffer. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) +static int multifd_zstd_recv_setup(MultiFDRecvParams *p, Error **errp) { struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; @@ -220,14 +181,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) return 0; } -/** - * zstd_recv_cleanup: setup receive side - * - * For no compression this function does nothing. - * - * @p: Params for the channel that we are using - */ -static void zstd_recv_cleanup(MultiFDRecvParams *p) +static void multifd_zstd_recv_cleanup(MultiFDRecvParams *p) { struct zstd_data *z = p->compress_data; @@ -239,22 +193,12 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) p->compress_data = NULL; } -/** - * zstd_recv: read the data from the channel into actual pages - * - * Read the compressed buffer, and uncompress it into the actual - * pages. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int zstd_recv(MultiFDRecvParams *p, Error **errp) +static int multifd_zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; - uint32_t expected_size = p->normal_num * p->page_size; + uint32_t page_size = multifd_ram_page_size(); + uint32_t expected_size = p->normal_num * page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; struct zstd_data *z = p->compress_data; int ret; @@ -286,7 +230,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) for (i = 0; i < p->normal_num; i++) { ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); z->out.dst = p->host + p->normal[i]; - z->out.size = p->page_size; + z->out.size = page_size; z->out.pos = 0; /* @@ -299,9 +243,9 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) */ do { ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); - } while (ret > 0 && (z->in.size - z->in.pos > 0) - && (z->out.pos < p->page_size)); - if (ret > 0 && (z->out.pos < p->page_size)) { + } while (ret > 0 && (z->in.size > z->in.pos) + && (z->out.pos < page_size)); + if (ret > 0 && (z->out.pos < page_size)) { error_setg(errp, "multifd %u: decompressStream buffer too small", p->id); return -1; @@ -321,13 +265,13 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) return 0; } -static MultiFDMethods multifd_zstd_ops = { - .send_setup = zstd_send_setup, - .send_cleanup = zstd_send_cleanup, - .send_prepare = zstd_send_prepare, - .recv_setup = zstd_recv_setup, - .recv_cleanup = zstd_recv_cleanup, - .recv = zstd_recv +static const MultiFDMethods multifd_zstd_ops = { + .send_setup = multifd_zstd_send_setup, + .send_cleanup = multifd_zstd_send_cleanup, + .send_prepare = multifd_zstd_send_prepare, + .recv_setup = multifd_zstd_recv_setup, + .recv_cleanup = multifd_zstd_recv_cleanup, + .recv = multifd_zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index 0b4cbad..b255778 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -12,15 +12,18 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/iov.h" #include "qemu/rcu.h" #include "exec/target_page.h" -#include "sysemu/sysemu.h" -#include "exec/ramblock.h" +#include "system/system.h" +#include "system/ramblock.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "file.h" +#include "migration/misc.h" #include "migration.h" #include "migration-stats.h" +#include "savevm.h" #include "socket.h" #include "tls.h" #include "qemu-file.h" @@ -33,11 +36,6 @@ #include "io/channel-socket.h" #include "yank_functions.h" -/* Multiple fd's */ - -#define MULTIFD_MAGIC 0x11223344U -#define MULTIFD_VERSION 1 - typedef struct { uint32_t magic; uint32_t version; @@ -49,8 +47,10 @@ typedef struct { struct { MultiFDSendParams *params; - /* array of pages to sent */ - MultiFDPages_t *pages; + + /* multifd_send() body is not thread safe, needs serialization */ + QemuMutex multifd_send_mutex; + /* * Global number of generated multifd packets. * @@ -78,7 +78,7 @@ struct { */ int exiting; /* multifd ops */ - MultiFDMethods *ops; + const MultiFDMethods *ops; } *multifd_send_state; struct { @@ -95,236 +95,70 @@ struct { uint64_t packet_num; int exiting; /* multifd ops */ - MultiFDMethods *ops; + const MultiFDMethods *ops; } *multifd_recv_state; -static bool multifd_use_packets(void) -{ - return !migrate_mapped_ram(); -} - -void multifd_send_channel_created(void) -{ - qemu_sem_post(&multifd_send_state->channels_created); -} - -static void multifd_set_file_bitmap(MultiFDSendParams *p) +MultiFDSendData *multifd_send_data_alloc(void) { - MultiFDPages_t *pages = p->pages; + MultiFDSendData *new = g_new0(MultiFDSendData, 1); - assert(pages->block); + multifd_ram_payload_alloc(&new->u.ram); + /* Device state allocates its payload on-demand */ - for (int i = 0; i < p->pages->normal_num; i++) { - ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); - } - - for (int i = p->pages->normal_num; i < p->pages->num; i++) { - ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false); - } + return new; } -/* Multifd without compression */ - -/** - * nocomp_send_setup: setup send side - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) +void multifd_send_data_clear(MultiFDSendData *data) { - if (migrate_zero_copy_send()) { - p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + if (multifd_payload_empty(data)) { + return; } - if (multifd_use_packets()) { - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, p->page_count + 1); - } else { - p->iov = g_new0(struct iovec, p->page_count); + switch (data->type) { + case MULTIFD_PAYLOAD_DEVICE_STATE: + multifd_send_data_clear_device_state(&data->u.device_state); + break; + default: + /* Nothing to do */ + break; } - return 0; -} - -/** - * nocomp_send_cleanup: cleanup send side - * - * For no compression this function does nothing. - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) -{ - g_free(p->iov); - p->iov = NULL; - return; + data->type = MULTIFD_PAYLOAD_NONE; } -static void multifd_send_prepare_iovs(MultiFDSendParams *p) +void multifd_send_data_free(MultiFDSendData *data) { - MultiFDPages_t *pages = p->pages; - - for (int i = 0; i < pages->normal_num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; - } - - p->next_packet_size = pages->normal_num * p->page_size; -} - -/** - * nocomp_send_prepare: prepare date to be able to send - * - * For no compression we just have to calculate the size of the - * packet. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) -{ - bool use_zero_copy_send = migrate_zero_copy_send(); - int ret; - - multifd_send_zero_page_detect(p); - - if (!multifd_use_packets()) { - multifd_send_prepare_iovs(p); - multifd_set_file_bitmap(p); - - return 0; - } - - if (!use_zero_copy_send) { - /* - * Only !zerocopy needs the header in IOV; zerocopy will - * send it separately. - */ - multifd_send_prepare_header(p); + if (!data) { + return; } - multifd_send_prepare_iovs(p); - p->flags |= MULTIFD_FLAG_NOCOMP; + /* This also free's device state payload */ + multifd_send_data_clear(data); - multifd_send_fill_packet(p); - - if (use_zero_copy_send) { - /* Send header first, without zerocopy */ - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, errp); - if (ret != 0) { - return -1; - } - } + multifd_ram_payload_free(&data->u.ram); - return 0; -} - -/** - * nocomp_recv_setup: setup receive side - * - * For no compression this function does nothing. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) -{ - p->iov = g_new0(struct iovec, p->page_count); - return 0; + g_free(data); } -/** - * nocomp_recv_cleanup: setup receive side - * - * For no compression this function does nothing. - * - * @p: Params for the channel that we are using - */ -static void nocomp_recv_cleanup(MultiFDRecvParams *p) +static bool multifd_use_packets(void) { - g_free(p->iov); - p->iov = NULL; + return !migrate_mapped_ram(); } -/** - * nocomp_recv: read the data from the channel - * - * For no compression we just need to read things into the correct place. - * - * Returns 0 for success or -1 for error - * - * @p: Params for the channel that we are using - * @errp: pointer to an error - */ -static int nocomp_recv(MultiFDRecvParams *p, Error **errp) +void multifd_send_channel_created(void) { - uint32_t flags; - - if (!multifd_use_packets()) { - return multifd_file_recv_data(p, errp); - } - - flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - - if (flags != MULTIFD_FLAG_NOCOMP) { - error_setg(errp, "multifd %u: flags received %x flags expected %x", - p->id, flags, MULTIFD_FLAG_NOCOMP); - return -1; - } - - multifd_recv_zero_page_process(p); - - if (!p->normal_num) { - return 0; - } - - for (int i = 0; i < p->normal_num; i++) { - p->iov[i].iov_base = p->host + p->normal[i]; - p->iov[i].iov_len = p->page_size; - ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); - } - return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); + qemu_sem_post(&multifd_send_state->channels_created); } -static MultiFDMethods multifd_nocomp_ops = { - .send_setup = nocomp_send_setup, - .send_cleanup = nocomp_send_cleanup, - .send_prepare = nocomp_send_prepare, - .recv_setup = nocomp_recv_setup, - .recv_cleanup = nocomp_recv_cleanup, - .recv = nocomp_recv -}; +static const MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {}; -static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { - [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops, -}; - -void multifd_register_ops(int method, MultiFDMethods *ops) +void multifd_register_ops(int method, const MultiFDMethods *ops) { - assert(0 < method && method < MULTIFD_COMPRESSION__MAX); + assert(0 <= method && method < MULTIFD_COMPRESSION__MAX); + assert(!multifd_ops[method]); multifd_ops[method] = ops; } -/* Reset a MultiFDPages_t* object for the next use */ -static void multifd_pages_reset(MultiFDPages_t *pages) -{ - /* - * We don't need to touch offset[] array, because it will be - * overwritten later when reused. - */ - pages->num = 0; - pages->normal_num = 0; - pages->block = NULL; -} - static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { MultiFDInit_t msg = {}; @@ -389,160 +223,95 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } -static MultiFDPages_t *multifd_pages_init(uint32_t n) -{ - MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); - - pages->allocated = n; - pages->offset = g_new0(ram_addr_t, n); - - return pages; -} - -static void multifd_pages_clear(MultiFDPages_t *pages) -{ - multifd_pages_reset(pages); - pages->allocated = 0; - g_free(pages->offset); - pages->offset = NULL; - g_free(pages); -} - +/* Fills a RAM multifd packet */ void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; - MultiFDPages_t *pages = p->pages; uint64_t packet_num; - uint32_t zero_num = pages->num - pages->normal_num; - int i; + bool sync_packet = p->flags & MULTIFD_FLAG_SYNC; + + memset(packet, 0, p->packet_len); - packet->flags = cpu_to_be32(p->flags); - packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(pages->normal_num); - packet->zero_pages = cpu_to_be32(zero_num); + packet->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + packet->hdr.version = cpu_to_be32(MULTIFD_VERSION); + + packet->hdr.flags = cpu_to_be32(p->flags); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); packet->packet_num = cpu_to_be64(packet_num); - if (pages->block) { - strncpy(packet->ramblock, pages->block->idstr, 256); - } - - for (i = 0; i < pages->num; i++) { - /* there are architectures where ram_addr_t is 32 bit */ - uint64_t temp = pages->offset[i]; + p->packets_sent++; - packet->offset[i] = cpu_to_be64(temp); + if (!sync_packet) { + multifd_ram_fill_packet(p); } - p->packets_sent++; - p->total_normal_pages += pages->normal_num; - p->total_zero_pages += zero_num; - - trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, - p->flags, p->next_packet_size); + trace_multifd_send_fill(p->id, packet_num, + p->flags, p->next_packet_size); } -static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +static int multifd_recv_unfill_packet_header(MultiFDRecvParams *p, + const MultiFDPacketHdr_t *hdr, + Error **errp) { - MultiFDPacket_t *packet = p->packet; - int i; + uint32_t magic = be32_to_cpu(hdr->magic); + uint32_t version = be32_to_cpu(hdr->version); - packet->magic = be32_to_cpu(packet->magic); - if (packet->magic != MULTIFD_MAGIC) { - error_setg(errp, "multifd: received packet " - "magic %x and expected magic %x", - packet->magic, MULTIFD_MAGIC); + if (magic != MULTIFD_MAGIC) { + error_setg(errp, "multifd: received packet magic %x, expected %x", + magic, MULTIFD_MAGIC); return -1; } - packet->version = be32_to_cpu(packet->version); - if (packet->version != MULTIFD_VERSION) { - error_setg(errp, "multifd: received packet " - "version %u and expected version %u", - packet->version, MULTIFD_VERSION); + if (version != MULTIFD_VERSION) { + error_setg(errp, "multifd: received packet version %u, expected %u", + version, MULTIFD_VERSION); return -1; } - p->flags = be32_to_cpu(packet->flags); - - packet->pages_alloc = be32_to_cpu(packet->pages_alloc); - /* - * If we received a packet that is 100 times bigger than expected - * just stop migration. It is a magic number. - */ - if (packet->pages_alloc > p->page_count) { - error_setg(errp, "multifd: received packet " - "with size %u and expected a size of %u", - packet->pages_alloc, p->page_count) ; - return -1; - } + p->flags = be32_to_cpu(hdr->flags); - p->normal_num = be32_to_cpu(packet->normal_pages); - if (p->normal_num > packet->pages_alloc) { - error_setg(errp, "multifd: received packet " - "with %u normal pages and expected maximum pages are %u", - p->normal_num, packet->pages_alloc) ; - return -1; - } + return 0; +} - p->zero_num = be32_to_cpu(packet->zero_pages); - if (p->zero_num > packet->pages_alloc - p->normal_num) { - error_setg(errp, "multifd: received packet " - "with %u zero pages and expected maximum zero pages are %u", - p->zero_num, packet->pages_alloc - p->normal_num) ; - return -1; - } +static int multifd_recv_unfill_packet_device_state(MultiFDRecvParams *p, + Error **errp) +{ + MultiFDPacketDeviceState_t *packet = p->packet_dev_state; + packet->instance_id = be32_to_cpu(packet->instance_id); p->next_packet_size = be32_to_cpu(packet->next_packet_size); - p->packet_num = be64_to_cpu(packet->packet_num); - p->packets_recved++; - p->total_normal_pages += p->normal_num; - p->total_zero_pages += p->zero_num; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, - p->flags, p->next_packet_size); + return 0; +} - if (p->normal_num == 0 && p->zero_num == 0) { - return 0; - } +static int multifd_recv_unfill_packet_ram(MultiFDRecvParams *p, Error **errp) +{ + const MultiFDPacket_t *packet = p->packet; + int ret = 0; - /* make sure that ramblock is 0 terminated */ - packet->ramblock[255] = 0; - p->block = qemu_ram_block_by_name(packet->ramblock); - if (!p->block) { - error_setg(errp, "multifd: unknown ram block %s", - packet->ramblock); - return -1; - } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + p->packet_num = be64_to_cpu(packet->packet_num); - p->host = p->block->host; - for (i = 0; i < p->normal_num; i++) { - uint64_t offset = be64_to_cpu(packet->offset[i]); + /* Always unfill, old QEMUs (<9.0) send data along with SYNC */ + ret = multifd_ram_unfill_packet(p, errp); - if (offset > (p->block->used_length - p->page_size)) { - error_setg(errp, "multifd: offset too long %" PRIu64 - " (max " RAM_ADDR_FMT ")", - offset, p->block->used_length); - return -1; - } - p->normal[i] = offset; - } + trace_multifd_recv_unfill(p->id, p->packet_num, p->flags, + p->next_packet_size); - for (i = 0; i < p->zero_num; i++) { - uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + return ret; +} - if (offset > (p->block->used_length - p->page_size)) { - error_setg(errp, "multifd: offset too long %" PRIu64 - " (max " RAM_ADDR_FMT ")", - offset, p->block->used_length); - return -1; - } - p->zero[i] = offset; +static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +{ + p->packets_recved++; + + if (p->flags & MULTIFD_FLAG_DEVICE_STATE) { + return multifd_recv_unfill_packet_device_state(p, errp); } - return 0; + return multifd_recv_unfill_packet_ram(p, errp); } static bool multifd_send_should_exit(void) @@ -568,35 +337,32 @@ static void multifd_send_kick_main(MultiFDSendParams *p) } /* - * How we use multifd_send_state->pages and channel->pages? + * multifd_send() works by exchanging the MultiFDSendData object + * provided by the caller with an unused MultiFDSendData object from + * the next channel that is found to be idle. * - * We create a pages for each channel, and a main one. Each time that - * we need to send a batch of pages we interchange the ones between - * multifd_send_state and the channel that is sending it. There are - * two reasons for that: - * - to not have to do so many mallocs during migration - * - to make easier to know what to free at the end of migration + * The channel owns the data until it finishes transmitting and the + * caller owns the empty object until it fills it with data and calls + * this function again. No locking necessary. * - * This way we always know who is the owner of each "pages" struct, - * and we don't need any locking. It belongs to the migration thread - * or to the channel thread. Switching is safe because the migration - * thread is using the channel mutex when changing it, and the channel - * have to had finish with its own, otherwise pending_job can't be - * false. + * Switching is safe because both the migration thread and the channel + * thread have barriers in place to serialize access. * * Returns true if succeed, false otherwise. */ -static bool multifd_send_pages(void) +bool multifd_send(MultiFDSendData **send_data) { int i; static int next_channel; MultiFDSendParams *p = NULL; /* make happy gcc */ - MultiFDPages_t *pages = multifd_send_state->pages; + MultiFDSendData *tmp; if (multifd_send_should_exit()) { return false; } + QEMU_LOCK_GUARD(&multifd_send_state->multifd_send_mutex); + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); @@ -626,66 +392,24 @@ static bool multifd_send_pages(void) * qatomic_store_release() in multifd_send_thread(). */ smp_mb_acquire(); - assert(!p->pages->num); - multifd_send_state->pages = p->pages; - p->pages = pages; - /* - * Making sure p->pages is setup before marking pending_job=true. Pairs - * with the qatomic_load_acquire() in multifd_send_thread(). - */ - qatomic_store_release(&p->pending_job, true); - qemu_sem_post(&p->sem); - - return true; -} - -static inline bool multifd_queue_empty(MultiFDPages_t *pages) -{ - return pages->num == 0; -} -static inline bool multifd_queue_full(MultiFDPages_t *pages) -{ - return pages->num == pages->allocated; -} - -static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) -{ - pages->offset[pages->num++] = offset; -} + assert(multifd_payload_empty(p->data)); -/* Returns true if enqueue successful, false otherwise */ -bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) -{ - MultiFDPages_t *pages; - -retry: - pages = multifd_send_state->pages; - - /* If the queue is empty, we can already enqueue now */ - if (multifd_queue_empty(pages)) { - pages->block = block; - multifd_enqueue(pages, offset); - return true; - } + /* + * Swap the pointers. The channel gets the client data for + * transferring and the client gets back an unused data slot. + */ + tmp = *send_data; + *send_data = p->data; + p->data = tmp; /* - * Not empty, meanwhile we need a flush. It can because of either: - * - * (1) The page is not on the same ramblock of previous ones, or, - * (2) The queue is full. - * - * After flush, always retry. + * Making sure p->data is setup before marking pending_job=true. Pairs + * with the qatomic_load_acquire() in multifd_send_thread(). */ - if (pages->block != block || multifd_queue_full(pages)) { - if (!multifd_send_pages()) { - return false; - } - goto retry; - } + qatomic_store_release(&p->pending_job, true); + qemu_sem_post(&p->sem); - /* Not empty, and we still have space, do it! */ - multifd_enqueue(pages, offset); return true; } @@ -775,7 +499,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) * channels have no I/O handler callback registered when reaching * here, because migration thread will wait for all multifd channel * establishments to complete during setup. Since - * migrate_fd_cleanup() will be scheduled in main thread too, all + * migration_cleanup() will be scheduled in main thread too, all * previous callbacks should guarantee to be completed when * reaching here. See multifd_send_state.channels_created and its * usage. In the future, we could replace this with an assert @@ -790,12 +514,13 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) qemu_sem_destroy(&p->sem_sync); g_free(p->name); p->name = NULL; - multifd_pages_clear(p->pages); - p->pages = NULL; + g_clear_pointer(&p->data, multifd_send_data_free); p->packet_len = 0; + g_clear_pointer(&p->packet_device_state, g_free); g_free(p->packet); p->packet = NULL; multifd_send_state->ops->send_cleanup(p, errp); + assert(!p->iov); return *errp == NULL; } @@ -804,12 +529,12 @@ static void multifd_send_cleanup_state(void) { file_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); + multifd_device_state_send_cleanup(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); + qemu_mutex_destroy(&multifd_send_state->multifd_send_mutex); g_free(multifd_send_state->params); multifd_send_state->params = NULL; - multifd_pages_clear(multifd_send_state->pages); - multifd_send_state->pages = NULL; g_free(multifd_send_state); multifd_send_state = NULL; } @@ -822,6 +547,36 @@ void multifd_send_shutdown(void) return; } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + /* thread_created implies the TLS handshake has succeeded */ + if (p->tls_thread_created && p->thread_created) { + Error *local_err = NULL; + /* + * The destination expects the TLS session to always be + * properly terminated. This helps to detect a premature + * termination in the middle of the stream. Note that + * older QEMUs always break the connection on the source + * and the destination always sees + * GNUTLS_E_PREMATURE_TERMINATION. + */ + migration_tls_channel_end(p->c, &local_err); + + /* + * The above can return an error in case the migration has + * already failed. If the migration succeeded, errors are + * not expected but there's no need to kill the source. + */ + if (local_err && !migration_has_failed(migrate_get_current())) { + warn_report( + "multifd_send_%d: Failed to terminate TLS connection: %s", + p->id, error_get_pretty(local_err)); + break; + } + } + } + multifd_send_terminate_threads(); for (i = 0; i < migrate_multifd_channels(); i++) { @@ -854,20 +609,12 @@ static int multifd_zero_copy_flush(QIOChannel *c) return ret; } -int multifd_send_sync_main(void) +int multifd_send_sync_main(MultiFDSyncReq req) { int i; bool flush_zero_copy; - if (!migrate_multifd()) { - return 0; - } - if (multifd_send_state->pages->num) { - if (!multifd_send_pages()) { - error_report("%s: multifd_send_pages fail", __func__); - return -1; - } - } + assert(req != MULTIFD_SYNC_NONE); flush_zero_copy = migrate_zero_copy_send(); @@ -884,8 +631,8 @@ int multifd_send_sync_main(void) * We should be the only user so far, so not possible to be set by * others concurrently. */ - assert(qatomic_read(&p->pending_sync) == false); - qatomic_set(&p->pending_sync, true); + assert(qatomic_read(&p->pending_sync) == MULTIFD_SYNC_NONE); + qatomic_set(&p->pending_sync, req); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -937,26 +684,46 @@ static void *multifd_send_thread(void *opaque) } /* - * Read pending_job flag before p->pages. Pairs with the - * qatomic_store_release() in multifd_send_pages(). + * Read pending_job flag before p->data. Pairs with the + * qatomic_store_release() in multifd_send(). */ if (qatomic_load_acquire(&p->pending_job)) { - MultiFDPages_t *pages = p->pages; + bool is_device_state = multifd_payload_device_state(p->data); + size_t total_size; + int write_flags_masked = 0; + p->flags = 0; p->iovs_num = 0; - assert(pages->num); + assert(!multifd_payload_empty(p->data)); - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - break; + if (is_device_state) { + multifd_device_state_send_prepare(p); + + /* Device state packets cannot be sent via zerocopy */ + write_flags_masked |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } else { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + break; + } } + /* + * The packet header in the zerocopy RAM case is accounted for + * in multifd_nocomp_send_prepare() - where it is actually + * being sent. + */ + total_size = iov_size(p->iov, p->iovs_num); + if (migrate_mapped_ram()) { + assert(!is_device_state); + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, - p->pages->block, &local_err); + &p->data->u.ram, &local_err); } else { ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, - NULL, 0, p->write_flags, + NULL, 0, + p->write_flags & ~write_flags_masked, &local_err); } @@ -964,29 +731,29 @@ static void *multifd_send_thread(void *opaque) break; } - stat64_add(&mig_stats.multifd_bytes, - p->next_packet_size + p->packet_len); - stat64_add(&mig_stats.normal_pages, pages->normal_num); - stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); + stat64_add(&mig_stats.multifd_bytes, total_size); - multifd_pages_reset(p->pages); p->next_packet_size = 0; + multifd_send_data_clear(p->data); /* - * Making sure p->pages is published before saying "we're + * Making sure p->data is published before saying "we're * free". Pairs with the smp_mb_acquire() in - * multifd_send_pages(). + * multifd_send(). */ qatomic_store_release(&p->pending_job, false); } else { + MultiFDSyncReq req = qatomic_read(&p->pending_sync); + /* * If not a normal job, must be a sync request. Note that * pending_sync is a standalone flag (unlike pending_job), so * it doesn't require explicit memory barriers. */ - assert(qatomic_read(&p->pending_sync)); + assert(req != MULTIFD_SYNC_NONE); - if (use_packets) { + /* Only push the SYNC message if it involves a remote sync */ + if (req == MULTIFD_SYNC_ALL) { p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, @@ -996,10 +763,9 @@ static void *multifd_send_thread(void *opaque) } /* p->next_packet_size will always be zero for a SYNC packet */ stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; } - qatomic_set(&p->pending_sync, false); + qatomic_set(&p->pending_sync, MULTIFD_SYNC_NONE); qemu_sem_post(&p->sem_sync); } } @@ -1015,8 +781,7 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, - p->total_zero_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent); return NULL; } @@ -1069,7 +834,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, args->p = p; p->tls_thread_created = true; - qemu_thread_create(&p->tls_thread, "mig/src/tls", + qemu_thread_create(&p->tls_thread, MIGRATION_THREAD_SRC_TLS, multifd_tls_handshake_thread, args, QEMU_THREAD_JOINABLE); return true; @@ -1156,9 +921,8 @@ static bool multifd_new_send_channel_create(gpointer opaque, Error **errp) bool multifd_send_setup(void) { MigrationState *s = migrate_get_current(); - Error *local_err = NULL; int thread_count, ret = 0; - uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + uint32_t page_count = multifd_ram_page_count(); bool use_packets = multifd_use_packets(); uint8_t i; @@ -1169,7 +933,7 @@ bool multifd_send_setup(void) thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); - multifd_send_state->pages = multifd_pages_init(page_count); + qemu_mutex_init(&multifd_send_state->multifd_send_mutex); qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); @@ -1177,26 +941,27 @@ bool multifd_send_setup(void) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); p->id = i; - p->pages = multifd_pages_init(page_count); + p->data = multifd_send_data_alloc(); if (use_packets) { p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state)); + p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION); } - p->name = g_strdup_printf("mig/src/send_%d", i); - p->page_size = qemu_target_page_size(); - p->page_count = page_count; + p->name = g_strdup_printf(MIGRATION_THREAD_SRC_MULTIFD, i); p->write_flags = 0; if (!multifd_new_send_channel_create(p, &local_err)) { - return false; + migrate_set_error(s, local_err); + ret = -1; } } @@ -1209,24 +974,30 @@ bool multifd_send_setup(void) qemu_sem_wait(&multifd_send_state->channels_created); } + if (ret) { + goto err; + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - break; + migrate_set_error(s, local_err); + goto err; } + assert(p->iov); } - if (ret) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - return false; - } + multifd_device_state_send_setup(); return true; + +err: + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; } bool multifd_recv(void) @@ -1353,11 +1124,14 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem_sync); qemu_sem_destroy(&p->sem); + g_free(p->data); + p->data = NULL; g_free(p->name); p->name = NULL; p->packet_len = 0; g_free(p->packet); p->packet = NULL; + g_clear_pointer(&p->packet_dev_state, g_free); g_free(p->normal); p->normal = NULL; g_free(p->zero); @@ -1459,8 +1233,37 @@ void multifd_recv_sync_main(void) trace_multifd_recv_sync_main(multifd_recv_state->packet_num); } +static int multifd_device_state_recv(MultiFDRecvParams *p, Error **errp) +{ + g_autofree char *dev_state_buf = NULL; + int ret; + + dev_state_buf = g_malloc(p->next_packet_size); + + ret = qio_channel_read_all(p->c, dev_state_buf, p->next_packet_size, errp); + if (ret != 0) { + return ret; + } + + if (p->packet_dev_state->idstr[sizeof(p->packet_dev_state->idstr) - 1] + != 0) { + error_setg(errp, "unterminated multifd device state idstr"); + return -1; + } + + if (!qemu_loadvm_load_state_buffer(p->packet_dev_state->idstr, + p->packet_dev_state->instance_id, + dev_state_buf, p->next_packet_size, + errp)) { + ret = -1; + } + + return ret; +} + static void *multifd_recv_thread(void *opaque) { + MigrationState *s = migrate_get_current(); MultiFDRecvParams *p = opaque; Error *local_err = NULL; bool use_packets = multifd_use_packets(); @@ -1469,19 +1272,65 @@ static void *multifd_recv_thread(void *opaque) trace_multifd_recv_thread_start(p->id); rcu_register_thread(); + if (!s->multifd_clean_tls_termination) { + p->read_flags = QIO_CHANNEL_READ_FLAG_RELAXED_EOF; + } + while (true) { + MultiFDPacketHdr_t hdr; uint32_t flags = 0; + bool is_device_state = false; bool has_data = false; + uint8_t *pkt_buf; + size_t pkt_len; + p->normal_num = 0; if (use_packets) { + struct iovec iov = { + .iov_base = (void *)&hdr, + .iov_len = sizeof(hdr) + }; + if (multifd_recv_should_exit()) { break; } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + ret = qio_channel_readv_full_all_eof(p->c, &iov, 1, NULL, NULL, + p->read_flags, &local_err); + if (!ret) { + /* EOF */ + assert(!local_err); + break; + } + + if (ret == -1) { + break; + } + + ret = multifd_recv_unfill_packet_header(p, &hdr, &local_err); + if (ret) { + break; + } + + is_device_state = p->flags & MULTIFD_FLAG_DEVICE_STATE; + if (is_device_state) { + pkt_buf = (uint8_t *)p->packet_dev_state + sizeof(hdr); + pkt_len = sizeof(*p->packet_dev_state) - sizeof(hdr); + } else { + pkt_buf = (uint8_t *)p->packet + sizeof(hdr); + pkt_len = p->packet_len - sizeof(hdr); + } + + ret = qio_channel_read_all_eof(p->c, (char *)pkt_buf, pkt_len, + &local_err); + if (!ret) { + /* EOF */ + error_setg(&local_err, "multifd: unexpected EOF after packet header"); + break; + } + + if (ret == -1) { break; } @@ -1495,7 +1344,18 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = p->normal_num || p->zero_num; + + if (is_device_state) { + has_data = p->next_packet_size > 0; + } else { + /* + * Even if it's a SYNC packet, this needs to be set + * because older QEMUs (<9.0) still send data along with + * the SYNC packet. + */ + has_data = p->normal_num || p->zero_num; + } + qemu_mutex_unlock(&p->mutex); } else { /* @@ -1524,19 +1384,40 @@ static void *multifd_recv_thread(void *opaque) } if (has_data) { - ret = multifd_recv_state->ops->recv(p, &local_err); + /* + * multifd thread should not be active and receive data + * when migration is in the Postcopy phase. Two threads + * writing the same memory area could easily corrupt + * the guest state. + */ + assert(!migration_in_postcopy()); + if (is_device_state) { + assert(use_packets); + ret = multifd_device_state_recv(p, &local_err); + } else { + ret = multifd_recv_state->ops->recv(p, &local_err); + } if (ret != 0) { break; } + } else if (is_device_state) { + error_setg(&local_err, + "multifd: received empty device state packet"); + break; } if (use_packets) { if (flags & MULTIFD_FLAG_SYNC) { + if (is_device_state) { + error_setg(&local_err, + "multifd: received SYNC device state packet"); + break; + } + qemu_sem_post(&multifd_recv_state->sem_sync); qemu_sem_wait(&p->sem_sync); } } else { - p->total_normal_pages += p->data->size / qemu_target_page_size(); p->data->size = 0; /* * Order data->size update before clearing @@ -1553,9 +1434,7 @@ static void *multifd_recv_thread(void *opaque) } rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->packets_recved, - p->total_normal_pages, - p->total_zero_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved); return NULL; } @@ -1563,7 +1442,7 @@ static void *multifd_recv_thread(void *opaque) int multifd_recv_setup(Error **errp) { int thread_count; - uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + uint32_t page_count = multifd_ram_page_count(); bool use_packets = multifd_use_packets(); uint8_t i; @@ -1603,12 +1482,11 @@ int multifd_recv_setup(Error **errp) p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); + p->packet_dev_state = g_malloc0(sizeof(*p->packet_dev_state)); } - p->name = g_strdup_printf("mig/dst/recv_%d", i); + p->name = g_strdup_printf(MIGRATION_THREAD_DST_MULTIFD, i); p->normal = g_new0(ram_addr_t, page_count); p->zero = g_new0(ram_addr_t, page_count); - p->page_count = page_count; - p->page_size = qemu_target_page_size(); } for (i = 0; i < thread_count; i++) { @@ -1681,17 +1559,3 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } - -bool multifd_send_prepare_common(MultiFDSendParams *p) -{ - multifd_send_zero_page_detect(p); - - if (!p->pages->normal_num) { - p->next_packet_size = 0; - return false; - } - - multifd_send_prepare_header(p); - - return true; -} diff --git a/migration/multifd.h b/migration/multifd.h index 0ecd6f4..9b6d81e 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,9 +13,27 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +#include "exec/target_page.h" #include "ram.h" typedef struct MultiFDRecvData MultiFDRecvData; +typedef struct MultiFDSendData MultiFDSendData; + +typedef enum { + /* No sync request */ + MULTIFD_SYNC_NONE = 0, + /* Sync locally on the sender threads without pushing messages */ + MULTIFD_SYNC_LOCAL, + /* + * Sync not only on the sender threads, but also push MULTIFD_FLAG_SYNC + * message to the wire for each iochannel (which is for a remote sync). + * + * When remote sync is used, need to be paired with a follow up + * RAM_SAVE_FLAG_EOS / RAM_SAVE_FLAG_MULTIFD_FLUSH message on the main + * channel. + */ + MULTIFD_SYNC_ALL, +} MultiFDSyncReq; bool multifd_send_setup(void); void multifd_send_shutdown(void); @@ -26,22 +44,34 @@ void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -int multifd_send_sync_main(void); +int multifd_send_sync_main(MultiFDSyncReq req); bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); bool multifd_recv(void); MultiFDRecvData *multifd_get_recv_data(void); +/* Multiple fd's */ + +#define MULTIFD_MAGIC 0x11223344U +#define MULTIFD_VERSION 1 + /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 4 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1) +/* We reserve 5 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) #define MULTIFD_FLAG_QPL (4 << 1) #define MULTIFD_FLAG_UADK (8 << 1) +#define MULTIFD_FLAG_QATZIP (16 << 1) + +/* + * If set it means that this packet contains device state + * (MultiFDPacketDeviceState_t), not RAM data (MultiFDPacket_t). + */ +#define MULTIFD_FLAG_DEVICE_STATE (32 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) @@ -50,6 +80,11 @@ typedef struct { uint32_t magic; uint32_t version; uint32_t flags; +} __attribute__((packed)) MultiFDPacketHdr_t; + +typedef struct { + MultiFDPacketHdr_t hdr; + /* maximum number of allocated pages */ uint32_t pages_alloc; /* non zero pages */ @@ -71,15 +106,27 @@ typedef struct { } __attribute__((packed)) MultiFDPacket_t; typedef struct { + MultiFDPacketHdr_t hdr; + + char idstr[256]; + uint32_t instance_id; + + /* size of the next packet that contains the actual data */ + uint32_t next_packet_size; +} __attribute__((packed)) MultiFDPacketDeviceState_t; + +typedef struct { /* number of used pages */ uint32_t num; /* number of normal pages */ uint32_t normal_num; - /* number of allocated pages */ - uint32_t allocated; - /* offset of each page */ - ram_addr_t *offset; + /* + * Pointer to the ramblock. NOTE: it's caller's responsibility to make + * sure the pointer is always valid! + */ RAMBlock *block; + /* offset array of each page, managed by multifd */ + ram_addr_t *offset; } MultiFDPages_t; struct MultiFDRecvData { @@ -90,6 +137,48 @@ struct MultiFDRecvData { }; typedef struct { + char *idstr; + uint32_t instance_id; + char *buf; + size_t buf_len; +} MultiFDDeviceState_t; + +typedef enum { + MULTIFD_PAYLOAD_NONE, + MULTIFD_PAYLOAD_RAM, + MULTIFD_PAYLOAD_DEVICE_STATE, +} MultiFDPayloadType; + +typedef struct MultiFDPayload { + MultiFDPages_t ram; + MultiFDDeviceState_t device_state; +} MultiFDPayload; + +struct MultiFDSendData { + MultiFDPayloadType type; + MultiFDPayload u; +}; + +static inline bool multifd_payload_empty(MultiFDSendData *data) +{ + return data->type == MULTIFD_PAYLOAD_NONE; +} + +static inline bool multifd_payload_device_state(MultiFDSendData *data) +{ + return data->type == MULTIFD_PAYLOAD_DEVICE_STATE; +} + +static inline void multifd_set_payload_type(MultiFDSendData *data, + MultiFDPayloadType type) +{ + assert(multifd_payload_empty(data)); + assert(type != MULTIFD_PAYLOAD_NONE); + + data->type = type; +} + +typedef struct { /* Fields are only written at creating/deletion time */ /* No lock required for them, they are read only */ @@ -106,10 +195,6 @@ typedef struct { QIOChannel *c; /* packet allocated len */ uint32_t packet_len; - /* guest page size */ - uint32_t page_size; - /* number of pages in a full packet */ - uint32_t page_count; /* multifd flags for sending ram */ int write_flags; @@ -121,7 +206,7 @@ typedef struct { /* multifd flags for each packet */ uint32_t flags; /* - * The sender thread has work to do if either of below boolean is set. + * The sender thread has work to do if either of below field is set. * * @pending_job: a job is pending * @pending_sync: a sync request is pending @@ -130,26 +215,19 @@ typedef struct { * cleared by the multifd sender threads. */ bool pending_job; - bool pending_sync; - /* array of pages to sent. - * The owner of 'pages' depends of 'pending_job' value: - * pending_job == 0 -> migration_thread can use it. - * pending_job != 0 -> multifd_channel can use it. - */ - MultiFDPages_t *pages; + MultiFDSyncReq pending_sync; + + MultiFDSendData *data; /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_device_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ uint64_t packets_sent; - /* non zero pages sent through this channel */ - uint64_t total_normal_pages; - /* zero pages sent through this channel */ - uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ @@ -173,10 +251,6 @@ typedef struct { QIOChannel *c; /* packet allocated len */ uint32_t packet_len; - /* guest page size */ - uint32_t page_size; - /* number of pages in a full packet */ - uint32_t page_count; /* syncs main thread and channels */ QemuSemaphore sem_sync; @@ -196,8 +270,9 @@ typedef struct { /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_dev_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets received through this channel */ @@ -206,10 +281,6 @@ typedef struct { RAMBlock *block; /* ramblock host address */ uint8_t *host; - /* non zero pages recv through this channel */ - uint64_t total_normal_pages; - /* zero pages recv through this channel */ - uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ @@ -222,36 +293,126 @@ typedef struct { uint32_t zero_num; /* used for de-compression methods */ void *compress_data; + /* Flags for the QIOChannel */ + int read_flags; } MultiFDRecvParams; typedef struct { - /* Setup for sending side */ + /* + * The send_setup, send_cleanup, send_prepare are only called on + * the QEMU instance at the migration source. + */ + + /* + * Setup for sending side. Called once per channel during channel + * setup phase. + * + * Must allocate p->iov. If packets are in use (default), one + * extra iovec must be allocated for the packet header. Any memory + * allocated in this hook must be released at send_cleanup. + * + * p->write_flags may be used for passing flags to the QIOChannel. + * + * p->compression_data may be used by compression methods to store + * compression data. + */ int (*send_setup)(MultiFDSendParams *p, Error **errp); - /* Cleanup for sending side */ + + /* + * Cleanup for sending side. Called once per channel during + * channel cleanup phase. + */ void (*send_cleanup)(MultiFDSendParams *p, Error **errp); - /* Prepare the send packet */ + + /* + * Prepare the send packet. Called as a result of multifd_send() + * on the client side, with p pointing to the MultiFDSendParams of + * a channel that is currently idle. + * + * Must populate p->iov with the data to be sent, increment + * p->iovs_num to match the amount of iovecs used and set + * p->next_packet_size with the amount of data currently present + * in p->iov. + * + * Must indicate whether this is a compression packet by setting + * p->flags. + * + * As a last step, if packets are in use (default), must prepare + * the packet by calling multifd_send_fill_packet(). + */ int (*send_prepare)(MultiFDSendParams *p, Error **errp); - /* Setup for receiving side */ + + /* + * The recv_setup, recv_cleanup, recv are only called on the QEMU + * instance at the migration destination. + */ + + /* + * Setup for receiving side. Called once per channel during + * channel setup phase. May be empty. + * + * May allocate data structures for the receiving of data. May use + * p->iov. Compression methods may use p->compress_data. + */ int (*recv_setup)(MultiFDRecvParams *p, Error **errp); - /* Cleanup for receiving side */ + + /* + * Cleanup for receiving side. Called once per channel during + * channel cleanup phase. May be empty. + */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all data */ + + /* + * Data receive method. Called as a result of multifd_recv() on + * the client side, with p pointing to the MultiFDRecvParams of a + * channel that is currently idle. Only called if there is data + * available to receive. + * + * Must validate p->flags according to what was set at + * send_prepare. + * + * Must read the data from the QIOChannel p->c. + */ int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; -void multifd_register_ops(int method, MultiFDMethods *ops); +void multifd_register_ops(int method, const MultiFDMethods *ops); void multifd_send_fill_packet(MultiFDSendParams *p); bool multifd_send_prepare_common(MultiFDSendParams *p); void multifd_send_zero_page_detect(MultiFDSendParams *p); void multifd_recv_zero_page_process(MultiFDRecvParams *p); -static inline void multifd_send_prepare_header(MultiFDSendParams *p) +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); +bool multifd_send(MultiFDSendData **send_data); +MultiFDSendData *multifd_send_data_alloc(void); +void multifd_send_data_clear(MultiFDSendData *data); +void multifd_send_data_free(MultiFDSendData *data); + +static inline uint32_t multifd_ram_page_size(void) { - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; - p->iovs_num++; + return qemu_target_page_size(); } -void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); +static inline uint32_t multifd_ram_page_count(void) +{ + return MULTIFD_PACKET_SIZE / qemu_target_page_size(); +} + +void multifd_ram_save_setup(void); +void multifd_ram_save_cleanup(void); +int multifd_ram_flush_and_sync(QEMUFile *f); +bool multifd_ram_sync_per_round(void); +bool multifd_ram_sync_per_section(void); +void multifd_ram_payload_alloc(MultiFDPages_t *pages); +void multifd_ram_payload_free(MultiFDPages_t *pages); +void multifd_ram_fill_packet(MultiFDSendParams *p); +int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); + +void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state); + +void multifd_device_state_send_setup(void); +void multifd_device_state_send_cleanup(void); + +void multifd_device_state_send_prepare(MultiFDSendParams *p); #endif diff --git a/migration/options.c b/migration/options.c index 645f550..162c72c 100644 --- a/migration/options.c +++ b/migration/options.c @@ -19,16 +19,17 @@ #include "qapi/qapi-commands-migration.h" #include "qapi/qapi-visit-migration.h" #include "qapi/qmp/qerror.h" -#include "qapi/qmp/qnull.h" -#include "sysemu/runstate.h" +#include "qobject/qnull.h" +#include "system/runstate.h" #include "migration/colo.h" +#include "migration/cpr.h" #include "migration/misc.h" #include "migration.h" #include "migration-stats.h" #include "qemu-file.h" #include "ram.h" #include "options.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" /* Maximum migrate downtime set to 2000 seconds */ #define MAX_MIGRATE_DOWNTIME_SECONDS 2000 @@ -55,6 +56,13 @@ #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 +/* + * 1: best speed, ... 9: best compress ratio + * There is some nuance here. Refer to QATzip documentation to understand + * the mapping of QATzip levels to standard deflate levels. + */ +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1 + /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 @@ -78,19 +86,23 @@ #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD 1000 /* milliseconds */ #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT 1 /* MB/s */ -Property migration_properties[] = { +const Property migration_properties[] = { DEFINE_PROP_BOOL("store-global-state", MigrationState, store_global_state, true), DEFINE_PROP_BOOL("send-configuration", MigrationState, send_configuration, true), DEFINE_PROP_BOOL("send-section-footer", MigrationState, send_section_footer, true), + DEFINE_PROP_BOOL("send-switchover-start", MigrationState, + send_switchover_start, true), DEFINE_PROP_BOOL("multifd-flush-after-each-section", MigrationState, multifd_flush_after_each_section, false), DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState, clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT), DEFINE_PROP_BOOL("x-preempt-pre-7-2", MigrationState, preempt_pre_7_2, false), + DEFINE_PROP_BOOL("multifd-clean-tls-termination", MigrationState, + multifd_clean_tls_termination, true), /* Migration parameters */ DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState, @@ -123,6 +135,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, parameters.multifd_zlib_level, DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), + DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState, + parameters.multifd_qatzip_level, + DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL), DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, parameters.multifd_zstd_level, DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), @@ -186,8 +201,8 @@ Property migration_properties[] = { MIGRATION_CAPABILITY_SWITCHOVER_ACK), DEFINE_PROP_MIG_CAP("x-dirty-limit", MIGRATION_CAPABILITY_DIRTY_LIMIT), DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM), - DEFINE_PROP_END_OF_LIST(), }; +const size_t migration_properties_count = ARRAY_SIZE(migration_properties); bool migrate_auto_converge(void) { @@ -196,6 +211,13 @@ bool migrate_auto_converge(void) return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; } +bool migrate_send_switchover_start(void) +{ + MigrationState *s = migrate_get_current(); + + return s->send_switchover_start; +} + bool migrate_background_snapshot(void) { MigrationState *s = migrate_get_current(); @@ -329,13 +351,6 @@ bool migrate_xbzrle(void) return s->capabilities[MIGRATION_CAPABILITY_XBZRLE]; } -bool migrate_zero_blocks(void) -{ - MigrationState *s = migrate_get_current(); - - return s->capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; -} - bool migrate_zero_copy_send(void) { MigrationState *s = migrate_get_current(); @@ -433,6 +448,24 @@ static bool migrate_incoming_started(void) return !!migration_incoming_get_current()->transport_data; } +bool migrate_rdma_caps_check(bool *caps, Error **errp) +{ + if (caps[MIGRATION_CAPABILITY_XBZRLE]) { + error_setg(errp, "RDMA and XBZRLE can't be used together"); + return false; + } + if (caps[MIGRATION_CAPABILITY_MULTIFD]) { + error_setg(errp, "RDMA and multifd can't be used together"); + return false; + } + if (caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + error_setg(errp, "RDMA and postcopy-ram can't be used together"); + return false; + } + + return true; +} + /** * @migration_caps_check - check capability compatibility * @@ -447,6 +480,10 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) ERRP_GUARD(); MigrationIncomingState *mis = migration_incoming_get_current(); + if (new_caps[MIGRATION_CAPABILITY_ZERO_BLOCKS]) { + warn_report("zero-blocks capability is deprecated"); + } + #ifndef CONFIG_REPLICATION if (new_caps[MIGRATION_CAPABILITY_X_COLO]) { error_setg(errp, "QEMU compiled without replication module" @@ -472,11 +509,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) error_setg(errp, "Postcopy is not compatible with ignore-shared"); return false; } - - if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - error_setg(errp, "Postcopy is not yet compatible with multifd"); - return false; - } } if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { @@ -536,7 +568,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) return false; } - if (migrate_incoming_started()) { + if (!migrate_postcopy_preempt() && migrate_incoming_started()) { error_setg(errp, "Postcopy preempt must be set before incoming starts"); return false; @@ -544,7 +576,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - if (migrate_incoming_started()) { + if (!migrate_multifd() && migrate_incoming_started()) { error_setg(errp, "Multifd must be set before incoming starts"); return false; } @@ -592,26 +624,13 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } } - return true; -} - -bool migrate_cap_set(int cap, bool value, Error **errp) -{ - MigrationState *s = migrate_get_current(); - bool new_caps[MIGRATION_CAPABILITY__MAX]; - - if (migration_is_running()) { - error_setg(errp, "There's a migration process in progress"); - return false; - } - - memcpy(new_caps, s->capabilities, sizeof(new_caps)); - new_caps[cap] = value; - - if (!migrate_caps_check(s->capabilities, new_caps, errp)) { + /* + * On destination side, check the cases that capability is being set + * after incoming thread has started. + */ + if (migrate_rdma() && !migrate_rdma_caps_check(new_caps, errp)) { return false; } - s->capabilities[cap] = value; return true; } @@ -758,8 +777,11 @@ uint64_t migrate_max_postcopy_bandwidth(void) MigMode migrate_mode(void) { - MigrationState *s = migrate_get_current(); - MigMode mode = s->parameters.mode; + MigMode mode = cpr_get_incoming_mode(); + + if (mode == MIG_MODE_NONE) { + mode = migrate_get_current()->parameters.mode; + } assert(mode >= 0 && mode < MIG_MODE__MAX); return mode; @@ -787,6 +809,13 @@ int migrate_multifd_zlib_level(void) return s->parameters.multifd_zlib_level; } +int migrate_multifd_qatzip_level(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_qatzip_level; +} + int migrate_multifd_zstd_level(void) { MigrationState *s = migrate_get_current(); @@ -892,6 +921,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->multifd_compression = s->parameters.multifd_compression; params->has_multifd_zlib_level = true; params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_qatzip_level = true; + params->multifd_qatzip_level = s->parameters.multifd_qatzip_level; params->has_multifd_zstd_level = true; params->multifd_zstd_level = s->parameters.multifd_zstd_level; params->has_xbzrle_cache_size = true; @@ -946,6 +977,7 @@ void migrate_params_init(MigrationParameters *params) params->has_multifd_channels = true; params->has_multifd_compression = true; params->has_multifd_zlib_level = true; + params->has_multifd_qatzip_level = true; params->has_multifd_zstd_level = true; params->has_xbzrle_cache_size = true; params->has_max_postcopy_bandwidth = true; @@ -1038,6 +1070,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_multifd_qatzip_level && + ((params->multifd_qatzip_level > 9) || + (params->multifd_qatzip_level < 1))) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level", + "a value between 1 and 9"); + return false; + } + if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", @@ -1173,6 +1213,11 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->tls_hostname = params->tls_hostname->u.s; } + if (params->tls_authz) { + assert(params->tls_authz->type == QTYPE_QSTRING); + dest->tls_authz = params->tls_authz->u.s; + } + if (params->has_max_bandwidth) { dest->max_bandwidth = params->max_bandwidth; } @@ -1195,6 +1240,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + dest->multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { dest->multifd_zlib_level = params->multifd_zlib_level; } @@ -1315,6 +1363,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + s->parameters.multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { s->parameters.multifd_zlib_level = params->multifd_zlib_level; } diff --git a/migration/options.h b/migration/options.h index a239702..82d8397 100644 --- a/migration/options.h +++ b/migration/options.h @@ -20,7 +20,8 @@ /* migration properties */ -extern Property migration_properties[]; +extern const Property migration_properties[]; +extern const size_t migration_properties_count; /* capabilities */ @@ -40,7 +41,6 @@ bool migrate_release_ram(void); bool migrate_return_path(void); bool migrate_validate_uuid(void); bool migrate_xbzrle(void); -bool migrate_zero_blocks(void); bool migrate_zero_copy_send(void); /* @@ -57,8 +57,8 @@ bool migrate_tls(void); /* capabilities helpers */ +bool migrate_rdma_caps_check(bool *caps, Error **errp); bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp); -bool migrate_cap_set(int cap, bool value, Error **errp); /* parameters */ @@ -78,6 +78,7 @@ uint64_t migrate_max_postcopy_bandwidth(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); +int migrate_multifd_qatzip_level(void); int migrate_multifd_zstd_level(void); uint8_t migrate_throttle_trigger_threshold(void); const char *migrate_tls_authz(void); diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 1c374b7..75fd310 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -27,11 +27,11 @@ #include "qapi/error.h" #include "qemu/notify.h" #include "qemu/rcu.h" -#include "sysemu/sysemu.h" +#include "system/system.h" #include "qemu/error-report.h" #include "trace.h" #include "hw/boards.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "socket.h" #include "yank_functions.h" #include "tls.h" @@ -90,10 +90,10 @@ void postcopy_thread_create(MigrationIncomingState *mis, QemuThread *thread, const char *name, void *(*fn)(void *), int joinable) { - qemu_sem_init(&mis->thread_sync_sem, 0); + qemu_event_init(&mis->thread_sync_event, false); qemu_thread_create(thread, name, fn, mis, joinable); - qemu_sem_wait(&mis->thread_sync_sem); - qemu_sem_destroy(&mis->thread_sync_sem); + qemu_event_wait(&mis->thread_sync_event); + qemu_event_destroy(&mis->thread_sync_event); } /* Postcopy needs to detect accesses to pages that haven't yet been copied @@ -651,8 +651,8 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) mis->have_fault_thread = false; } - if (enable_mlock) { - if (os_mlock() < 0) { + if (should_mlock(mlock_state)) { + if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) { error_report("mlock: %s", strerror(errno)); /* * It doesn't feel right to fail at this point, we have a valid @@ -746,18 +746,10 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd, RAMBlock *rb) { size_t pagesize = qemu_ram_pagesize(rb); - struct uffdio_range range; - int ret; trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); - range.start = ROUND_DOWN(client_addr, pagesize); - range.len = pagesize; - ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range); - if (ret) { - error_report("%s: Failed to wake: %zx in %s (%s)", - __func__, (size_t)client_addr, qemu_ram_get_idstr(rb), - strerror(errno)); - } - return ret; + return uffd_wakeup(pcfd->fd, + (void *)(uintptr_t)ROUND_DOWN(client_addr, pagesize), + pagesize); } static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb, @@ -972,7 +964,7 @@ static void *postcopy_ram_fault_thread(void *opaque) trace_postcopy_ram_fault_thread_entry(); rcu_register_thread(); mis->last_rb = NULL; /* last RAMBlock we sent part of */ - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); struct pollfd *pfd; size_t pfd_len = 2 + mis->postcopy_remote_fds->len; @@ -1238,7 +1230,8 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis) return -1; } - postcopy_thread_create(mis, &mis->fault_thread, "mig/dst/fault", + postcopy_thread_create(mis, &mis->fault_thread, + MIGRATION_THREAD_DST_FAULT, postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE); mis->have_fault_thread = true; @@ -1258,7 +1251,8 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis) * This thread needs to be created after the temp pages because * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately. */ - postcopy_thread_create(mis, &mis->postcopy_prio_thread, "mig/dst/preempt", + postcopy_thread_create(mis, &mis->postcopy_prio_thread, + MIGRATION_THREAD_DST_PREEMPT, postcopy_preempt_thread, QEMU_THREAD_JOINABLE); mis->preempt_thread_status = PREEMPT_THREAD_CREATED; } @@ -1275,18 +1269,10 @@ static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr, int ret; if (from_addr) { - struct uffdio_copy copy_struct; - copy_struct.dst = (uint64_t)(uintptr_t)host_addr; - copy_struct.src = (uint64_t)(uintptr_t)from_addr; - copy_struct.len = pagesize; - copy_struct.mode = 0; - ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct); + ret = uffd_copy_page(userfault_fd, host_addr, from_addr, pagesize, + false); } else { - struct uffdio_zeropage zero_struct; - zero_struct.range.start = (uint64_t)(uintptr_t)host_addr; - zero_struct.range.len = pagesize; - zero_struct.mode = 0; - ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct); + ret = uffd_zero_page(userfault_fd, host_addr, pagesize, false); } if (!ret) { qemu_mutex_lock(&mis->page_request_mutex); @@ -1343,18 +1329,16 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, RAMBlock *rb) { size_t pagesize = qemu_ram_pagesize(rb); + int e; /* copy also acks to the kernel waking the stalled thread up * TODO: We can inhibit that ack and only do it if it was requested * which would be slightly cheaper, but we'd have to be careful * of the order of updating our page state. */ - if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) { - int e = errno; - error_report("%s: %s copy host: %p from: %p (size: %zd)", - __func__, strerror(e), host, from, pagesize); - - return -e; + e = qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb); + if (e) { + return e; } trace_postcopy_place_page(host); @@ -1376,12 +1360,10 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, * but it's not available for everything (e.g. hugetlbpages) */ if (qemu_ram_is_uf_zeroable(rb)) { - if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) { - int e = errno; - error_report("%s: %s zero host: %p", - __func__, strerror(e), host); - - return -e; + int e; + e = qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb); + if (e) { + return e; } return postcopy_notify_shared_wake(rb, qemu_ram_block_host_offset(rb, @@ -1411,49 +1393,42 @@ int postcopy_ram_incoming_init(MigrationIncomingState *mis) int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) { - assert(0); - return -1; + g_assert_not_reached(); } int postcopy_ram_prepare_discard(MigrationIncomingState *mis) { - assert(0); - return -1; + g_assert_not_reached(); } int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, uint64_t client_addr, uint64_t rb_offset) { - assert(0); - return -1; + g_assert_not_reached(); } int postcopy_ram_incoming_setup(MigrationIncomingState *mis) { - assert(0); - return -1; + g_assert_not_reached(); } int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, RAMBlock *rb) { - assert(0); - return -1; + g_assert_not_reached(); } int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, RAMBlock *rb) { - assert(0); - return -1; + g_assert_not_reached(); } int postcopy_wake_shared(struct PostCopyFD *pcfd, uint64_t client_addr, RAMBlock *rb) { - assert(0); - return -1; + g_assert_not_reached(); } #endif @@ -1741,7 +1716,7 @@ void *postcopy_preempt_thread(void *opaque) rcu_register_thread(); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); /* * The preempt channel is established in asynchronous way. Wait diff --git a/migration/qemu-file.c b/migration/qemu-file.c index b6d2f58..b6ac190 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -37,6 +37,11 @@ #define IO_BUF_SIZE 32768 #define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64) +typedef struct FdEntry { + QTAILQ_ENTRY(FdEntry) entry; + int fd; +} FdEntry; + struct QEMUFile { QIOChannel *ioc; bool is_writable; @@ -51,6 +56,9 @@ struct QEMUFile { int last_error; Error *last_error_obj; + + bool can_pass_fd; + QTAILQ_HEAD(, FdEntry) fds; }; /* @@ -109,6 +117,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable) object_ref(ioc); f->ioc = ioc; f->is_writable = is_writable; + f->can_pass_fd = qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS); + QTAILQ_INIT(&f->fds); return f; } @@ -310,6 +320,10 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) int len; int pending; Error *local_error = NULL; + g_autofree int *fds = NULL; + size_t nfd = 0; + int **pfds = f->can_pass_fd ? &fds : NULL; + size_t *pnfd = f->can_pass_fd ? &nfd : NULL; assert(!qemu_file_is_writable(f)); @@ -325,10 +339,9 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) } do { - len = qio_channel_read(f->ioc, - (char *)f->buf + pending, - IO_BUF_SIZE - pending, - &local_error); + struct iovec iov = { f->buf + pending, IO_BUF_SIZE - pending }; + len = qio_channel_readv_full(f->ioc, &iov, 1, pfds, pnfd, 0, + &local_error); if (len == QIO_CHANNEL_ERR_BLOCK) { if (qemu_in_coroutine()) { qio_channel_yield(f->ioc, G_IO_IN); @@ -348,9 +361,66 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) qemu_file_set_error_obj(f, len, local_error); } + for (int i = 0; i < nfd; i++) { + FdEntry *fde = g_new0(FdEntry, 1); + fde->fd = fds[i]; + QTAILQ_INSERT_TAIL(&f->fds, fde, entry); + } + return len; } +int qemu_file_put_fd(QEMUFile *f, int fd) +{ + int ret = 0; + QIOChannel *ioc = qemu_file_get_ioc(f); + Error *err = NULL; + struct iovec iov = { (void *)" ", 1 }; + + /* + * Send a dummy byte so qemu_fill_buffer on the receiving side does not + * fail with a len=0 error. Flush first to maintain ordering wrt other + * data. + */ + + qemu_fflush(f); + if (qio_channel_writev_full(ioc, &iov, 1, &fd, 1, 0, &err) < 1) { + error_report_err(error_copy(err)); + qemu_file_set_error_obj(f, -EIO, err); + ret = -1; + } + trace_qemu_file_put_fd(f->ioc->name, fd, ret); + return ret; +} + +int qemu_file_get_fd(QEMUFile *f) +{ + int fd = -1; + FdEntry *fde; + + if (!f->can_pass_fd) { + Error *err = NULL; + error_setg(&err, "%s does not support fd passing", f->ioc->name); + error_report_err(error_copy(err)); + qemu_file_set_error_obj(f, -EIO, err); + goto out; + } + + /* Force the dummy byte and its fd passenger to appear. */ + qemu_peek_byte(f, 0); + + fde = QTAILQ_FIRST(&f->fds); + if (fde) { + qemu_get_byte(f); /* Drop the dummy byte */ + fd = fde->fd; + QTAILQ_REMOVE(&f->fds, fde, entry); + g_free(fde); + } +out: + trace_qemu_file_get_fd(f->ioc->name, fd); + return fd; +} + /** Closes the file * * Returns negative error value if any error happened on previous operations or @@ -361,11 +431,17 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) */ int qemu_fclose(QEMUFile *f) { + FdEntry *fde, *next; int ret = qemu_fflush(f); int ret2 = qio_channel_close(f->ioc, NULL); if (ret >= 0) { ret = ret2; } + QTAILQ_FOREACH_SAFE(fde, &f->fds, entry, next) { + warn_report("qemu_fclose: received fd %d was never claimed", fde->fd); + close(fde->fd); + g_free(fde); + } g_clear_pointer(&f->ioc, object_unref); error_free(f->last_error_obj); g_free(f); @@ -485,8 +561,6 @@ void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, } stat64_add(&mig_stats.qemu_file_transferred, buflen); - - return; } diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 11c2120..f5b9f43 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -33,6 +33,8 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc); QEMUFile *qemu_file_new_output(QIOChannel *ioc); int qemu_fclose(QEMUFile *f); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(QEMUFile, qemu_fclose) + /* * qemu_file_transferred: * @@ -79,5 +81,7 @@ size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, off_t pos); QIOChannel *qemu_file_get_ioc(QEMUFile *file); +int qemu_file_put_fd(QEMUFile *f, int fd); +int qemu_file_get_fd(QEMUFile *f); #endif diff --git a/migration/ram.c b/migration/ram.c index edec1a2..2140785 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -48,19 +48,19 @@ #include "qapi/qapi-commands-migration.h" #include "qapi/qmp/qerror.h" #include "trace.h" -#include "exec/ram_addr.h" +#include "system/ram_addr.h" #include "exec/target_page.h" #include "qemu/rcu_queue.h" #include "migration/colo.h" -#include "sysemu/cpu-throttle.h" +#include "system/cpu-throttle.h" #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" -#include "sysemu/runstate.h" +#include "system/runstate.h" #include "rdma.h" #include "options.h" -#include "sysemu/dirtylimit.h" -#include "sysemu/kvm.h" +#include "system/dirtylimit.h" +#include "system/kvm.h" #include "hw/boards.h" /* for machine_dump_guest_core() */ @@ -72,27 +72,6 @@ /* ram save/restore */ /* - * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it - * worked for pages that were filled with the same char. We switched - * it to only search for the zero value. And to avoid confusion with - * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. - * - * RAM_SAVE_FLAG_FULL was obsoleted in 2009. - * - * RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1. - */ -#define RAM_SAVE_FLAG_FULL 0x01 -#define RAM_SAVE_FLAG_ZERO 0x02 -#define RAM_SAVE_FLAG_MEM_SIZE 0x04 -#define RAM_SAVE_FLAG_PAGE 0x08 -#define RAM_SAVE_FLAG_EOS 0x10 -#define RAM_SAVE_FLAG_CONTINUE 0x20 -#define RAM_SAVE_FLAG_XBZRLE 0x40 -/* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */ -#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 -/* We can't use any flag that is bigger than 0x200 */ - -/* * mapped-ram migration supports O_DIRECT, so we need to make sure the * userspace buffer, the IO operation size and the file offset are * aligned according to the underlying device's block size. The first @@ -112,6 +91,36 @@ XBZRLECacheStats xbzrle_counters; +/* + * This structure locates a specific location of a guest page. In QEMU, + * it's described in a tuple of (ramblock, offset). + */ +struct PageLocation { + RAMBlock *block; + unsigned long offset; +}; +typedef struct PageLocation PageLocation; + +/** + * PageLocationHint: describes a hint to a page location + * + * @valid set if the hint is vaild and to be consumed + * @location: the hint content + * + * In postcopy preempt mode, the urgent channel may provide hints to the + * background channel, so that QEMU source can try to migrate whatever is + * right after the requested urgent pages. + * + * This is based on the assumption that the VM (already running on the + * destination side) tends to access the memory with spatial locality. + * This is also the default behavior of vanilla postcopy (preempt off). + */ +struct PageLocationHint { + bool valid; + PageLocation location; +}; +typedef struct PageLocationHint PageLocationHint; + /* used by the search for pages to send */ struct PageSearchStatus { /* The migration channel used for a specific host page */ @@ -216,7 +225,9 @@ static bool postcopy_preempt_active(void) bool migrate_ram_is_ignored(RAMBlock *block) { + MigMode mode = migrate_mode(); return !qemu_ram_is_migratable(block) || + mode == MIG_MODE_CPR_TRANSFER || (migrate_ignore_shared() && qemu_ram_is_shared(block) && qemu_ram_is_named_file(block)); } @@ -414,6 +425,13 @@ struct RAMState { * RAM migration. */ unsigned int postcopy_bmap_sync_requested; + /* + * Page hint during postcopy when preempt mode is on. Return path + * thread sets it, while background migration thread consumes it. + * + * Protected by @bitmap_mutex. + */ + PageLocationHint page_hint; }; typedef struct RAMState RAMState; @@ -467,13 +485,6 @@ void ram_transferred_add(uint64_t bytes) } } -struct MigrationOps { - int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); -}; -typedef struct MigrationOps MigrationOps; - -MigrationOps *migration_ops; - static int ram_save_host_page_urgent(PageSearchStatus *pss); /* NOTE: page is the PFN not real ram_addr_t. */ @@ -820,14 +831,22 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, bool ret; /* - * Clear dirty bitmap if needed. This _must_ be called before we - * send any of the page in the chunk because we need to make sure - * we can capture further page content changes when we sync dirty - * log the next time. So as long as we are going to send any of - * the page in the chunk we clear the remote dirty bitmap for all. - * Clearing it earlier won't be a problem, but too late will. + * During the last stage (after source VM stopped), resetting the write + * protections isn't needed as we know there will be either (1) no + * further writes if migration will complete, or (2) migration fails + * at last then tracking isn't needed either. */ - migration_clear_memory_region_dirty_bitmap(rb, page); + if (!rs->last_stage) { + /* + * Clear dirty bitmap if needed. This _must_ be called before we + * send any of the page in the chunk because we need to make sure + * we can capture further page content changes when we sync dirty + * log the next time. So as long as we are going to send any of + * the page in the chunk we clear the remote dirty bitmap for all. + * Clearing it earlier won't be a problem, but too late will. + */ + migration_clear_memory_region_dirty_bitmap(rb, page); + } ret = test_and_clear_bit(page, rb->bmap); if (ret) { @@ -837,8 +856,8 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, return ret; } -static void dirty_bitmap_clear_section(MemoryRegionSection *section, - void *opaque) +static int dirty_bitmap_clear_section(MemoryRegionSection *section, + void *opaque) { const hwaddr offset = section->offset_within_region; const hwaddr size = int128_get64(section->size); @@ -857,6 +876,7 @@ static void dirty_bitmap_clear_section(MemoryRegionSection *section, } *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); bitmap_clear(rb->bmap, start, npages); + return 0; } /* @@ -1088,9 +1108,10 @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage) } } -static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) +void migration_bitmap_sync_precopy(bool last_stage) { Error *local_err = NULL; + assert(ram_state); /* * The current notifier usage is just an optimization to migration, so we @@ -1101,7 +1122,7 @@ static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) local_err = NULL; } - migration_bitmap_sync(rs, last_stage); + migration_bitmap_sync(ram_state, last_stage); if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { error_report_err(local_err); @@ -1169,32 +1190,6 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, } /* - * @pages: the number of pages written by the control path, - * < 0 - error - * > 0 - number of pages written - * - * Return true if the pages has been saved, otherwise false is returned. - */ -static bool control_save_page(PageSearchStatus *pss, - ram_addr_t offset, int *pages) -{ - int ret; - - ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset, - TARGET_PAGE_SIZE); - if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { - return false; - } - - if (ret == RAM_SAVE_CONTROL_DELAYED) { - *pages = 1; - return true; - } - *pages = ret; - return true; -} - -/* * directly send the page to the stream * * Returns the number of pages written. @@ -1322,19 +1317,12 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) pss->page = 0; pss->block = QLIST_NEXT_RCU(pss->block, next); if (!pss->block) { - if (migrate_multifd() && - (!migrate_multifd_flush_after_each_section() || - migrate_mapped_ram())) { + if (multifd_ram_sync_per_round()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; - int ret = multifd_send_sync_main(); + int ret = multifd_ram_flush_and_sync(f); if (ret < 0) { return ret; } - - if (!migrate_mapped_ram()) { - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - qemu_fflush(f); - } } /* Hit the end of the list */ @@ -1765,19 +1753,17 @@ bool ram_write_tracking_available(void) bool ram_write_tracking_compatible(void) { - assert(0); - return false; + g_assert_not_reached(); } int ram_write_tracking_start(void) { - assert(0); - return -1; + g_assert_not_reached(); } void ram_write_tracking_stop(void) { - assert(0); + g_assert_not_reached(); } #endif /* defined(__linux__) */ @@ -1795,7 +1781,7 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) { RAMBlock *block; ram_addr_t offset; - bool dirty; + bool dirty = false; do { block = unqueue_page(rs, &offset); @@ -1987,53 +1973,40 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, } /** - * ram_save_target_page_legacy: save one target page - * - * Returns the number of pages written + * ram_save_target_page: save one target page to the precopy thread + * OR to multifd workers. * * @rs: current RAM state * @pss: data about the page we want to send */ -static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) +static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) { ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; - if (control_save_page(pss, offset, &res)) { - return res; - } + /* Hand over to RDMA first */ + if (migrate_rdma()) { + res = rdma_control_save_page(pss->pss_channel, pss->block->offset, + offset, TARGET_PAGE_SIZE); - if (save_zero_page(rs, pss, offset)) { - return 1; + if (res == RAM_SAVE_CONTROL_DELAYED) { + res = 1; + } + return res; } - return ram_save_page(rs, pss); -} - -/** - * ram_save_target_page_multifd: send one target page to multifd workers - * - * Returns 1 if the page was queued, -1 otherwise. - * - * @rs: current RAM state - * @pss: data about the page we want to send - */ -static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) -{ - RAMBlock *block = pss->block; - ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; - - /* - * While using multifd live migration, we still need to handle zero - * page checking on the migration main thread. - */ - if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (!migrate_multifd() + || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { if (save_zero_page(rs, pss, offset)) { return 1; } } - return ram_save_multifd_page(block, offset); + if (migrate_multifd() && !migration_in_postcopy()) { + return ram_save_multifd_page(pss->block, offset); + } + + return ram_save_page(rs, pss); } /* Should be called before sending a host page */ @@ -2091,6 +2064,21 @@ static void pss_host_page_finish(PageSearchStatus *pss) pss->host_page_start = pss->host_page_end = 0; } +static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss) +{ + PageLocationHint *hint = &rs->page_hint; + + /* If there's a pending hint not consumed, don't bother */ + if (hint->valid) { + return; + } + + /* Provide a hint to the background stream otherwise */ + hint->location.block = pss->block; + hint->location.offset = pss->page; + hint->valid = true; +} + /* * Send an urgent host page specified by `pss'. Need to be called with * bitmap_mutex held. @@ -2122,7 +2110,7 @@ static int ram_save_host_page_urgent(PageSearchStatus *pss) if (page_dirty) { /* Be strict to return code; it must be 1, or what else? */ - if (migration_ops->ram_save_target_page(rs, pss) != 1) { + if (ram_save_target_page(rs, pss) != 1) { error_report_once("%s: ram_save_target_page failed", __func__); ret = -1; goto out; @@ -2136,6 +2124,7 @@ out: /* For urgent requests, flush immediately if sent */ if (sent) { qemu_fflush(pss->pss_channel); + ram_page_hint_update(rs, pss); } return ret; } @@ -2191,7 +2180,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) if (preempt_active) { qemu_mutex_unlock(&rs->bitmap_mutex); } - tmppages = migration_ops->ram_save_target_page(rs, pss); + tmppages = ram_save_target_page(rs, pss); if (tmppages >= 0) { pages += tmppages; /* @@ -2223,6 +2212,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return (res < 0 ? res : pages); } +static bool ram_page_hint_valid(RAMState *rs) +{ + /* There's only page hint during postcopy preempt mode */ + if (!postcopy_preempt_active()) { + return false; + } + + return rs->page_hint.valid; +} + +static void ram_page_hint_collect(RAMState *rs, RAMBlock **block, + unsigned long *page) +{ + PageLocationHint *hint = &rs->page_hint; + + assert(hint->valid); + + *block = hint->location.block; + *page = hint->location.offset; + + /* Mark the hint consumed */ + hint->valid = false; +} + /** * ram_find_and_save_block: finds a dirty page and sends it to f * @@ -2239,6 +2252,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) static int ram_find_and_save_block(RAMState *rs) { PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + unsigned long next_page; + RAMBlock *next_block; int pages = 0; /* No dirty page as there is zero RAM */ @@ -2258,7 +2273,14 @@ static int ram_find_and_save_block(RAMState *rs) rs->last_page = 0; } - pss_init(pss, rs->last_seen_block, rs->last_page); + if (ram_page_hint_valid(rs)) { + ram_page_hint_collect(rs, &next_block, &next_page); + } else { + next_block = rs->last_seen_block; + next_page = rs->last_page; + } + + pss_init(pss, next_block, next_page); while (true){ if (!get_queued_page(rs, pss)) { @@ -2387,9 +2409,15 @@ static void ram_save_cleanup(void *opaque) ram_bitmaps_destroy(); xbzrle_cleanup(); + multifd_ram_save_cleanup(); ram_state_cleanup(rsp); - g_free(migration_ops); - migration_ops = NULL; +} + +static void ram_page_hint_reset(PageLocationHint *hint) +{ + hint->location.block = NULL; + hint->location.offset = 0; + hint->valid = false; } static void ram_state_reset(RAMState *rs) @@ -2404,6 +2432,8 @@ static void ram_state_reset(RAMState *rs) rs->last_page = 0; rs->last_version = ram_list.version; rs->xbzrle_started = false; + + ram_page_hint_reset(&rs->page_hint); } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -2783,7 +2813,7 @@ static bool ram_init_bitmaps(RAMState *rs, Error **errp) if (!ret) { goto out_unlock; } - migration_bitmap_sync_precopy(rs, false); + migration_bitmap_sync_precopy(false); } } out_unlock: @@ -2860,7 +2890,7 @@ void qemu_guest_free_page_hint(void *addr, size_t len) size_t used_len, start, npages; /* This function is currently expected to be used during live migration */ - if (!migration_is_setup_or_active()) { + if (!migration_is_running()) { return; } @@ -3055,27 +3085,43 @@ static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) return ret; } - migration_ops = g_malloc0(sizeof(MigrationOps)); - if (migrate_multifd()) { - migration_ops->ram_save_target_page = ram_save_target_page_multifd; - } else { - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + multifd_ram_save_setup(); } + /* + * This operation is unfortunate.. + * + * For legacy QEMUs using per-section sync + * ======================================= + * + * This must exist because the EOS below requires the SYNC messages + * per-channel to work. + * + * For modern QEMUs using per-round sync + * ===================================== + * + * Logically such sync is not needed, and recv threads should not run + * until setup ready (using things like channels_ready on src). Then + * we should be all fine. + * + * However even if we add channels_ready to recv side in new QEMUs, old + * QEMU won't have them so this sync will still be needed to make sure + * multifd recv threads won't start processing guest pages early before + * ram_load_setup() is properly done. + * + * Let's stick with this. Fortunately the overhead is low to sync + * during setup because the VM is running, so at least it's not + * accounted as part of downtime. + */ bql_unlock(); - ret = multifd_send_sync_main(); + ret = multifd_ram_flush_and_sync(f); bql_lock(); if (ret < 0) { error_setg(errp, "%s: multifd synchronization failed", __func__); return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section() - && !migrate_mapped_ram()) { - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - } - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ret = qemu_fflush(f); if (ret < 0) { @@ -3207,11 +3253,9 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } out: - if (ret >= 0 - && migration_is_setup_or_active()) { - if (migrate_multifd() && migrate_multifd_flush_after_each_section() && - !migrate_mapped_ram()) { - ret = multifd_send_sync_main(); + if (ret >= 0 && migration_is_running()) { + if (multifd_ram_sync_per_section()) { + ret = multifd_ram_flush_and_sync(f); if (ret < 0) { return ret; } @@ -3248,7 +3292,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) WITH_RCU_READ_LOCK_GUARD() { if (!migration_in_postcopy()) { - migration_bitmap_sync_precopy(rs, true); + migration_bitmap_sync_precopy(true); } ret = rdma_registration_start(f, RAM_CONTROL_FINISH); @@ -3283,9 +3327,15 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } } - ret = multifd_send_sync_main(); - if (ret < 0) { - return ret; + if (multifd_ram_sync_per_section()) { + /* + * Only the old dest QEMU will need this sync, because each EOS + * will require one SYNC message on each channel. + */ + ret = multifd_ram_flush_and_sync(f); + if (ret < 0) { + return ret; + } } if (migrate_mapped_ram()) { @@ -3330,7 +3380,7 @@ static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, if (!migration_in_postcopy()) { bql_lock(); WITH_RCU_READ_LOCK_GUARD() { - migration_bitmap_sync_precopy(rs, false); + migration_bitmap_sync_precopy(false); } bql_unlock(); } @@ -3631,7 +3681,9 @@ static int ram_load_cleanup(void *opaque) RAMBlock *rb; RAMBLOCK_FOREACH_NOT_IGNORED(rb) { - qemu_ram_block_writeback(rb); + if (memory_region_is_nonvolatile(rb->mr)) { + qemu_ram_block_writeback(rb); + } } xbzrle_load_cleanup(); @@ -3796,15 +3848,7 @@ int ram_load_postcopy(QEMUFile *f, int channel) TARGET_PAGE_SIZE); } break; - case RAM_SAVE_FLAG_MULTIFD_FLUSH: - multifd_recv_sync_main(); - break; case RAM_SAVE_FLAG_EOS: - /* normal exit */ - if (migrate_multifd() && - migrate_multifd_flush_after_each_section()) { - multifd_recv_sync_main(); - } break; default: error_report("Unknown combination of migration flags: 0x%x" @@ -4004,8 +4048,6 @@ static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, /* Skip pages array */ qemu_set_offset(f, block->pages_offset + length, SEEK_SET); - - return; } static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) @@ -4294,6 +4336,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) * it will be necessary to reduce the granularity of this * critical section. */ + trace_ram_load_start(); WITH_RCU_READ_LOCK_GUARD() { if (postcopy_running) { /* @@ -4460,6 +4503,42 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) return 0; } +static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) +{ + int ret; + + if (migrate_multifd()) { + /* + * When multifd is enabled, source QEMU needs to make sure all the + * pages queued before postcopy starts have been flushed. + * + * The load of these pages must happen before switching to postcopy. + * It's because loading of guest pages (so far) in multifd recv + * threads is still non-atomic, so the load cannot happen with vCPUs + * running on the destination side. + * + * This flush and sync will guarantee that those pages are loaded + * _before_ postcopy starts on the destination. The rationale is, + * this happens before VM stops (and before source QEMU sends all + * the rest of the postcopy messages). So when the destination QEMU + * receives the postcopy messages, it must have received the sync + * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, + * or RAM_SAVE_FLAG_EOS), and such message would guarantee that + * all previous guest pages queued in the multifd channels are + * completely loaded. + */ + ret = multifd_ram_flush_and_sync(f); + if (ret < 0) { + error_setg(errp, "%s: multifd flush and sync failed", __func__); + return false; + } + } + + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + + return true; +} + void postcopy_preempt_shutdown_file(MigrationState *s) { qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); @@ -4479,6 +4558,7 @@ static SaveVMHandlers savevm_ram_handlers = { .load_setup = ram_load_setup, .load_cleanup = ram_load_cleanup, .resume_prepare = ram_resume_prepare, + .save_postcopy_prepare = ram_save_postcopy_prepare, }; static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, @@ -4498,7 +4578,7 @@ static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, return; } - if (!migration_is_idle()) { + if (migration_is_running()) { /* * Precopy code on the source cannot deal with the size of RAM blocks * changing at random points in time - especially after sending the @@ -4506,8 +4586,10 @@ static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, * Abort and indicate a proper reason. */ error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); - migration_cancel(err); + migrate_set_error(migrate_get_current(), err); error_free(err); + + migration_cancel(); } switch (ps) { diff --git a/migration/ram.h b/migration/ram.h index bc0318b..921c39a 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -33,6 +33,34 @@ #include "exec/cpu-common.h" #include "io/channel.h" +/* + * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it + * worked for pages that were filled with the same char. We switched + * it to only search for the zero value. And to avoid confusion with + * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. + * + * RAM_SAVE_FLAG_FULL (0x01) was obsoleted in 2009. + * + * RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1. + * + * RAM_SAVE_FLAG_HOOK is only used in RDMA. Whenever this is found in the + * data stream, the flags will be passed to rdma functions in the + * incoming-migration side. + * + * We can't use any flag that is bigger than 0x200, because the flags are + * always assumed to be encoded in a ramblock address offset, which is + * multiple of PAGE_SIZE. Here it means QEMU supports migration with any + * architecture that has PAGE_SIZE>=1K (0x400). + */ +#define RAM_SAVE_FLAG_ZERO 0x002 +#define RAM_SAVE_FLAG_MEM_SIZE 0x004 +#define RAM_SAVE_FLAG_PAGE 0x008 +#define RAM_SAVE_FLAG_EOS 0x010 +#define RAM_SAVE_FLAG_CONTINUE 0x020 +#define RAM_SAVE_FLAG_XBZRLE 0x040 +#define RAM_SAVE_FLAG_HOOK 0x080 +#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 + extern XBZRLECacheStats xbzrle_counters; /* Should be holding either ram_list.mutex, or the RCU lock. */ @@ -44,6 +72,7 @@ extern XBZRLECacheStats xbzrle_counters; INTERNAL_RAMBLOCK_FOREACH(block) \ if (!qemu_ram_is_migratable(block)) {} else +void ram_mig_init(void); int xbzrle_cache_resize(uint64_t new_size, Error **errp); uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_total(void); diff --git a/migration/rdma.c b/migration/rdma.c index 855753c..2d839fc 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -30,7 +30,7 @@ #include "qemu/sockets.h" #include "qemu/bitmap.h" #include "qemu/coroutine.h" -#include "exec/memory.h" +#include "system/memory.h" #include <sys/socket.h> #include <netdb.h> #include <arpa/inet.h> @@ -768,156 +768,12 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) } /* - * As of now, IPv6 over RoCE / iWARP is not supported by linux. - * We will try the next addrinfo struct, and fail if there are - * no other valid addresses to bind against. - * - * If user is listening on '[::]', then we will not have a opened a device - * yet and have no way of verifying if the device is RoCE or not. - * - * In this case, the source VM will throw an error for ALL types of - * connections (both IPv4 and IPv6) if the destination machine does not have - * a regular infiniband network available for use. - * - * The only way to guarantee that an error is thrown for broken kernels is - * for the management software to choose a *specific* interface at bind time - * and validate what time of hardware it is. - * - * Unfortunately, this puts the user in a fix: - * - * If the source VM connects with an IPv4 address without knowing that the - * destination has bound to '[::]' the migration will unconditionally fail - * unless the management software is explicitly listening on the IPv4 - * address while using a RoCE-based device. - * - * If the source VM connects with an IPv6 address, then we're OK because we can - * throw an error on the source (and similarly on the destination). - * - * But in mixed environments, this will be broken for a while until it is fixed - * inside linux. - * - * We do provide a *tiny* bit of help in this function: We can list all of the - * devices in the system and check to see if all the devices are RoCE or - * Infiniband. - * - * If we detect that we have a *pure* RoCE environment, then we can safely - * thrown an error even if the management software has specified '[::]' as the - * bind address. - * - * However, if there is are multiple hetergeneous devices, then we cannot make - * this assumption and the user just has to be sure they know what they are - * doing. - * - * Patches are being reviewed on linux-rdma. - */ -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) -{ - /* This bug only exists in linux, to our knowledge. */ -#ifdef CONFIG_LINUX - struct ibv_port_attr port_attr; - - /* - * Verbs are only NULL if management has bound to '[::]'. - * - * Let's iterate through all the devices and see if there any pure IB - * devices (non-ethernet). - * - * If not, then we can safely proceed with the migration. - * Otherwise, there are no guarantees until the bug is fixed in linux. - */ - if (!verbs) { - int num_devices; - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); - bool roce_found = false; - bool ib_found = false; - - for (int x = 0; x < num_devices; x++) { - verbs = ibv_open_device(dev_list[x]); - /* - * ibv_open_device() is not documented to set errno. If - * it does, it's somebody else's doc bug. If it doesn't, - * the use of errno below is wrong. - * TODO Find out whether ibv_open_device() sets errno. - */ - if (!verbs) { - if (errno == EPERM) { - continue; - } else { - error_setg_errno(errp, errno, - "could not open RDMA device context"); - return -1; - } - } - - if (ibv_query_port(verbs, 1, &port_attr)) { - ibv_close_device(verbs); - error_setg(errp, - "RDMA ERROR: Could not query initial IB port"); - return -1; - } - - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { - ib_found = true; - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - roce_found = true; - } - - ibv_close_device(verbs); - - } - - if (roce_found) { - if (ib_found) { - warn_report("migrations may fail:" - " IPv6 over RoCE / iWARP in linux" - " is broken. But since you appear to have a" - " mixed RoCE / IB environment, be sure to only" - " migrate over the IB fabric until the kernel " - " fixes the bug."); - } else { - error_setg(errp, "RDMA ERROR: " - "You only have RoCE / iWARP devices in your systems" - " and your management software has specified '[::]'" - ", but IPv6 over RoCE / iWARP is not supported in Linux."); - return -1; - } - } - - return 0; - } - - /* - * If we have a verbs context, that means that some other than '[::]' was - * used by the management software for binding. In which case we can - * actually warn the user about a potentially broken kernel. - */ - - /* IB ports start with 1, not 0 */ - if (ibv_query_port(verbs, 1, &port_attr)) { - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); - return -1; - } - - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - error_setg(errp, "RDMA ERROR: " - "Linux kernel's RoCE / iWARP does not support IPv6 " - "(but patches on linux-rdma in progress)"); - return -1; - } - -#endif - - return 0; -} - -/* * Figure out which RDMA device corresponds to the requested IP hostname * Also create the initial connection manager identifiers for opening * the connection. */ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) { - Error *err = NULL; int ret; struct rdma_addrinfo *res; char port_str[16]; @@ -953,9 +809,8 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) goto err_resolve_get_addr; } - /* Try all addresses, saving the first error in @err */ + /* Try all addresses, exit loop on first success of resolving address */ for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) { - Error **local_errp = err ? NULL : &err; inet_ntop(e->ai_family, &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); @@ -964,25 +819,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, RDMA_RESOLVE_TIMEOUT_MS); if (ret >= 0) { - if (e->ai_family == AF_INET6) { - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, - local_errp); - if (ret < 0) { - continue; - } - } - error_free(err); goto route; } } rdma_freeaddrinfo(res); - if (err) { - error_propagate(errp, err); - } else { - error_setg(errp, "RDMA ERROR: could not resolve address %s", - rdma->host); - } + error_setg(errp, "RDMA ERROR: could not resolve address %s", rdma->host); goto err_resolve_get_addr; route: @@ -2611,7 +2453,6 @@ err_rdma_source_connect: static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) { - Error *err = NULL; int ret; struct rdma_cm_id *listen_id; char ip[40] = "unknown"; @@ -2661,9 +2502,8 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) goto err_dest_init_bind_addr; } - /* Try all addresses, saving the first error in @err */ + /* Try all addresses */ for (e = res; e != NULL; e = e->ai_next) { - Error **local_errp = err ? NULL : &err; inet_ntop(e->ai_family, &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); @@ -2672,24 +2512,12 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) if (ret < 0) { continue; } - if (e->ai_family == AF_INET6) { - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, - local_errp); - if (ret < 0) { - continue; - } - } - error_free(err); break; } rdma_freeaddrinfo(res); if (!e) { - if (err) { - error_propagate(errp, err); - } else { - error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!"); - } + error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!"); goto err_dest_init_bind_addr; } @@ -3284,14 +3112,11 @@ err: int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, size_t size) { - if (!migrate_rdma() || migration_in_postcopy()) { - return RAM_SAVE_CONTROL_NOT_SUPP; - } + assert(migrate_rdma()); int ret = qemu_rdma_save_page(f, block_offset, offset, size); - if (ret != RAM_SAVE_CONTROL_DELAYED && - ret != RAM_SAVE_CONTROL_NOT_SUPP) { + if (ret != RAM_SAVE_CONTROL_DELAYED) { if (ret < 0) { qemu_file_set_error(f, ret); } @@ -3829,7 +3654,7 @@ int rdma_block_notification_handle(QEMUFile *f, const char *name) int rdma_registration_start(QEMUFile *f, uint64_t flags) { - if (!migrate_rdma() || migration_in_postcopy()) { + if (!migrate_rdma()) { return 0; } @@ -3861,7 +3686,7 @@ int rdma_registration_stop(QEMUFile *f, uint64_t flags) RDMAControlHeader head = { .len = 0, .repeat = 1 }; int ret; - if (!migrate_rdma() || migration_in_postcopy()) { + if (!migrate_rdma()) { return 0; } @@ -3985,7 +3810,7 @@ static void qio_channel_rdma_finalize(Object *obj) } static void qio_channel_rdma_class_init(ObjectClass *klass, - void *class_data G_GNUC_UNUSED) + const void *class_data G_GNUC_UNUSED) { QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass); @@ -4174,7 +3999,7 @@ void rdma_start_outgoing_migration(void *opaque, s->to_dst_file = rdma_new_output(rdma); s->rdma_migration = true; - migrate_fd_connect(s, NULL); + migration_connect(s, NULL); return; return_path_err: qemu_rdma_cleanup(rdma); diff --git a/migration/rdma.h b/migration/rdma.h index a8d27f3..f74f16a 100644 --- a/migration/rdma.h +++ b/migration/rdma.h @@ -19,7 +19,7 @@ #ifndef QEMU_MIGRATION_RDMA_H #define QEMU_MIGRATION_RDMA_H -#include "exec/memory.h" +#include "system/memory.h" void rdma_start_outgoing_migration(void *opaque, InetSocketAddress *host_port, Error **errp); @@ -33,14 +33,6 @@ void rdma_start_incoming_migration(InetSocketAddress *host_port, Error **errp); #define RAM_CONTROL_ROUND 1 #define RAM_CONTROL_FINISH 3 -/* - * Whenever this is found in the data stream, the flags - * will be passed to rdma functions in the incoming-migration - * side. - */ -#define RAM_SAVE_FLAG_HOOK 0x80 - -#define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 #ifdef CONFIG_RDMA @@ -63,7 +55,7 @@ static inline int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, size_t size) { - return RAM_SAVE_CONTROL_NOT_SUPP; + g_assert_not_reached(); } #endif #endif diff --git a/migration/savevm.c b/migration/savevm.c index deb5783..bb04a45 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -37,6 +37,7 @@ #include "migration/register.h" #include "migration/global_state.h" #include "migration/channel-block.h" +#include "multifd.h" #include "ram.h" #include "qemu-file.h" #include "savevm.h" @@ -46,27 +47,29 @@ #include "qapi/clone-visitor.h" #include "qapi/qapi-builtin-visit.h" #include "qemu/error-report.h" -#include "sysemu/cpus.h" -#include "exec/memory.h" +#include "system/cpus.h" +#include "system/memory.h" #include "exec/target_page.h" +#include "exec/page-vary.h" #include "trace.h" #include "qemu/iov.h" #include "qemu/job.h" #include "qemu/main-loop.h" #include "block/snapshot.h" +#include "block/thread-pool.h" #include "qemu/cutils.h" #include "io/channel-buffer.h" #include "io/channel-file.h" -#include "sysemu/replay.h" -#include "sysemu/runstate.h" -#include "sysemu/sysemu.h" -#include "sysemu/xen.h" +#include "system/replay.h" +#include "system/runstate.h" +#include "system/system.h" +#include "system/xen.h" #include "migration/colo.h" #include "qemu/bitmap.h" #include "net/announce.h" #include "qemu/yank.h" #include "yank_functions.h" -#include "sysemu/qtest.h" +#include "system/qtest.h" #include "options.h" const unsigned int postcopy_ram_discard_version; @@ -90,6 +93,7 @@ enum qemu_vm_cmd { MIG_CMD_ENABLE_COLO, /* Enable COLO */ MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ + MIG_CMD_SWITCHOVER_START, /* Switchover start notification */ MIG_CMD_MAX }; @@ -109,6 +113,7 @@ static struct mig_cmd_args { [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_CMD_SWITCHOVER_START] = { .len = 0, .name = "SWITCHOVER_START" }, [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, }; @@ -130,6 +135,35 @@ static struct mig_cmd_args { */ /***********************************************************/ +/* Optional load threads pool support */ + +static void qemu_loadvm_thread_pool_create(MigrationIncomingState *mis) +{ + assert(!mis->load_threads); + mis->load_threads = thread_pool_new(); + mis->load_threads_abort = false; +} + +static void qemu_loadvm_thread_pool_destroy(MigrationIncomingState *mis) +{ + qatomic_set(&mis->load_threads_abort, true); + + bql_unlock(); /* Load threads might be waiting for BQL */ + g_clear_pointer(&mis->load_threads, thread_pool_free); + bql_lock(); +} + +static bool qemu_loadvm_thread_pool_wait(MigrationState *s, + MigrationIncomingState *mis) +{ + bql_unlock(); /* Let load threads do work requiring BQL */ + thread_pool_wait(mis->load_threads); + bql_lock(); + + return !migrate_has_error(s); +} + +/***********************************************************/ /* savevm/loadvm support */ static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable) @@ -232,7 +266,7 @@ typedef struct SaveState { static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), - .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL }, + .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL }, .global_section_id = 0, }; @@ -306,7 +340,7 @@ static int configuration_pre_load(void *opaque) * predates the variable-target-page-bits support and is using the * minimum possible value for this CPU. */ - state->target_page_bits = qemu_target_page_bits_min(); + state->target_page_bits = migration_legacy_page_bits(); return 0; } @@ -429,8 +463,7 @@ static const VMStateInfo vmstate_info_capability = { */ static bool vmstate_target_page_bits_needed(void *opaque) { - return qemu_target_page_bits() - > qemu_target_page_bits_min(); + return qemu_target_page_bits() > migration_legacy_page_bits(); } static const VMStateDescription vmstate_target_page_bits = { @@ -704,7 +737,7 @@ static int calculate_compat_instance_id(const char *idstr) static inline MigrationPriority save_state_priority(SaveStateEntry *se) { - if (se->vmsd) { + if (se->vmsd && se->vmsd->priority) { return se->vmsd->priority; } return MIG_PRI_DEFAULT; @@ -860,23 +893,6 @@ static void vmstate_check(const VMStateDescription *vmsd) } } -/* - * See comment in hw/intc/xics.c:icp_realize() - * - * This function can be removed when - * pre_2_10_vmstate_register_dummy_icp() is removed. - */ -int vmstate_replace_hack_for_ppc(VMStateIf *obj, int instance_id, - const VMStateDescription *vmsd, - void *opaque) -{ - SaveStateEntry *se = find_se(vmsd->name, instance_id); - - if (se) { - savevm_state_handler_remove(se); - } - return vmstate_register(obj, instance_id, vmsd, opaque); -} int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, const VMStateDescription *vmsd, @@ -1218,6 +1234,19 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); } +static void qemu_savevm_send_switchover_start(QEMUFile *f) +{ + trace_savevm_send_switchover_start(); + qemu_savevm_command_send(f, MIG_CMD_SWITCHOVER_START, 0, NULL); +} + +void qemu_savevm_maybe_send_switchover_start(QEMUFile *f) +{ + if (migrate_send_switchover_start()) { + qemu_savevm_send_switchover_start(f); + } +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -1248,8 +1277,7 @@ void qemu_savevm_non_migratable_list(strList **reasons) void qemu_savevm_state_header(QEMUFile *f) { MigrationState *s = migrate_get_current(); - - s->vmdesc = json_writer_new(false); + JSONWriter *vmdesc = s->vmdesc; trace_savevm_state_header(); qemu_put_be32(f, QEMU_VM_FILE_MAGIC); @@ -1258,16 +1286,21 @@ void qemu_savevm_state_header(QEMUFile *f) if (s->send_configuration) { qemu_put_byte(f, QEMU_VM_CONFIGURATION); - /* - * This starts the main json object and is paired with the - * json_writer_end_object in - * qemu_savevm_state_complete_precopy_non_iterable - */ - json_writer_start_object(s->vmdesc, NULL); + if (vmdesc) { + /* + * This starts the main json object and is paired with the + * json_writer_end_object in + * qemu_savevm_state_complete_precopy_non_iterable + */ + json_writer_start_object(vmdesc, NULL); + json_writer_start_object(vmdesc, "configuration"); + } - json_writer_start_object(s->vmdesc, "configuration"); - vmstate_save_state(f, &vmstate_configuration, &savevm_state, s->vmdesc); - json_writer_end_object(s->vmdesc); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, vmdesc); + + if (vmdesc) { + json_writer_end_object(vmdesc); + } } } @@ -1313,16 +1346,19 @@ int qemu_savevm_state_setup(QEMUFile *f, Error **errp) { ERRP_GUARD(); MigrationState *ms = migrate_get_current(); + JSONWriter *vmdesc = ms->vmdesc; SaveStateEntry *se; int ret = 0; - json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); - json_writer_start_array(ms->vmdesc, "devices"); + if (vmdesc) { + json_writer_int64(vmdesc, "page_size", qemu_target_page_size()); + json_writer_start_array(vmdesc, "devices"); + } trace_savevm_state_setup(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (se->vmsd && se->vmsd->early_setup) { - ret = vmstate_save(f, se, ms->vmdesc, errp); + ret = vmstate_save(f, se, vmdesc, errp); if (ret) { migrate_set_error(ms, *errp); qemu_file_set_error(f, ret); @@ -1441,11 +1477,11 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) return all_finished; } -static bool should_send_vmdesc(void) +bool should_send_vmdesc(void) { MachineState *machine = MACHINE(qdev_get_machine()); - bool in_postcopy = migration_in_postcopy(); - return !machine->suppress_vmdesc && !in_postcopy; + + return !machine->suppress_vmdesc; } /* @@ -1487,12 +1523,62 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f) qemu_fflush(f); } -static +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp) +{ + SaveStateEntry *se; + bool ret; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->save_postcopy_prepare) { + continue; + } + + if (se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + + trace_savevm_section_start(se->idstr, se->section_id); + + save_section_header(f, se, QEMU_VM_SECTION_PART); + ret = se->ops->save_postcopy_prepare(f, se->opaque, errp); + save_section_footer(f, se); + + trace_savevm_section_end(se->idstr, se->section_id, ret); + + if (!ret) { + assert(*errp); + return false; + } + } + + return true; +} + int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) { int64_t start_ts_each, end_ts_each; SaveStateEntry *se; int ret; + bool multifd_device_state = multifd_device_state_supported(); + + if (multifd_device_state) { + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + SaveLiveCompletePrecopyThreadHandler hdlr; + + if (!se->ops || (in_postcopy && se->ops->has_postcopy && + se->ops->has_postcopy(se->opaque)) || + !se->ops->save_live_complete_precopy_thread) { + continue; + } + + hdlr = se->ops->save_live_complete_precopy_thread; + multifd_spawn_device_state_save_thread(hdlr, + se->idstr, se->instance_id, + se->opaque); + } + } QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || @@ -1518,21 +1604,39 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) save_section_footer(f, se); if (ret < 0) { qemu_file_set_error(f, ret); - return -1; + goto ret_fail_abort_threads; } end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, end_ts_each - start_ts_each); } + if (multifd_device_state) { + if (migrate_has_error(migrate_get_current())) { + multifd_abort_device_state_save_threads(); + } + + if (!multifd_join_device_state_save_threads()) { + qemu_file_set_error(f, -EINVAL); + return -1; + } + } + trace_vmstate_downtime_checkpoint("src-iterable-saved"); return 0; + +ret_fail_abort_threads: + if (multifd_device_state) { + multifd_abort_device_state_save_threads(); + multifd_join_device_state_save_threads(); + } + + return -1; } int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, - bool in_postcopy, - bool inactivate_disks) + bool in_postcopy) { MigrationState *ms = migrate_get_current(); int64_t start_ts_each, end_ts_each; @@ -1542,6 +1646,9 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, Error *local_err = NULL; int ret; + /* Making sure cpu states are synchronized before saving non-iterable */ + cpu_synchronize_all_states(); + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (se->vmsd && se->vmsd->early_setup) { /* Already saved during qemu_savevm_state_setup(). */ @@ -1563,76 +1670,42 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, end_ts_each - start_ts_each); } - if (inactivate_disks) { - /* Inactivate before sending QEMU_VM_EOF so that the - * bdrv_activate_all() on the other end won't fail. */ - ret = bdrv_inactivate_all(); - if (ret) { - error_setg(&local_err, "%s: bdrv_inactivate_all() failed (%d)", - __func__, ret); - migrate_set_error(ms, local_err); - error_report_err(local_err); - qemu_file_set_error(f, ret); - return ret; - } - } if (!in_postcopy) { /* Postcopy stream will still be going */ qemu_put_byte(f, QEMU_VM_EOF); - } - json_writer_end_array(vmdesc); - json_writer_end_object(vmdesc); - vmdesc_len = strlen(json_writer_get(vmdesc)); + if (vmdesc) { + json_writer_end_array(vmdesc); + json_writer_end_object(vmdesc); + vmdesc_len = strlen(json_writer_get(vmdesc)); - if (should_send_vmdesc()) { - qemu_put_byte(f, QEMU_VM_VMDESCRIPTION); - qemu_put_be32(f, vmdesc_len); - qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len); + qemu_put_byte(f, QEMU_VM_VMDESCRIPTION); + qemu_put_be32(f, vmdesc_len); + qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len); + } } - /* Free it now to detect any inconsistencies. */ - json_writer_free(vmdesc); - ms->vmdesc = NULL; - trace_vmstate_downtime_checkpoint("src-non-iterable-saved"); return 0; } -int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, - bool inactivate_disks) +int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only) { int ret; - Error *local_err = NULL; - bool in_postcopy = migration_in_postcopy(); - if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) { - error_report_err(local_err); + ret = qemu_savevm_state_complete_precopy_iterable(f, false); + if (ret) { + return ret; } - trace_savevm_state_complete_precopy(); - - cpu_synchronize_all_states(); - - if (!in_postcopy || iterable_only) { - ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy); + if (!iterable_only) { + ret = qemu_savevm_state_complete_precopy_non_iterable(f, false); if (ret) { return ret; } } - if (iterable_only) { - goto flush; - } - - ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy, - inactivate_disks); - if (ret) { - return ret; - } - -flush: return qemu_fflush(f); } @@ -1730,7 +1803,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) ret = qemu_file_get_error(f); if (ret == 0) { - qemu_savevm_state_complete_precopy(f, false, false); + qemu_savevm_maybe_send_switchover_start(f); + qemu_savevm_state_complete_precopy(f, false); ret = qemu_file_get_error(f); } if (ret != 0) { @@ -1756,7 +1830,7 @@ cleanup: void qemu_savevm_live_state(QEMUFile *f) { /* save QEMU_VM_SECTION_END section */ - qemu_savevm_state_complete_precopy(f, true, false); + qemu_savevm_state_complete_precopy(f, true); qemu_put_byte(f, QEMU_VM_EOF); } @@ -2004,7 +2078,7 @@ static void *postcopy_ram_listen_thread(void *opaque) migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); trace_postcopy_ram_listen_thread_start(); rcu_register_thread(); @@ -2013,6 +2087,8 @@ static void *postcopy_ram_listen_thread(void *opaque) * in qemu_file, and thus we must be blocking now. */ qemu_file_set_blocking(f, true); + + /* TODO: sanity check that only postcopiable data will be loaded here */ load_res = qemu_loadvm_state_main(f, mis); /* @@ -2073,8 +2149,9 @@ static void *postcopy_ram_listen_thread(void *opaque) * (If something broke then qemu will have to exit anyway since it's * got a bad migration state). */ + bql_lock(); migration_incoming_state_destroy(); - qemu_loadvm_state_cleanup(); + bql_unlock(); rcu_unregister_thread(); mis->have_listen_thread = false; @@ -2129,7 +2206,8 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) } mis->have_listen_thread = true; - postcopy_thread_create(mis, &mis->listen_thread, "mig/dst/listen", + postcopy_thread_create(mis, &mis->listen_thread, + MIGRATION_THREAD_DST_LISTEN, postcopy_ram_listen_thread, QEMU_THREAD_DETACHED); trace_loadvm_postcopy_handle_listen("return"); @@ -2138,7 +2216,6 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) static void loadvm_postcopy_handle_run_bh(void *opaque) { - Error *local_err = NULL; MigrationIncomingState *mis = opaque; trace_vmstate_downtime_checkpoint("dst-postcopy-bh-enter"); @@ -2154,22 +2231,20 @@ static void loadvm_postcopy_handle_run_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-postcopy-bh-announced"); - /* Make sure all file formats throw away their mutable metadata. - * If we get an error here, just don't restart the VM yet. */ - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - local_err = NULL; - autostart = false; - } - - trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated"); - dirty_bitmap_mig_before_vm_start(); if (autostart) { - /* Hold onto your hats, starting the CPU */ - vm_start(); + /* + * Make sure all file formats throw away their mutable metadata. + * If we get an error here, just don't restart the VM yet. + */ + bool success = migration_block_activate(NULL); + + trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated"); + + if (success) { + vm_start(); + } } else { /* leave it paused and let management decide when to start the CPU */ runstate_set(RUN_STATE_PAUSED); @@ -2429,6 +2504,26 @@ static int loadvm_process_enable_colo(MigrationIncomingState *mis) return ret; } +static int loadvm_postcopy_handle_switchover_start(void) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + int ret; + + if (!se->ops || !se->ops->switchover_start) { + continue; + } + + ret = se->ops->switchover_start(se->opaque); + if (ret < 0) { + return ret; + } + } + + return 0; +} + /* * Process an incoming 'QEMU_VM_COMMAND' * 0 just a normal return @@ -2527,6 +2622,9 @@ static int loadvm_process_command(QEMUFile *f) case MIG_CMD_ENABLE_COLO: return loadvm_process_enable_colo(mis); + + case MIG_CMD_SWITCHOVER_START: + return loadvm_postcopy_handle_switchover_start(); } return 0; @@ -2576,8 +2674,7 @@ static bool check_section_footer(QEMUFile *f, SaveStateEntry *se) } static int -qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, - uint8_t type) +qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) { bool trace_downtime = (type == QEMU_VM_SECTION_FULL); uint32_t instance_id, version_id, section_id; @@ -2655,8 +2752,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, } static int -qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis, - uint8_t type) +qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type) { bool trace_downtime = (type == QEMU_VM_SECTION_END); int64_t start_ts, end_ts; @@ -2732,13 +2828,11 @@ static int qemu_loadvm_state_header(QEMUFile *f) if (migrate_get_current()->send_configuration) { if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { error_report("Configuration section missing"); - qemu_loadvm_state_cleanup(); return -EINVAL; } ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); if (ret) { - qemu_loadvm_state_cleanup(); return ret; } } @@ -2790,16 +2884,68 @@ static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp) return 0; } -void qemu_loadvm_state_cleanup(void) +struct LoadThreadData { + MigrationLoadThread function; + void *opaque; +}; + +static int qemu_loadvm_load_thread(void *thread_opaque) +{ + struct LoadThreadData *data = thread_opaque; + MigrationIncomingState *mis = migration_incoming_get_current(); + g_autoptr(Error) local_err = NULL; + + if (!data->function(data->opaque, &mis->load_threads_abort, &local_err)) { + MigrationState *s = migrate_get_current(); + + /* + * Can't set load_threads_abort here since processing of main migration + * channel data could still be happening, resulting in launching of new + * load threads. + */ + + assert(local_err); + + /* + * In case of multiple load threads failing which thread error + * return we end setting is purely arbitrary. + */ + migrate_set_error(s, local_err); + } + + return 0; +} + +void qemu_loadvm_start_load_thread(MigrationLoadThread function, + void *opaque) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + struct LoadThreadData *data; + + /* We only set it from this thread so it's okay to read it directly */ + assert(!mis->load_threads_abort); + + data = g_new(struct LoadThreadData, 1); + data->function = function; + data->opaque = opaque; + + thread_pool_submit_immediate(mis->load_threads, qemu_loadvm_load_thread, + data, g_free); +} + +void qemu_loadvm_state_cleanup(MigrationIncomingState *mis) { SaveStateEntry *se; trace_loadvm_state_cleanup(); + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (se->ops && se->ops->load_cleanup) { se->ops->load_cleanup(se->opaque); } } + + qemu_loadvm_thread_pool_destroy(mis); } /* Return true if we should continue the migration, or false. */ @@ -2891,14 +3037,14 @@ retry: switch (section_type) { case QEMU_VM_SECTION_START: case QEMU_VM_SECTION_FULL: - ret = qemu_loadvm_section_start_full(f, mis, section_type); + ret = qemu_loadvm_section_start_full(f, section_type); if (ret < 0) { goto out; } break; case QEMU_VM_SECTION_PART: case QEMU_VM_SECTION_END: - ret = qemu_loadvm_section_part_end(f, mis, section_type); + ret = qemu_loadvm_section_part_end(f, section_type); if (ret < 0) { goto out; } @@ -2950,6 +3096,7 @@ out: int qemu_loadvm_state(QEMUFile *f) { + MigrationState *s = migrate_get_current(); MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; int ret; @@ -2959,6 +3106,8 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } + qemu_loadvm_thread_pool_create(mis); + ret = qemu_loadvm_state_header(f); if (ret) { return ret; @@ -2981,13 +3130,27 @@ int qemu_loadvm_state(QEMUFile *f) trace_qemu_loadvm_state_post_main(ret); if (mis->have_listen_thread) { - /* Listen thread still going, can't clean up yet */ + /* + * Postcopy listen thread still going, don't synchronize the + * cpus yet. + */ return ret; } + /* When reaching here, it must be precopy */ if (ret == 0) { - ret = qemu_file_get_error(f); + if (migrate_has_error(migrate_get_current()) || + !qemu_loadvm_thread_pool_wait(s, mis)) { + ret = -EINVAL; + } else { + ret = qemu_file_get_error(f); + } } + /* + * Set this flag unconditionally so we'll catch further attempts to + * start additional threads via an appropriate assert() + */ + qatomic_set(&mis->load_threads_abort, true); /* * Try to read in the VMDESC section as well, so that dumping tools that @@ -3024,7 +3187,6 @@ int qemu_loadvm_state(QEMUFile *f) } } - qemu_loadvm_state_cleanup(); cpu_synchronize_all_post_init(); return ret; @@ -3064,6 +3226,29 @@ int qemu_loadvm_approve_switchover(void) return migrate_send_rp_switchover_ack(mis); } +bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, + char *buf, size_t len, Error **errp) +{ + SaveStateEntry *se; + + se = find_se(idstr, instance_id); + if (!se) { + error_setg(errp, + "Unknown idstr %s or instance id %u for load state buffer", + idstr, instance_id); + return false; + } + + if (!se->ops || !se->ops->load_state_buffer) { + error_setg(errp, + "idstr %s / instance %u has no load state buffer operation", + idstr, instance_id); + return false; + } + + return se->ops->load_state_buffer(se->opaque, buf, len, errp); +} + bool save_snapshot(const char *name, bool overwrite, const char *vmstate, bool has_devices, strList *devices, Error **errp) { @@ -3211,11 +3396,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, * side of the migration take control of the images. */ if (live && !saved_vm_running) { - ret = bdrv_inactivate_all(); - if (ret) { - error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)", - __func__, ret); - } + migration_block_inactivate(); } } @@ -3286,6 +3467,7 @@ bool load_snapshot(const char *name, const char *vmstate, /* Don't even try to load empty VM states */ ret = bdrv_snapshot_find(bs_vm_state, &sn, name); if (ret < 0) { + error_setg(errp, "Snapshot can not be found"); return false; } else if (sn.vm_state_size == 0) { error_setg(errp, "This is a disk-only snapshot. Revert to it " @@ -3365,12 +3547,14 @@ void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) qemu_ram_set_idstr(mr->ram_block, memory_region_name(mr), dev); qemu_ram_set_migratable(mr->ram_block); + ram_block_add_cpr_blocker(mr->ram_block, &error_fatal); } void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev) { qemu_ram_unset_idstr(mr->ram_block); qemu_ram_unset_migratable(mr->ram_block); + ram_block_del_cpr_blocker(mr->ram_block); } void vmstate_register_ram_global(MemoryRegion *mr) diff --git a/migration/savevm.h b/migration/savevm.h index 9ec96a9..2d5e9c7 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -39,12 +39,13 @@ void qemu_savevm_state_header(QEMUFile *f); int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy); void qemu_savevm_state_cleanup(void); void qemu_savevm_state_complete_postcopy(QEMUFile *f); -int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, - bool inactivate_disks); +int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only); void qemu_savevm_state_pending_exact(uint64_t *must_precopy, uint64_t *can_postcopy); void qemu_savevm_state_pending_estimate(uint64_t *must_precopy, uint64_t *can_postcopy); +int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy); +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp); void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); void qemu_savevm_send_open_return_path(QEMUFile *f); int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); @@ -53,6 +54,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); void qemu_savevm_send_postcopy_resume(QEMUFile *f); void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); +void qemu_savevm_maybe_send_switchover_start(QEMUFile *f); void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, uint16_t len, @@ -63,11 +65,14 @@ void qemu_savevm_live_state(QEMUFile *f); int qemu_save_device_state(QEMUFile *f); int qemu_loadvm_state(QEMUFile *f); -void qemu_loadvm_state_cleanup(void); +void qemu_loadvm_state_cleanup(MigrationIncomingState *mis); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); int qemu_loadvm_approve_switchover(void); int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, - bool in_postcopy, bool inactivate_disks); + bool in_postcopy); + +bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, + char *buf, size_t len, Error **errp); #endif diff --git a/migration/socket.c b/migration/socket.c index 9ab89b1..5ec65b8 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -42,24 +42,6 @@ void socket_send_channel_create(QIOTaskFunc f, void *data) f, data, NULL, NULL); } -QIOChannel *socket_send_channel_create_sync(Error **errp) -{ - QIOChannelSocket *sioc = qio_channel_socket_new(); - - if (!outgoing_args.saddr) { - object_unref(OBJECT(sioc)); - error_setg(errp, "Initial sock address not set!"); - return NULL; - } - - if (qio_channel_socket_connect_sync(sioc, outgoing_args.saddr, errp) < 0) { - object_unref(OBJECT(sioc)); - return NULL; - } - - return QIO_CHANNEL(sioc); -} - struct SocketConnectData { MigrationState *s; char *hostname; diff --git a/migration/socket.h b/migration/socket.h index 46c233e..04ebbe9 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -22,7 +22,6 @@ #include "qemu/sockets.h" void socket_send_channel_create(QIOTaskFunc f, void *data); -QIOChannel *socket_send_channel_create_sync(Error **errp); void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); diff --git a/migration/target.c b/migration/target.c index a6ffa9a..12fd399 100644 --- a/migration/target.c +++ b/migration/target.c @@ -11,21 +11,21 @@ #include CONFIG_DEVICES #ifdef CONFIG_VFIO -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-migration.h" #endif #ifdef CONFIG_VFIO void migration_populate_vfio_info(MigrationInfo *info) { - if (vfio_mig_active()) { + if (vfio_migration_active()) { info->vfio = g_malloc0(sizeof(*info->vfio)); - info->vfio->transferred = vfio_mig_bytes_transferred(); + info->vfio->transferred = vfio_migration_bytes_transferred(); } } void migration_reset_vfio_bytes_transferred(void) { - vfio_reset_bytes_transferred(); + vfio_migration_reset_bytes_transferred(); } #else void migration_populate_vfio_info(MigrationInfo *info) diff --git a/migration/tls.c b/migration/tls.c index fa03d91..5cbf952 100644 --- a/migration/tls.c +++ b/migration/tls.c @@ -156,6 +156,11 @@ void migration_tls_channel_connect(MigrationState *s, NULL); } +void migration_tls_channel_end(QIOChannel *ioc, Error **errp) +{ + qio_channel_tls_bye(QIO_CHANNEL_TLS(ioc), errp); +} + bool migrate_channel_requires_tls_upgrade(QIOChannel *ioc) { if (!migrate_tls()) { diff --git a/migration/tls.h b/migration/tls.h index 5797d15..58b25e1 100644 --- a/migration/tls.h +++ b/migration/tls.h @@ -36,7 +36,7 @@ void migration_tls_channel_connect(MigrationState *s, QIOChannel *ioc, const char *hostname, Error **errp); - +void migration_tls_channel_end(QIOChannel *ioc, Error **errp); /* Whether the QIO channel requires further TLS handshake? */ bool migrate_channel_requires_tls_upgrade(QIOChannel *ioc); diff --git a/migration/trace-events b/migration/trace-events index 0b7c332..c506e11 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -39,12 +39,12 @@ savevm_send_postcopy_run(void) "" savevm_send_postcopy_resume(void) "" savevm_send_colo_enable(void) "" savevm_send_recv_bitmap(char *name) "%s" +savevm_send_switchover_start(void) "" savevm_state_setup(void) "" savevm_state_resume_prepare(void) "" savevm_state_header(void) "" savevm_state_iterate(void) "" savevm_state_cleanup(void) "" -savevm_state_complete_precopy(void) "" vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s" vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s" vmstate_downtime_save(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64 @@ -88,6 +88,8 @@ put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)" # qemu-file.c qemu_file_fclose(void) "" +qemu_file_put_fd(const char *name, int fd, int ret) "ioc %s, fd %d -> status %d" +qemu_file_get_fd(const char *name, int fd) "ioc %s -> fd %d" # ram.c get_queued_page(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx" @@ -115,6 +117,7 @@ colo_flush_ram_cache_end(void) "" save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" +ram_load_start(void) "" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" @@ -128,21 +131,22 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" +multifd_recv_unfill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" +multifd_send_fill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u" +multifd_send_ram_fill(uint8_t id, uint32_t normal, uint32_t zero) "channel %u normal pages %u zero pages %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" multifd_send_terminate_threads(void) "" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 +multifd_send_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" @@ -151,9 +155,9 @@ multifd_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostnam # migration.c migrate_set_state(const char *new_state) "new state %s" -migrate_fd_cleanup(void) "" +migration_cleanup(void) "" migrate_error(const char *error_desc) "error=%s" -migrate_fd_cancel(void) "" +migration_cancel(void) "" migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx" migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")" migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")" @@ -191,6 +195,7 @@ migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidt process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" process_incoming_migration_co_postcopy_end_main(void) "" postcopy_preempt_enabled(bool value) "%d" +migration_precopy_complete(void) "" # migration-stats migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd, uint64_t rdma) "qemu_file %" PRIu64 " multifd %" PRIu64 " RDMA %" PRIu64 @@ -340,6 +345,15 @@ colo_receive_message(const char *msg) "Receive '%s' message" # colo-failover.c colo_failover_set_state(const char *new_state) "new state %s" +# cpr.c +cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d" +cpr_delete_fd(const char *name, int id) "%s, id %d" +cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d" +cpr_state_save(const char *mode) "%s mode" +cpr_state_load(const char *mode) "%s mode" +cpr_transfer_input(const char *path) "%s" +cpr_transfer_output(const char *path) "%s" + # block-dirty-bitmap.c send_bitmap_header_enter(void) "" send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64 @@ -377,3 +391,10 @@ migration_block_progression(unsigned percent) "Completed %u%%" # page_cache.c migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64 migration_pagecache_insert(void) "Error allocating page" + +# cpu-throttle.c +cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%" +cpu_throttle_dirty_sync(void) "" + +# block-active.c +migration_block_activation(const char *name) "%s" diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c index e83bfcc..741a588 100644 --- a/migration/vmstate-types.c +++ b/migration/vmstate-types.c @@ -15,6 +15,7 @@ #include "qemu-file.h" #include "migration.h" #include "migration/vmstate.h" +#include "migration/client-options.h" #include "qemu/error-report.h" #include "qemu/queue.h" #include "trace.h" @@ -314,6 +315,29 @@ const VMStateInfo vmstate_info_uint64 = { .put = put_uint64, }; +/* File descriptor communicated via SCM_RIGHTS */ + +static int get_fd(QEMUFile *f, void *pv, size_t size, + const VMStateField *field) +{ + int32_t *v = pv; + *v = qemu_file_get_fd(f); + return 0; +} + +static int put_fd(QEMUFile *f, void *pv, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + int32_t *v = pv; + return qemu_file_put_fd(f, *v); +} + +const VMStateInfo vmstate_info_fd = { + .name = "fd", + .get = get_fd, + .put = put_fd, +}; + static int get_nullptr(QEMUFile *f, void *pv, size_t size, const VMStateField *field) @@ -338,7 +362,7 @@ static int put_nullptr(QEMUFile *f, void *pv, size_t size, } const VMStateInfo vmstate_info_nullptr = { - .name = "uint64", + .name = "nullptr", .get = get_nullptr, .put = put_nullptr, }; diff --git a/migration/vmstate.c b/migration/vmstate.c index ff5d589..5feaa32 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -15,14 +15,15 @@ #include "migration/vmstate.h" #include "savevm.h" #include "qapi/error.h" -#include "qapi/qmp/json-writer.h" +#include "qobject/json-writer.h" #include "qemu-file.h" #include "qemu/bitops.h" #include "qemu/error-report.h" #include "trace.h" static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc); + void *opaque, JSONWriter *vmdesc, + Error **errp); static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, void *opaque); @@ -50,6 +51,36 @@ vmstate_field_exists(const VMStateDescription *vmsd, const VMStateField *field, return result; } +/* + * Create a fake nullptr field when there's a NULL pointer detected in the + * array of a VMS_ARRAY_OF_POINTER VMSD field. It's needed because we + * can't dereference the NULL pointer. + */ +static const VMStateField * +vmsd_create_fake_nullptr_field(const VMStateField *field) +{ + VMStateField *fake = g_new0(VMStateField, 1); + + /* It can only happen on an array of pointers! */ + assert(field->flags & VMS_ARRAY_OF_POINTER); + + /* Some of fake's properties should match the original's */ + fake->name = field->name; + fake->version_id = field->version_id; + + /* Do not need "field_exists" check as it always exists (which is null) */ + fake->field_exists = NULL; + + /* See vmstate_info_nullptr - use 1 byte to represent nullptr */ + fake->size = 1; + fake->info = &vmstate_info_nullptr; + fake->flags = VMS_SINGLE; + + /* All the rest fields shouldn't matter.. */ + + return (const VMStateField *)fake; +} + static int vmstate_n_elems(void *opaque, const VMStateField *field) { int n_elems = 1; @@ -142,23 +173,39 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, } for (i = 0; i < n_elems; i++) { void *curr_elem = first_elem + size * i; + const VMStateField *inner_field; if (field->flags & VMS_ARRAY_OF_POINTER) { curr_elem = *(void **)curr_elem; } + if (!curr_elem && size) { - /* if null pointer check placeholder and do not follow */ - assert(field->flags & VMS_ARRAY_OF_POINTER); - ret = vmstate_info_nullptr.get(f, curr_elem, size, NULL); - } else if (field->flags & VMS_STRUCT) { - ret = vmstate_load_state(f, field->vmsd, curr_elem, - field->vmsd->version_id); - } else if (field->flags & VMS_VSTRUCT) { - ret = vmstate_load_state(f, field->vmsd, curr_elem, - field->struct_version_id); + /* + * If null pointer found (which should only happen in + * an array of pointers), use null placeholder and do + * not follow. + */ + inner_field = vmsd_create_fake_nullptr_field(field); + } else { + inner_field = field; + } + + if (inner_field->flags & VMS_STRUCT) { + ret = vmstate_load_state(f, inner_field->vmsd, curr_elem, + inner_field->vmsd->version_id); + } else if (inner_field->flags & VMS_VSTRUCT) { + ret = vmstate_load_state(f, inner_field->vmsd, curr_elem, + inner_field->struct_version_id); } else { - ret = field->info->get(f, curr_elem, size, field); + ret = inner_field->info->get(f, curr_elem, size, + inner_field); + } + + /* If we used a fake temp field.. free it now */ + if (inner_field != field) { + g_clear_pointer((gpointer *)&inner_field, g_free); } + if (ret >= 0) { ret = qemu_file_get_error(f); } @@ -310,7 +357,7 @@ static void vmsd_desc_field_start(const VMStateDescription *vmsd, static void vmsd_desc_field_end(const VMStateDescription *vmsd, JSONWriter *vmdesc, - const VMStateField *field, size_t size, int i) + const VMStateField *field, size_t size) { if (!vmdesc) { return; @@ -378,37 +425,91 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, int size = vmstate_size(opaque, field); uint64_t old_offset, written_bytes; JSONWriter *vmdesc_loop = vmdesc; + bool is_prev_null = false; trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems); if (field->flags & VMS_POINTER) { first_elem = *(void **)first_elem; assert(first_elem || !n_elems || !size); } + for (i = 0; i < n_elems; i++) { void *curr_elem = first_elem + size * i; + const VMStateField *inner_field; + bool is_null; + int max_elems = n_elems - i; - vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems); old_offset = qemu_file_transferred(f); if (field->flags & VMS_ARRAY_OF_POINTER) { assert(curr_elem); curr_elem = *(void **)curr_elem; } + if (!curr_elem && size) { - /* if null pointer write placeholder and do not follow */ - assert(field->flags & VMS_ARRAY_OF_POINTER); - ret = vmstate_info_nullptr.put(f, curr_elem, size, NULL, - NULL); - } else if (field->flags & VMS_STRUCT) { - ret = vmstate_save_state(f, field->vmsd, curr_elem, - vmdesc_loop); - } else if (field->flags & VMS_VSTRUCT) { - ret = vmstate_save_state_v(f, field->vmsd, curr_elem, - vmdesc_loop, - field->struct_version_id, errp); + /* + * If null pointer found (which should only happen in + * an array of pointers), use null placeholder and do + * not follow. + */ + inner_field = vmsd_create_fake_nullptr_field(field); + is_null = true; + } else { + inner_field = field; + is_null = false; + } + + /* + * This logic only matters when dumping VM Desc. + * + * Due to the fake nullptr handling above, if there's mixed + * null/non-null data, it doesn't make sense to emit a + * compressed array representation spanning the entire array + * because the field types will be different (e.g. struct + * vs. nullptr). Search ahead for the next null/non-null element + * and start a new compressed array if found. + */ + if (vmdesc && (field->flags & VMS_ARRAY_OF_POINTER) && + is_null != is_prev_null) { + + is_prev_null = is_null; + vmdesc_loop = vmdesc; + + for (int j = i + 1; j < n_elems; j++) { + void *elem = *(void **)(first_elem + size * j); + bool elem_is_null = !elem && size; + + if (is_null != elem_is_null) { + max_elems = j - i; + break; + } + } + } + + vmsd_desc_field_start(vmsd, vmdesc_loop, inner_field, + i, max_elems); + + if (inner_field->flags & VMS_STRUCT) { + ret = vmstate_save_state(f, inner_field->vmsd, + curr_elem, vmdesc_loop); + } else if (inner_field->flags & VMS_VSTRUCT) { + ret = vmstate_save_state_v(f, inner_field->vmsd, + curr_elem, vmdesc_loop, + inner_field->struct_version_id, + errp); } else { - ret = field->info->put(f, curr_elem, size, field, - vmdesc_loop); + ret = inner_field->info->put(f, curr_elem, size, + inner_field, vmdesc_loop); + } + + written_bytes = qemu_file_transferred(f) - old_offset; + vmsd_desc_field_end(vmsd, vmdesc_loop, inner_field, + written_bytes); + + /* If we used a fake temp field.. free it now */ + if (is_null) { + g_clear_pointer((gpointer *)&inner_field, g_free); } + if (ret) { error_setg(errp, "Save of field %s/%s failed", vmsd->name, field->name); @@ -418,9 +519,6 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, return ret; } - written_bytes = qemu_file_transferred(f) - old_offset; - vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i); - /* Compressed arrays only care about the first element */ if (vmdesc_loop && vmsd_can_compress(field)) { vmdesc_loop = NULL; @@ -441,12 +539,13 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, json_writer_end_array(vmdesc); } - ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc); + ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc, errp); if (vmsd->post_save) { int ps_ret = vmsd->post_save(opaque); - if (!ret) { + if (!ret && ps_ret) { ret = ps_ret; + error_setg(errp, "post-save failed: %s", vmsd->name); } } return ret; @@ -518,7 +617,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, } static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc) + void *opaque, JSONWriter *vmdesc, + Error **errp) { const VMStateDescription * const *sub = vmsd->subsections; bool vmdesc_has_subsections = false; @@ -546,7 +646,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len); qemu_put_be32(f, vmsdsub->version_id); - ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc); + ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp); if (ret) { return ret; } |