diff options
Diffstat (limited to 'migration')
-rw-r--r-- | migration/block-dirty-bitmap.c | 3 | ||||
-rw-r--r-- | migration/colo.c | 35 | ||||
-rw-r--r-- | migration/cpr-exec.c | 194 | ||||
-rw-r--r-- | migration/cpr-transfer.c | 7 | ||||
-rw-r--r-- | migration/cpr.c | 129 | ||||
-rw-r--r-- | migration/meson.build | 9 | ||||
-rw-r--r-- | migration/migration-hmp-cmds.c | 335 | ||||
-rw-r--r-- | migration/migration.c | 232 | ||||
-rw-r--r-- | migration/migration.h | 14 | ||||
-rw-r--r-- | migration/multifd-device-state.c | 10 | ||||
-rw-r--r-- | migration/multifd-nocomp.c | 3 | ||||
-rw-r--r-- | migration/multifd-zero-page.c | 22 | ||||
-rw-r--r-- | migration/multifd.c | 79 | ||||
-rw-r--r-- | migration/options.c | 25 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 584 | ||||
-rw-r--r-- | migration/postcopy-ram.h | 4 | ||||
-rw-r--r-- | migration/qemu-file.c | 14 | ||||
-rw-r--r-- | migration/qemu-file.h | 2 | ||||
-rw-r--r-- | migration/ram.c | 164 | ||||
-rw-r--r-- | migration/ram.h | 4 | ||||
-rw-r--r-- | migration/savevm.c | 428 | ||||
-rw-r--r-- | migration/savevm.h | 7 | ||||
-rw-r--r-- | migration/tls.c | 9 | ||||
-rw-r--r-- | migration/trace-events | 10 | ||||
-rw-r--r-- | migration/vfio-stub.c | 16 | ||||
-rw-r--r-- | migration/vfio.c (renamed from migration/target.c) | 16 | ||||
-rw-r--r-- | migration/vmstate-types.c | 61 | ||||
-rw-r--r-- | migration/vmstate.c | 103 |
28 files changed, 1805 insertions, 714 deletions
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index f2c352d..a061aad 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -1248,8 +1248,7 @@ static bool dirty_bitmap_has_postcopy(void *opaque) static SaveVMHandlers savevm_dirty_bitmap_handlers = { .save_setup = dirty_bitmap_save_setup, - .save_live_complete_postcopy = dirty_bitmap_save_complete, - .save_live_complete_precopy = dirty_bitmap_save_complete, + .save_complete = dirty_bitmap_save_complete, .has_postcopy = dirty_bitmap_has_postcopy, .state_pending_exact = dirty_bitmap_state_pending, .state_pending_estimate = dirty_bitmap_state_pending, diff --git a/migration/colo.c b/migration/colo.c index c976b3f..db783f6 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -146,7 +146,7 @@ static void secondary_vm_do_failover(void) return; } /* Notify COLO incoming thread that failover work is finished */ - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); /* For Secondary VM, jump to incoming co */ if (mis->colo_incoming_co) { @@ -195,7 +195,7 @@ static void primary_vm_do_failover(void) } /* Notify COLO thread that failover work is finished */ - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); } COLOMode get_colo_mode(void) @@ -620,8 +620,8 @@ out: } /* Hope this not to be too long to wait here */ - qemu_sem_wait(&s->colo_exit_sem); - qemu_sem_destroy(&s->colo_exit_sem); + qemu_event_wait(&s->colo_exit_event); + qemu_event_destroy(&s->colo_exit_event); /* * It is safe to unregister notifier after failover finished. @@ -651,7 +651,7 @@ void migrate_start_colo_process(MigrationState *s) s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, colo_checkpoint_notify_timer, NULL); - qemu_sem_init(&s->colo_exit_sem, 0); + qemu_event_init(&s->colo_exit_event, false); colo_process_checkpoint(s); bql_lock(); } @@ -686,11 +686,10 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis, bql_lock(); cpu_synchronize_all_states(); - ret = qemu_loadvm_state_main(mis->from_src_file, mis); + ret = qemu_loadvm_state_main(mis->from_src_file, mis, errp); bql_unlock(); if (ret < 0) { - error_setg(errp, "Load VM's live state (ram) error"); return; } @@ -729,9 +728,8 @@ static void colo_incoming_process_checkpoint(MigrationIncomingState *mis, bql_lock(); vmstate_loading = true; colo_flush_ram_cache(); - ret = qemu_load_device_state(fb); + ret = qemu_load_device_state(fb, errp); if (ret < 0) { - error_setg(errp, "COLO: load device state failed"); vmstate_loading = false; bql_unlock(); return; @@ -808,11 +806,11 @@ void colo_shutdown(void) case COLO_MODE_PRIMARY: s = migrate_get_current(); qemu_event_set(&s->colo_checkpoint_event); - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); break; case COLO_MODE_SECONDARY: mis = migration_incoming_get_current(); - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); break; default: break; @@ -827,7 +825,7 @@ static void *colo_process_incoming_thread(void *opaque) Error *local_err = NULL; rcu_register_thread(); - qemu_sem_init(&mis->colo_incoming_sem, 0); + qemu_event_init(&mis->colo_incoming_event, false); migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -849,17 +847,16 @@ static void *colo_process_incoming_thread(void *opaque) failover_init_state(); mis->to_src_file = qemu_file_get_return_path(mis->from_src_file); - if (!mis->to_src_file) { - error_report("COLO incoming thread: Open QEMUFile to_src_file failed"); - goto out; - } /* * Note: the communication between Primary side and Secondary side * should be sequential, we set the fd to unblocked in migration incoming * coroutine, and here we are in the COLO incoming thread, so it is ok to * set the fd back to blocked. */ - qemu_file_set_blocking(mis->from_src_file, true); + if (!qemu_file_set_blocking(mis->from_src_file, true, &local_err)) { + error_report_err(local_err); + goto out; + } colo_incoming_start_dirty_log(); @@ -923,8 +920,8 @@ out: } /* Hope this not to be too long to loop here */ - qemu_sem_wait(&mis->colo_incoming_sem); - qemu_sem_destroy(&mis->colo_incoming_sem); + qemu_event_wait(&mis->colo_incoming_event); + qemu_event_destroy(&mis->colo_incoming_event); rcu_unregister_thread(); return NULL; diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c new file mode 100644 index 0000000..d57714b --- /dev/null +++ b/migration/cpr-exec.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2021-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include "qemu/memfd.h" +#include "qapi/error.h" +#include "qapi/type-helpers.h" +#include "io/channel-file.h" +#include "io/channel-socket.h" +#include "block/block-global-state.h" +#include "qemu/main-loop.h" +#include "migration/cpr.h" +#include "migration/qemu-file.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/vmstate.h" +#include "system/runstate.h" +#include "trace.h" + +#define CPR_EXEC_STATE_NAME "QEMU_CPR_EXEC_STATE" + +static QEMUFile *qemu_file_new_fd_input(int fd, const char *name) +{ + g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd); + QIOChannel *ioc = QIO_CHANNEL(fioc); + qio_channel_set_name(ioc, name); + return qemu_file_new_input(ioc); +} + +static QEMUFile *qemu_file_new_fd_output(int fd, const char *name) +{ + g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd); + QIOChannel *ioc = QIO_CHANNEL(fioc); + qio_channel_set_name(ioc, name); + return qemu_file_new_output(ioc); +} + +void cpr_exec_persist_state(QEMUFile *f) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(qemu_file_get_ioc(f)); + int mfd = dup(fioc->fd); + char val[16]; + + /* Remember mfd in environment for post-exec load */ + qemu_clear_cloexec(mfd); + snprintf(val, sizeof(val), "%d", mfd); + g_setenv(CPR_EXEC_STATE_NAME, val, 1); +} + +static int cpr_exec_find_state(void) +{ + const char *val = g_getenv(CPR_EXEC_STATE_NAME); + int mfd; + + assert(val); + g_unsetenv(CPR_EXEC_STATE_NAME); + assert(!qemu_strtoi(val, NULL, 10, &mfd)); + return mfd; +} + +bool cpr_exec_has_state(void) +{ + return g_getenv(CPR_EXEC_STATE_NAME) != NULL; +} + +void cpr_exec_unpersist_state(void) +{ + int mfd; + const char *val = g_getenv(CPR_EXEC_STATE_NAME); + + g_unsetenv(CPR_EXEC_STATE_NAME); + assert(val); + assert(!qemu_strtoi(val, NULL, 10, &mfd)); + close(mfd); +} + +QEMUFile *cpr_exec_output(Error **errp) +{ + int mfd; + +#ifdef CONFIG_LINUX + mfd = qemu_memfd_create(CPR_EXEC_STATE_NAME, 0, false, 0, 0, errp); +#else + mfd = -1; +#endif + + if (mfd < 0) { + return NULL; + } + + return qemu_file_new_fd_output(mfd, CPR_EXEC_STATE_NAME); +} + +QEMUFile *cpr_exec_input(Error **errp) +{ + int mfd = cpr_exec_find_state(); + + lseek(mfd, 0, SEEK_SET); + return qemu_file_new_fd_input(mfd, CPR_EXEC_STATE_NAME); +} + +static bool preserve_fd(int fd) +{ + qemu_clear_cloexec(fd); + return true; +} + +static bool unpreserve_fd(int fd) +{ + qemu_set_cloexec(fd); + return true; +} + +static void cpr_exec_preserve_fds(void) +{ + cpr_walk_fd(preserve_fd); +} + +void cpr_exec_unpreserve_fds(void) +{ + cpr_walk_fd(unpreserve_fd); +} + +static void cpr_exec_cb(void *opaque) +{ + MigrationState *s = migrate_get_current(); + char **argv = strv_from_str_list(s->parameters.cpr_exec_command); + Error *err = NULL; + + /* + * Clear the close-on-exec flag for all preserved fd's. We cannot do so + * earlier because they should not persist across miscellaneous fork and + * exec calls that are performed during normal operation. + */ + cpr_exec_preserve_fds(); + + trace_cpr_exec(); + execvp(argv[0], argv); + + /* + * exec should only fail if argv[0] is bogus, or has a permissions problem, + * or the system is very short on resources. + */ + g_strfreev(argv); + cpr_exec_unpreserve_fds(); + + error_setg_errno(&err, errno, "execvp %s failed", argv[0]); + error_report_err(error_copy(err)); + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + migrate_set_error(s, err); + + /* Note, we can go from state COMPLETED to FAILED */ + migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL); + + err = NULL; + if (!migration_block_activate(&err)) { + /* error was already reported */ + error_free(err); + return; + } + + if (runstate_is_live(s->vm_old_state)) { + vm_start(); + } +} + +static int cpr_exec_notifier(NotifierWithReturn *notifier, MigrationEvent *e, + Error **errp) +{ + MigrationState *s = migrate_get_current(); + + if (e->type == MIG_EVENT_PRECOPY_DONE) { + QEMUBH *cpr_exec_bh = qemu_bh_new(cpr_exec_cb, NULL); + assert(s->state == MIGRATION_STATUS_COMPLETED); + qemu_bh_schedule(cpr_exec_bh); + qemu_notify_event(); + } else if (e->type == MIG_EVENT_PRECOPY_FAILED) { + cpr_exec_unpersist_state(); + } + return 0; +} + +void cpr_exec_init(void) +{ + static NotifierWithReturn exec_notifier; + + migration_add_notifier_mode(&exec_notifier, cpr_exec_notifier, + MIG_MODE_CPR_EXEC); +} diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c index e1f1403..00371d1 100644 --- a/migration/cpr-transfer.c +++ b/migration/cpr-transfer.c @@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp) MigrationAddress *addr = channel->addr; if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET && - addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) { + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX || + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) { g_autoptr(QIOChannelSocket) sioc = NULL; SocketAddress *saddr = &addr->u.socket; @@ -60,7 +61,9 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp) sioc = qio_net_listener_wait_client(listener); ioc = QIO_CHANNEL(sioc); - trace_cpr_transfer_input(addr->u.socket.u.q_unix.path); + trace_cpr_transfer_input( + addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ? + addr->u.socket.u.q_unix.path : addr->u.socket.u.fd.str); qio_channel_set_name(ioc, "cpr-in"); return qemu_file_new_input(ioc); diff --git a/migration/cpr.c b/migration/cpr.c index 42c4656..22dbac7 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -6,26 +6,24 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" #include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/vfio/vfio-device.h" #include "migration/cpr.h" #include "migration/misc.h" #include "migration/options.h" #include "migration/qemu-file.h" #include "migration/savevm.h" #include "migration/vmstate.h" +#include "monitor/monitor.h" #include "system/runstate.h" #include "trace.h" /*************************************************************************/ /* cpr state container for all information to be saved. */ -typedef QLIST_HEAD(CprFdList, CprFd) CprFdList; - -typedef struct CprState { - CprFdList fds; -} CprState; - -static CprState cpr_state; +CprState cpr_state; /****************************************************************************/ @@ -95,9 +93,50 @@ int cpr_find_fd(const char *name, int id) trace_cpr_find_fd(name, id, fd); return fd; } -/*************************************************************************/ -#define CPR_STATE "CprState" +void cpr_resave_fd(const char *name, int id, int fd) +{ + CprFd *elem = find_fd(&cpr_state.fds, name, id); + int old_fd = elem ? elem->fd : -1; + + if (old_fd < 0) { + cpr_save_fd(name, id, fd); + } else if (old_fd != fd) { + error_report("internal error: cpr fd '%s' id %d value %d " + "already saved with a different value %d", + name, id, fd, old_fd); + g_assert_not_reached(); + } +} + +int cpr_open_fd(const char *path, int flags, const char *name, int id, + Error **errp) +{ + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = qemu_open(path, flags, errp); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +bool cpr_walk_fd(cpr_walk_fd_cb cb) +{ + CprFd *elem; + + QLIST_FOREACH(elem, &cpr_state.fds, next) { + g_assert(elem->fd >= 0); + if (!cb(elem->fd)) { + return false; + } + } + return true; +} + +/*************************************************************************/ static const VMStateDescription vmstate_cpr_state = { .name = CPR_STATE, .version_id = 1, @@ -105,6 +144,10 @@ static const VMStateDescription vmstate_cpr_state = { .fields = (VMStateField[]) { VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_cpr_vfio_devices, + NULL } }; /*************************************************************************/ @@ -144,6 +187,8 @@ int cpr_state_save(MigrationChannel *channel, Error **errp) if (mode == MIG_MODE_CPR_TRANSFER) { g_assert(channel); f = cpr_transfer_output(channel, errp); + } else if (mode == MIG_MODE_CPR_EXEC) { + f = cpr_exec_output(errp); } else { return 0; } @@ -154,13 +199,16 @@ int cpr_state_save(MigrationChannel *channel, Error **errp) qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); qemu_put_be32(f, QEMU_CPR_FILE_VERSION); - ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0); + ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0, errp); if (ret) { - error_setg(errp, "vmstate_save_state error %d", ret); qemu_fclose(f); return ret; } + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + cpr_exec_persist_state(f); + } + /* * Close the socket only partially so we can later detect when the other * end closes by getting a HUP event. @@ -179,7 +227,13 @@ int cpr_state_load(MigrationChannel *channel, Error **errp) QEMUFile *f; MigMode mode = 0; - if (channel) { + if (cpr_exec_has_state()) { + mode = MIG_MODE_CPR_EXEC; + f = cpr_exec_input(errp); + if (channel) { + warn_report("ignoring cpr channel for migration mode cpr-exec"); + } + } else if (channel) { mode = MIG_MODE_CPR_TRANSFER; cpr_set_incoming_mode(mode); f = cpr_transfer_input(channel, errp); @@ -191,6 +245,7 @@ int cpr_state_load(MigrationChannel *channel, Error **errp) } trace_cpr_state_load(MigMode_str(mode)); + cpr_set_incoming_mode(mode); v = qemu_get_be32(f); if (v != QEMU_CPR_FILE_MAGIC) { @@ -205,13 +260,17 @@ int cpr_state_load(MigrationChannel *channel, Error **errp) return -ENOTSUP; } - ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1); + ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1, errp); if (ret) { - error_setg(errp, "vmstate_load_state error %d", ret); qemu_fclose(f); return ret; } + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */ + cpr_exec_unpreserve_fds(); + } + /* * Let the caller decide when to close the socket (and generate a HUP event * for the sending side). @@ -228,3 +287,45 @@ void cpr_state_close(void) cpr_state_file = NULL; } } + +bool cpr_incoming_needed(void *opaque) +{ + MigMode mode = migrate_mode(); + return mode == MIG_MODE_CPR_TRANSFER || mode == MIG_MODE_CPR_EXEC; +} + +/* + * cpr_get_fd_param: find a descriptor and return its value. + * + * @name: CPR name for the descriptor + * @fdname: An integer-valued string, or a name passed to a getfd command + * @index: CPR index of the descriptor + * @errp: returned error message + * + * If CPR is not being performed, then use @fdname to find the fd. + * If CPR is being performed, then ignore @fdname, and look for @name + * and @index in CPR state. + * + * On success returns the fd value, else returns -1. + */ +int cpr_get_fd_param(const char *name, const char *fdname, int index, + Error **errp) +{ + ERRP_GUARD(); + int fd; + + if (cpr_is_incoming()) { + fd = cpr_find_fd(name, index); + if (fd < 0) { + error_setg(errp, "cannot find saved value for fd %s", fdname); + } + } else { + fd = monitor_fd_param(monitor_cur(), fdname, errp); + if (fd >= 0) { + cpr_save_fd(name, index, fd); + } else { + error_prepend(errp, "Could not parse object fd %s:", fdname); + } + } + return fd; +} diff --git a/migration/meson.build b/migration/meson.build index 9aa48b2..16909d5 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -16,6 +16,7 @@ system_ss.add(files( 'channel-block.c', 'cpr.c', 'cpr-transfer.c', + 'cpr-exec.c', 'cpu-throttle.c', 'dirtyrate.c', 'exec.c', @@ -31,6 +32,7 @@ system_ss.add(files( 'multifd-zero-page.c', 'options.c', 'postcopy-ram.c', + 'ram.c', 'savevm.c', 'socket.c', 'tls.c', @@ -48,7 +50,6 @@ system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) - -specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', - if_true: files('ram.c', - 'target.c')) +system_ss.add(when: 'CONFIG_VFIO', + if_true: files('vfio.c'), + if_false: files('vfio-stub.c')) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 49c26da..847d18f 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -37,29 +37,108 @@ static void migration_global_dump(Monitor *mon) { MigrationState *ms = migrate_get_current(); - monitor_printf(mon, "globals:\n"); - monitor_printf(mon, "store-global-state: %s\n", + monitor_printf(mon, "Globals:\n"); + monitor_printf(mon, " store-global-state: %s\n", ms->store_global_state ? "on" : "off"); - monitor_printf(mon, "only-migratable: %s\n", + monitor_printf(mon, " only-migratable: %s\n", only_migratable ? "on" : "off"); - monitor_printf(mon, "send-configuration: %s\n", + monitor_printf(mon, " send-configuration: %s\n", ms->send_configuration ? "on" : "off"); - monitor_printf(mon, "send-section-footer: %s\n", + monitor_printf(mon, " send-section-footer: %s\n", ms->send_section_footer ? "on" : "off"); - monitor_printf(mon, "send-switchover-start: %s\n", + monitor_printf(mon, " send-switchover-start: %s\n", ms->send_switchover_start ? "on" : "off"); - monitor_printf(mon, "clear-bitmap-shift: %u\n", + monitor_printf(mon, " clear-bitmap-shift: %u\n", ms->clear_bitmap_shift); } +static const gchar *format_time_str(uint64_t us) +{ + const char *units[] = {"us", "ms", "sec"}; + int index = 0; + + while (us >= 1000 && index + 1 < ARRAY_SIZE(units)) { + us /= 1000; + index++; + } + + return g_strdup_printf("%"PRIu64" %s", us, units[index]); +} + +static void migration_dump_blocktime(Monitor *mon, MigrationInfo *info) +{ + if (info->has_postcopy_blocktime) { + monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n", + info->postcopy_blocktime); + } + + if (info->has_postcopy_vcpu_blocktime) { + uint32List *item = info->postcopy_vcpu_blocktime; + const char *sep = ""; + int count = 0; + + monitor_printf(mon, "Postcopy vCPU Blocktime (ms):\n ["); + + while (item) { + monitor_printf(mon, "%s%"PRIu32, sep, item->value); + item = item->next; + /* Each line 10 vcpu results, newline if there's more */ + sep = ((++count % 10 == 0) && item) ? ",\n " : ", "; + } + monitor_printf(mon, "]\n"); + } + + if (info->has_postcopy_latency) { + monitor_printf(mon, "Postcopy Latency (ns): %" PRIu64 "\n", + info->postcopy_latency); + } + + if (info->has_postcopy_non_vcpu_latency) { + monitor_printf(mon, "Postcopy non-vCPU Latencies (ns): %" PRIu64 "\n", + info->postcopy_non_vcpu_latency); + } + + if (info->has_postcopy_vcpu_latency) { + uint64List *item = info->postcopy_vcpu_latency; + const char *sep = ""; + int count = 0; + + monitor_printf(mon, "Postcopy vCPU Latencies (ns):\n ["); + + while (item) { + monitor_printf(mon, "%s%"PRIu64, sep, item->value); + item = item->next; + /* Each line 10 vcpu results, newline if there's more */ + sep = ((++count % 10 == 0) && item) ? ",\n " : ", "; + } + monitor_printf(mon, "]\n"); + } + + if (info->has_postcopy_latency_dist) { + uint64List *item = info->postcopy_latency_dist; + int count = 0; + + monitor_printf(mon, "Postcopy Latency Distribution:\n"); + + while (item) { + g_autofree const gchar *from = format_time_str(1UL << count); + g_autofree const gchar *to = format_time_str(1UL << (count + 1)); + + monitor_printf(mon, " [ %8s - %8s ]: %10"PRIu64"\n", + from, to, item->value); + item = item->next; + count++; + } + } +} + void hmp_info_migrate(Monitor *mon, const QDict *qdict) { + bool show_all = qdict_get_try_bool(qdict, "all", false); MigrationInfo *info; info = qmp_query_migrate(NULL); - migration_global_dump(mon); - if (info->blocked_reasons) { strList *reasons = info->blocked_reasons; monitor_printf(mon, "Outgoing migration blocked:\n"); @@ -70,147 +149,143 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) } if (info->has_status) { - monitor_printf(mon, "Migration status: %s", + monitor_printf(mon, "Status: \t\t%s", MigrationStatus_str(info->status)); - if (info->status == MIGRATION_STATUS_FAILED && info->error_desc) { + if ((info->status == MIGRATION_STATUS_FAILED || + info->status == MIGRATION_STATUS_POSTCOPY_PAUSED) && + info->error_desc) { monitor_printf(mon, " (%s)\n", info->error_desc); } else { monitor_printf(mon, "\n"); } - monitor_printf(mon, "total time: %" PRIu64 " ms\n", - info->total_time); - if (info->has_expected_downtime) { - monitor_printf(mon, "expected downtime: %" PRIu64 " ms\n", - info->expected_downtime); - } - if (info->has_downtime) { - monitor_printf(mon, "downtime: %" PRIu64 " ms\n", - info->downtime); + if (info->total_time) { + monitor_printf(mon, "Time (ms): \t\ttotal=%" PRIu64, + info->total_time); + if (info->has_setup_time) { + monitor_printf(mon, ", setup=%" PRIu64, + info->setup_time); + } + if (info->has_expected_downtime) { + monitor_printf(mon, ", exp_down=%" PRIu64, + info->expected_downtime); + } + if (info->has_downtime) { + monitor_printf(mon, ", down=%" PRIu64, + info->downtime); + } + monitor_printf(mon, "\n"); } - if (info->has_setup_time) { - monitor_printf(mon, "setup: %" PRIu64 " ms\n", - info->setup_time); + } + + if (info->has_socket_address) { + SocketAddressList *addr; + + monitor_printf(mon, "Sockets: [\n"); + + for (addr = info->socket_address; addr; addr = addr->next) { + char *s = socket_uri(addr->value); + monitor_printf(mon, "\t%s\n", s); + g_free(s); } + monitor_printf(mon, "]\n"); } if (info->ram) { - monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n", - info->ram->transferred >> 10); - monitor_printf(mon, "throughput: %0.2f mbps\n", + g_autofree char *str_psize = size_to_str(info->ram->page_size); + g_autofree char *str_total = size_to_str(info->ram->total); + g_autofree char *str_transferred = size_to_str(info->ram->transferred); + g_autofree char *str_remaining = size_to_str(info->ram->remaining); + g_autofree char *str_precopy = size_to_str(info->ram->precopy_bytes); + g_autofree char *str_multifd = size_to_str(info->ram->multifd_bytes); + g_autofree char *str_postcopy = size_to_str(info->ram->postcopy_bytes); + + monitor_printf(mon, "RAM info:\n"); + monitor_printf(mon, " Throughput (Mbps): \t%0.2f\n", info->ram->mbps); - monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n", - info->ram->remaining >> 10); - monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n", - info->ram->total >> 10); - monitor_printf(mon, "duplicate: %" PRIu64 " pages\n", - info->ram->duplicate); - monitor_printf(mon, "normal: %" PRIu64 " pages\n", - info->ram->normal); - monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n", - info->ram->normal_bytes >> 10); - monitor_printf(mon, "dirty sync count: %" PRIu64 "\n", - info->ram->dirty_sync_count); - monitor_printf(mon, "page size: %" PRIu64 " kbytes\n", - info->ram->page_size >> 10); - monitor_printf(mon, "multifd bytes: %" PRIu64 " kbytes\n", - info->ram->multifd_bytes >> 10); - monitor_printf(mon, "pages-per-second: %" PRIu64 "\n", - info->ram->pages_per_second); + monitor_printf(mon, " Sizes: \t\tpagesize=%s, total=%s\n", + str_psize, str_total); + monitor_printf(mon, " Transfers: \t\ttransferred=%s, remain=%s\n", + str_transferred, str_remaining); + monitor_printf(mon, " Channels: \t\tprecopy=%s, " + "multifd=%s, postcopy=%s", + str_precopy, str_multifd, str_postcopy); + + if (info->vfio) { + g_autofree char *str_vfio = size_to_str(info->vfio->transferred); + + monitor_printf(mon, ", vfio=%s", str_vfio); + } + monitor_printf(mon, "\n"); + monitor_printf(mon, " Page Types: \tnormal=%" PRIu64 + ", zero=%" PRIu64 "\n", + info->ram->normal, info->ram->duplicate); + monitor_printf(mon, " Page Rates (pps): \ttransfer=%" PRIu64, + info->ram->pages_per_second); if (info->ram->dirty_pages_rate) { - monitor_printf(mon, "dirty pages rate: %" PRIu64 " pages\n", + monitor_printf(mon, ", dirty=%" PRIu64, info->ram->dirty_pages_rate); } + monitor_printf(mon, "\n"); + + monitor_printf(mon, " Others: \t\tdirty_syncs=%" PRIu64, + info->ram->dirty_sync_count); if (info->ram->postcopy_requests) { - monitor_printf(mon, "postcopy request count: %" PRIu64 "\n", + monitor_printf(mon, ", postcopy_req=%" PRIu64, info->ram->postcopy_requests); } - if (info->ram->precopy_bytes) { - monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n", - info->ram->precopy_bytes >> 10); - } if (info->ram->downtime_bytes) { - monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n", - info->ram->downtime_bytes >> 10); - } - if (info->ram->postcopy_bytes) { - monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n", - info->ram->postcopy_bytes >> 10); + monitor_printf(mon, ", downtime_bytes=%" PRIu64, + info->ram->downtime_bytes); } if (info->ram->dirty_sync_missed_zero_copy) { - monitor_printf(mon, - "Zero-copy-send fallbacks happened: %" PRIu64 " times\n", + monitor_printf(mon, ", zerocopy_fallbacks=%" PRIu64, info->ram->dirty_sync_missed_zero_copy); } + monitor_printf(mon, "\n"); + } + + if (!show_all) { + goto out; } + migration_global_dump(mon); + if (info->xbzrle_cache) { - monitor_printf(mon, "cache size: %" PRIu64 " bytes\n", - info->xbzrle_cache->cache_size); - monitor_printf(mon, "xbzrle transferred: %" PRIu64 " kbytes\n", - info->xbzrle_cache->bytes >> 10); - monitor_printf(mon, "xbzrle pages: %" PRIu64 " pages\n", - info->xbzrle_cache->pages); - monitor_printf(mon, "xbzrle cache miss: %" PRIu64 " pages\n", - info->xbzrle_cache->cache_miss); - monitor_printf(mon, "xbzrle cache miss rate: %0.2f\n", - info->xbzrle_cache->cache_miss_rate); - monitor_printf(mon, "xbzrle encoding rate: %0.2f\n", - info->xbzrle_cache->encoding_rate); - monitor_printf(mon, "xbzrle overflow: %" PRIu64 "\n", + monitor_printf(mon, "XBZRLE: size=%" PRIu64 + ", transferred=%" PRIu64 + ", pages=%" PRIu64 + ", miss=%" PRIu64 "\n" + " miss_rate=%0.2f" + ", encode_rate=%0.2f" + ", overflow=%" PRIu64 "\n", + info->xbzrle_cache->cache_size, + info->xbzrle_cache->bytes, + info->xbzrle_cache->pages, + info->xbzrle_cache->cache_miss, + info->xbzrle_cache->cache_miss_rate, + info->xbzrle_cache->encoding_rate, info->xbzrle_cache->overflow); } if (info->has_cpu_throttle_percentage) { - monitor_printf(mon, "cpu throttle percentage: %" PRIu64 "\n", + monitor_printf(mon, "CPU Throttle (%%): %" PRIu64 "\n", info->cpu_throttle_percentage); } if (info->has_dirty_limit_throttle_time_per_round) { - monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n", + monitor_printf(mon, "Dirty-limit Throttle (us): %" PRIu64 "\n", info->dirty_limit_throttle_time_per_round); } if (info->has_dirty_limit_ring_full_time) { - monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n", + monitor_printf(mon, "Dirty-limit Ring Full (us): %" PRIu64 "\n", info->dirty_limit_ring_full_time); } - if (info->has_postcopy_blocktime) { - monitor_printf(mon, "postcopy blocktime: %u\n", - info->postcopy_blocktime); - } - - if (info->has_postcopy_vcpu_blocktime) { - Visitor *v; - char *str; - v = string_output_visitor_new(false, &str); - visit_type_uint32List(v, NULL, &info->postcopy_vcpu_blocktime, - &error_abort); - visit_complete(v, &str); - monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str); - g_free(str); - visit_free(v); - } - if (info->has_socket_address) { - SocketAddressList *addr; - - monitor_printf(mon, "socket address: [\n"); - - for (addr = info->socket_address; addr; addr = addr->next) { - char *s = socket_uri(addr->value); - monitor_printf(mon, "\t%s\n", s); - g_free(s); - } - monitor_printf(mon, "]\n"); - } - - if (info->vfio) { - monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n", - info->vfio->transferred >> 10); - } - + migration_dump_blocktime(mon, info); +out: qapi_free_MigrationInfo(info); } @@ -231,6 +306,18 @@ void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict) qapi_free_MigrationCapabilityStatusList(caps); } +static void monitor_print_cpr_exec_command(Monitor *mon, strList *args) +{ + monitor_printf(mon, "%s:", + MigrationParameter_str(MIGRATION_PARAMETER_CPR_EXEC_COMMAND)); + + while (args) { + monitor_printf(mon, " %s", args->value); + args = args->next; + } + monitor_printf(mon, "\n"); +} + void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) { MigrationParameters *params; @@ -278,6 +365,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: '%s'\n", MigrationParameter_str(MIGRATION_PARAMETER_TLS_HOSTNAME), params->tls_hostname); + assert(params->tls_authz); + monitor_printf(mon, "%s: '%s'\n", + MigrationParameter_str(MIGRATION_PARAMETER_TLS_AUTHZ), + params->tls_authz); assert(params->has_max_bandwidth); monitor_printf(mon, "%s: %" PRIu64 " bytes/second\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_BANDWIDTH), @@ -286,6 +377,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %" PRIu64 " bytes/second\n", MigrationParameter_str(MIGRATION_PARAMETER_AVAIL_SWITCHOVER_BANDWIDTH), params->avail_switchover_bandwidth); + assert(params->has_max_postcopy_bandwidth); + monitor_printf(mon, "%s: %" PRIu64 " bytes/second\n", + MigrationParameter_str(MIGRATION_PARAMETER_MAX_POSTCOPY_BANDWIDTH), + params->max_postcopy_bandwidth); assert(params->has_downtime_limit); monitor_printf(mon, "%s: %" PRIu64 " ms\n", MigrationParameter_str(MIGRATION_PARAMETER_DOWNTIME_LIMIT), @@ -308,12 +403,6 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); - monitor_printf(mon, "%s: %" PRIu64 "\n", - MigrationParameter_str(MIGRATION_PARAMETER_MAX_POSTCOPY_BANDWIDTH), - params->max_postcopy_bandwidth); - monitor_printf(mon, "%s: '%s'\n", - MigrationParameter_str(MIGRATION_PARAMETER_TLS_AUTHZ), - params->tls_authz); if (params->has_block_bitmap_mapping) { const BitmapMigrationNodeAliasList *bmnal; @@ -360,6 +449,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) MIGRATION_PARAMETER_DIRECT_IO), params->direct_io ? "on" : "off"); } + + assert(params->has_cpr_exec_command); + monitor_print_cpr_exec_command(mon, params->cpr_exec_command); } qapi_free_MigrationParameters(params); @@ -641,6 +733,21 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_direct_io = true; visit_type_bool(v, param, &p->direct_io, &err); break; + case MIGRATION_PARAMETER_CPR_EXEC_COMMAND: { + g_autofree char **strv = NULL; + g_autoptr(GError) gerr = NULL; + strList **tail = &p->cpr_exec_command; + + if (!g_shell_parse_argv(valuestr, NULL, &strv, &gerr)) { + error_setg(&err, "%s", gerr->message); + break; + } + for (int i = 0; strv[i]; i++) { + QAPI_LIST_APPEND(tail, strv[i]); + } + p->has_cpr_exec_command = true; + break; + } default: g_assert_not_reached(); } diff --git a/migration/migration.c b/migration/migration.c index 4697732..a63b46b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -74,11 +74,7 @@ #define INMIGRATE_DEFAULT_EXIT_ON_ERROR true -static NotifierWithReturnList migration_state_notifiers[] = { - NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), - NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), - NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), -}; +static GSList *migration_state_notifiers[MIG_MODE__MAX]; /* Messages sent on the return path from destination to source */ enum mig_rp_message_type { @@ -337,6 +333,7 @@ void migration_object_init(void) ram_mig_init(); dirty_bitmap_mig_init(); + cpr_exec_init(); /* Initialize cpu throttle timers */ cpu_throttle_init(); @@ -576,22 +573,27 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis, } int migrate_send_rp_req_pages(MigrationIncomingState *mis, - RAMBlock *rb, ram_addr_t start, uint64_t haddr) + RAMBlock *rb, ram_addr_t start, uint64_t haddr, + uint32_t tid) { void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb)); bool received = false; WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) { received = ramblock_recv_bitmap_test_byte_offset(rb, start); - if (!received && !g_tree_lookup(mis->page_requested, aligned)) { - /* - * The page has not been received, and it's not yet in the page - * request list. Queue it. Set the value of element to 1, so that - * things like g_tree_lookup() will return TRUE (1) when found. - */ - g_tree_insert(mis->page_requested, aligned, (gpointer)1); - qatomic_inc(&mis->page_requested_count); - trace_postcopy_page_req_add(aligned, mis->page_requested_count); + if (!received) { + if (!g_tree_lookup(mis->page_requested, aligned)) { + /* + * The page has not been received, and it's not yet in the + * page request list. Queue it. Set the value of element + * to 1, so that things like g_tree_lookup() will return + * TRUE (1) when found. + */ + g_tree_insert(mis->page_requested, aligned, (gpointer)1); + qatomic_inc(&mis->page_requested_count); + trace_postcopy_page_req_add(aligned, mis->page_requested_count); + } + mark_postcopy_blocktime_begin(haddr, tid, rb); } } @@ -618,22 +620,22 @@ void migration_incoming_disable_colo(void) migration_colo_enabled = false; } -int migration_incoming_enable_colo(void) +int migration_incoming_enable_colo(Error **errp) { #ifndef CONFIG_REPLICATION - error_report("ENABLE_COLO command come in migration stream, but the " - "replication module is not built in"); + error_setg(errp, "ENABLE_COLO command come in migration stream, but the " + "replication module is not built in"); return -ENOTSUP; #endif if (!migrate_colo()) { - error_report("ENABLE_COLO command come in migration stream, but x-colo " - "capability is not set"); + error_setg(errp, "ENABLE_COLO command come in migration stream" + ", but x-colo capability is not set"); return -EINVAL; } if (ram_block_discard_disable(true)) { - error_report("COLO: cannot disable RAM discard"); + error_setg(errp, "COLO: cannot disable RAM discard"); return -EBUSY; } migration_colo_enabled = true; @@ -876,7 +878,7 @@ process_incoming_migration_co(void *opaque) MIGRATION_STATUS_ACTIVE); mis->loadvm_co = qemu_coroutine_self(); - ret = qemu_loadvm_state(mis->from_src_file); + ret = qemu_loadvm_state(mis->from_src_file, &local_err); mis->loadvm_co = NULL; trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed"); @@ -903,7 +905,8 @@ process_incoming_migration_co(void *opaque) } if (ret < 0) { - error_setg(&local_err, "load of migration failed: %s", strerror(-ret)); + error_prepend(&local_err, "load of migration failed: %s: ", + strerror(-ret)); goto fail; } @@ -930,6 +933,15 @@ fail: } exit(EXIT_FAILURE); + } else { + /* + * Report the error here in case that QEMU abruptly exits + * when postcopy is enabled. + */ + WITH_QEMU_LOCK_GUARD(&s->error_mutex) { + error_report_err(s->error); + s->error = NULL; + } } out: /* Pairs with the refcount taken in qmp_migrate_incoming() */ @@ -946,7 +958,7 @@ static void migration_incoming_setup(QEMUFile *f) assert(!mis->from_src_file); mis->from_src_file = f; - qemu_file_set_blocking(f, false); + qemu_file_set_blocking(f, false, &error_abort); } void migration_incoming_process(void) @@ -966,7 +978,7 @@ static bool postcopy_try_recover(void) /* This should be set already in migration_incoming_setup() */ assert(mis->from_src_file); /* Postcopy has standalone thread to do vm load */ - qemu_file_set_blocking(mis->from_src_file, true); + qemu_file_set_blocking(mis->from_src_file, true, &error_abort); /* Re-configure the return path */ mis->to_src_file = qemu_file_get_return_path(mis->from_src_file); @@ -1630,7 +1642,7 @@ void migration_cancel(void) } /* If the migration is paused, kick it out of the pause */ if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); } while (s->state != MIGRATION_STATUS_CANCELLING); @@ -1660,23 +1672,51 @@ void migration_cancel(void) } } +static int get_modes(MigMode mode, va_list ap); + +static void add_notifiers(NotifierWithReturn *notify, int modes) +{ + for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { + if (modes & BIT(mode)) { + migration_state_notifiers[mode] = + g_slist_prepend(migration_state_notifiers[mode], notify); + } + } +} + +void migration_add_notifier_modes(NotifierWithReturn *notify, + MigrationNotifyFunc func, MigMode mode, ...) +{ + int modes; + va_list ap; + + va_start(ap, mode); + modes = get_modes(mode, ap); + va_end(ap); + + notify->notify = (NotifierWithReturnFunc)func; + add_notifiers(notify, modes); +} + void migration_add_notifier_mode(NotifierWithReturn *notify, MigrationNotifyFunc func, MigMode mode) { - notify->notify = (NotifierWithReturnFunc)func; - notifier_with_return_list_add(&migration_state_notifiers[mode], notify); + migration_add_notifier_modes(notify, func, mode, -1); } void migration_add_notifier(NotifierWithReturn *notify, MigrationNotifyFunc func) { - migration_add_notifier_mode(notify, func, MIG_MODE_NORMAL); + migration_add_notifier_modes(notify, func, MIG_MODE_NORMAL, -1); } void migration_remove_notifier(NotifierWithReturn *notify) { if (notify->notify) { - notifier_with_return_remove(notify); + for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { + migration_blockers[mode] = + g_slist_remove(migration_state_notifiers[mode], notify); + } notify->notify = NULL; } } @@ -1686,18 +1726,29 @@ int migration_call_notifiers(MigrationState *s, MigrationEventType type, { MigMode mode = s->parameters.mode; MigrationEvent e; + NotifierWithReturn *notifier; + GSList *elem, *next; int ret; e.type = type; - ret = notifier_with_return_list_notify(&migration_state_notifiers[mode], - &e, errp); - assert(!ret || type == MIG_EVENT_PRECOPY_SETUP); - return ret; + + for (elem = migration_state_notifiers[mode]; elem; elem = next) { + next = elem->next; + notifier = (NotifierWithReturn *)elem->data; + ret = notifier->notify(notifier, &e, errp); + if (ret) { + assert(type == MIG_EVENT_PRECOPY_SETUP); + return ret; + } + } + + return 0; } bool migration_has_failed(MigrationState *s) { - return (s->state == MIGRATION_STATUS_CANCELLED || + return (s->state == MIGRATION_STATUS_CANCELLING || + s->state == MIGRATION_STATUS_CANCELLED || s->state == MIGRATION_STATUS_FAILED); } @@ -1757,7 +1808,8 @@ bool migrate_mode_is_cpr(MigrationState *s) { MigMode mode = s->parameters.mode; return mode == MIG_MODE_CPR_REBOOT || - mode == MIG_MODE_CPR_TRANSFER; + mode == MIG_MODE_CPR_TRANSFER || + mode == MIG_MODE_CPR_EXEC; } int migrate_init(MigrationState *s, Error **errp) @@ -2106,6 +2158,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) return false; } + if (migrate_mode() == MIG_MODE_CPR_EXEC && + !s->parameters.has_cpr_exec_command) { + error_setg(errp, "cpr-exec mode requires setting cpr-exec-command"); + return false; + } + if (migration_is_blocked(errp)) { return false; } @@ -2342,7 +2400,7 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) MigrationStatus_str(s->state)); return; } - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } int migration_rp_wait(MigrationState *s) @@ -2641,12 +2699,9 @@ out: return NULL; } -static int open_return_path_on_source(MigrationState *ms) +static void open_return_path_on_source(MigrationState *ms) { ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); - if (!ms->rp_state.from_dst_file) { - return -1; - } trace_open_return_path_on_source(); @@ -2655,8 +2710,6 @@ static int open_return_path_on_source(MigrationState *ms) ms->rp_state.rp_thread_created = true; trace_open_return_path_on_source_continue(); - - return 0; } /* Return true if error detected, or false otherwise */ @@ -2867,8 +2920,9 @@ static int postcopy_start(MigrationState *ms, Error **errp) fail_closefb: qemu_fclose(fb); fail: - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); + if (ms->state != MIGRATION_STATUS_CANCELLING) { + migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); + } migration_block_activate(NULL); migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL); bql_unlock(); @@ -2911,21 +2965,18 @@ static bool migration_switchover_prepare(MigrationState *s) return true; } - /* Since leaving this state is not atomic with posting the semaphore + /* + * Since leaving this state is not atomic with setting the event * it's possible that someone could have issued multiple migrate_continue - * and the semaphore is incorrectly positive at this point; - * the docs say it's undefined to reinit a semaphore that's already - * init'd, so use timedwait to eat up any existing posts. + * and the event is incorrectly set at this point so reset it. */ - while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { - /* This block intentionally left blank */ - } + qemu_event_reset(&s->pause_event); /* Update [POSTCOPY_]ACTIVE to PRE_SWITCHOVER */ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_PRE_SWITCHOVER); bql_unlock(); - qemu_sem_wait(&s->pause_sem); + qemu_event_wait(&s->pause_event); bql_lock(); /* @@ -3439,33 +3490,60 @@ static MigIterateState migration_iteration_run(MigrationState *s) Error *local_err = NULL; bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; bool can_switchover = migration_can_switchover(s); + bool complete_ready; + /* Fast path - get the estimated amount of pending data */ qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy); pending_size = must_precopy + can_postcopy; trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); - if (pending_size < s->threshold_size) { - qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy); - pending_size = must_precopy + can_postcopy; - trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy); + if (in_postcopy) { + /* + * Iterate in postcopy until all pending data flushed. Note that + * postcopy completion doesn't rely on can_switchover, because when + * POSTCOPY_ACTIVE it means switchover already happened. + */ + complete_ready = !pending_size; + } else { + /* + * Exact pending reporting is only needed for precopy. Taking RAM + * as example, there'll be no extra dirty information after + * postcopy started, so ESTIMATE should always match with EXACT + * during postcopy phase. + */ + if (pending_size < s->threshold_size) { + qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy); + pending_size = must_precopy + can_postcopy; + trace_migrate_pending_exact(pending_size, must_precopy, + can_postcopy); + } + + /* Should we switch to postcopy now? */ + if (must_precopy <= s->threshold_size && + can_switchover && qatomic_read(&s->start_postcopy)) { + if (postcopy_start(s, &local_err)) { + migrate_set_error(s, local_err); + error_report_err(local_err); + } + return MIG_ITERATE_SKIP; + } + + /* + * For precopy, migration can complete only if: + * + * (1) Switchover is acknowledged by destination + * (2) Pending size is no more than the threshold specified + * (which was calculated from expected downtime) + */ + complete_ready = can_switchover && (pending_size <= s->threshold_size); } - if ((!pending_size || pending_size < s->threshold_size) && can_switchover) { + if (complete_ready) { trace_migration_thread_low_pending(pending_size); migration_completion(s); return MIG_ITERATE_BREAK; } - /* Still a significant amount to transfer */ - if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover && - qatomic_read(&s->start_postcopy)) { - if (postcopy_start(s, &local_err)) { - migrate_set_error(s, local_err); - error_report_err(local_err); - } - return MIG_ITERATE_SKIP; - } - /* Just another iteration step */ qemu_savevm_state_iterate(s->to_dst_file, in_postcopy); return MIG_ITERATE_RESUME; @@ -3890,9 +3968,8 @@ static void *bg_migration_thread(void *opaque) while (migration_is_active()) { MigIterateState iter_state = bg_migration_iteration_run(s); - if (iter_state == MIG_ITERATE_SKIP) { - continue; - } else if (iter_state == MIG_ITERATE_BREAK) { + + if (iter_state == MIG_ITERATE_BREAK) { break; } @@ -3974,7 +4051,9 @@ void migration_connect(MigrationState *s, Error *error_in) } migration_rate_set(rate_limit); - qemu_file_set_blocking(s->to_dst_file, true); + if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) { + goto fail; + } /* * Open the return path. For postcopy, it is used exclusively. For @@ -3982,10 +4061,7 @@ void migration_connect(MigrationState *s, Error *error_in) * QEMU uses the return path. */ if (migrate_postcopy_ram() || migrate_return_path()) { - if (open_return_path_on_source(s)) { - error_setg(&local_err, "Unable to open return-path for postcopy"); - goto fail; - } + open_return_path_on_source(s); } /* @@ -4057,7 +4133,7 @@ static void migration_instance_finalize(Object *obj) qemu_mutex_destroy(&ms->qemu_file_lock); qemu_sem_destroy(&ms->wait_unplug_sem); qemu_sem_destroy(&ms->rate_limit_sem); - qemu_sem_destroy(&ms->pause_sem); + qemu_event_destroy(&ms->pause_event); qemu_sem_destroy(&ms->postcopy_pause_sem); qemu_sem_destroy(&ms->rp_state.rp_sem); qemu_sem_destroy(&ms->rp_state.rp_pong_acks); @@ -4072,7 +4148,7 @@ static void migration_instance_init(Object *obj) ms->state = MIGRATION_STATUS_NONE; ms->mbps = -1; ms->pages_per_second = -1; - qemu_sem_init(&ms->pause_sem, 0); + qemu_event_init(&ms->pause_event, false); qemu_mutex_init(&ms->error_mutex); migrate_params_init(&ms->parameters); diff --git a/migration/migration.h b/migration/migration.h index d53f7ca..01329bf 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -98,9 +98,9 @@ struct MigrationIncomingState { void (*transport_cleanup)(void *data); /* * Used to sync thread creations. Note that we can't create threads in - * parallel with this sem. + * parallel with this event. */ - QemuSemaphore thread_sync_sem; + QemuEvent thread_sync_event; /* * Free at the start of the main state load, set as the main thread finishes * loading state. @@ -186,7 +186,7 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *colo_incoming_co; - QemuSemaphore colo_incoming_sem; + QemuEvent colo_incoming_event; /* Optional load threads pool and its thread exit request flag */ ThreadPool *load_threads; @@ -379,10 +379,10 @@ struct MigrationState { QemuSemaphore wait_unplug_sem; /* Migration is paused due to pause-before-switchover */ - QemuSemaphore pause_sem; + QemuEvent pause_event; - /* The semaphore is used to notify COLO thread that failover is finished */ - QemuSemaphore colo_exit_sem; + /* The event is used to notify COLO thread that failover is finished */ + QemuEvent colo_exit_event; /* The event is used to notify COLO thread to do checkpoint */ QemuEvent colo_checkpoint_event; @@ -546,7 +546,7 @@ void migrate_send_rp_shut(MigrationIncomingState *mis, void migrate_send_rp_pong(MigrationIncomingState *mis, uint32_t value); int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb, - ram_addr_t start, uint64_t haddr); + ram_addr_t start, uint64_t haddr, uint32_t tid); int migrate_send_rp_message_req_pages(MigrationIncomingState *mis, RAMBlock *rb, ram_addr_t start); void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c index 94222d0..fce64f0 100644 --- a/migration/multifd-device-state.c +++ b/migration/multifd-device-state.c @@ -131,7 +131,7 @@ bool multifd_device_state_supported(void) static void multifd_device_state_save_thread_data_free(void *opaque) { - SaveLiveCompletePrecopyThreadData *data = opaque; + SaveCompletePrecopyThreadData *data = opaque; g_clear_pointer(&data->idstr, g_free); g_free(data); @@ -139,7 +139,7 @@ static void multifd_device_state_save_thread_data_free(void *opaque) static int multifd_device_state_save_thread(void *opaque) { - SaveLiveCompletePrecopyThreadData *data = opaque; + SaveCompletePrecopyThreadData *data = opaque; g_autoptr(Error) local_err = NULL; if (!data->hdlr(data, &local_err)) { @@ -170,18 +170,18 @@ bool multifd_device_state_save_thread_should_exit(void) } void -multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr, +multifd_spawn_device_state_save_thread(SaveCompletePrecopyThreadHandler hdlr, char *idstr, uint32_t instance_id, void *opaque) { - SaveLiveCompletePrecopyThreadData *data; + SaveCompletePrecopyThreadData *data; assert(multifd_device_state_supported()); assert(multifd_send_device_state); assert(!qatomic_read(&multifd_send_device_state->threads_abort)); - data = g_new(SaveLiveCompletePrecopyThreadData, 1); + data = g_new(SaveCompletePrecopyThreadData, 1); data->hdlr = hdlr; data->idstr = g_strdup(idstr); data->instance_id = instance_id; diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index 88fe0f9..b48eae3 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -17,6 +17,7 @@ #include "migration-stats.h" #include "multifd.h" #include "options.h" +#include "migration.h" #include "qapi/error.h" #include "qemu/cutils.h" #include "qemu/error-report.h" @@ -398,7 +399,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f) MultiFDSyncReq req; int ret; - if (!migrate_multifd()) { + if (!migrate_multifd() || migration_in_postcopy()) { return 0; } diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c index dbc1184..4cde868 100644 --- a/migration/multifd-zero-page.c +++ b/migration/multifd-zero-page.c @@ -85,9 +85,27 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) { for (int i = 0; i < p->zero_num; i++) { void *page = p->host + p->zero[i]; - if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { + bool received = + ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i]); + + /* + * During multifd migration zero page is written to the memory + * only if it is migrated more than once. + * + * It becomes a problem when both multifd & postcopy options are + * enabled. If the zero page which was skipped during multifd phase, + * is accessed during the postcopy phase of the migration, a page + * fault occurs. But this page fault is not served because the + * 'receivedmap' says the zero page is already received. Thus the + * thread accessing that page may hang. + * + * When postcopy is enabled, always write the zero page as and when + * it is migrated. + */ + if (migrate_postcopy_ram() || received) { memset(page, 0, multifd_ram_page_size()); - } else { + } + if (!received) { ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); } } diff --git a/migration/multifd.c b/migration/multifd.c index ec108af..98873ce 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -439,6 +439,39 @@ static void multifd_send_set_error(Error *err) } } +/* + * Gracefully shutdown IOChannels. Only needed for successful migrations on + * top of TLS channels. Otherwise it is same to qio_channel_shutdown(). + * + * A successful migration also guarantees multifd sender threads are + * properly flushed and halted. It is only safe to send BYE in the + * migration thread here when we know there's no other thread writting to + * the channel, because GnuTLS doesn't support concurrent writers. + */ +static void migration_ioc_shutdown_gracefully(QIOChannel *ioc) +{ + g_autoptr(Error) local_err = NULL; + + if (!migration_has_failed(migrate_get_current()) && + object_dynamic_cast((Object *)ioc, TYPE_QIO_CHANNEL_TLS)) { + + /* + * The destination expects the TLS session to always be properly + * terminated. This helps to detect a premature termination in the + * middle of the stream. Note that older QEMUs always break the + * connection on the source and the destination always sees + * GNUTLS_E_PREMATURE_TERMINATION. + */ + migration_tls_channel_end(ioc, &local_err); + if (local_err) { + warn_report("Failed to gracefully terminate TLS connection: %s", + error_get_pretty(local_err)); + } + } + + qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); +} + static void multifd_send_terminate_threads(void) { int i; @@ -460,7 +493,7 @@ static void multifd_send_terminate_threads(void) qemu_sem_post(&p->sem); if (p->c) { - qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + migration_ioc_shutdown_gracefully(p->c); } } @@ -547,36 +580,6 @@ void multifd_send_shutdown(void) return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDSendParams *p = &multifd_send_state->params[i]; - - /* thread_created implies the TLS handshake has succeeded */ - if (p->tls_thread_created && p->thread_created) { - Error *local_err = NULL; - /* - * The destination expects the TLS session to always be - * properly terminated. This helps to detect a premature - * termination in the middle of the stream. Note that - * older QEMUs always break the connection on the source - * and the destination always sees - * GNUTLS_E_PREMATURE_TERMINATION. - */ - migration_tls_channel_end(p->c, &local_err); - - /* - * The above can return an error in case the migration has - * already failed. If the migration succeeded, errors are - * not expected but there's no need to kill the source. - */ - if (local_err && !migration_has_failed(migrate_get_current())) { - warn_report( - "multifd_send_%d: Failed to terminate TLS connection: %s", - p->id, error_get_pretty(local_err)); - break; - } - } - } - multifd_send_terminate_threads(); for (i = 0; i < migrate_multifd_channels(); i++) { @@ -690,6 +693,7 @@ static void *multifd_send_thread(void *opaque) if (qatomic_load_acquire(&p->pending_job)) { bool is_device_state = multifd_payload_device_state(p->data); size_t total_size; + int write_flags_masked = 0; p->flags = 0; p->iovs_num = 0; @@ -697,6 +701,9 @@ static void *multifd_send_thread(void *opaque) if (is_device_state) { multifd_device_state_send_prepare(p); + + /* Device state packets cannot be sent via zerocopy */ + write_flags_masked |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; } else { ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { @@ -718,7 +725,8 @@ static void *multifd_send_thread(void *opaque) &p->data->u.ram, &local_err); } else { ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, - NULL, 0, p->write_flags, + NULL, 0, + p->write_flags & ~write_flags_masked, &local_err); } @@ -1379,6 +1387,13 @@ static void *multifd_recv_thread(void *opaque) } if (has_data) { + /* + * multifd thread should not be active and receive data + * when migration is in the Postcopy phase. Two threads + * writing the same memory area could easily corrupt + * the guest state. + */ + assert(!migration_in_postcopy()); if (is_device_state) { assert(use_packets); ret = multifd_device_state_recv(p, &local_err); diff --git a/migration/options.c b/migration/options.c index b6ae953..5183112 100644 --- a/migration/options.c +++ b/migration/options.c @@ -187,6 +187,8 @@ const Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM), DEFINE_PROP_MIG_CAP("x-postcopy-preempt", MIGRATION_CAPABILITY_POSTCOPY_PREEMPT), + DEFINE_PROP_MIG_CAP("postcopy-blocktime", + MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME), DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO), DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM), DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), @@ -509,11 +511,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) error_setg(errp, "Postcopy is not compatible with ignore-shared"); return false; } - - if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - error_setg(errp, "Postcopy is not yet compatible with multifd"); - return false; - } } if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { @@ -573,7 +570,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) return false; } - if (migrate_incoming_started()) { + if (!migrate_postcopy_preempt() && migrate_incoming_started()) { error_setg(errp, "Postcopy preempt must be set before incoming starts"); return false; @@ -581,7 +578,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - if (migrate_incoming_started()) { + if (!migrate_multifd() && migrate_incoming_started()) { error_setg(errp, "Multifd must be set before incoming starts"); return false; } @@ -962,6 +959,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->zero_page_detection = s->parameters.zero_page_detection; params->has_direct_io = true; params->direct_io = s->parameters.direct_io; + params->has_cpr_exec_command = true; + params->cpr_exec_command = QAPI_CLONE(strList, + s->parameters.cpr_exec_command); return params; } @@ -996,6 +996,7 @@ void migrate_params_init(MigrationParameters *params) params->has_mode = true; params->has_zero_page_detection = true; params->has_direct_io = true; + params->has_cpr_exec_command = true; } /* @@ -1300,6 +1301,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_direct_io) { dest->direct_io = params->direct_io; } + + if (params->has_cpr_exec_command) { + dest->cpr_exec_command = params->cpr_exec_command; + } } static void migrate_params_apply(MigrateSetParameters *params, Error **errp) @@ -1432,6 +1437,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_direct_io) { s->parameters.direct_io = params->direct_io; } + + if (params->has_cpr_exec_command) { + qapi_free_strList(s->parameters.cpr_exec_command); + s->parameters.cpr_exec_command = + QAPI_CLONE(strList, params->cpr_exec_command); + } } void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 995614b..5471efb 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -90,10 +90,10 @@ void postcopy_thread_create(MigrationIncomingState *mis, QemuThread *thread, const char *name, void *(*fn)(void *), int joinable) { - qemu_sem_init(&mis->thread_sync_sem, 0); + qemu_event_init(&mis->thread_sync_event, false); qemu_thread_create(thread, name, fn, mis, joinable); - qemu_sem_wait(&mis->thread_sync_sem); - qemu_sem_destroy(&mis->thread_sync_sem); + qemu_event_wait(&mis->thread_sync_event); + qemu_event_destroy(&mis->thread_sync_event); } /* Postcopy needs to detect accesses to pages that haven't yet been copied @@ -110,19 +110,104 @@ void postcopy_thread_create(MigrationIncomingState *mis, #include <sys/eventfd.h> #include <linux/userfaultfd.h> +/* + * Here we use 24 buckets, which means the last bucket will cover [2^24 us, + * 2^25 us) ~= [16, 32) seconds. It should be far enough to record even + * extreme (perf-wise broken) 1G pages moving over, which can sometimes + * take a few seconds due to various reasons. Anything more than that + * might be unsensible to account anymore. + */ +#define BLOCKTIME_LATENCY_BUCKET_N (24) + +/* All the time records are in unit of nanoseconds */ typedef struct PostcopyBlocktimeContext { - /* time when page fault initiated per vCPU */ - uint32_t *page_fault_vcpu_time; - /* page address per vCPU */ - uintptr_t *vcpu_addr; - uint32_t total_blocktime; /* blocktime per vCPU */ - uint32_t *vcpu_blocktime; + uint64_t *vcpu_blocktime_total; + /* count of faults per vCPU */ + uint64_t *vcpu_faults_count; + /* + * count of currently blocked faults per vCPU. + * + * NOTE: Normally there should only be one fault in-progress per vCPU + * thread, so logically it _seems_ vcpu_faults_count[] for any vCPU + * should be either zero or one. However, there can be reasons we see + * >1 faults on the same vCPU thread. + * + * CASE (1): since the process to resolve faults (ioctl(UFFDIO_COPY), + * for example) is done before taking the mutex that protects the + * blocktime context, it can happen that we read more than one faulted + * addresses per vCPU. + * + * One example when we can see >1 faulted addresses for one vCPU: + * + * vcpu1 thread fault thread resolve thread + * ============ ============ ============== + * + * faulted on addr1 + * read uffd msg (addr1) + * MUTEX_LOCK + * add entry (cpu1, addr1) + * MUTEX_UNLOCK + * request remote fault (addr1) + * resolve fault (addr1) + * addr1 resolved, continue.. + * faulted on addr2 + * read uffd msg (addr2) + * MUTEX_LOCK + * add entry (cpu1, addr2) <--------------- [A] + * MUTEX_UNLOCK + * MUTEX_LOCK + * remove entry (cpu1, addr1) + * MUTEX_UNLOCK + * + * In above case, we may see (cpu1, addr1) and (cpu1, addr2) entries to + * appear together at [A], when it gets the lock before the resolve + * thread. Use this counter to maintain such case, and only when it + * reaches zero we know the vCPU is not blocked anymore. + * + * CASE (2): theoretically (the author admit to not have verified + * this..), one vCPU thread can also generate more than one userfaultfd + * message on the same address. It can happen e.g. for whatever reason + * the fault got retried before a resolution arrives. In that extremely + * rare case, we could also see two (cpu1, addr1) entries. + * + * In all cases, be prepared with such re-entrancies with this array. + * + * Using uint8_t should be far enough for now. For example, when + * there're only one resolve thread (postcopy ram listening thread), + * the max (concurrent fault entries) should be two. + */ + uint8_t *vcpu_faults_current; + /* + * The hash that contains addr1->[(cpu1,ts1),(cpu2,ts2) ...] mappings. + * Each of the entry is a tuple of (CPU index, fault timestamp) showing + * that a fault was requested. + */ + GHashTable *vcpu_addr_hash; + /* + * Each bucket stores the count of faults that were resolved within the + * bucket window [2^N us, 2^(N+1) us). + */ + uint64_t latency_buckets[BLOCKTIME_LATENCY_BUCKET_N]; + /* total blocktime when all vCPUs are stopped */ + uint64_t total_blocktime; /* point in time when last page fault was initiated */ - uint32_t last_begin; + uint64_t last_begin; /* number of vCPU are suspended */ int smp_cpus_down; - uint64_t start_time; + + /* + * Fast path for looking up vcpu_index from tid. NOTE: this result + * only reflects the vcpu setup when postcopy is running. It may not + * always match with the current vcpu setup because vcpus can be hot + * attached/detached after migration completes. However this should be + * stable when blocktime is using the structure. + */ + GHashTable *tid_to_vcpu_hash; + /* Count of non-vCPU faults. This is only for debugging purpose. */ + uint64_t non_vcpu_faults; + /* total blocktime when a non-vCPU thread is stopped */ + uint64_t non_vcpu_blocktime_total; /* * Handler for exit event, necessary for @@ -131,11 +216,41 @@ typedef struct PostcopyBlocktimeContext { Notifier exit_notifier; } PostcopyBlocktimeContext; +typedef struct { + /* The time the fault was triggered */ + uint64_t fault_time; + /* + * The vCPU index that was blocked, when cpu==-1, it means it's a + * fault from non-vCPU threads. + */ + int cpu; +} BlocktimeVCPUEntry; + +/* Alloc an entry to record a vCPU fault */ +static BlocktimeVCPUEntry * +blocktime_vcpu_entry_alloc(int cpu, uint64_t fault_time) +{ + BlocktimeVCPUEntry *entry = g_new(BlocktimeVCPUEntry, 1); + + entry->fault_time = fault_time; + entry->cpu = cpu; + + return entry; +} + +/* Free a @GList of @BlocktimeVCPUEntry */ +static void blocktime_vcpu_list_free(gpointer data) +{ + g_list_free_full(data, g_free); +} + static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) { - g_free(ctx->page_fault_vcpu_time); - g_free(ctx->vcpu_addr); - g_free(ctx->vcpu_blocktime); + g_hash_table_destroy(ctx->tid_to_vcpu_hash); + g_hash_table_destroy(ctx->vcpu_addr_hash); + g_free(ctx->vcpu_blocktime_total); + g_free(ctx->vcpu_faults_count); + g_free(ctx->vcpu_faults_current); g_free(ctx); } @@ -146,32 +261,65 @@ static void migration_exit_cb(Notifier *n, void *data) destroy_blocktime_context(ctx); } +static GHashTable *blocktime_init_tid_to_vcpu_hash(void) +{ + /* + * TID as an unsigned int can be directly used as the key. However, + * CPU index can NOT be directly used as value, because CPU index can + * be 0, which means NULL. Then when lookup we can never know whether + * it's 0 or "not found". Hence use an indirection for CPU index. + */ + GHashTable *table = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, g_free); + CPUState *cpu; + + /* + * Initialize the tid->cpu_id mapping for lookups. The caller needs to + * make sure when reaching here the CPU topology is frozen and will be + * stable for the whole blocktime trapping period. + */ + CPU_FOREACH(cpu) { + int *value = g_new(int, 1); + + *value = cpu->cpu_index; + g_hash_table_insert(table, + GUINT_TO_POINTER((uint32_t)cpu->thread_id), + value); + trace_postcopy_blocktime_tid_cpu_map(cpu->cpu_index, cpu->thread_id); + } + + return table; +} + static struct PostcopyBlocktimeContext *blocktime_context_new(void) { MachineState *ms = MACHINE(qdev_get_machine()); unsigned int smp_cpus = ms->smp.cpus; PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); - ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus); - ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); - ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus); - ctx->exit_notifier.notify = migration_exit_cb; - ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_add_exit_notifier(&ctx->exit_notifier); - return ctx; -} + /* Initialize all counters to be zeros */ + memset(ctx->latency_buckets, 0, sizeof(ctx->latency_buckets)); -static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) -{ - MachineState *ms = MACHINE(qdev_get_machine()); - uint32List *list = NULL; - int i; + ctx->vcpu_blocktime_total = g_new0(uint64_t, smp_cpus); + ctx->vcpu_faults_count = g_new0(uint64_t, smp_cpus); + ctx->vcpu_faults_current = g_new0(uint8_t, smp_cpus); + ctx->tid_to_vcpu_hash = blocktime_init_tid_to_vcpu_hash(); - for (i = ms->smp.cpus - 1; i >= 0; i--) { - QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]); - } + /* + * The key (host virtual addresses) will always be gpointer-sized on + * either 32bits or 64bits systems, so it'll fit as a direct key. + * + * The value will be a list of BlocktimeVCPUEntry entries. + */ + ctx->vcpu_addr_hash = g_hash_table_new_full(g_direct_hash, + g_direct_equal, + NULL, + blocktime_vcpu_list_free); + + ctx->exit_notifier.notify = migration_exit_cb; + qemu_add_exit_notifier(&ctx->exit_notifier); - return list; + return ctx; } /* @@ -185,18 +333,64 @@ void fill_destination_postcopy_migration_info(MigrationInfo *info) { MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + MachineState *ms = MACHINE(qdev_get_machine()); + uint64_t latency_total = 0, faults = 0; + uint32List *list_blocktime = NULL; + uint64List *list_latency = NULL; + uint64List *latency_buckets = NULL; + int i; if (!bc) { return; } + for (i = ms->smp.cpus - 1; i >= 0; i--) { + uint64_t latency, total, count; + + /* Convert ns -> ms */ + QAPI_LIST_PREPEND(list_blocktime, + (uint32_t)(bc->vcpu_blocktime_total[i] / SCALE_MS)); + + /* The rest in nanoseconds */ + total = bc->vcpu_blocktime_total[i]; + latency_total += total; + count = bc->vcpu_faults_count[i]; + faults += count; + + if (count) { + latency = total / count; + } else { + /* No fault detected */ + latency = 0; + } + + QAPI_LIST_PREPEND(list_latency, latency); + } + + for (i = BLOCKTIME_LATENCY_BUCKET_N - 1; i >= 0; i--) { + QAPI_LIST_PREPEND(latency_buckets, bc->latency_buckets[i]); + } + + latency_total += bc->non_vcpu_blocktime_total; + faults += bc->non_vcpu_faults; + + info->has_postcopy_non_vcpu_latency = true; + info->postcopy_non_vcpu_latency = bc->non_vcpu_faults ? + (bc->non_vcpu_blocktime_total / bc->non_vcpu_faults) : 0; info->has_postcopy_blocktime = true; - info->postcopy_blocktime = bc->total_blocktime; + /* Convert ns -> ms */ + info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS); info->has_postcopy_vcpu_blocktime = true; - info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); + info->postcopy_vcpu_blocktime = list_blocktime; + info->has_postcopy_latency = true; + info->postcopy_latency = faults ? (latency_total / faults) : 0; + info->has_postcopy_vcpu_latency = true; + info->postcopy_vcpu_latency = list_latency; + info->has_postcopy_latency_dist = true; + info->postcopy_latency_dist = latency_buckets; } -static uint32_t get_postcopy_total_blocktime(void) +static uint64_t get_postcopy_total_blocktime(void) { MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyBlocktimeContext *bc = mis->blocktime_ctx; @@ -300,13 +494,13 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis, } #ifdef UFFD_FEATURE_THREAD_ID + /* + * Postcopy blocktime conditionally needs THREAD_ID feature (introduced + * to Linux in 2017). Always try to enable it when QEMU is compiled + * with such environment. + */ if (UFFD_FEATURE_THREAD_ID & supported_features) { asked_features |= UFFD_FEATURE_THREAD_ID; - if (migrate_postcopy_blocktime()) { - if (!mis->blocktime_ctx) { - mis->blocktime_ctx = blocktime_context_new(); - } - } } #endif @@ -487,6 +681,7 @@ out: */ static int init_range(RAMBlock *rb, void *opaque) { + Error **errp = opaque; const char *block_name = qemu_ram_get_idstr(rb); void *host_addr = qemu_ram_get_host_addr(rb); ram_addr_t offset = qemu_ram_get_offset(rb); @@ -507,6 +702,8 @@ static int init_range(RAMBlock *rb, void *opaque) * (Precopy will just overwrite this data, so doesn't need the discard) */ if (ram_discard_range(block_name, 0, length)) { + error_setg(errp, "failed to discard RAM block %s len=%zu", + block_name, length); return -1; } @@ -555,9 +752,9 @@ static int cleanup_range(RAMBlock *rb, void *opaque) * postcopy later; must be called prior to any precopy. * called from arch_init's similarly named ram_postcopy_incoming_init */ -int postcopy_ram_incoming_init(MigrationIncomingState *mis) +int postcopy_ram_incoming_init(MigrationIncomingState *mis, Error **errp) { - if (foreach_not_ignored_block(init_range, NULL)) { + if (foreach_not_ignored_block(init_range, errp)) { return -1; } @@ -752,8 +949,12 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd, pagesize); } +/* + * NOTE: @tid is only used when postcopy-blocktime feature is enabled, and + * also optional: when zero is provided, the fault accounting will be ignored. + */ static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb, - ram_addr_t start, uint64_t haddr) + ram_addr_t start, uint64_t haddr, uint32_t tid) { void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb)); @@ -772,7 +973,7 @@ static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb, return received ? 0 : postcopy_place_page_zero(mis, aligned, rb); } - return migrate_send_rp_req_pages(mis, rb, start, haddr); + return migrate_send_rp_req_pages(mis, rb, start, haddr, tid); } /* @@ -793,83 +994,204 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, qemu_ram_get_idstr(rb), rb_offset); return postcopy_wake_shared(pcfd, client_addr, rb); } - postcopy_request_page(mis, rb, aligned_rbo, client_addr); + /* TODO: support blocktime tracking */ + postcopy_request_page(mis, rb, aligned_rbo, client_addr, 0); return 0; } -static int get_mem_fault_cpu_index(uint32_t pid) +static int blocktime_get_vcpu(PostcopyBlocktimeContext *ctx, uint32_t tid) { - CPUState *cpu_iter; + int *found; - CPU_FOREACH(cpu_iter) { - if (cpu_iter->thread_id == pid) { - trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid); - return cpu_iter->cpu_index; - } + found = g_hash_table_lookup(ctx->tid_to_vcpu_hash, GUINT_TO_POINTER(tid)); + if (!found) { + /* + * NOTE: this is possible, because QEMU's non-vCPU threads can + * also access a missing page. Or, when KVM async pf is enabled, a + * fault can even happen from a kworker.. + */ + return -1; } - trace_get_mem_fault_cpu_index(-1, pid); - return -1; + + return *found; } -static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc) +static uint64_t get_current_ns(void) { - int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - dc->start_time; - return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX; + return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +} + +/* + * Inject an (cpu, fault_time) entry into the database, using addr as key. + * When cpu==-1, it means it's a non-vCPU fault. + */ +static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx, + uintptr_t addr, int cpu, uint64_t time) +{ + BlocktimeVCPUEntry *entry = blocktime_vcpu_entry_alloc(cpu, time); + GHashTable *table = ctx->vcpu_addr_hash; + gpointer key = (gpointer)addr; + GList *head, *list; + gboolean result; + + head = g_hash_table_lookup(table, key); + if (head) { + /* + * If existed, steal the @head for list operation rather than + * freeing it, making sure steal succeeded. + */ + result = g_hash_table_steal(table, key); + assert(result == TRUE); + } + + /* + * Now the key is guaranteed to be absent. Two cases: + * + * (1) There's no existing entry, list contains the only one. Insert. + * (2) There're existing entries, after stealing we own it, prepend the + * result and re-insert. + */ + list = g_list_prepend(head, entry); + g_hash_table_insert(table, key, list); + + trace_postcopy_blocktime_begin(addr, time, cpu, !!head); } /* - * This function is being called when pagefault occurs. It - * tracks down vCPU blocking time. + * This function is being called when pagefault occurs. It tracks down vCPU + * blocking time. It's protected by @page_request_mutex. * * @addr: faulted host virtual address * @ptid: faulted process thread id * @rb: ramblock appropriate to addr */ -static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, - RAMBlock *rb) +void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, + RAMBlock *rb) { - int cpu, already_received; + int cpu; MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyBlocktimeContext *dc = mis->blocktime_ctx; - uint32_t low_time_offset; + uint64_t current; if (!dc || ptid == 0) { return; } - cpu = get_mem_fault_cpu_index(ptid); - if (cpu < 0) { - return; + + /* + * The caller should only inject a blocktime entry when the page is + * yet missing. + */ + assert(!ramblock_recv_bitmap_test(rb, (void *)addr)); + + current = get_current_ns(); + cpu = blocktime_get_vcpu(dc, ptid); + + if (cpu >= 0) { + /* How many faults on this vCPU in total? */ + dc->vcpu_faults_count[cpu]++; + + /* + * Account how many concurrent faults on this vCPU we trapped. See + * comments above vcpu_faults_current[] on why it can be more than one. + */ + if (dc->vcpu_faults_current[cpu]++ == 0) { + dc->smp_cpus_down++; + /* + * We use last_begin to cover (1) the 1st fault on this specific + * vCPU, but meanwhile (2) the last vCPU that got blocked. It's + * only used to calculate system-wide blocktime. + */ + dc->last_begin = current; + } + + /* Making sure it won't overflow - it really should never! */ + assert(dc->vcpu_faults_current[cpu] <= 255); + } else { + /* + * For non-vCPU thread faults, we don't care about tid or cpu index + * or time the thread is blocked (e.g., a kworker trying to help + * KVM when async_pf=on is OK to be blocked and not affect guest + * responsiveness), but we care about latency. Track it with + * cpu=-1. + * + * Note that this will NOT affect blocktime reports on vCPU being + * blocked, but only about system-wide latency reports. + */ + dc->non_vcpu_faults++; } - low_time_offset = get_low_time_offset(dc); - if (dc->vcpu_addr[cpu] == 0) { - qatomic_inc(&dc->smp_cpus_down); + blocktime_fault_inject(dc, addr, cpu, current); +} + +static void blocktime_latency_account(PostcopyBlocktimeContext *ctx, + uint64_t time_us) +{ + /* + * Convert time (in us) to bucket index it belongs. Take extra caution + * of time_us==0 even if normally rare - when happens put into bucket 0. + */ + int index = time_us ? (63 - clz64(time_us)) : 0; + + assert(index >= 0); + + /* If it's too large, put into top bucket */ + if (index >= BLOCKTIME_LATENCY_BUCKET_N) { + index = BLOCKTIME_LATENCY_BUCKET_N - 1; } - qatomic_xchg(&dc->last_begin, low_time_offset); - qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset); - qatomic_xchg(&dc->vcpu_addr[cpu], addr); + ctx->latency_buckets[index]++; +} + +typedef struct { + PostcopyBlocktimeContext *ctx; + uint64_t current; + int affected_cpus; + int affected_non_cpus; +} BlockTimeVCPUIter; + +static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data) +{ + BlockTimeVCPUIter *iter = user_data; + PostcopyBlocktimeContext *ctx = iter->ctx; + BlocktimeVCPUEntry *entry = data; + uint64_t time_passed; + int cpu = entry->cpu; /* - * check it here, not at the beginning of the function, - * due to, check could occur early than bitmap_set in - * qemu_ufd_copy_ioctl + * Time should never go back.. so when the fault is resolved it must be + * later than when it was faulted. */ - already_received = ramblock_recv_bitmap_test(rb, (void *)addr); - if (already_received) { - qatomic_xchg(&dc->vcpu_addr[cpu], 0); - qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0); - qatomic_dec(&dc->smp_cpus_down); + assert(iter->current >= entry->fault_time); + time_passed = iter->current - entry->fault_time; + + /* Latency buckets are in microseconds */ + blocktime_latency_account(ctx, time_passed / SCALE_US); + + if (cpu >= 0) { + /* + * If we resolved all pending faults on one vCPU due to this page + * resolution, take a note. + */ + if (--ctx->vcpu_faults_current[cpu] == 0) { + ctx->vcpu_blocktime_total[cpu] += time_passed; + iter->affected_cpus += 1; + } + trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]); + } else { + iter->affected_non_cpus++; + ctx->non_vcpu_blocktime_total += time_passed; + /* + * We do not maintain how many pending non-vCPU faults because we + * do not care about blocktime, only latency. + */ + trace_postcopy_blocktime_end_one(-1, 0); } - trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], - cpu, already_received); } /* - * This function just provide calculated blocktime per cpu and trace it. - * Total blocktime is calculated in mark_postcopy_blocktime_end. - * + * This function just provide calculated blocktime per cpu and trace it. + * Total blocktime is calculated in mark_postcopy_blocktime_end. It's + * protected by @page_request_mutex. * * Assume we have 3 CPU * @@ -899,48 +1221,45 @@ static void mark_postcopy_blocktime_end(uintptr_t addr) PostcopyBlocktimeContext *dc = mis->blocktime_ctx; MachineState *ms = MACHINE(qdev_get_machine()); unsigned int smp_cpus = ms->smp.cpus; - int i, affected_cpu = 0; - bool vcpu_total_blocktime = false; - uint32_t read_vcpu_time, low_time_offset; + BlockTimeVCPUIter iter = { + .current = get_current_ns(), + .affected_cpus = 0, + .affected_non_cpus = 0, + .ctx = dc, + }; + gpointer key = (gpointer)addr; + GHashTable *table; + GList *list; if (!dc) { return; } - low_time_offset = get_low_time_offset(dc); - /* lookup cpu, to clear it, - * that algorithm looks straightforward, but it's not - * optimal, more optimal algorithm is keeping tree or hash - * where key is address value is a list of */ - for (i = 0; i < smp_cpus; i++) { - uint32_t vcpu_blocktime = 0; - - read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); - if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || - read_vcpu_time == 0) { - continue; - } - qatomic_xchg(&dc->vcpu_addr[i], 0); - vcpu_blocktime = low_time_offset - read_vcpu_time; - affected_cpu += 1; - /* we need to know is that mark_postcopy_end was due to - * faulted page, another possible case it's prefetched - * page and in that case we shouldn't be here */ - if (!vcpu_total_blocktime && - qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) { - vcpu_total_blocktime = true; - } - /* continue cycle, due to one page could affect several vCPUs */ - dc->vcpu_blocktime[i] += vcpu_blocktime; + table = dc->vcpu_addr_hash; + /* the address wasn't tracked at all? */ + list = g_hash_table_lookup(table, key); + if (!list) { + return; } - qatomic_sub(&dc->smp_cpus_down, affected_cpu); - if (vcpu_total_blocktime) { - dc->total_blocktime += low_time_offset - qatomic_fetch_add( - &dc->last_begin, 0); + /* + * Loop over the set of vCPUs that got blocked on this addr, do the + * blocktime accounting. After that, remove the whole list. + */ + g_list_foreach(list, blocktime_cpu_list_iter_fn, &iter); + g_hash_table_remove(table, key); + + /* + * If all vCPUs used to be down, and copying this page would free some + * vCPUs, then the system-level blocktime ends here. + */ + if (dc->smp_cpus_down == smp_cpus && iter.affected_cpus) { + dc->total_blocktime += iter.current - dc->last_begin; } - trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, - affected_cpu); + dc->smp_cpus_down -= iter.affected_cpus; + + trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus, + iter.affected_non_cpus); } static void postcopy_pause_fault_thread(MigrationIncomingState *mis) @@ -964,7 +1283,7 @@ static void *postcopy_ram_fault_thread(void *opaque) trace_postcopy_ram_fault_thread_entry(); rcu_register_thread(); mis->last_rb = NULL; /* last RAMBlock we sent part of */ - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); struct pollfd *pfd; size_t pfd_len = 2 + mis->postcopy_remote_fds->len; @@ -1068,17 +1387,14 @@ static void *postcopy_ram_fault_thread(void *opaque) qemu_ram_get_idstr(rb), rb_offset, msg.arg.pagefault.feat.ptid); - mark_postcopy_blocktime_begin( - (uintptr_t)(msg.arg.pagefault.address), - msg.arg.pagefault.feat.ptid, rb); - retry: /* * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) */ ret = postcopy_request_page(mis, rb, rb_offset, - msg.arg.pagefault.address); + msg.arg.pagefault.address, + msg.arg.pagefault.feat.ptid); if (ret) { /* May be network failure, try to wait for recovery */ postcopy_pause_fault_thread(mis); @@ -1221,6 +1537,11 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis) return -1; } + if (migrate_postcopy_blocktime()) { + assert(mis->blocktime_ctx == NULL); + mis->blocktime_ctx = blocktime_context_new(); + } + /* Now an eventfd we use to tell the fault-thread to quit */ mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC); if (mis->userfault_event_fd == -1) { @@ -1299,8 +1620,8 @@ static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr, qemu_cond_signal(&mis->page_request_cond); } } - qemu_mutex_unlock(&mis->page_request_mutex); mark_postcopy_blocktime_end((uintptr_t)host_addr); + qemu_mutex_unlock(&mis->page_request_mutex); } return ret; } @@ -1385,7 +1706,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp) return false; } -int postcopy_ram_incoming_init(MigrationIncomingState *mis) +int postcopy_ram_incoming_init(MigrationIncomingState *mis, Error **errp) { error_report("postcopy_ram_incoming_init: No OS support"); return -1; @@ -1430,6 +1751,11 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd, { g_assert_not_reached(); } + +void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, + RAMBlock *rb) +{ +} #endif /* ------------------------------------------------------------------------- */ @@ -1586,7 +1912,7 @@ void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) * The new loading channel has its own threads, so it needs to be * blocked too. It's by default true, just be explicit. */ - qemu_file_set_blocking(file, true); + qemu_file_set_blocking(file, true, &error_abort); mis->postcopy_qemufile_dst = file; qemu_sem_post(&mis->postcopy_qemufile_dst_done); trace_postcopy_preempt_new_channel(); @@ -1716,7 +2042,7 @@ void *postcopy_preempt_thread(void *opaque) rcu_register_thread(); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); /* * The preempt channel is established in asynchronous way. Wait diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h index a6df1b2..ca19433 100644 --- a/migration/postcopy-ram.h +++ b/migration/postcopy-ram.h @@ -30,7 +30,7 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis); * postcopy later; must be called prior to any precopy. * called from ram.c's similarly named ram_postcopy_incoming_init */ -int postcopy_ram_incoming_init(MigrationIncomingState *mis); +int postcopy_ram_incoming_init(MigrationIncomingState *mis, Error **errp); /* * At the end of a migration where postcopy_ram_incoming_init was called. @@ -196,5 +196,7 @@ void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file); void postcopy_preempt_setup(MigrationState *s); int postcopy_preempt_establish_channel(MigrationState *s); bool postcopy_is_paused(MigrationStatus status); +void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, + RAMBlock *rb); #endif diff --git a/migration/qemu-file.c b/migration/qemu-file.c index b6ac190..2d4ce17 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -125,7 +125,6 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable) /* * Result: QEMUFile* for a 'return path' for comms in the opposite direction - * NULL if not available */ QEMUFile *qemu_file_get_return_path(QEMUFile *f) { @@ -340,7 +339,8 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) do { struct iovec iov = { f->buf + pending, IO_BUF_SIZE - pending }; - len = qio_channel_readv_full(f->ioc, &iov, 1, pfds, pnfd, 0, + len = qio_channel_readv_full(f->ioc, &iov, 1, pfds, pnfd, + QIO_CHANNEL_READ_FLAG_FD_PRESERVE_BLOCKING, &local_error); if (len == QIO_CHANNEL_ERR_BLOCK) { if (qemu_in_coroutine()) { @@ -348,17 +348,13 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) } else { qio_channel_wait(f->ioc, G_IO_IN); } - } else if (len < 0) { - len = -EIO; } } while (len == QIO_CHANNEL_ERR_BLOCK); if (len > 0) { f->buf_size += len; - } else if (len == 0) { - qemu_file_set_error_obj(f, -EIO, local_error); } else { - qemu_file_set_error_obj(f, len, local_error); + qemu_file_set_error_obj(f, -EIO, local_error); } for (int i = 0; i < nfd; i++) { @@ -887,9 +883,9 @@ void qemu_put_counted_string(QEMUFile *f, const char *str) * both directions, and thus changing the blocking on the main * QEMUFile can also affect the return path. */ -void qemu_file_set_blocking(QEMUFile *f, bool block) +bool qemu_file_set_blocking(QEMUFile *f, bool block, Error **errp) { - qio_channel_set_blocking(f->ioc, block, NULL); + return qio_channel_set_blocking(f->ioc, block, errp); } /* diff --git a/migration/qemu-file.h b/migration/qemu-file.h index f5b9f43..c13c967 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -71,7 +71,7 @@ void qemu_file_set_error(QEMUFile *f, int ret); int qemu_file_shutdown(QEMUFile *f); QEMUFile *qemu_file_get_return_path(QEMUFile *f); int qemu_fflush(QEMUFile *f); -void qemu_file_set_blocking(QEMUFile *f, bool block); +bool qemu_file_set_blocking(QEMUFile *f, bool block, Error **errp); int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size); void qemu_set_offset(QEMUFile *f, off_t off, int whence); off_t qemu_get_offset(QEMUFile *f); diff --git a/migration/ram.c b/migration/ram.c index e12913b..5eef2ef 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -53,6 +53,8 @@ #include "qemu/rcu_queue.h" #include "migration/colo.h" #include "system/cpu-throttle.h" +#include "system/physmem.h" +#include "system/ramblock.h" #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" @@ -228,6 +230,7 @@ bool migrate_ram_is_ignored(RAMBlock *block) MigMode mode = migrate_mode(); return !qemu_ram_is_migratable(block) || mode == MIG_MODE_CPR_TRANSFER || + mode == MIG_MODE_CPR_EXEC || (migrate_ignore_shared() && qemu_ram_is_shared(block) && qemu_ram_is_named_file(block)); } @@ -831,14 +834,24 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, bool ret; /* - * Clear dirty bitmap if needed. This _must_ be called before we - * send any of the page in the chunk because we need to make sure - * we can capture further page content changes when we sync dirty - * log the next time. So as long as we are going to send any of - * the page in the chunk we clear the remote dirty bitmap for all. - * Clearing it earlier won't be a problem, but too late will. + * During the last stage (after source VM stopped), resetting the write + * protections isn't needed as we know there will be either (1) no + * further writes if migration will complete, or (2) migration fails + * at last then tracking isn't needed either. + * + * Do the same for postcopy due to the same reason. */ - migration_clear_memory_region_dirty_bitmap(rb, page); + if (!rs->last_stage && !migration_in_postcopy()) { + /* + * Clear dirty bitmap if needed. This _must_ be called before we + * send any of the page in the chunk because we need to make sure + * we can capture further page content changes when we sync dirty + * log the next time. So as long as we are going to send any of + * the page in the chunk we clear the remote dirty bitmap for all. + * Clearing it earlier won't be a problem, but too late will. + */ + migration_clear_memory_region_dirty_bitmap(rb, page); + } ret = test_and_clear_bit(page, rb->bmap); if (ret) { @@ -848,8 +861,8 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, return ret; } -static void dirty_bitmap_clear_section(MemoryRegionSection *section, - void *opaque) +static int dirty_bitmap_clear_section(MemoryRegionSection *section, + void *opaque) { const hwaddr offset = section->offset_within_region; const hwaddr size = int128_get64(section->size); @@ -868,6 +881,7 @@ static void dirty_bitmap_clear_section(MemoryRegionSection *section, } *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); bitmap_clear(rb->bmap, start, npages); + return 0; } /* @@ -924,10 +938,85 @@ bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) } /* Called with RCU critical section */ +static uint64_t physical_memory_sync_dirty_bitmap(RAMBlock *rb, + ram_addr_t start, + ram_addr_t length) +{ + ram_addr_t addr; + unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS); + uint64_t num_dirty = 0; + unsigned long *dest = rb->bmap; + + /* start address and length is aligned at the start of a word? */ + if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == + (start + rb->offset) && + !(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) { + int k; + int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS); + unsigned long * const *src; + unsigned long idx = (word * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE; + unsigned long offset = BIT_WORD((word * BITS_PER_LONG) % + DIRTY_MEMORY_BLOCK_SIZE); + unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); + + src = qatomic_rcu_read( + &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks; + + for (k = page; k < page + nr; k++) { + if (src[idx][offset]) { + unsigned long bits = qatomic_xchg(&src[idx][offset], 0); + unsigned long new_dirty; + new_dirty = ~dest[k]; + dest[k] |= bits; + new_dirty &= bits; + num_dirty += ctpopl(new_dirty); + } + + if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { + offset = 0; + idx++; + } + } + if (num_dirty) { + physical_memory_dirty_bits_cleared(start, length); + } + + if (rb->clear_bmap) { + /* + * Postpone the dirty bitmap clear to the point before we + * really send the pages, also we will split the clear + * dirty procedure into smaller chunks. + */ + clear_bmap_set(rb, start >> TARGET_PAGE_BITS, + length >> TARGET_PAGE_BITS); + } else { + /* Slow path - still do that in a huge chunk */ + memory_region_clear_dirty_bitmap(rb->mr, start, length); + } + } else { + ram_addr_t offset = rb->offset; + + for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { + if (physical_memory_test_and_clear_dirty( + start + addr + offset, + TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { + long k = (start + addr) >> TARGET_PAGE_BITS; + if (!test_and_set_bit(k, dest)) { + num_dirty++; + } + } + } + } + + return num_dirty; +} + +/* Called with RCU critical section */ static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) { uint64_t new_dirty_pages = - cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); + physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); rs->migration_dirty_pages += new_dirty_pages; rs->num_dirty_pages_period += new_dirty_pages; @@ -1993,9 +2082,8 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) } } - if (migrate_multifd()) { - RAMBlock *block = pss->block; - return ram_save_multifd_page(block, offset); + if (migrate_multifd() && !migration_in_postcopy()) { + return ram_save_multifd_page(pss->block, offset); } return ram_save_page(rs, pss); @@ -2278,16 +2366,18 @@ static int ram_find_and_save_block(RAMState *rs) if (!get_queued_page(rs, pss)) { /* priority queue empty, so just search for something dirty */ int res = find_dirty_block(rs, pss); - if (res != PAGE_DIRTY_FOUND) { - if (res == PAGE_ALL_CLEAN) { - break; - } else if (res == PAGE_TRY_AGAIN) { - continue; - } else if (res < 0) { - pages = res; - break; - } + + if (res == PAGE_ALL_CLEAN) { + break; + } else if (res == PAGE_TRY_AGAIN) { + continue; + } else if (res < 0) { + pages = res; + break; } + + /* Otherwise we must have a dirty page to move */ + assert(res == PAGE_DIRTY_FOUND); } pages = ram_save_host_page(rs, pss); if (pages) { @@ -3280,6 +3370,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque) RAMState *rs = *temp; int ret = 0; + trace_ram_save_complete(rs->migration_dirty_pages, 0); + rs->last_stage = !migration_in_colo_state(); WITH_RCU_READ_LOCK_GUARD() { @@ -3343,6 +3435,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + + trace_ram_save_complete(rs->migration_dirty_pages, 1); + return qemu_fflush(f); } @@ -3558,8 +3653,10 @@ static void colo_init_ram_state(void) * colo cache: this is for secondary VM, we cache the whole * memory of the secondary VM, it is need to hold the global lock * to call this helper. + * + * Returns zero to indicate success or -1 on error. */ -int colo_init_ram_cache(void) +int colo_init_ram_cache(Error **errp) { RAMBlock *block; @@ -3568,16 +3665,16 @@ int colo_init_ram_cache(void) block->colo_cache = qemu_anon_ram_alloc(block->used_length, NULL, false, false); if (!block->colo_cache) { - error_report("%s: Can't alloc memory for COLO cache of block %s," - "size 0x" RAM_ADDR_FMT, __func__, block->idstr, - block->used_length); + error_setg(errp, "Can't alloc memory for COLO cache of " + "block %s, size 0x" RAM_ADDR_FMT, + block->idstr, block->used_length); RAMBLOCK_FOREACH_NOT_IGNORED(block) { if (block->colo_cache) { qemu_anon_ram_free(block->colo_cache, block->used_length); block->colo_cache = NULL; } } - return -errno; + return -1; } if (!machine_dump_guest_core(current_machine)) { qemu_madvise(block->colo_cache, block->used_length, @@ -3673,7 +3770,9 @@ static int ram_load_cleanup(void *opaque) RAMBlock *rb; RAMBLOCK_FOREACH_NOT_IGNORED(rb) { - qemu_ram_block_writeback(rb); + if (memory_region_is_nonvolatile(rb->mr)) { + qemu_ram_block_writeback(rb); + } } xbzrle_load_cleanup(); @@ -3697,9 +3796,9 @@ static int ram_load_cleanup(void *opaque) * postcopy-ram. postcopy-ram's similarly names * postcopy_ram_incoming_init does the work. */ -int ram_postcopy_incoming_init(MigrationIncomingState *mis) +int ram_postcopy_incoming_init(MigrationIncomingState *mis, Error **errp) { - return postcopy_ram_incoming_init(mis); + return postcopy_ram_incoming_init(mis, errp); } /** @@ -4348,7 +4447,7 @@ static bool ram_has_postcopy(void *opaque) { RAMBlock *rb; RAMBLOCK_FOREACH_NOT_IGNORED(rb) { - if (ramblock_is_pmem(rb)) { + if (ram_block_is_pmem(rb)) { info_report("Block: %s, host: %p is a nvdimm memory, postcopy" "is not supported now!", rb->idstr, rb->host); return false; @@ -4538,8 +4637,7 @@ void postcopy_preempt_shutdown_file(MigrationState *s) static SaveVMHandlers savevm_ram_handlers = { .save_setup = ram_save_setup, .save_live_iterate = ram_save_iterate, - .save_live_complete_postcopy = ram_save_complete, - .save_live_complete_precopy = ram_save_complete, + .save_complete = ram_save_complete, .has_postcopy = ram_has_postcopy, .state_pending_exact = ram_state_pending_exact, .state_pending_estimate = ram_state_pending_estimate, diff --git a/migration/ram.h b/migration/ram.h index 921c39a..24cd0bf 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -86,7 +86,7 @@ void ram_postcopy_migrated_memory_release(MigrationState *ms); void ram_postcopy_send_discard_bitmap(MigrationState *ms); /* For incoming postcopy discard */ int ram_discard_range(const char *block_name, uint64_t start, size_t length); -int ram_postcopy_incoming_init(MigrationIncomingState *mis); +int ram_postcopy_incoming_init(MigrationIncomingState *mis, Error **errp); int ram_load_postcopy(QEMUFile *f, int channel); void ram_handle_zero(void *host, uint64_t size); @@ -109,7 +109,7 @@ void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set); /* ram cache */ -int colo_init_ram_cache(void); +int colo_init_ram_cache(Error **errp); void colo_flush_ram_cache(void); void colo_release_ram_cache(void); void colo_incoming_start_dirty_log(void); diff --git a/migration/savevm.c b/migration/savevm.c index 006514c..7b35ec4 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -266,7 +266,7 @@ typedef struct SaveState { static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), - .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL }, + .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL }, .global_section_id = 0, }; @@ -737,7 +737,7 @@ static int calculate_compat_instance_id(const char *idstr) static inline MigrationPriority save_state_priority(SaveStateEntry *se) { - if (se->vmsd) { + if (se->vmsd && se->vmsd->priority) { return se->vmsd->priority; } return MIG_PRI_DEFAULT; @@ -963,13 +963,20 @@ void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd, } } -static int vmstate_load(QEMUFile *f, SaveStateEntry *se) +static int vmstate_load(QEMUFile *f, SaveStateEntry *se, Error **errp) { + int ret; trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); if (!se->vmsd) { /* Old style */ - return se->ops->load_state(f, se->opaque, se->load_version_id); + ret = se->ops->load_state(f, se->opaque, se->load_version_id); + if (ret < 0) { + error_setg(errp, "Failed to load vmstate version_id: %d, ret: %d", + se->load_version_id, ret); + } + return ret; } - return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id); + return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id, + errp); } static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, @@ -1049,8 +1056,8 @@ static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc, if (!se->vmsd) { vmstate_save_old_style(f, se, vmdesc); } else { - ret = vmstate_save_state_with_err(f, se->vmsd, se->opaque, vmdesc, - errp); + ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc, + errp); if (ret) { return ret; } @@ -1278,6 +1285,7 @@ void qemu_savevm_state_header(QEMUFile *f) { MigrationState *s = migrate_get_current(); JSONWriter *vmdesc = s->vmdesc; + Error *local_err = NULL; trace_savevm_state_header(); qemu_put_be32(f, QEMU_VM_FILE_MAGIC); @@ -1296,7 +1304,11 @@ void qemu_savevm_state_header(QEMUFile *f) json_writer_start_object(vmdesc, "configuration"); } - vmstate_save_state(f, &vmstate_configuration, &savevm_state, vmdesc); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, + vmdesc, &local_err); + if (local_err) { + error_report_err(local_err); + } if (vmdesc) { json_writer_end_object(vmdesc); @@ -1484,37 +1496,54 @@ bool should_send_vmdesc(void) return !machine->suppress_vmdesc; } +static bool qemu_savevm_complete_exists(SaveStateEntry *se) +{ + return se->ops && se->ops->save_complete; +} + /* - * Calls the save_live_complete_postcopy methods - * causing the last few pages to be sent immediately and doing any associated - * cleanup. + * Invoke the ->save_complete() if necessary. + * Returns: 0 if skip the current SE or succeeded, <0 if error happened. + */ +static int qemu_savevm_complete(SaveStateEntry *se, QEMUFile *f) +{ + int ret; + + if (se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + return 0; + } + } + + trace_savevm_section_start(se->idstr, se->section_id); + save_section_header(f, se, QEMU_VM_SECTION_END); + ret = se->ops->save_complete(f, se->opaque); + trace_savevm_section_end(se->idstr, se->section_id, ret); + save_section_footer(f, se); + + if (ret < 0) { + qemu_file_set_error(f, ret); + } + + return ret; +} + +/* + * Complete saving any postcopy-able devices. + * * Note postcopy also calls qemu_savevm_state_complete_precopy to complete * all the other devices, but that happens at the point we switch to postcopy. */ void qemu_savevm_state_complete_postcopy(QEMUFile *f) { SaveStateEntry *se; - int ret; QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_complete_postcopy) { + if (!qemu_savevm_complete_exists(se)) { continue; } - if (se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } - } - trace_savevm_section_start(se->idstr, se->section_id); - /* Section type */ - qemu_put_byte(f, QEMU_VM_SECTION_END); - qemu_put_be32(f, se->section_id); - ret = se->ops->save_live_complete_postcopy(f, se->opaque); - trace_savevm_section_end(se->idstr, se->section_id, ret); - save_section_footer(f, se); - if (ret < 0) { - qemu_file_set_error(f, ret); + if (qemu_savevm_complete(se, f) < 0) { return; } } @@ -1560,20 +1589,19 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) { int64_t start_ts_each, end_ts_each; SaveStateEntry *se; - int ret; bool multifd_device_state = multifd_device_state_supported(); if (multifd_device_state) { QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - SaveLiveCompletePrecopyThreadHandler hdlr; + SaveCompletePrecopyThreadHandler hdlr; if (!se->ops || (in_postcopy && se->ops->has_postcopy && se->ops->has_postcopy(se->opaque)) || - !se->ops->save_live_complete_precopy_thread) { + !se->ops->save_complete_precopy_thread) { continue; } - hdlr = se->ops->save_live_complete_precopy_thread; + hdlr = se->ops->save_complete_precopy_thread; multifd_spawn_device_state_save_thread(hdlr, se->idstr, se->instance_id, se->opaque); @@ -1581,32 +1609,25 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) } QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || - (in_postcopy && se->ops->has_postcopy && - se->ops->has_postcopy(se->opaque)) || - !se->ops->save_live_complete_precopy) { + if (!qemu_savevm_complete_exists(se)) { continue; } - if (se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } + if (in_postcopy && se->ops->has_postcopy && + se->ops->has_postcopy(se->opaque)) { + /* + * If postcopy will start soon, and if the SE supports + * postcopy, then we can skip the SE for the postcopy phase. + */ + continue; } start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); - trace_savevm_section_start(se->idstr, se->section_id); - - save_section_header(f, se, QEMU_VM_SECTION_END); - - ret = se->ops->save_live_complete_precopy(f, se->opaque); - trace_savevm_section_end(se->idstr, se->section_id, ret); - save_section_footer(f, se); - if (ret < 0) { - qemu_file_set_error(f, ret); + if (qemu_savevm_complete(se, f) < 0) { goto ret_fail_abort_threads; } end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); + trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, end_ts_each - start_ts_each); } @@ -1896,39 +1917,39 @@ enum LoadVMExitCodes { * quickly. */ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, - uint16_t len) + uint16_t len, Error **errp) { PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps; size_t page_size = qemu_target_page_size(); - Error *local_err = NULL; trace_loadvm_postcopy_handle_advise(); if (ps != POSTCOPY_INCOMING_NONE) { - error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps); + error_setg(errp, "CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", + ps); return -1; } switch (len) { case 0: if (migrate_postcopy_ram()) { - error_report("RAM postcopy is enabled but have 0 byte advise"); + error_setg(errp, "RAM postcopy is enabled but have 0 byte advise"); return -EINVAL; } return 0; case 8 + 8: if (!migrate_postcopy_ram()) { - error_report("RAM postcopy is disabled but have 16 byte advise"); + error_setg(errp, + "RAM postcopy is disabled but have 16 byte advise"); return -EINVAL; } break; default: - error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len); + error_setg(errp, "CMD_POSTCOPY_ADVISE invalid length (%d)", len); return -EINVAL; } - if (!postcopy_ram_supported_by_host(mis, &local_err)) { - error_report_err(local_err); + if (!postcopy_ram_supported_by_host(mis, errp)) { postcopy_state_set(POSTCOPY_INCOMING_NONE); return -1; } @@ -1951,9 +1972,10 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, * also fails when passed to an older qemu that doesn't * do huge pages. */ - error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64 - " d=%" PRIx64 ")", - remote_pagesize_summary, local_pagesize_summary); + error_setg(errp, + "Postcopy needs matching RAM page sizes " + "(s=%" PRIx64 " d=%" PRIx64 ")", + remote_pagesize_summary, local_pagesize_summary); return -1; } @@ -1963,17 +1985,18 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, * Again, some differences could be dealt with, but for now keep it * simple. */ - error_report("Postcopy needs matching target page sizes (s=%d d=%zd)", - (int)remote_tps, page_size); + error_setg(errp, + "Postcopy needs matching target page sizes (s=%d d=%zd)", + (int)remote_tps, page_size); return -1; } - if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) { - error_report_err(local_err); + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, errp)) { return -1; } - if (ram_postcopy_incoming_init(mis)) { + if (ram_postcopy_incoming_init(mis, errp) < 0) { + error_prepend(errp, "Postcopy RAM incoming init failed: "); return -1; } @@ -1986,7 +2009,7 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, * There can be 0..many of these messages, each encoding multiple pages. */ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, - uint16_t len) + uint16_t len, Error **errp) { int tmp; char ramid[256]; @@ -1999,6 +2022,7 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, /* 1st discard */ tmp = postcopy_ram_prepare_discard(mis); if (tmp) { + error_setg(errp, "Failed to prepare for RAM discard: %d", tmp); return tmp; } break; @@ -2008,8 +2032,9 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, break; default: - error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)", - ps); + error_setg(errp, + "CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)", + ps); return -1; } /* We're expecting a @@ -2018,29 +2043,30 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, * then at least 1 16 byte chunk */ if (len < (1 + 1 + 1 + 1 + 2 * 8)) { - error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); + error_setg(errp, "CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); return -1; } tmp = qemu_get_byte(mis->from_src_file); if (tmp != postcopy_ram_discard_version) { - error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp); + error_setg(errp, "CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp); return -1; } if (!qemu_get_counted_string(mis->from_src_file, ramid)) { - error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID"); + error_setg(errp, + "CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID"); return -1; } tmp = qemu_get_byte(mis->from_src_file); if (tmp != 0) { - error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp); + error_setg(errp, "CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp); return -1; } len -= 3 + strlen(ramid); if (len % 16) { - error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); + error_setg(errp, "CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); return -1; } trace_loadvm_postcopy_ram_handle_discard_header(ramid, len); @@ -2052,6 +2078,7 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, len -= 16; int ret = ram_discard_range(ramid, start_addr, block_length); if (ret) { + error_setg(errp, "Failed to discard RAM range %s: %d", ramid, ret); return ret; } } @@ -2073,12 +2100,13 @@ static void *postcopy_ram_listen_thread(void *opaque) QEMUFile *f = mis->from_src_file; int load_res; MigrationState *migr = migrate_get_current(); + Error *local_err = NULL; object_ref(OBJECT(migr)); migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); trace_postcopy_ram_listen_thread_start(); rcu_register_thread(); @@ -2086,10 +2114,10 @@ static void *postcopy_ram_listen_thread(void *opaque) * Because we're a thread and not a coroutine we can't yield * in qemu_file, and thus we must be blocking now. */ - qemu_file_set_blocking(f, true); + qemu_file_set_blocking(f, true, &error_fatal); /* TODO: sanity check that only postcopiable data will be loaded here */ - load_res = qemu_loadvm_state_main(f, mis); + load_res = qemu_loadvm_state_main(f, mis, &local_err); /* * This is tricky, but, mis->from_src_file can change after it @@ -2099,7 +2127,7 @@ static void *postcopy_ram_listen_thread(void *opaque) f = mis->from_src_file; /* And non-blocking again so we don't block in any cleanup */ - qemu_file_set_blocking(f, false); + qemu_file_set_blocking(f, false, &error_fatal); trace_postcopy_ram_listen_thread_exit(); if (load_res < 0) { @@ -2115,7 +2143,10 @@ static void *postcopy_ram_listen_thread(void *opaque) __func__, load_res); load_res = 0; /* prevent further exit() */ } else { - error_report("%s: loadvm failed: %d", __func__, load_res); + error_prepend(&local_err, + "loadvm failed during postcopy: %d: ", load_res); + migrate_set_error(migr, local_err); + error_report_err(local_err); migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_FAILED); } @@ -2163,15 +2194,16 @@ static void *postcopy_ram_listen_thread(void *opaque) } /* After this message we must be able to immediately receive postcopy data */ -static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) +static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis, + Error **errp) { PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING); - Error *local_err = NULL; trace_loadvm_postcopy_handle_listen("enter"); if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) { - error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); + error_setg(errp, + "CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); return -1; } if (ps == POSTCOPY_INCOMING_ADVISE) { @@ -2194,14 +2226,14 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) if (migrate_postcopy_ram()) { if (postcopy_ram_incoming_setup(mis)) { postcopy_ram_incoming_cleanup(mis); + error_setg(errp, "Failed to setup incoming postcopy RAM blocks"); return -1; } } trace_loadvm_postcopy_handle_listen("after uffd"); - if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) { - error_report_err(local_err); + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, errp)) { return -1; } @@ -2254,13 +2286,13 @@ static void loadvm_postcopy_handle_run_bh(void *opaque) } /* After all discards we can start running and asking for pages */ -static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) +static int loadvm_postcopy_handle_run(MigrationIncomingState *mis, Error **errp) { PostcopyState ps = postcopy_state_get(); trace_loadvm_postcopy_handle_run(); if (ps != POSTCOPY_INCOMING_LISTENING) { - error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps); + error_setg(errp, "CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps); return -1; } @@ -2318,12 +2350,12 @@ static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis) } } -static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) +static void loadvm_postcopy_handle_resume(MigrationIncomingState *mis) { if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { - error_report("%s: illegal resume received", __func__); + warn_report("%s: illegal resume received", __func__); /* Don't fail the load, only for this. */ - return 0; + return; } /* @@ -2375,8 +2407,6 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) /* Kick the fast ram load thread too */ qemu_sem_post(&mis->postcopy_pause_sem_fast_load); } - - return 0; } /** @@ -2389,7 +2419,7 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) * Returns: Negative values on error * */ -static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) +static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis, Error **errp) { int ret; size_t length; @@ -2399,7 +2429,7 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) trace_loadvm_handle_cmd_packaged(length); if (length > MAX_VM_CMD_PACKAGED_SIZE) { - error_report("Unreasonably large packaged state: %zu", length); + error_setg(errp, "Unreasonably large packaged state: %zu", length); return -1; } @@ -2410,8 +2440,8 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) length); if (ret != length) { object_unref(OBJECT(bioc)); - error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu", - ret, length); + error_setg(errp, "CMD_PACKAGED: Buffer receive fail ret=%d length=%zu", + ret, length); return (ret < 0) ? ret : -EAGAIN; } bioc->usage += length; @@ -2440,7 +2470,7 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) qemu_coroutine_yield(); } while (1); - ret = qemu_loadvm_state_main(packf, mis); + ret = qemu_loadvm_state_main(packf, mis, errp); trace_loadvm_handle_cmd_packaged_main(ret); qemu_fclose(packf); object_unref(OBJECT(bioc)); @@ -2455,32 +2485,35 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) * len (1 byte) + ramblock_name (<255 bytes) */ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis, - uint16_t len) + uint16_t len, Error **errp) { QEMUFile *file = mis->from_src_file; RAMBlock *rb; char block_name[256]; size_t cnt; + int ret; cnt = qemu_get_counted_string(file, block_name); if (!cnt) { - error_report("%s: failed to read block name", __func__); + error_setg(errp, "failed to read block name"); return -EINVAL; } /* Validate before using the data */ - if (qemu_file_get_error(file)) { - return qemu_file_get_error(file); + ret = qemu_file_get_error(file); + if (ret < 0) { + error_setg(errp, "loadvm failed: stream error: %d", ret); + return ret; } if (len != cnt + 1) { - error_report("%s: invalid payload length (%d)", __func__, len); + error_setg(errp, "invalid payload length (%d)", len); return -EINVAL; } rb = qemu_ram_block_by_name(block_name); if (!rb) { - error_report("%s: block '%s' not found", __func__, block_name); + error_setg(errp, "block '%s' not found", block_name); return -EINVAL; } @@ -2491,20 +2524,26 @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis, return 0; } -static int loadvm_process_enable_colo(MigrationIncomingState *mis) +static int loadvm_process_enable_colo(MigrationIncomingState *mis, + Error **errp) { - int ret = migration_incoming_enable_colo(); + ERRP_GUARD(); + int ret; - if (!ret) { - ret = colo_init_ram_cache(); - if (ret) { - migration_incoming_disable_colo(); - } + ret = migration_incoming_enable_colo(errp); + if (ret < 0) { + return ret; + } + + ret = colo_init_ram_cache(errp); + if (ret) { + error_prepend(errp, "failed to init colo RAM cache: %d: ", ret); + migration_incoming_disable_colo(); } return ret; } -static int loadvm_postcopy_handle_switchover_start(void) +static int loadvm_postcopy_handle_switchover_start(Error **errp) { SaveStateEntry *se; @@ -2517,6 +2556,7 @@ static int loadvm_postcopy_handle_switchover_start(void) ret = se->ops->switchover_start(se->opaque); if (ret < 0) { + error_setg(errp, "Switchover start failed: %d", ret); return ret; } } @@ -2530,32 +2570,37 @@ static int loadvm_postcopy_handle_switchover_start(void) * LOADVM_QUIT All good, but exit the loop * <0 Error */ -static int loadvm_process_command(QEMUFile *f) +static int loadvm_process_command(QEMUFile *f, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); uint16_t cmd; uint16_t len; uint32_t tmp32; + int ret; cmd = qemu_get_be16(f); len = qemu_get_be16(f); /* Check validity before continue processing of cmds */ - if (qemu_file_get_error(f)) { - return qemu_file_get_error(f); + ret = qemu_file_get_error(f); + if (ret) { + error_setg(errp, + "Failed to load VM process command: stream error: %d", + ret); + return ret; } if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) { - error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len); + error_setg(errp, "MIG_CMD 0x%x unknown (len 0x%x)", cmd, len); return -EINVAL; } trace_loadvm_process_command(mig_cmd_args[cmd].name, len); if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) { - error_report("%s received with bad length - expecting %zu, got %d", - mig_cmd_args[cmd].name, - (size_t)mig_cmd_args[cmd].len, len); + error_setg(errp, "%s received with bad length - expecting %zu, got %d", + mig_cmd_args[cmd].name, + (size_t)mig_cmd_args[cmd].len, len); return -ERANGE; } @@ -2567,10 +2612,6 @@ static int loadvm_process_command(QEMUFile *f) return 0; } mis->to_src_file = qemu_file_get_return_path(f); - if (!mis->to_src_file) { - error_report("CMD_OPEN_RETURN_PATH failed"); - return -1; - } /* * Switchover ack is enabled but no device uses it, so send an ACK to @@ -2578,11 +2619,10 @@ static int loadvm_process_command(QEMUFile *f) * been created. */ if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) { - int ret = migrate_send_rp_switchover_ack(mis); + ret = migrate_send_rp_switchover_ack(mis); if (ret) { - error_report( - "Could not send switchover ack RP MSG, err %d (%s)", ret, - strerror(-ret)); + error_setg_errno(errp, -ret, + "Could not send switchover ack RP MSG"); return ret; } } @@ -2592,39 +2632,40 @@ static int loadvm_process_command(QEMUFile *f) tmp32 = qemu_get_be32(f); trace_loadvm_process_command_ping(tmp32); if (!mis->to_src_file) { - error_report("CMD_PING (0x%x) received with no return path", - tmp32); + error_setg(errp, "CMD_PING (0x%x) received with no return path", + tmp32); return -1; } migrate_send_rp_pong(mis, tmp32); break; case MIG_CMD_PACKAGED: - return loadvm_handle_cmd_packaged(mis); + return loadvm_handle_cmd_packaged(mis, errp); case MIG_CMD_POSTCOPY_ADVISE: - return loadvm_postcopy_handle_advise(mis, len); + return loadvm_postcopy_handle_advise(mis, len, errp); case MIG_CMD_POSTCOPY_LISTEN: - return loadvm_postcopy_handle_listen(mis); + return loadvm_postcopy_handle_listen(mis, errp); case MIG_CMD_POSTCOPY_RUN: - return loadvm_postcopy_handle_run(mis); + return loadvm_postcopy_handle_run(mis, errp); case MIG_CMD_POSTCOPY_RAM_DISCARD: - return loadvm_postcopy_ram_handle_discard(mis, len); + return loadvm_postcopy_ram_handle_discard(mis, len, errp); case MIG_CMD_POSTCOPY_RESUME: - return loadvm_postcopy_handle_resume(mis); + loadvm_postcopy_handle_resume(mis); + return 0; case MIG_CMD_RECV_BITMAP: - return loadvm_handle_recv_bitmap(mis, len); + return loadvm_handle_recv_bitmap(mis, len, errp); case MIG_CMD_ENABLE_COLO: - return loadvm_process_enable_colo(mis); + return loadvm_process_enable_colo(mis, errp); case MIG_CMD_SWITCHOVER_START: - return loadvm_postcopy_handle_switchover_start(); + return loadvm_postcopy_handle_switchover_start(errp); } return 0; @@ -2674,8 +2715,9 @@ static bool check_section_footer(QEMUFile *f, SaveStateEntry *se) } static int -qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) +qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type, Error **errp) { + ERRP_GUARD(); bool trace_downtime = (type == QEMU_VM_SECTION_FULL); uint32_t instance_id, version_id, section_id; int64_t start_ts, end_ts; @@ -2686,8 +2728,8 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) /* Read section start */ section_id = qemu_get_be32(f); if (!qemu_get_counted_string(f, idstr)) { - error_report("Unable to read ID string for section %u", - section_id); + error_setg(errp, "Unable to read ID string for section %u", + section_id); return -EINVAL; } instance_id = qemu_get_be32(f); @@ -2695,8 +2737,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) ret = qemu_file_get_error(f); if (ret) { - error_report("%s: Failed to read instance/version ID: %d", - __func__, ret); + error_setg(errp, "Failed to read instance/version ID: %d", ret); return ret; } @@ -2705,17 +2746,17 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) /* Find savevm section */ se = find_se(idstr, instance_id); if (se == NULL) { - error_report("Unknown savevm section or instance '%s' %"PRIu32". " - "Make sure that your current VM setup matches your " - "saved VM setup, including any hotplugged devices", - idstr, instance_id); + error_setg(errp, "Unknown section or instance '%s' %"PRIu32". " + "Make sure that your current VM setup matches your " + "saved VM setup, including any hotplugged devices", + idstr, instance_id); return -EINVAL; } /* Validate version */ if (version_id > se->version_id) { - error_report("savevm: unsupported version %d for '%s' v%d", - version_id, idstr, se->version_id); + error_setg(errp, "unsupported version %d for '%s' v%d", + version_id, idstr, se->version_id); return -EINVAL; } se->load_version_id = version_id; @@ -2723,7 +2764,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) /* Validate if it is a device's state */ if (xen_enabled() && se->is_ram) { - error_report("loadvm: %s RAM loading not allowed on Xen", idstr); + error_setg(errp, "loadvm: %s RAM loading not allowed on Xen", idstr); return -EINVAL; } @@ -2731,10 +2772,11 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) start_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME); } - ret = vmstate_load(f, se); + ret = vmstate_load(f, se, errp); if (ret < 0) { - error_report("error while loading state for instance 0x%"PRIx32" of" - " device '%s'", instance_id, idstr); + error_prepend(errp, + "error while loading state for instance 0x%"PRIx32" of" + " device '%s': ", instance_id, idstr); return ret; } @@ -2745,6 +2787,8 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) } if (!check_section_footer(f, se)) { + error_setg(errp, "Section footer error, section_id: %d", + section_id); return -EINVAL; } @@ -2752,7 +2796,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type) } static int -qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type) +qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type, Error **errp) { bool trace_downtime = (type == QEMU_VM_SECTION_END); int64_t start_ts, end_ts; @@ -2764,8 +2808,7 @@ qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type) ret = qemu_file_get_error(f); if (ret) { - error_report("%s: Failed to read section ID: %d", - __func__, ret); + error_setg(errp, "Failed to read section ID: %d", ret); return ret; } @@ -2776,7 +2819,7 @@ qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type) } } if (se == NULL) { - error_report("Unknown savevm section %d", section_id); + error_setg(errp, "Unknown section %d", section_id); return -EINVAL; } @@ -2784,10 +2827,8 @@ qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type) start_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME); } - ret = vmstate_load(f, se); + ret = vmstate_load(f, se, errp); if (ret < 0) { - error_report("error while loading state section id %d(%s)", - section_id, se->idstr); return ret; } @@ -2798,40 +2839,50 @@ qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type) } if (!check_section_footer(f, se)) { + error_setg(errp, "Section footer error, section_id: %d", + section_id); return -EINVAL; } return 0; } -static int qemu_loadvm_state_header(QEMUFile *f) +static int qemu_loadvm_state_header(QEMUFile *f, Error **errp) { unsigned int v; int ret; v = qemu_get_be32(f); if (v != QEMU_VM_FILE_MAGIC) { - error_report("Not a migration stream"); + error_setg(errp, "Not a migration stream, magic: %x != %x", + v, QEMU_VM_FILE_MAGIC); return -EINVAL; } v = qemu_get_be32(f); if (v == QEMU_VM_FILE_VERSION_COMPAT) { - error_report("SaveVM v2 format is obsolete and don't work anymore"); + error_setg(errp, + "SaveVM v2 format is obsolete and no longer supported"); + return -ENOTSUP; } if (v != QEMU_VM_FILE_VERSION) { - error_report("Unsupported migration stream version"); + error_setg(errp, "Unsupported migration stream version, " + "file version %x != %x", + v, QEMU_VM_FILE_VERSION); return -ENOTSUP; } if (migrate_get_current()->send_configuration) { - if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { - error_report("Configuration section missing"); + v = qemu_get_byte(f); + if (v != QEMU_VM_CONFIGURATION) { + error_setg(errp, "Configuration section missing, %x != %x", + v, QEMU_VM_CONFIGURATION); return -EINVAL; } - ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); + ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0, + errp); if (ret) { return ret; } @@ -3019,8 +3070,10 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) return true; } -int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) +int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis, + Error **errp) { + ERRP_GUARD(); uint8_t section_type; int ret = 0; @@ -3028,8 +3081,11 @@ retry: while (true) { section_type = qemu_get_byte(f); - ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL); + ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, errp); if (ret) { + error_prepend(errp, + "Failed to load section ID: stream error: %d: ", + ret); break; } @@ -3037,20 +3093,20 @@ retry: switch (section_type) { case QEMU_VM_SECTION_START: case QEMU_VM_SECTION_FULL: - ret = qemu_loadvm_section_start_full(f, section_type); + ret = qemu_loadvm_section_start_full(f, section_type, errp); if (ret < 0) { goto out; } break; case QEMU_VM_SECTION_PART: case QEMU_VM_SECTION_END: - ret = qemu_loadvm_section_part_end(f, section_type); + ret = qemu_loadvm_section_part_end(f, section_type, errp); if (ret < 0) { goto out; } break; case QEMU_VM_COMMAND: - ret = loadvm_process_command(f); + ret = loadvm_process_command(f, errp); trace_qemu_loadvm_state_section_command(ret); if ((ret < 0) || (ret == LOADVM_QUIT)) { goto out; @@ -3060,7 +3116,7 @@ retry: /* This is the end of migration */ goto out; default: - error_report("Unknown savevm section type %d", section_type); + error_setg(errp, "Unknown section type %d", section_type); ret = -EINVAL; goto out; } @@ -3088,33 +3144,31 @@ out: migrate_postcopy_ram() && postcopy_pause_incoming(mis)) { /* Reset f to point to the newly created channel */ f = mis->from_src_file; + error_free_or_abort(errp); goto retry; } } return ret; } -int qemu_loadvm_state(QEMUFile *f) +int qemu_loadvm_state(QEMUFile *f, Error **errp) { MigrationState *s = migrate_get_current(); MigrationIncomingState *mis = migration_incoming_get_current(); - Error *local_err = NULL; int ret; - if (qemu_savevm_state_blocked(&local_err)) { - error_report_err(local_err); + if (qemu_savevm_state_blocked(errp)) { return -EINVAL; } qemu_loadvm_thread_pool_create(mis); - ret = qemu_loadvm_state_header(f); + ret = qemu_loadvm_state_header(f, errp); if (ret) { return ret; } - if (qemu_loadvm_state_setup(f, &local_err) != 0) { - error_report_err(local_err); + if (qemu_loadvm_state_setup(f, errp) != 0) { return -EINVAL; } @@ -3124,7 +3178,7 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_pre_loadvm(); - ret = qemu_loadvm_state_main(f, mis); + ret = qemu_loadvm_state_main(f, mis, errp); qemu_event_set(&mis->main_thread_load_event); trace_qemu_loadvm_state_post_main(ret); @@ -3142,8 +3196,15 @@ int qemu_loadvm_state(QEMUFile *f) if (migrate_has_error(migrate_get_current()) || !qemu_loadvm_thread_pool_wait(s, mis)) { ret = -EINVAL; + error_setg(errp, + "Error while loading vmstate"); } else { ret = qemu_file_get_error(f); + if (ret < 0) { + error_setg(errp, + "Error while loading vmstate: stream error: %d", + ret); + } } } /* @@ -3192,15 +3253,14 @@ int qemu_loadvm_state(QEMUFile *f) return ret; } -int qemu_load_device_state(QEMUFile *f) +int qemu_load_device_state(QEMUFile *f, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); int ret; /* Load QEMU_VM_SECTION_FULL section */ - ret = qemu_loadvm_state_main(f, mis); + ret = qemu_loadvm_state_main(f, mis, errp); if (ret < 0) { - error_report("Failed to load device state: %d", ret); return ret; } @@ -3408,6 +3468,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, void qmp_xen_load_devices_state(const char *filename, Error **errp) { + ERRP_GUARD(); QEMUFile *f; QIOChannelFile *ioc; int ret; @@ -3429,10 +3490,10 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp) f = qemu_file_new_input(QIO_CHANNEL(ioc)); object_unref(OBJECT(ioc)); - ret = qemu_loadvm_state(f); + ret = qemu_loadvm_state(f, errp); qemu_fclose(f); if (ret < 0) { - error_setg(errp, "loading Xen device state failed"); + error_prepend(errp, "loading Xen device state failed: "); } migration_incoming_state_destroy(); } @@ -3503,13 +3564,12 @@ bool load_snapshot(const char *name, const char *vmstate, ret = -EINVAL; goto err_drain; } - ret = qemu_loadvm_state(f); + ret = qemu_loadvm_state(f, errp); migration_incoming_state_destroy(); bdrv_drain_all_end(); if (ret < 0) { - error_setg(errp, "Error %d while loading VM state", ret); return false; } diff --git a/migration/savevm.h b/migration/savevm.h index 2d5e9c7..c337e3e 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -64,10 +64,11 @@ void qemu_savevm_send_colo_enable(QEMUFile *f); void qemu_savevm_live_state(QEMUFile *f); int qemu_save_device_state(QEMUFile *f); -int qemu_loadvm_state(QEMUFile *f); +int qemu_loadvm_state(QEMUFile *f, Error **errp); void qemu_loadvm_state_cleanup(MigrationIncomingState *mis); -int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); -int qemu_load_device_state(QEMUFile *f); +int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis, + Error **errp); +int qemu_load_device_state(QEMUFile *f, Error **errp); int qemu_loadvm_approve_switchover(void); int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy); diff --git a/migration/tls.c b/migration/tls.c index 5cbf952..284a619 100644 --- a/migration/tls.c +++ b/migration/tls.c @@ -90,6 +90,10 @@ void migration_tls_channel_process_incoming(MigrationState *s, trace_migration_tls_incoming_handshake_start(); qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-incoming"); + if (migrate_postcopy_ram() || migrate_return_path()) { + qio_channel_set_feature(QIO_CHANNEL(tioc), + QIO_CHANNEL_FEATURE_CONCURRENT_IO); + } qio_channel_tls_handshake(tioc, migration_tls_incoming_handshake, NULL, @@ -149,6 +153,11 @@ void migration_tls_channel_connect(MigrationState *s, s->hostname = g_strdup(hostname); trace_migration_tls_outgoing_handshake_start(hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-outgoing"); + + if (migrate_postcopy_ram() || migrate_return_path()) { + qio_channel_set_feature(QIO_CHANNEL(tioc), + QIO_CHANNEL_FEATURE_CONCURRENT_IO); + } qio_channel_tls_handshake(tioc, migration_tls_outgoing_handshake, s, diff --git a/migration/trace-events b/migration/trace-events index c506e11..e8edd1f 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -105,6 +105,7 @@ ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d addr=0x%" ram_postcopy_send_discard_bitmap(void) "" ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p" ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx" +ram_save_complete(uint64_t dirty_pages, int done) "dirty=%" PRIu64 ", done=%d" ram_dirty_bitmap_request(char *str) "%s" ram_dirty_bitmap_reload_begin(char *str) "%s" ram_dirty_bitmap_reload_complete(char *str) "%s" @@ -284,8 +285,6 @@ postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t postcopy_place_page(void *host_addr) "host=%p" postcopy_place_page_zero(void *host_addr) "host=%p" postcopy_ram_enable_notify(void) "" -mark_postcopy_blocktime_begin(uint64_t addr, void *dd, uint32_t time, int cpu, int received) "addr: 0x%" PRIx64 ", dd: %p, time: %u, cpu: %d, already_received: %d" -mark_postcopy_blocktime_end(uint64_t addr, void *dd, uint32_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %u, affected_cpu: %d" postcopy_pause_fault_thread(void) "" postcopy_pause_fault_thread_continued(void) "" postcopy_pause_fast_load(void) "" @@ -309,8 +308,10 @@ postcopy_preempt_tls_handshake(void) "" postcopy_preempt_new_channel(void) "" postcopy_preempt_thread_entry(void) "" postcopy_preempt_thread_exit(void) "" - -get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u" +postcopy_blocktime_tid_cpu_map(int cpu, uint32_t tid) "cpu: %d, tid: %u" +postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu, bool exists) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d, exist: %d" +postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu, int affected_non_cpus) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d, affected_non_cpus: %d" +postcopy_blocktime_end_one(int cpu, uint8_t left_faults) "cpu: %d, left_faults: %" PRIu8 # exec.c migration_exec_outgoing(const char *cmd) "cmd=%s" @@ -353,6 +354,7 @@ cpr_state_save(const char *mode) "%s mode" cpr_state_load(const char *mode) "%s mode" cpr_transfer_input(const char *path) "%s" cpr_transfer_output(const char *path) "%s" +cpr_exec(void) "" # block-dirty-bitmap.c send_bitmap_header_enter(void) "" diff --git a/migration/vfio-stub.c b/migration/vfio-stub.c new file mode 100644 index 0000000..f59ebe0 --- /dev/null +++ b/migration/vfio-stub.c @@ -0,0 +1,16 @@ +/* + * QEMU live migration - stubs for VFIO + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "migration.h" + +void migration_populate_vfio_info(MigrationInfo *info) +{ +} + +void migration_reset_vfio_bytes_transferred(void) +{ +} diff --git a/migration/target.c b/migration/vfio.c index 12fd399..af6ae2c 100644 --- a/migration/target.c +++ b/migration/vfio.c @@ -1,5 +1,5 @@ /* - * QEMU live migration - functions that need to be compiled target-specific + * QEMU live migration - VFIO * * This work is licensed under the terms of the GNU GPL, version 2 * or (at your option) any later version. @@ -8,13 +8,8 @@ #include "qemu/osdep.h" #include "qapi/qapi-types-migration.h" #include "migration.h" -#include CONFIG_DEVICES - -#ifdef CONFIG_VFIO #include "hw/vfio/vfio-migration.h" -#endif -#ifdef CONFIG_VFIO void migration_populate_vfio_info(MigrationInfo *info) { if (vfio_migration_active()) { @@ -27,12 +22,3 @@ void migration_reset_vfio_bytes_transferred(void) { vfio_migration_reset_bytes_transferred(); } -#else -void migration_populate_vfio_info(MigrationInfo *info) -{ -} - -void migration_reset_vfio_bytes_transferred(void) -{ -} -#endif diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c index 741a588..4b01dc1 100644 --- a/migration/vmstate-types.c +++ b/migration/vmstate-types.c @@ -19,6 +19,7 @@ #include "qemu/error-report.h" #include "qemu/queue.h" #include "trace.h" +#include "qapi/error.h" /* bool */ @@ -321,6 +322,10 @@ static int get_fd(QEMUFile *f, void *pv, size_t size, const VMStateField *field) { int32_t *v = pv; + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + qemu_get_sbe32s(f, v); + return 0; + } *v = qemu_file_get_fd(f); return 0; } @@ -329,6 +334,10 @@ static int put_fd(QEMUFile *f, void *pv, size_t size, const VMStateField *field, JSONWriter *vmdesc) { int32_t *v = pv; + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + qemu_put_sbe32s(f, v); + return 0; + } return qemu_file_put_fd(f, *v); } @@ -543,13 +552,17 @@ static int get_tmp(QEMUFile *f, void *pv, size_t size, const VMStateField *field) { int ret; + Error *local_err = NULL; const VMStateDescription *vmsd = field->vmsd; int version_id = field->version_id; void *tmp = g_malloc(size); /* Writes the parent field which is at the start of the tmp */ *(void **)tmp = pv; - ret = vmstate_load_state(f, vmsd, tmp, version_id); + ret = vmstate_load_state(f, vmsd, tmp, version_id, &local_err); + if (ret < 0) { + error_report_err(local_err); + } g_free(tmp); return ret; } @@ -560,10 +573,14 @@ static int put_tmp(QEMUFile *f, void *pv, size_t size, const VMStateDescription *vmsd = field->vmsd; void *tmp = g_malloc(size); int ret; + Error *local_err = NULL; /* Writes the parent field which is at the start of the tmp */ *(void **)tmp = pv; - ret = vmstate_save_state(f, vmsd, tmp, vmdesc); + ret = vmstate_save_state(f, vmsd, tmp, vmdesc, &local_err); + if (ret) { + error_report_err(local_err); + } g_free(tmp); return ret; @@ -626,6 +643,7 @@ static int get_qtailq(QEMUFile *f, void *pv, size_t unused_size, const VMStateField *field) { int ret = 0; + Error *local_err = NULL; const VMStateDescription *vmsd = field->vmsd; /* size of a QTAILQ element */ size_t size = field->size; @@ -649,8 +667,9 @@ static int get_qtailq(QEMUFile *f, void *pv, size_t unused_size, while (qemu_get_byte(f)) { elm = g_malloc(size); - ret = vmstate_load_state(f, vmsd, elm, version_id); + ret = vmstate_load_state(f, vmsd, elm, version_id, &local_err); if (ret) { + error_report_err(local_err); return ret; } QTAILQ_RAW_INSERT_TAIL(pv, elm, entry_offset); @@ -669,13 +688,15 @@ static int put_qtailq(QEMUFile *f, void *pv, size_t unused_size, size_t entry_offset = field->start; void *elm; int ret; + Error *local_err = NULL; trace_put_qtailq(vmsd->name, vmsd->version_id); QTAILQ_RAW_FOREACH(elm, pv, entry_offset) { qemu_put_byte(f, true); - ret = vmstate_save_state(f, vmsd, elm, vmdesc); + ret = vmstate_save_state(f, vmsd, elm, vmdesc, &local_err); if (ret) { + error_report_err(local_err); return ret; } } @@ -704,6 +725,7 @@ static gboolean put_gtree_elem(gpointer key, gpointer value, gpointer data) struct put_gtree_data *capsule = (struct put_gtree_data *)data; QEMUFile *f = capsule->f; int ret; + Error *local_err = NULL; qemu_put_byte(f, true); @@ -711,16 +733,20 @@ static gboolean put_gtree_elem(gpointer key, gpointer value, gpointer data) if (!capsule->key_vmsd) { qemu_put_be64(f, (uint64_t)(uintptr_t)(key)); /* direct key */ } else { - ret = vmstate_save_state(f, capsule->key_vmsd, key, capsule->vmdesc); + ret = vmstate_save_state(f, capsule->key_vmsd, key, capsule->vmdesc, + &local_err); if (ret) { + error_report_err(local_err); capsule->ret = ret; return true; } } /* put the data */ - ret = vmstate_save_state(f, capsule->val_vmsd, value, capsule->vmdesc); + ret = vmstate_save_state(f, capsule->val_vmsd, value, capsule->vmdesc, + &local_err); if (ret) { + error_report_err(local_err); capsule->ret = ret; return true; } @@ -772,6 +798,7 @@ static int get_gtree(QEMUFile *f, void *pv, size_t unused_size, GTree *tree = *pval; void *key, *val; int ret = 0; + Error *local_err = NULL; /* in case of direct key, the key vmsd can be {}, ie. check fields */ if (!direct_key && version_id > key_vmsd->version_id) { @@ -803,18 +830,16 @@ static int get_gtree(QEMUFile *f, void *pv, size_t unused_size, key = (void *)(uintptr_t)qemu_get_be64(f); } else { key = g_malloc0(key_size); - ret = vmstate_load_state(f, key_vmsd, key, version_id); + ret = vmstate_load_state(f, key_vmsd, key, version_id, &local_err); if (ret) { - error_report("%s : failed to load %s (%d)", - field->name, key_vmsd->name, ret); + error_report_err(local_err); goto key_error; } } val = g_malloc0(val_size); - ret = vmstate_load_state(f, val_vmsd, val, version_id); + ret = vmstate_load_state(f, val_vmsd, val, version_id, &local_err); if (ret) { - error_report("%s : failed to load %s (%d)", - field->name, val_vmsd->name, ret); + error_report_err(local_err); goto val_error; } g_tree_insert(tree, key, val); @@ -851,14 +876,14 @@ static int put_qlist(QEMUFile *f, void *pv, size_t unused_size, size_t entry_offset = field->start; void *elm; int ret; + Error *local_err = NULL; trace_put_qlist(field->name, vmsd->name, vmsd->version_id); QLIST_RAW_FOREACH(elm, pv, entry_offset) { qemu_put_byte(f, true); - ret = vmstate_save_state(f, vmsd, elm, vmdesc); + ret = vmstate_save_state(f, vmsd, elm, vmdesc, &local_err); if (ret) { - error_report("%s: failed to save %s (%d)", field->name, - vmsd->name, ret); + error_report_err(local_err); return ret; } } @@ -872,6 +897,7 @@ static int get_qlist(QEMUFile *f, void *pv, size_t unused_size, const VMStateField *field) { int ret = 0; + Error *local_err = NULL; const VMStateDescription *vmsd = field->vmsd; /* size of a QLIST element */ size_t size = field->size; @@ -892,10 +918,9 @@ static int get_qlist(QEMUFile *f, void *pv, size_t unused_size, while (qemu_get_byte(f)) { elm = g_malloc(size); - ret = vmstate_load_state(f, vmsd, elm, version_id); + ret = vmstate_load_state(f, vmsd, elm, version_id, &local_err); if (ret) { - error_report("%s: failed to load %s (%d)", field->name, - vmsd->name, ret); + error_report_err(local_err); g_free(elm); return ret; } diff --git a/migration/vmstate.c b/migration/vmstate.c index 5feaa32..81eadde 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -25,7 +25,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, JSONWriter *vmdesc, Error **errp); static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque); + void *opaque, Error **errp); /* Whether this field should exist for either save or load the VM? */ static bool @@ -132,29 +132,43 @@ static void vmstate_handle_alloc(void *ptr, const VMStateField *field, } int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, int version_id) + void *opaque, int version_id, Error **errp) { + ERRP_GUARD(); const VMStateField *field = vmsd->fields; int ret = 0; trace_vmstate_load_state(vmsd->name, version_id); if (version_id > vmsd->version_id) { - error_report("%s: incoming version_id %d is too new " - "for local version_id %d", - vmsd->name, version_id, vmsd->version_id); + error_setg(errp, "%s: incoming version_id %d is too new " + "for local version_id %d", + vmsd->name, version_id, vmsd->version_id); trace_vmstate_load_state_end(vmsd->name, "too new", -EINVAL); return -EINVAL; } if (version_id < vmsd->minimum_version_id) { - error_report("%s: incoming version_id %d is too old " - "for local minimum version_id %d", - vmsd->name, version_id, vmsd->minimum_version_id); + error_setg(errp, "%s: incoming version_id %d is too old " + "for local minimum version_id %d", + vmsd->name, version_id, vmsd->minimum_version_id); trace_vmstate_load_state_end(vmsd->name, "too old", -EINVAL); return -EINVAL; } - if (vmsd->pre_load) { + if (vmsd->pre_load_errp) { + ret = vmsd->pre_load_errp(opaque, errp); + if (ret < 0) { + error_prepend(errp, "pre load hook failed for: '%s', " + "version_id: %d, minimum version_id: %d, " + "ret: %d: ", vmsd->name, vmsd->version_id, + vmsd->minimum_version_id, ret); + return ret; + } + } else if (vmsd->pre_load) { ret = vmsd->pre_load(opaque); if (ret) { + error_setg(errp, "pre load hook failed for: '%s', " + "version_id: %d, minimum version_id: %d, ret: %d", + vmsd->name, vmsd->version_id, vmsd->minimum_version_id, + ret); return ret; } } @@ -192,13 +206,21 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, if (inner_field->flags & VMS_STRUCT) { ret = vmstate_load_state(f, inner_field->vmsd, curr_elem, - inner_field->vmsd->version_id); + inner_field->vmsd->version_id, + errp); } else if (inner_field->flags & VMS_VSTRUCT) { ret = vmstate_load_state(f, inner_field->vmsd, curr_elem, - inner_field->struct_version_id); + inner_field->struct_version_id, + errp); } else { ret = inner_field->info->get(f, curr_elem, size, inner_field); + if (ret < 0) { + error_setg(errp, + "Failed to load element of type %s for %s: " + "%d", inner_field->info->name, + inner_field->name, ret); + } } /* If we used a fake temp field.. free it now */ @@ -208,30 +230,47 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, if (ret >= 0) { ret = qemu_file_get_error(f); + if (ret < 0) { + error_setg(errp, + "Failed to load %s state: stream error: %d", + vmsd->name, ret); + } } if (ret < 0) { qemu_file_set_error(f, ret); - error_report("Failed to load %s:%s", vmsd->name, - field->name); trace_vmstate_load_field_error(field->name, ret); return ret; } } } else if (field->flags & VMS_MUST_EXIST) { - error_report("Input validation failed: %s/%s", - vmsd->name, field->name); + error_setg(errp, "Input validation failed: %s/%s version_id: %d", + vmsd->name, field->name, vmsd->version_id); return -1; } field++; } assert(field->flags == VMS_END); - ret = vmstate_subsection_load(f, vmsd, opaque); + ret = vmstate_subsection_load(f, vmsd, opaque, errp); if (ret != 0) { qemu_file_set_error(f, ret); return ret; } - if (vmsd->post_load) { + if (vmsd->post_load_errp) { + ret = vmsd->post_load_errp(opaque, version_id, errp); + if (ret < 0) { + error_prepend(errp, "post load hook failed for: %s, version_id: " + "%d, minimum_version: %d, ret: %d: ", vmsd->name, + vmsd->version_id, vmsd->minimum_version_id, ret); + } + } else if (vmsd->post_load) { ret = vmsd->post_load(opaque, version_id); + if (ret < 0) { + error_setg(errp, + "post load hook failed for: %s, version_id: %d, " + "minimum_version: %d, ret: %d", + vmsd->name, vmsd->version_id, vmsd->minimum_version_id, + ret); + } } trace_vmstate_load_state_end(vmsd->name, "end", ret); return ret; @@ -384,12 +423,6 @@ bool vmstate_section_needed(const VMStateDescription *vmsd, void *opaque) int vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc_id) -{ - return vmstate_save_state_v(f, vmsd, opaque, vmdesc_id, vmsd->version_id, NULL); -} - -int vmstate_save_state_with_err(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, JSONWriter *vmdesc_id, Error **errp) { return vmstate_save_state_v(f, vmsd, opaque, vmdesc_id, vmsd->version_id, errp); @@ -398,12 +431,20 @@ int vmstate_save_state_with_err(QEMUFile *f, const VMStateDescription *vmsd, int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, JSONWriter *vmdesc, int version_id, Error **errp) { + ERRP_GUARD(); int ret = 0; const VMStateField *field = vmsd->fields; trace_vmstate_save_state_top(vmsd->name); - if (vmsd->pre_save) { + if (vmsd->pre_save_errp) { + ret = vmsd->pre_save_errp(opaque, errp); + trace_vmstate_save_state_pre_save_res(vmsd->name, ret); + if (ret < 0) { + error_prepend(errp, "pre-save for %s failed, ret: %d: ", + vmsd->name, ret); + } + } else if (vmsd->pre_save) { ret = vmsd->pre_save(opaque); trace_vmstate_save_state_pre_save_res(vmsd->name, ret); if (ret) { @@ -490,7 +531,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, if (inner_field->flags & VMS_STRUCT) { ret = vmstate_save_state(f, inner_field->vmsd, - curr_elem, vmdesc_loop); + curr_elem, vmdesc_loop, errp); } else if (inner_field->flags & VMS_VSTRUCT) { ret = vmstate_save_state_v(f, inner_field->vmsd, curr_elem, vmdesc_loop, @@ -566,8 +607,9 @@ vmstate_get_subsection(const VMStateDescription * const *sub, } static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque) + void *opaque, Error **errp) { + ERRP_GUARD(); trace_vmstate_subsection_load(vmsd->name); while (qemu_peek_byte(f, 0) == QEMU_VM_SUBSECTION) { @@ -598,6 +640,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, sub_vmsd = vmstate_get_subsection(vmsd->subsections, idstr); if (sub_vmsd == NULL) { trace_vmstate_subsection_load_bad(vmsd->name, idstr, "(lookup)"); + error_setg(errp, "VM subsection '%s' in '%s' does not exist", + idstr, vmsd->name); return -ENOENT; } qemu_file_skip(f, 1); /* subsection */ @@ -605,9 +649,12 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, qemu_file_skip(f, len); /* idstr */ version_id = qemu_get_be32(f); - ret = vmstate_load_state(f, sub_vmsd, opaque, version_id); + ret = vmstate_load_state(f, sub_vmsd, opaque, version_id, errp); if (ret) { trace_vmstate_subsection_load_bad(vmsd->name, idstr, "(child)"); + error_prepend(errp, + "Loading VM subsection '%s' in '%s' failed: %d: ", + idstr, vmsd->name, ret); return ret; } } @@ -646,7 +693,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len); qemu_put_be32(f, vmsdsub->version_id); - ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp); + ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc, errp); if (ret) { return ret; } |