aboutsummaryrefslogtreecommitdiff
path: root/migration
diff options
context:
space:
mode:
Diffstat (limited to 'migration')
-rw-r--r--migration/block-active.c48
-rw-r--r--migration/block-dirty-bitmap.c4
-rw-r--r--migration/channel-block.c4
-rw-r--r--migration/channel.c9
-rw-r--r--migration/colo.c34
-rw-r--r--migration/cpr-transfer.c74
-rw-r--r--migration/cpr.c266
-rw-r--r--migration/cpu-throttle.c199
-rw-r--r--migration/dirtyrate.c28
-rw-r--r--migration/dirtyrate.h2
-rw-r--r--migration/fd.c27
-rw-r--r--migration/file.c7
-rw-r--r--migration/file.h2
-rw-r--r--migration/global_state.c2
-rw-r--r--migration/meson.build7
-rw-r--r--migration/migration-hmp-cmds.c198
-rw-r--r--migration/migration.c942
-rw-r--r--migration/migration.h87
-rw-r--r--migration/multifd-device-state.c212
-rw-r--r--migration/multifd-nocomp.c468
-rw-r--r--migration/multifd-qatzip.c395
-rw-r--r--migration/multifd-qpl.c92
-rw-r--r--migration/multifd-uadk.c108
-rw-r--r--migration/multifd-zero-page.c37
-rw-r--r--migration/multifd-zlib.c101
-rw-r--r--migration/multifd-zstd.c108
-rw-r--r--migration/multifd.c824
-rw-r--r--migration/multifd.h251
-rw-r--r--migration/options.c129
-rw-r--r--migration/options.h7
-rw-r--r--migration/postcopy-ram.c93
-rw-r--r--migration/qemu-file.c86
-rw-r--r--migration/qemu-file.h4
-rw-r--r--migration/ram.c396
-rw-r--r--migration/ram.h29
-rw-r--r--migration/rdma.c197
-rw-r--r--migration/rdma.h12
-rw-r--r--migration/savevm.c452
-rw-r--r--migration/savevm.h13
-rw-r--r--migration/socket.c18
-rw-r--r--migration/socket.h1
-rw-r--r--migration/target.c8
-rw-r--r--migration/tls.c5
-rw-r--r--migration/tls.h2
-rw-r--r--migration/trace-events35
-rw-r--r--migration/vmstate-types.c26
-rw-r--r--migration/vmstate.c168
47 files changed, 4197 insertions, 2020 deletions
diff --git a/migration/block-active.c b/migration/block-active.c
new file mode 100644
index 0000000..40e986a
--- /dev/null
+++ b/migration/block-active.c
@@ -0,0 +1,48 @@
+/*
+ * Block activation tracking for migration purpose
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2024 Red Hat, Inc.
+ */
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "qapi/error.h"
+#include "migration/migration.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+bool migration_block_activate(Error **errp)
+{
+ ERRP_GUARD();
+
+ assert(bql_locked());
+
+ trace_migration_block_activation("active");
+
+ bdrv_activate_all(errp);
+ if (*errp) {
+ error_report_err(error_copy(*errp));
+ return false;
+ }
+
+ return true;
+}
+
+bool migration_block_inactivate(void)
+{
+ int ret;
+
+ assert(bql_locked());
+
+ trace_migration_block_activation("inactive");
+
+ ret = bdrv_inactivate_all();
+ if (ret) {
+ error_report("%s: bdrv_inactivate_all() failed: %d",
+ __func__, ret);
+ return false;
+ }
+
+ return true;
+}
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index a7d5504..f2c352d 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -62,8 +62,8 @@
#include "block/block.h"
#include "block/block_int.h"
#include "block/dirty-bitmap.h"
-#include "sysemu/block-backend.h"
-#include "sysemu/runstate.h"
+#include "system/block-backend.h"
+#include "system/runstate.h"
#include "qemu/main-loop.h"
#include "qemu/error-report.h"
#include "migration/misc.h"
diff --git a/migration/channel-block.c b/migration/channel-block.c
index fff8d87..97de5a6 100644
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@@ -123,7 +123,7 @@ qio_channel_block_seek(QIOChannel *ioc,
bioc->offset = offset;
break;
case SEEK_CUR:
- bioc->offset += whence;
+ bioc->offset += offset;
break;
case SEEK_END:
error_setg(errp, "Size of VMstate region is unknown");
@@ -170,7 +170,7 @@ qio_channel_block_set_aio_fd_handler(QIOChannel *ioc,
static void
qio_channel_block_class_init(ObjectClass *klass,
- void *class_data G_GNUC_UNUSED)
+ const void *class_data G_GNUC_UNUSED)
{
QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
diff --git a/migration/channel.c b/migration/channel.c
index f9de064..a547b1f 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -33,6 +33,7 @@
void migration_channel_process_incoming(QIOChannel *ioc)
{
MigrationState *s = migrate_get_current();
+ MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
trace_migration_set_incoming_channel(
@@ -47,6 +48,10 @@ void migration_channel_process_incoming(QIOChannel *ioc)
if (local_err) {
error_report_err(local_err);
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+ if (mis->exit_on_error) {
+ exit(EXIT_FAILURE);
+ }
}
}
@@ -74,7 +79,7 @@ void migration_channel_connect(MigrationState *s,
if (!error) {
/* tls_channel_connect will call back to this
* function after the TLS handshake,
- * so we mustn't call migrate_fd_connect until then
+ * so we mustn't call migration_connect until then
*/
return;
@@ -89,7 +94,7 @@ void migration_channel_connect(MigrationState *s,
qemu_mutex_unlock(&s->qemu_file_lock);
}
}
- migrate_fd_connect(s, error);
+ migration_connect(s, error);
error_free(error);
}
diff --git a/migration/colo.c b/migration/colo.c
index 6449490..e0f713c 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -11,7 +11,7 @@
*/
#include "qemu/osdep.h"
-#include "sysemu/sysemu.h"
+#include "system/system.h"
#include "qapi/error.h"
#include "qapi/qapi-commands-migration.h"
#include "migration.h"
@@ -30,8 +30,8 @@
#include "net/colo.h"
#include "block/block.h"
#include "qapi/qapi-events-migration.h"
-#include "sysemu/cpus.h"
-#include "sysemu/runstate.h"
+#include "system/cpus.h"
+#include "system/runstate.h"
#include "net/filter.h"
#include "options.h"
@@ -146,7 +146,7 @@ static void secondary_vm_do_failover(void)
return;
}
/* Notify COLO incoming thread that failover work is finished */
- qemu_sem_post(&mis->colo_incoming_sem);
+ qemu_event_set(&mis->colo_incoming_event);
/* For Secondary VM, jump to incoming co */
if (mis->colo_incoming_co) {
@@ -195,7 +195,7 @@ static void primary_vm_do_failover(void)
}
/* Notify COLO thread that failover work is finished */
- qemu_sem_post(&s->colo_exit_sem);
+ qemu_event_set(&s->colo_exit_event);
}
COLOMode get_colo_mode(void)
@@ -452,6 +452,9 @@ static int colo_do_checkpoint_transaction(MigrationState *s,
bql_unlock();
goto out;
}
+
+ qemu_savevm_maybe_send_switchover_start(s->to_dst_file);
+
/* Note: device state is saved into buffer */
ret = qemu_save_device_state(fb);
@@ -617,8 +620,8 @@ out:
}
/* Hope this not to be too long to wait here */
- qemu_sem_wait(&s->colo_exit_sem);
- qemu_sem_destroy(&s->colo_exit_sem);
+ qemu_event_wait(&s->colo_exit_event);
+ qemu_event_destroy(&s->colo_exit_event);
/*
* It is safe to unregister notifier after failover finished.
@@ -648,7 +651,7 @@ void migrate_start_colo_process(MigrationState *s)
s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST,
colo_checkpoint_notify_timer, NULL);
- qemu_sem_init(&s->colo_exit_sem, 0);
+ qemu_event_init(&s->colo_exit_event, false);
colo_process_checkpoint(s);
bql_lock();
}
@@ -805,11 +808,11 @@ void colo_shutdown(void)
case COLO_MODE_PRIMARY:
s = migrate_get_current();
qemu_event_set(&s->colo_checkpoint_event);
- qemu_sem_post(&s->colo_exit_sem);
+ qemu_event_set(&s->colo_exit_event);
break;
case COLO_MODE_SECONDARY:
mis = migration_incoming_get_current();
- qemu_sem_post(&mis->colo_incoming_sem);
+ qemu_event_set(&mis->colo_incoming_event);
break;
default:
break;
@@ -824,7 +827,7 @@ static void *colo_process_incoming_thread(void *opaque)
Error *local_err = NULL;
rcu_register_thread();
- qemu_sem_init(&mis->colo_incoming_sem, 0);
+ qemu_event_init(&mis->colo_incoming_event, false);
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_COLO);
@@ -836,7 +839,7 @@ static void *colo_process_incoming_thread(void *opaque)
/* Make sure all file formats throw away their mutable metadata */
bql_lock();
- bdrv_activate_all(&local_err);
+ migration_block_activate(&local_err);
bql_unlock();
if (local_err) {
error_report_err(local_err);
@@ -920,8 +923,8 @@ out:
}
/* Hope this not to be too long to loop here */
- qemu_sem_wait(&mis->colo_incoming_sem);
- qemu_sem_destroy(&mis->colo_incoming_sem);
+ qemu_event_wait(&mis->colo_incoming_event);
+ qemu_event_destroy(&mis->colo_incoming_event);
rcu_unregister_thread();
return NULL;
@@ -935,7 +938,8 @@ void coroutine_fn colo_incoming_co(void)
assert(bql_locked());
assert(migration_incoming_colo_enabled());
- qemu_thread_create(&th, "mig/dst/colo", colo_process_incoming_thread,
+ qemu_thread_create(&th, MIGRATION_THREAD_DST_COLO,
+ colo_process_incoming_thread,
mis, QEMU_THREAD_JOINABLE);
mis->colo_incoming_co = qemu_coroutine_self();
diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
new file mode 100644
index 0000000..00371d1
--- /dev/null
+++ b/migration/cpr-transfer.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "io/channel-file.h"
+#include "io/channel-socket.h"
+#include "io/net-listener.h"
+#include "migration/cpr.h"
+#include "migration/migration.h"
+#include "migration/savevm.h"
+#include "migration/qemu-file.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+
+QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp)
+{
+ MigrationAddress *addr = channel->addr;
+
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
+
+ g_autoptr(QIOChannelSocket) sioc = qio_channel_socket_new();
+ QIOChannel *ioc = QIO_CHANNEL(sioc);
+ SocketAddress *saddr = &addr->u.socket;
+
+ if (qio_channel_socket_connect_sync(sioc, saddr, errp) < 0) {
+ return NULL;
+ }
+ trace_cpr_transfer_output(addr->u.socket.u.q_unix.path);
+ qio_channel_set_name(ioc, "cpr-out");
+ return qemu_file_new_output(ioc);
+
+ } else {
+ error_setg(errp, "bad cpr channel address; must be unix");
+ return NULL;
+ }
+}
+
+QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
+{
+ MigrationAddress *addr = channel->addr;
+
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
+ (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
+
+ g_autoptr(QIOChannelSocket) sioc = NULL;
+ SocketAddress *saddr = &addr->u.socket;
+ g_autoptr(QIONetListener) listener = qio_net_listener_new();
+ QIOChannel *ioc;
+
+ qio_net_listener_set_name(listener, "cpr-socket-listener");
+ if (qio_net_listener_open_sync(listener, saddr, 1, errp) < 0) {
+ return NULL;
+ }
+
+ sioc = qio_net_listener_wait_client(listener);
+ ioc = QIO_CHANNEL(sioc);
+ trace_cpr_transfer_input(
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ?
+ addr->u.socket.u.q_unix.path : addr->u.socket.u.fd.str);
+ qio_channel_set_name(ioc, "cpr-in");
+ return qemu_file_new_input(ioc);
+
+ } else {
+ error_setg(errp, "bad cpr channel socket type; must be unix");
+ return NULL;
+ }
+}
diff --git a/migration/cpr.c b/migration/cpr.c
new file mode 100644
index 0000000..a50a57e
--- /dev/null
+++ b/migration/cpr.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021-2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "migration/cpr.h"
+#include "migration/misc.h"
+#include "migration/options.h"
+#include "migration/qemu-file.h"
+#include "migration/savevm.h"
+#include "migration/vmstate.h"
+#include "system/runstate.h"
+#include "trace.h"
+
+/*************************************************************************/
+/* cpr state container for all information to be saved. */
+
+typedef QLIST_HEAD(CprFdList, CprFd) CprFdList;
+
+typedef struct CprState {
+ CprFdList fds;
+} CprState;
+
+static CprState cpr_state;
+
+/****************************************************************************/
+
+typedef struct CprFd {
+ char *name;
+ unsigned int namelen;
+ int id;
+ int fd;
+ QLIST_ENTRY(CprFd) next;
+} CprFd;
+
+static const VMStateDescription vmstate_cpr_fd = {
+ .name = "cpr fd",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(namelen, CprFd),
+ VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen),
+ VMSTATE_INT32(id, CprFd),
+ VMSTATE_FD(fd, CprFd),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+void cpr_save_fd(const char *name, int id, int fd)
+{
+ CprFd *elem = g_new0(CprFd, 1);
+
+ trace_cpr_save_fd(name, id, fd);
+ elem->name = g_strdup(name);
+ elem->namelen = strlen(name) + 1;
+ elem->id = id;
+ elem->fd = fd;
+ QLIST_INSERT_HEAD(&cpr_state.fds, elem, next);
+}
+
+static CprFd *find_fd(CprFdList *head, const char *name, int id)
+{
+ CprFd *elem;
+
+ QLIST_FOREACH(elem, head, next) {
+ if (!strcmp(elem->name, name) && elem->id == id) {
+ return elem;
+ }
+ }
+ return NULL;
+}
+
+void cpr_delete_fd(const char *name, int id)
+{
+ CprFd *elem = find_fd(&cpr_state.fds, name, id);
+
+ if (elem) {
+ QLIST_REMOVE(elem, next);
+ g_free(elem->name);
+ g_free(elem);
+ }
+
+ trace_cpr_delete_fd(name, id);
+}
+
+int cpr_find_fd(const char *name, int id)
+{
+ CprFd *elem = find_fd(&cpr_state.fds, name, id);
+ int fd = elem ? elem->fd : -1;
+
+ trace_cpr_find_fd(name, id, fd);
+ return fd;
+}
+
+void cpr_resave_fd(const char *name, int id, int fd)
+{
+ CprFd *elem = find_fd(&cpr_state.fds, name, id);
+ int old_fd = elem ? elem->fd : -1;
+
+ if (old_fd < 0) {
+ cpr_save_fd(name, id, fd);
+ } else if (old_fd != fd) {
+ error_setg(&error_fatal,
+ "internal error: cpr fd '%s' id %d value %d "
+ "already saved with a different value %d",
+ name, id, fd, old_fd);
+ }
+}
+
+int cpr_open_fd(const char *path, int flags, const char *name, int id,
+ Error **errp)
+{
+ int fd = cpr_find_fd(name, id);
+
+ if (fd < 0) {
+ fd = qemu_open(path, flags, errp);
+ if (fd >= 0) {
+ cpr_save_fd(name, id, fd);
+ }
+ }
+ return fd;
+}
+
+/*************************************************************************/
+#define CPR_STATE "CprState"
+
+static const VMStateDescription vmstate_cpr_state = {
+ .name = CPR_STATE,
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next),
+ VMSTATE_END_OF_LIST()
+ }
+};
+/*************************************************************************/
+
+static QEMUFile *cpr_state_file;
+
+QIOChannel *cpr_state_ioc(void)
+{
+ return qemu_file_get_ioc(cpr_state_file);
+}
+
+static MigMode incoming_mode = MIG_MODE_NONE;
+
+MigMode cpr_get_incoming_mode(void)
+{
+ return incoming_mode;
+}
+
+void cpr_set_incoming_mode(MigMode mode)
+{
+ incoming_mode = mode;
+}
+
+bool cpr_is_incoming(void)
+{
+ return incoming_mode != MIG_MODE_NONE;
+}
+
+int cpr_state_save(MigrationChannel *channel, Error **errp)
+{
+ int ret;
+ QEMUFile *f;
+ MigMode mode = migrate_mode();
+
+ trace_cpr_state_save(MigMode_str(mode));
+
+ if (mode == MIG_MODE_CPR_TRANSFER) {
+ g_assert(channel);
+ f = cpr_transfer_output(channel, errp);
+ } else {
+ return 0;
+ }
+ if (!f) {
+ return -1;
+ }
+
+ qemu_put_be32(f, QEMU_CPR_FILE_MAGIC);
+ qemu_put_be32(f, QEMU_CPR_FILE_VERSION);
+
+ ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0);
+ if (ret) {
+ error_setg(errp, "vmstate_save_state error %d", ret);
+ qemu_fclose(f);
+ return ret;
+ }
+
+ /*
+ * Close the socket only partially so we can later detect when the other
+ * end closes by getting a HUP event.
+ */
+ qemu_fflush(f);
+ qio_channel_shutdown(qemu_file_get_ioc(f), QIO_CHANNEL_SHUTDOWN_WRITE,
+ NULL);
+ cpr_state_file = f;
+ return 0;
+}
+
+int cpr_state_load(MigrationChannel *channel, Error **errp)
+{
+ int ret;
+ uint32_t v;
+ QEMUFile *f;
+ MigMode mode = 0;
+
+ if (channel) {
+ mode = MIG_MODE_CPR_TRANSFER;
+ cpr_set_incoming_mode(mode);
+ f = cpr_transfer_input(channel, errp);
+ } else {
+ return 0;
+ }
+ if (!f) {
+ return -1;
+ }
+
+ trace_cpr_state_load(MigMode_str(mode));
+
+ v = qemu_get_be32(f);
+ if (v != QEMU_CPR_FILE_MAGIC) {
+ error_setg(errp, "Not a migration stream (bad magic %x)", v);
+ qemu_fclose(f);
+ return -EINVAL;
+ }
+ v = qemu_get_be32(f);
+ if (v != QEMU_CPR_FILE_VERSION) {
+ error_setg(errp, "Unsupported migration stream version %d", v);
+ qemu_fclose(f);
+ return -ENOTSUP;
+ }
+
+ ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1);
+ if (ret) {
+ error_setg(errp, "vmstate_load_state error %d", ret);
+ qemu_fclose(f);
+ return ret;
+ }
+
+ /*
+ * Let the caller decide when to close the socket (and generate a HUP event
+ * for the sending side).
+ */
+ cpr_state_file = f;
+
+ return ret;
+}
+
+void cpr_state_close(void)
+{
+ if (cpr_state_file) {
+ qemu_fclose(cpr_state_file);
+ cpr_state_file = NULL;
+ }
+}
+
+bool cpr_incoming_needed(void *opaque)
+{
+ MigMode mode = migrate_mode();
+ return mode == MIG_MODE_CPR_TRANSFER;
+}
diff --git a/migration/cpu-throttle.c b/migration/cpu-throttle.c
new file mode 100644
index 0000000..0642e6b
--- /dev/null
+++ b/migration/cpu-throttle.c
@@ -0,0 +1,199 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "hw/core/cpu.h"
+#include "qemu/main-loop.h"
+#include "system/cpus.h"
+#include "system/cpu-throttle.h"
+#include "migration.h"
+#include "migration-stats.h"
+#include "trace.h"
+
+/* vcpu throttling controls */
+static QEMUTimer *throttle_timer, *throttle_dirty_sync_timer;
+static unsigned int throttle_percentage;
+static bool throttle_dirty_sync_timer_active;
+static uint64_t throttle_dirty_sync_count_prev;
+
+#define CPU_THROTTLE_PCT_MIN 1
+#define CPU_THROTTLE_PCT_MAX 99
+#define CPU_THROTTLE_TIMESLICE_NS 10000000
+
+/* Making sure RAMBlock dirty bitmap is synchronized every five seconds */
+#define CPU_THROTTLE_DIRTY_SYNC_TIMESLICE_MS 5000
+
+static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
+{
+ double pct;
+ double throttle_ratio;
+ int64_t sleeptime_ns, endtime_ns;
+
+ if (!cpu_throttle_get_percentage()) {
+ return;
+ }
+
+ pct = (double)cpu_throttle_get_percentage() / 100;
+ throttle_ratio = pct / (1 - pct);
+ /* Add 1ns to fix double's rounding error (like 0.9999999...) */
+ sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
+ endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
+ while (sleeptime_ns > 0 && !cpu->stop) {
+ if (sleeptime_ns > SCALE_MS) {
+ qemu_cond_timedwait_bql(cpu->halt_cond,
+ sleeptime_ns / SCALE_MS);
+ } else {
+ bql_unlock();
+ g_usleep(sleeptime_ns / SCALE_US);
+ bql_lock();
+ }
+ sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ }
+ qatomic_set(&cpu->throttle_thread_scheduled, 0);
+}
+
+static void cpu_throttle_timer_tick(void *opaque)
+{
+ CPUState *cpu;
+ double pct;
+
+ /* Stop the timer if needed */
+ if (!cpu_throttle_get_percentage()) {
+ return;
+ }
+ CPU_FOREACH(cpu) {
+ if (!qatomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
+ async_run_on_cpu(cpu, cpu_throttle_thread,
+ RUN_ON_CPU_NULL);
+ }
+ }
+
+ pct = (double)cpu_throttle_get_percentage() / 100;
+ timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
+ CPU_THROTTLE_TIMESLICE_NS / (1 - pct));
+}
+
+void cpu_throttle_set(int new_throttle_pct)
+{
+ /*
+ * boolean to store whether throttle is already active or not,
+ * before modifying throttle_percentage
+ */
+ bool throttle_active = cpu_throttle_active();
+
+ trace_cpu_throttle_set(new_throttle_pct);
+
+ /* Ensure throttle percentage is within valid range */
+ new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
+ new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
+
+ qatomic_set(&throttle_percentage, new_throttle_pct);
+
+ if (!throttle_active) {
+ cpu_throttle_timer_tick(NULL);
+ }
+}
+
+void cpu_throttle_stop(void)
+{
+ qatomic_set(&throttle_percentage, 0);
+ cpu_throttle_dirty_sync_timer(false);
+}
+
+bool cpu_throttle_active(void)
+{
+ return (cpu_throttle_get_percentage() != 0);
+}
+
+int cpu_throttle_get_percentage(void)
+{
+ return qatomic_read(&throttle_percentage);
+}
+
+void cpu_throttle_dirty_sync_timer_tick(void *opaque)
+{
+ uint64_t sync_cnt = stat64_get(&mig_stats.dirty_sync_count);
+
+ /*
+ * The first iteration copies all memory anyhow and has no
+ * effect on guest performance, therefore omit it to avoid
+ * paying extra for the sync penalty.
+ */
+ if (sync_cnt <= 1) {
+ goto end;
+ }
+
+ if (sync_cnt == throttle_dirty_sync_count_prev) {
+ trace_cpu_throttle_dirty_sync();
+ WITH_RCU_READ_LOCK_GUARD() {
+ migration_bitmap_sync_precopy(false);
+ }
+ }
+
+end:
+ throttle_dirty_sync_count_prev = stat64_get(&mig_stats.dirty_sync_count);
+
+ timer_mod(throttle_dirty_sync_timer,
+ qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) +
+ CPU_THROTTLE_DIRTY_SYNC_TIMESLICE_MS);
+}
+
+static bool cpu_throttle_dirty_sync_active(void)
+{
+ return qatomic_read(&throttle_dirty_sync_timer_active);
+}
+
+void cpu_throttle_dirty_sync_timer(bool enable)
+{
+ assert(throttle_dirty_sync_timer);
+
+ if (enable) {
+ if (!cpu_throttle_dirty_sync_active()) {
+ /*
+ * Always reset the dirty sync count cache, in case migration
+ * was cancelled once.
+ */
+ throttle_dirty_sync_count_prev = 0;
+ timer_mod(throttle_dirty_sync_timer,
+ qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) +
+ CPU_THROTTLE_DIRTY_SYNC_TIMESLICE_MS);
+ qatomic_set(&throttle_dirty_sync_timer_active, 1);
+ }
+ } else {
+ if (cpu_throttle_dirty_sync_active()) {
+ timer_del(throttle_dirty_sync_timer);
+ qatomic_set(&throttle_dirty_sync_timer_active, 0);
+ }
+ }
+}
+
+void cpu_throttle_init(void)
+{
+ throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
+ cpu_throttle_timer_tick, NULL);
+ throttle_dirty_sync_timer =
+ timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
+ cpu_throttle_dirty_sync_timer_tick, NULL);
+}
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 1d9db81..986624c 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -14,7 +14,7 @@
#include "qemu/error-report.h"
#include "hw/core/cpu.h"
#include "qapi/error.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "qemu/rcu_queue.h"
#include "qemu/main-loop.h"
@@ -24,11 +24,12 @@
#include "dirtyrate.h"
#include "monitor/hmp.h"
#include "monitor/monitor.h"
-#include "qapi/qmp/qdict.h"
-#include "sysemu/kvm.h"
-#include "sysemu/runstate.h"
-#include "exec/memory.h"
+#include "qobject/qdict.h"
+#include "system/kvm.h"
+#include "system/runstate.h"
+#include "system/memory.h"
#include "qemu/xxhash.h"
+#include "migration.h"
/*
* total_dirty_pages is procted by BQL and is used
@@ -149,12 +150,12 @@ int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
unsigned int flag,
bool one_shot)
{
- DirtyPageRecord *records;
+ DirtyPageRecord *records = NULL;
int64_t init_time_ms;
int64_t duration;
int64_t dirtyrate;
int i = 0;
- unsigned int gen_id;
+ unsigned int gen_id = 0;
retry:
init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -228,8 +229,7 @@ static int time_unit_to_power(TimeUnit time_unit)
case TIME_UNIT_MILLISECOND:
return -3;
default:
- assert(false); /* unreachable */
- return 0;
+ g_assert_not_reached();
}
}
@@ -437,6 +437,7 @@ static void get_ramblock_dirty_info(RAMBlock *block,
struct DirtyRateConfig *config)
{
uint64_t sample_pages_per_gigabytes = config->sample_pages_per_gigabytes;
+ gsize len;
/* Right shift 30 bits to calc ramblock size in GB */
info->sample_pages_count = (qemu_ram_get_used_length(block) *
@@ -445,7 +446,9 @@ static void get_ramblock_dirty_info(RAMBlock *block,
info->ramblock_pages = qemu_ram_get_used_length(block) >>
qemu_target_page_bits();
info->ramblock_addr = qemu_ram_get_host_addr(block);
- strcpy(info->idstr, qemu_ram_get_idstr(block));
+ len = g_strlcpy(info->idstr, qemu_ram_get_idstr(block),
+ sizeof(info->idstr));
+ g_assert(len < sizeof(info->idstr));
}
static void free_ramblock_dirty_info(struct RamblockDirtyInfo *infos, int count)
@@ -840,8 +843,9 @@ void qmp_calc_dirty_rate(int64_t calc_time,
init_dirtyrate_stat(config);
- qemu_thread_create(&thread, "get_dirtyrate", get_dirtyrate_thread,
- (void *)&config, QEMU_THREAD_DETACHED);
+ qemu_thread_create(&thread, MIGRATION_THREAD_DIRTY_RATE,
+ get_dirtyrate_thread, (void *)&config,
+ QEMU_THREAD_DETACHED);
}
diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
index 869c060..35225c3 100644
--- a/migration/dirtyrate.h
+++ b/migration/dirtyrate.h
@@ -13,7 +13,7 @@
#ifndef QEMU_MIGRATION_DIRTYRATE_H
#define QEMU_MIGRATION_DIRTYRATE_H
-#include "sysemu/dirtyrate.h"
+#include "system/dirtyrate.h"
/*
* Sample 512 pages per GB as default.
diff --git a/migration/fd.c b/migration/fd.c
index aab5189..9bf9be6 100644
--- a/migration/fd.c
+++ b/migration/fd.c
@@ -25,6 +25,29 @@
#include "io/channel-util.h"
#include "trace.h"
+static bool fd_is_pipe(int fd)
+{
+ struct stat statbuf;
+
+ if (fstat(fd, &statbuf) == -1) {
+ return false;
+ }
+
+ return S_ISFIFO(statbuf.st_mode);
+}
+
+static bool migration_fd_valid(int fd)
+{
+ if (fd_is_socket(fd)) {
+ return true;
+ }
+
+ if (fd_is_pipe(fd)) {
+ return true;
+ }
+
+ return false;
+}
void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp)
{
@@ -34,7 +57,7 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **
return;
}
- if (!fd_is_socket(fd)) {
+ if (!migration_fd_valid(fd)) {
warn_report("fd: migration to a file is deprecated."
" Use file: instead.");
}
@@ -68,7 +91,7 @@ void fd_start_incoming_migration(const char *fdname, Error **errp)
return;
}
- if (!fd_is_socket(fd)) {
+ if (!migration_fd_valid(fd)) {
warn_report("fd: migration to a file is deprecated."
" Use file: instead.");
}
diff --git a/migration/file.c b/migration/file.c
index db870f2..bb8031e 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -6,7 +6,7 @@
*/
#include "qemu/osdep.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
@@ -112,7 +112,6 @@ void file_start_outgoing_migration(MigrationState *s,
error_setg_errno(errp, errno,
"failed to truncate migration file to offset %" PRIx64,
offset);
- object_unref(OBJECT(fioc));
return;
}
@@ -120,7 +119,6 @@ void file_start_outgoing_migration(MigrationState *s,
ioc = QIO_CHANNEL(fioc);
if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
- object_unref(OBJECT(fioc));
return;
}
qio_channel_set_name(ioc, "migration-file-outgoing");
@@ -198,12 +196,13 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp)
}
int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
- int niov, RAMBlock *block, Error **errp)
+ int niov, MultiFDPages_t *pages, Error **errp)
{
ssize_t ret = 0;
int i, slice_idx, slice_num;
uintptr_t base, next, offset;
size_t len;
+ RAMBlock *block = pages->block;
slice_idx = 0;
slice_num = 1;
diff --git a/migration/file.h b/migration/file.h
index 9f71e87..1a1115f 100644
--- a/migration/file.h
+++ b/migration/file.h
@@ -21,6 +21,6 @@ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp);
void file_cleanup_outgoing_migration(void);
bool file_send_channel_create(gpointer opaque, Error **errp);
int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
- int niov, RAMBlock *block, Error **errp);
+ int niov, MultiFDPages_t *pages, Error **errp);
int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp);
#endif
diff --git a/migration/global_state.c b/migration/global_state.c
index 3a9796c..c1f90fc 100644
--- a/migration/global_state.c
+++ b/migration/global_state.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
-#include "sysemu/runstate.h"
+#include "system/runstate.h"
#include "qapi/error.h"
#include "migration.h"
#include "migration/global_state.h"
diff --git a/migration/meson.build b/migration/meson.build
index 5ce2acb4..9aa48b2 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -11,8 +11,12 @@ migration_files = files(
system_ss.add(files(
'block-dirty-bitmap.c',
+ 'block-active.c',
'channel.c',
'channel-block.c',
+ 'cpr.c',
+ 'cpr-transfer.c',
+ 'cpu-throttle.c',
'dirtyrate.c',
'exec.c',
'fd.c',
@@ -21,6 +25,8 @@ system_ss.add(files(
'migration-hmp-cmds.c',
'migration.c',
'multifd.c',
+ 'multifd-device-state.c',
+ 'multifd-nocomp.c',
'multifd-zlib.c',
'multifd-zero-page.c',
'options.c',
@@ -41,6 +47,7 @@ system_ss.add(when: rdma, if_true: files('rdma.c'))
system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
system_ss.add(when: uadk, if_true: files('multifd-uadk.c'))
+system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
if_true: files('ram.c',
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 7d608d2..e8a563c 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -21,15 +21,15 @@
#include "qapi/error.h"
#include "qapi/qapi-commands-migration.h"
#include "qapi/qapi-visit-migration.h"
-#include "qapi/qmp/qdict.h"
+#include "qobject/qdict.h"
#include "qapi/string-input-visitor.h"
#include "qapi/string-output-visitor.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
#include "qemu/sockets.h"
-#include "sysemu/runstate.h"
+#include "system/runstate.h"
#include "ui/qemu-spice.h"
-#include "sysemu/sysemu.h"
+#include "system/system.h"
#include "options.h"
#include "migration.h"
@@ -37,27 +37,28 @@ static void migration_global_dump(Monitor *mon)
{
MigrationState *ms = migrate_get_current();
- monitor_printf(mon, "globals:\n");
- monitor_printf(mon, "store-global-state: %s\n",
+ monitor_printf(mon, "Globals:\n");
+ monitor_printf(mon, " store-global-state: %s\n",
ms->store_global_state ? "on" : "off");
- monitor_printf(mon, "only-migratable: %s\n",
+ monitor_printf(mon, " only-migratable: %s\n",
only_migratable ? "on" : "off");
- monitor_printf(mon, "send-configuration: %s\n",
+ monitor_printf(mon, " send-configuration: %s\n",
ms->send_configuration ? "on" : "off");
- monitor_printf(mon, "send-section-footer: %s\n",
+ monitor_printf(mon, " send-section-footer: %s\n",
ms->send_section_footer ? "on" : "off");
- monitor_printf(mon, "clear-bitmap-shift: %u\n",
+ monitor_printf(mon, " send-switchover-start: %s\n",
+ ms->send_switchover_start ? "on" : "off");
+ monitor_printf(mon, " clear-bitmap-shift: %u\n",
ms->clear_bitmap_shift);
}
void hmp_info_migrate(Monitor *mon, const QDict *qdict)
{
+ bool show_all = qdict_get_try_bool(qdict, "all", false);
MigrationInfo *info;
info = qmp_query_migrate(NULL);
- migration_global_dump(mon);
-
if (info->blocked_reasons) {
strList *reasons = info->blocked_reasons;
monitor_printf(mon, "Outgoing migration blocked:\n");
@@ -68,7 +69,7 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
}
if (info->has_status) {
- monitor_printf(mon, "Migration status: %s",
+ monitor_printf(mon, "Status: %s",
MigrationStatus_str(info->status));
if (info->status == MIGRATION_STATUS_FAILED && info->error_desc) {
monitor_printf(mon, " (%s)\n", info->error_desc);
@@ -76,107 +77,130 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
monitor_printf(mon, "\n");
}
- monitor_printf(mon, "total time: %" PRIu64 " ms\n",
- info->total_time);
- if (info->has_expected_downtime) {
- monitor_printf(mon, "expected downtime: %" PRIu64 " ms\n",
- info->expected_downtime);
- }
- if (info->has_downtime) {
- monitor_printf(mon, "downtime: %" PRIu64 " ms\n",
- info->downtime);
+ if (info->total_time) {
+ monitor_printf(mon, "Time (ms): total=%" PRIu64,
+ info->total_time);
+ if (info->has_setup_time) {
+ monitor_printf(mon, ", setup=%" PRIu64,
+ info->setup_time);
+ }
+ if (info->has_expected_downtime) {
+ monitor_printf(mon, ", exp_down=%" PRIu64,
+ info->expected_downtime);
+ }
+ if (info->has_downtime) {
+ monitor_printf(mon, ", down=%" PRIu64,
+ info->downtime);
+ }
+ monitor_printf(mon, "\n");
}
- if (info->has_setup_time) {
- monitor_printf(mon, "setup: %" PRIu64 " ms\n",
- info->setup_time);
+ }
+
+ if (info->has_socket_address) {
+ SocketAddressList *addr;
+
+ monitor_printf(mon, "Sockets: [\n");
+
+ for (addr = info->socket_address; addr; addr = addr->next) {
+ char *s = socket_uri(addr->value);
+ monitor_printf(mon, "\t%s\n", s);
+ g_free(s);
}
+ monitor_printf(mon, "]\n");
}
if (info->ram) {
- monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n",
- info->ram->transferred >> 10);
- monitor_printf(mon, "throughput: %0.2f mbps\n",
+ monitor_printf(mon, "RAM info:\n");
+ monitor_printf(mon, " Throughput (Mbps): %0.2f\n",
info->ram->mbps);
- monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n",
- info->ram->remaining >> 10);
- monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n",
+ monitor_printf(mon, " Sizes (KiB): pagesize=%" PRIu64
+ ", total=%" PRIu64 ",\n",
+ info->ram->page_size >> 10,
info->ram->total >> 10);
- monitor_printf(mon, "duplicate: %" PRIu64 " pages\n",
- info->ram->duplicate);
- monitor_printf(mon, "normal: %" PRIu64 " pages\n",
- info->ram->normal);
- monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n",
- info->ram->normal_bytes >> 10);
- monitor_printf(mon, "dirty sync count: %" PRIu64 "\n",
- info->ram->dirty_sync_count);
- monitor_printf(mon, "page size: %" PRIu64 " kbytes\n",
- info->ram->page_size >> 10);
- monitor_printf(mon, "multifd bytes: %" PRIu64 " kbytes\n",
- info->ram->multifd_bytes >> 10);
- monitor_printf(mon, "pages-per-second: %" PRIu64 "\n",
+ monitor_printf(mon, " transferred=%" PRIu64
+ ", remain=%" PRIu64 ",\n",
+ info->ram->transferred >> 10,
+ info->ram->remaining >> 10);
+ monitor_printf(mon, " precopy=%" PRIu64
+ ", multifd=%" PRIu64
+ ", postcopy=%" PRIu64,
+ info->ram->precopy_bytes >> 10,
+ info->ram->multifd_bytes >> 10,
+ info->ram->postcopy_bytes >> 10);
+
+ if (info->vfio) {
+ monitor_printf(mon, ", vfio=%" PRIu64,
+ info->vfio->transferred >> 10);
+ }
+ monitor_printf(mon, "\n");
+
+ monitor_printf(mon, " Pages: normal=%" PRIu64 ", zero=%" PRIu64
+ ", rate_per_sec=%" PRIu64 "\n",
+ info->ram->normal,
+ info->ram->duplicate,
info->ram->pages_per_second);
+ monitor_printf(mon, " Others: dirty_syncs=%" PRIu64,
+ info->ram->dirty_sync_count);
if (info->ram->dirty_pages_rate) {
- monitor_printf(mon, "dirty pages rate: %" PRIu64 " pages\n",
+ monitor_printf(mon, ", dirty_pages_rate=%" PRIu64,
info->ram->dirty_pages_rate);
}
if (info->ram->postcopy_requests) {
- monitor_printf(mon, "postcopy request count: %" PRIu64 "\n",
+ monitor_printf(mon, ", postcopy_req=%" PRIu64,
info->ram->postcopy_requests);
}
- if (info->ram->precopy_bytes) {
- monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n",
- info->ram->precopy_bytes >> 10);
- }
if (info->ram->downtime_bytes) {
- monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n",
- info->ram->downtime_bytes >> 10);
- }
- if (info->ram->postcopy_bytes) {
- monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n",
- info->ram->postcopy_bytes >> 10);
+ monitor_printf(mon, ", downtime_ram=%" PRIu64,
+ info->ram->downtime_bytes);
}
if (info->ram->dirty_sync_missed_zero_copy) {
- monitor_printf(mon,
- "Zero-copy-send fallbacks happened: %" PRIu64 " times\n",
+ monitor_printf(mon, ", zerocopy_fallbacks=%" PRIu64,
info->ram->dirty_sync_missed_zero_copy);
}
+ monitor_printf(mon, "\n");
+ }
+
+ if (!show_all) {
+ goto out;
}
+ migration_global_dump(mon);
+
if (info->xbzrle_cache) {
- monitor_printf(mon, "cache size: %" PRIu64 " bytes\n",
- info->xbzrle_cache->cache_size);
- monitor_printf(mon, "xbzrle transferred: %" PRIu64 " kbytes\n",
- info->xbzrle_cache->bytes >> 10);
- monitor_printf(mon, "xbzrle pages: %" PRIu64 " pages\n",
- info->xbzrle_cache->pages);
- monitor_printf(mon, "xbzrle cache miss: %" PRIu64 " pages\n",
- info->xbzrle_cache->cache_miss);
- monitor_printf(mon, "xbzrle cache miss rate: %0.2f\n",
- info->xbzrle_cache->cache_miss_rate);
- monitor_printf(mon, "xbzrle encoding rate: %0.2f\n",
- info->xbzrle_cache->encoding_rate);
- monitor_printf(mon, "xbzrle overflow: %" PRIu64 "\n",
+ monitor_printf(mon, "XBZRLE: size=%" PRIu64
+ ", transferred=%" PRIu64
+ ", pages=%" PRIu64
+ ", miss=%" PRIu64 "\n"
+ " miss_rate=%0.2f"
+ ", encode_rate=%0.2f"
+ ", overflow=%" PRIu64 "\n",
+ info->xbzrle_cache->cache_size,
+ info->xbzrle_cache->bytes,
+ info->xbzrle_cache->pages,
+ info->xbzrle_cache->cache_miss,
+ info->xbzrle_cache->cache_miss_rate,
+ info->xbzrle_cache->encoding_rate,
info->xbzrle_cache->overflow);
}
if (info->has_cpu_throttle_percentage) {
- monitor_printf(mon, "cpu throttle percentage: %" PRIu64 "\n",
+ monitor_printf(mon, "CPU Throttle (%%): %" PRIu64 "\n",
info->cpu_throttle_percentage);
}
if (info->has_dirty_limit_throttle_time_per_round) {
- monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n",
+ monitor_printf(mon, "Dirty-limit Throttle (us): %" PRIu64 "\n",
info->dirty_limit_throttle_time_per_round);
}
if (info->has_dirty_limit_ring_full_time) {
- monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n",
+ monitor_printf(mon, "Dirty-limit Ring Full (us): %" PRIu64 "\n",
info->dirty_limit_ring_full_time);
}
if (info->has_postcopy_blocktime) {
- monitor_printf(mon, "postcopy blocktime: %u\n",
+ monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n",
info->postcopy_blocktime);
}
@@ -187,28 +211,12 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
visit_type_uint32List(v, NULL, &info->postcopy_vcpu_blocktime,
&error_abort);
visit_complete(v, &str);
- monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str);
+ monitor_printf(mon, "Postcopy vCPU Blocktime: %s\n", str);
g_free(str);
visit_free(v);
}
- if (info->has_socket_address) {
- SocketAddressList *addr;
-
- monitor_printf(mon, "socket address: [\n");
-
- for (addr = info->socket_address; addr; addr = addr->next) {
- char *s = socket_uri(addr->value);
- monitor_printf(mon, "\t%s\n", s);
- g_free(s);
- }
- monitor_printf(mon, "]\n");
- }
-
- if (info->vfio) {
- monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n",
- info->vfio->transferred >> 10);
- }
+out:
qapi_free_MigrationInfo(info);
}
@@ -576,6 +584,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
p->has_multifd_zlib_level = true;
visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
break;
+ case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL:
+ p->has_multifd_qatzip_level = true;
+ visit_type_uint8(v, param, &p->multifd_qatzip_level, &err);
+ break;
case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
p->has_multifd_zstd_level = true;
visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
@@ -636,7 +648,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
visit_type_bool(v, param, &p->direct_io, &err);
break;
default:
- assert(0);
+ g_assert_not_reached();
}
if (err) {
diff --git a/migration/migration.c b/migration/migration.c
index 3dea06d..4098870 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -14,6 +14,7 @@
*/
#include "qemu/osdep.h"
+#include "qemu/ctype.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
@@ -22,11 +23,12 @@
#include "fd.h"
#include "file.h"
#include "socket.h"
-#include "sysemu/runstate.h"
-#include "sysemu/sysemu.h"
-#include "sysemu/cpu-throttle.h"
+#include "system/runstate.h"
+#include "system/system.h"
+#include "system/cpu-throttle.h"
#include "rdma.h"
#include "ram.h"
+#include "migration/cpr.h"
#include "migration/global_state.h"
#include "migration/misc.h"
#include "migration.h"
@@ -43,7 +45,7 @@
#include "qapi/qapi-commands-migration.h"
#include "qapi/qapi-events-migration.h"
#include "qapi/qmp/qerror.h"
-#include "qapi/qmp/qnull.h"
+#include "qobject/qnull.h"
#include "qemu/rcu.h"
#include "postcopy-ram.h"
#include "qemu/thread.h"
@@ -59,13 +61,13 @@
#include "multifd.h"
#include "threadinfo.h"
#include "qemu/yank.h"
-#include "sysemu/cpus.h"
+#include "system/cpus.h"
#include "yank_functions.h"
-#include "sysemu/qtest.h"
+#include "system/qtest.h"
#include "options.h"
-#include "sysemu/dirtylimit.h"
+#include "system/dirtylimit.h"
#include "qemu/sockets.h"
-#include "sysemu/kvm.h"
+#include "system/kvm.h"
#define NOTIFIER_ELEM_INIT(array, elem) \
[elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem])
@@ -75,6 +77,7 @@
static NotifierWithReturnList migration_state_notifiers[] = {
NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL),
NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT),
+ NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER),
};
/* Messages sent on the return path from destination to source */
@@ -92,6 +95,9 @@ enum mig_rp_message_type {
MIG_RP_MSG_MAX
};
+/* Migration channel types */
+enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY };
+
/* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add
dynamic creation of migration */
@@ -102,12 +108,10 @@ static MigrationIncomingState *current_incoming;
static GSList *migration_blockers[MIG_MODE__MAX];
static bool migration_object_check(MigrationState *ms, Error **errp);
-static int migration_maybe_pause(MigrationState *s,
- int *current_active_state,
- int new_state);
-static void migrate_fd_cancel(MigrationState *s);
+static bool migration_switchover_start(MigrationState *s, Error **errp);
static bool close_return_path_on_source(MigrationState *s);
static void migration_completion_end(MigrationState *s);
+static void migrate_hup_delete(MigrationState *s);
static void migration_downtime_start(MigrationState *s)
{
@@ -115,6 +119,27 @@ static void migration_downtime_start(MigrationState *s)
s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
}
+/*
+ * This is unfortunate: incoming migration actually needs the outgoing
+ * migration state (MigrationState) to be there too, e.g. to query
+ * capabilities, parameters, using locks, setup errors, etc.
+ *
+ * NOTE: when calling this, making sure current_migration exists and not
+ * been freed yet! Otherwise trying to access the refcount is already
+ * an use-after-free itself..
+ *
+ * TODO: Move shared part of incoming / outgoing out into separate object.
+ * Then this is not needed.
+ */
+static void migrate_incoming_ref_outgoing_state(void)
+{
+ object_ref(migrate_get_current());
+}
+static void migrate_incoming_unref_outgoing_state(void)
+{
+ object_unref(migrate_get_current());
+}
+
static void migration_downtime_end(MigrationState *s)
{
int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -125,9 +150,19 @@ static void migration_downtime_end(MigrationState *s)
*/
if (!s->downtime) {
s->downtime = now - s->downtime_start;
+ trace_vmstate_downtime_checkpoint("src-downtime-end");
+ }
+}
+
+static void precopy_notify_complete(void)
+{
+ Error *local_err = NULL;
+
+ if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
+ error_report_err(local_err);
}
- trace_vmstate_downtime_checkpoint("src-downtime-end");
+ trace_migration_precopy_complete();
}
static bool migration_needs_multiple_sockets(void)
@@ -135,6 +170,21 @@ static bool migration_needs_multiple_sockets(void)
return migrate_multifd() || migrate_postcopy_preempt();
}
+static RunState migration_get_target_runstate(void)
+{
+ /*
+ * When the global state is not migrated, it means we don't know the
+ * runstate of the src QEMU. We don't have much choice but assuming
+ * the VM is running. NOTE: this is pretty rare case, so far only Xen
+ * uses it.
+ */
+ if (!global_state_received()) {
+ return RUN_STATE_RUNNING;
+ }
+
+ return global_state_get_runstate();
+}
+
static bool transport_supports_multi_channels(MigrationAddress *addr)
{
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
@@ -203,9 +253,33 @@ migration_channels_and_transport_compatible(MigrationAddress *addr,
return false;
}
+ if (migrate_mode() == MIG_MODE_CPR_TRANSFER &&
+ addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
+ error_setg(errp, "Migration requires streamable transport (eg unix)");
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+migration_capabilities_and_transport_compatible(MigrationAddress *addr,
+ Error **errp)
+{
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
+ return migrate_rdma_caps_check(migrate_get_current()->capabilities,
+ errp);
+ }
+
return true;
}
+static bool migration_transport_compatible(MigrationAddress *addr, Error **errp)
+{
+ return migration_channels_and_transport_compatible(addr, errp) &&
+ migration_capabilities_and_transport_compatible(addr, errp);
+}
+
static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
{
uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
@@ -263,6 +337,9 @@ void migration_object_init(void)
ram_mig_init();
dirty_bitmap_mig_init();
+
+ /* Initialize cpu throttle timers */
+ cpu_throttle_init();
}
typedef struct {
@@ -306,17 +383,6 @@ void migration_bh_schedule(QEMUBHFunc *cb, void *opaque)
qemu_bh_schedule(bh);
}
-void migration_cancel(const Error *error)
-{
- if (error) {
- migrate_set_error(current_migration, error);
- }
- if (migrate_dirty_limit()) {
- qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
- }
- migrate_fd_cancel(current_migration);
-}
-
void migration_shutdown(void)
{
/*
@@ -329,7 +395,7 @@ void migration_shutdown(void)
* Cancel the current migration - that will (eventually)
* stop the migration using this structure
*/
- migration_cancel(NULL);
+ migration_cancel();
object_unref(OBJECT(current_migration));
/*
@@ -379,6 +445,24 @@ void migration_incoming_state_destroy(void)
multifd_recv_cleanup();
+ /*
+ * RAM state cleanup needs to happen after multifd cleanup, because
+ * multifd threads can use some of its states (receivedmap).
+ * The VFIO load_cleanup() implementation is BQL-sensitive. It requires
+ * BQL must NOT be taken when recycling load threads, so that it won't
+ * block the load threads from making progress on address space
+ * modification operations.
+ *
+ * To make it work, we could try to not take BQL for all load_cleanup(),
+ * or conditionally unlock BQL only if bql_locked() in VFIO.
+ *
+ * Since most existing call sites take BQL for load_cleanup(), make
+ * it simple by taking BQL always as the rule, so that VFIO can unlock
+ * BQL and retake unconditionally.
+ */
+ assert(bql_locked());
+ qemu_loadvm_state_cleanup(mis);
+
if (mis->to_src_file) {
/* Tell source that we are done */
migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
@@ -410,6 +494,7 @@ void migration_incoming_state_destroy(void)
mis->postcopy_qemufile_dst = NULL;
}
+ cpr_set_incoming_mode(MIG_MODE_NONE);
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
@@ -563,6 +648,16 @@ void migrate_add_address(SocketAddress *address)
QAPI_CLONE(SocketAddress, address));
}
+bool migrate_is_uri(const char *uri)
+{
+ while (*uri && *uri != ':') {
+ if (!qemu_isalpha(*uri++)) {
+ return false;
+ }
+ }
+ return *uri == ':';
+}
+
bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
Error **errp)
{
@@ -660,7 +755,8 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
if (channels) {
/* To verify that Migrate channel list has only item */
if (channels->next) {
- error_setg(errp, "Channel list has more than one entries");
+ error_setg(errp, "Channel list must have only one entry, "
+ "for type 'main'");
return;
}
addr = channels->value->addr;
@@ -675,7 +771,7 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
}
/* transport mechanism not suitable for migration? */
- if (!migration_channels_and_transport_compatible(addr, errp)) {
+ if (!migration_transport_compatible(addr, errp)) {
return;
}
@@ -694,14 +790,6 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
}
#ifdef CONFIG_RDMA
} else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
- if (migrate_xbzrle()) {
- error_setg(errp, "RDMA and XBZRLE can't be used together");
- return;
- }
- if (migrate_multifd()) {
- error_setg(errp, "RDMA and multifd can't be used together");
- return;
- }
rdma_start_incoming_migration(&addr->u.rdma, errp);
#endif
} else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
@@ -711,34 +799,17 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
} else {
error_setg(errp, "unknown migration protocol: %s", uri);
}
+
+ /* Close cpr socket to tell source that we are listening */
+ cpr_state_close();
}
static void process_incoming_migration_bh(void *opaque)
{
- Error *local_err = NULL;
MigrationIncomingState *mis = opaque;
trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter");
- /* If capability late_block_activate is set:
- * Only fire up the block code now if we're going to restart the
- * VM, else 'cont' will do it.
- * This causes file locking to happen; so we don't want it to happen
- * unless we really are starting the VM.
- */
- if (!migrate_late_block_activate() ||
- (autostart && (!global_state_received() ||
- runstate_is_live(global_state_get_runstate())))) {
- /* Make sure all file formats throw away their mutable metadata.
- * If we get an error here, just don't restart the VM yet. */
- bdrv_activate_all(&local_err);
- if (local_err) {
- error_report_err(local_err);
- local_err = NULL;
- autostart = false;
- }
- }
-
/*
* This must happen after all error conditions are dealt with and
* we're sure the VM is going to be running on this host.
@@ -751,10 +822,23 @@ static void process_incoming_migration_bh(void *opaque)
dirty_bitmap_mig_before_vm_start();
- if (!global_state_received() ||
- runstate_is_live(global_state_get_runstate())) {
+ if (runstate_is_live(migration_get_target_runstate())) {
if (autostart) {
- vm_start();
+ /*
+ * Block activation is always delayed until VM starts, either
+ * here (which means we need to start the dest VM right now..),
+ * or until qmp_cont() later.
+ *
+ * We used to have cap 'late-block-activate' but now we do this
+ * unconditionally, as it has no harm but only benefit. E.g.,
+ * it's not part of migration ABI on the time of disk activation.
+ *
+ * Make sure all file formats throw away their mutable
+ * metadata. If error, don't restart the VM yet.
+ */
+ if (migration_block_activate(NULL)) {
+ vm_start();
+ }
} else {
runstate_set(RUN_STATE_PAUSED);
}
@@ -813,7 +897,7 @@ process_incoming_migration_co(void *opaque)
* postcopy thread.
*/
trace_process_incoming_migration_co_postcopy_end_main();
- return;
+ goto out;
}
/* Else if something went wrong then just fall out of the normal exit */
}
@@ -829,7 +913,8 @@ process_incoming_migration_co(void *opaque)
}
migration_bh_schedule(process_incoming_migration_bh, mis);
- return;
+ goto out;
+
fail:
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_FAILED);
@@ -846,6 +931,9 @@ fail:
exit(EXIT_FAILURE);
}
+out:
+ /* Pairs with the refcount taken in qmp_migrate_incoming() */
+ migrate_incoming_unref_outgoing_state();
}
/**
@@ -856,9 +944,8 @@ static void migration_incoming_setup(QEMUFile *f)
{
MigrationIncomingState *mis = migration_incoming_get_current();
- if (!mis->from_src_file) {
- mis->from_src_file = f;
- }
+ assert(!mis->from_src_file);
+ mis->from_src_file = f;
qemu_file_set_blocking(f, false);
}
@@ -910,28 +997,19 @@ void migration_fd_process_incoming(QEMUFile *f)
migration_incoming_process();
}
-/*
- * Returns true when we want to start a new incoming migration process,
- * false otherwise.
- */
-static bool migration_should_start_incoming(bool main_channel)
+static bool migration_has_main_and_multifd_channels(void)
{
- /* Multifd doesn't start unless all channels are established */
- if (migrate_multifd()) {
- return migration_has_all_channels();
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ if (!mis->from_src_file) {
+ /* main channel not established */
+ return false;
}
- /* Preempt channel only starts when the main channel is created */
- if (migrate_postcopy_preempt()) {
- return main_channel;
+ if (migrate_multifd() && !multifd_recv_all_channels_created()) {
+ return false;
}
- /*
- * For all the rest types of migration, we should only reach here when
- * it's the main channel that's being created, and we should always
- * proceed with this channel.
- */
- assert(main_channel);
+ /* main and all multifd channels are established */
return true;
}
@@ -940,59 +1018,81 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
QEMUFile *f;
- bool default_channel = true;
+ uint8_t channel;
uint32_t channel_magic = 0;
int ret = 0;
- if (migrate_multifd() && !migrate_mapped_ram() &&
- !migrate_postcopy_ram() &&
- qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
- /*
- * With multiple channels, it is possible that we receive channels
- * out of order on destination side, causing incorrect mapping of
- * source channels on destination side. Check channel MAGIC to
- * decide type of channel. Please note this is best effort, postcopy
- * preempt channel does not send any magic number so avoid it for
- * postcopy live migration. Also tls live migration already does
- * tls handshake while initializing main channel so with tls this
- * issue is not possible.
- */
- ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
- sizeof(channel_magic), errp);
+ if (!migration_has_main_and_multifd_channels()) {
+ if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+ /*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort,
+ * postcopy preempt channel does not send any magic number so
+ * avoid it for postcopy live migration. Also tls live migration
+ * already does tls handshake while initializing main channel so
+ * with tls this issue is not possible.
+ */
+ ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
+ sizeof(channel_magic), errp);
+ if (ret != 0) {
+ return;
+ }
- if (ret != 0) {
+ channel_magic = be32_to_cpu(channel_magic);
+ if (channel_magic == QEMU_VM_FILE_MAGIC) {
+ channel = CH_MAIN;
+ } else if (channel_magic == MULTIFD_MAGIC) {
+ assert(migrate_multifd());
+ channel = CH_MULTIFD;
+ } else if (!mis->from_src_file &&
+ mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+ /* reconnect main channel for postcopy recovery */
+ channel = CH_MAIN;
+ } else {
+ error_setg(errp, "unknown channel magic: %u", channel_magic);
+ return;
+ }
+ } else if (mis->from_src_file && migrate_multifd()) {
+ /*
+ * Non-peekable channels like tls/file are processed as
+ * multifd channels when multifd is enabled.
+ */
+ channel = CH_MULTIFD;
+ } else if (!mis->from_src_file) {
+ channel = CH_MAIN;
+ } else {
+ error_setg(errp, "non-peekable channel used without multifd");
return;
}
-
- default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
} else {
- default_channel = !mis->from_src_file;
+ assert(migrate_postcopy_preempt());
+ channel = CH_POSTCOPY;
}
if (multifd_recv_setup(errp) != 0) {
return;
}
- if (default_channel) {
+ if (channel == CH_MAIN) {
f = qemu_file_new_input(ioc);
migration_incoming_setup(f);
- } else {
+ } else if (channel == CH_MULTIFD) {
/* Multiple connections */
- assert(migration_needs_multiple_sockets());
- if (migrate_multifd()) {
- multifd_recv_new_channel(ioc, &local_err);
- } else {
- assert(migrate_postcopy_preempt());
- f = qemu_file_new_input(ioc);
- postcopy_preempt_new_channel(mis, f);
- }
+ multifd_recv_new_channel(ioc, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
+ } else if (channel == CH_POSTCOPY) {
+ assert(!mis->postcopy_qemufile_dst);
+ f = qemu_file_new_input(ioc);
+ postcopy_preempt_new_channel(mis, f);
+ return;
}
- if (migration_should_start_incoming(default_channel)) {
+ if (migration_has_main_and_multifd_channels()) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
@@ -1009,18 +1109,13 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
*/
bool migration_has_all_channels(void)
{
- MigrationIncomingState *mis = migration_incoming_get_current();
-
- if (!mis->from_src_file) {
+ if (!migration_has_main_and_multifd_channels()) {
return false;
}
- if (migrate_multifd()) {
- return multifd_recv_all_channels_created();
- }
-
- if (migrate_postcopy_preempt()) {
- return mis->postcopy_qemufile_dst != NULL;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) {
+ return false;
}
return true;
@@ -1105,14 +1200,14 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
}
-/*
- * Return true if we're already in the middle of a migration
- * (i.e. any of the active or setup states)
- */
-bool migration_is_setup_or_active(void)
+bool migration_is_running(void)
{
MigrationState *s = current_migration;
+ if (!s) {
+ return false;
+ }
+
switch (s->state) {
case MIGRATION_STATUS_ACTIVE:
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
@@ -1123,36 +1218,20 @@ bool migration_is_setup_or_active(void)
case MIGRATION_STATUS_PRE_SWITCHOVER:
case MIGRATION_STATUS_DEVICE:
case MIGRATION_STATUS_WAIT_UNPLUG:
+ case MIGRATION_STATUS_CANCELLING:
case MIGRATION_STATUS_COLO:
return true;
-
default:
return false;
-
}
}
-bool migration_is_running(void)
+static bool migration_is_active(void)
{
MigrationState *s = current_migration;
- switch (s->state) {
- case MIGRATION_STATUS_ACTIVE:
- case MIGRATION_STATUS_POSTCOPY_ACTIVE:
- case MIGRATION_STATUS_POSTCOPY_PAUSED:
- case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
- case MIGRATION_STATUS_POSTCOPY_RECOVER:
- case MIGRATION_STATUS_SETUP:
- case MIGRATION_STATUS_PRE_SWITCHOVER:
- case MIGRATION_STATUS_DEVICE:
- case MIGRATION_STATUS_WAIT_UNPLUG:
- case MIGRATION_STATUS_CANCELLING:
- return true;
-
- default:
- return false;
-
- }
+ return (s->state == MIGRATION_STATUS_ACTIVE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
}
static bool migrate_show_downtime(MigrationState *s)
@@ -1397,39 +1476,52 @@ void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
}
}
-static void migrate_fd_cleanup(MigrationState *s)
+static void migration_cleanup_json_writer(MigrationState *s)
+{
+ g_clear_pointer(&s->vmdesc, json_writer_free);
+}
+
+static void migration_cleanup(MigrationState *s)
{
MigrationEventType type;
+ QEMUFile *tmp = NULL;
+
+ trace_migration_cleanup();
+
+ migration_cleanup_json_writer(s);
g_free(s->hostname);
s->hostname = NULL;
- json_writer_free(s->vmdesc);
- s->vmdesc = NULL;
qemu_savevm_state_cleanup();
+ cpr_state_close();
+ migrate_hup_delete(s);
close_return_path_on_source(s);
- if (s->to_dst_file) {
- QEMUFile *tmp;
-
- trace_migrate_fd_cleanup();
+ if (s->migration_thread_running) {
bql_unlock();
- if (s->migration_thread_running) {
- qemu_thread_join(&s->thread);
- s->migration_thread_running = false;
- }
+ qemu_thread_join(&s->thread);
+ s->migration_thread_running = false;
bql_lock();
+ }
- multifd_send_shutdown();
- qemu_mutex_lock(&s->qemu_file_lock);
+ WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+ /*
+ * Close the file handle without the lock to make sure the critical
+ * section won't block for long.
+ */
tmp = s->to_dst_file;
s->to_dst_file = NULL;
- qemu_mutex_unlock(&s->qemu_file_lock);
+ }
+
+ if (tmp) {
/*
- * Close the file handle without the lock to make sure the
- * critical section won't block for long.
+ * We only need to shutdown multifd if tmp!=NULL, because if
+ * tmp==NULL, it means the main channel isn't established, while
+ * multifd is only setup after that (in migration_thread()).
*/
+ multifd_send_shutdown();
migration_ioc_unregister_yank_from_file(tmp);
qemu_fclose(tmp);
}
@@ -1451,9 +1543,9 @@ static void migrate_fd_cleanup(MigrationState *s)
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
-static void migrate_fd_cleanup_bh(void *opaque)
+static void migration_cleanup_bh(void *opaque)
{
- migrate_fd_cleanup(opaque);
+ migration_cleanup(opaque);
}
void migrate_set_error(MigrationState *s, const Error *error)
@@ -1483,7 +1575,7 @@ static void migrate_error_free(MigrationState *s)
}
}
-static void migrate_fd_error(MigrationState *s, const Error *error)
+static void migration_connect_set_error(MigrationState *s, const Error *error)
{
MigrationStatus current = s->state;
MigrationStatus next;
@@ -1512,11 +1604,17 @@ static void migrate_fd_error(MigrationState *s, const Error *error)
migrate_set_error(s, error);
}
-static void migrate_fd_cancel(MigrationState *s)
+void migration_cancel(void)
{
+ MigrationState *s = migrate_get_current();
int old_state ;
+ bool setup = (s->state == MIGRATION_STATUS_SETUP);
+
+ trace_migration_cancel();
- trace_migrate_fd_cancel();
+ if (migrate_dirty_limit()) {
+ qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
+ }
WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
if (s->rp_state.from_dst_file) {
@@ -1532,7 +1630,7 @@ static void migrate_fd_cancel(MigrationState *s)
}
/* If the migration is paused, kick it out of the pause */
if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
- qemu_sem_post(&s->pause_sem);
+ qemu_event_set(&s->pause_event);
}
migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
} while (s->state != MIGRATION_STATUS_CANCELLING);
@@ -1549,15 +1647,16 @@ static void migrate_fd_cancel(MigrationState *s)
}
}
}
- if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
- Error *local_err = NULL;
- bdrv_activate_all(&local_err);
- if (local_err) {
- error_report_err(local_err);
- } else {
- s->block_inactive = false;
- }
+ /*
+ * If qmp_migrate_finish has not been called, then there is no path that
+ * will complete the cancellation. Do it now.
+ */
+ if (setup && !s->to_dst_file) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
+ MIGRATION_STATUS_CANCELLED);
+ cpr_state_close();
+ migrate_hup_delete(s);
}
}
@@ -1644,42 +1743,7 @@ bool migration_incoming_postcopy_advised(void)
bool migration_in_bg_snapshot(void)
{
- return migrate_background_snapshot() &&
- migration_is_setup_or_active();
-}
-
-bool migration_is_idle(void)
-{
- MigrationState *s = current_migration;
-
- if (!s) {
- return true;
- }
-
- switch (s->state) {
- case MIGRATION_STATUS_NONE:
- case MIGRATION_STATUS_CANCELLED:
- case MIGRATION_STATUS_COMPLETED:
- case MIGRATION_STATUS_FAILED:
- return true;
- default:
- return false;
- }
-}
-
-bool migration_is_active(void)
-{
- MigrationState *s = current_migration;
-
- return (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
-}
-
-bool migration_is_device(void)
-{
- MigrationState *s = current_migration;
-
- return s->state == MIGRATION_STATUS_DEVICE;
+ return migrate_background_snapshot() && migration_is_running();
}
bool migration_thread_is_self(void)
@@ -1691,7 +1755,9 @@ bool migration_thread_is_self(void)
bool migrate_mode_is_cpr(MigrationState *s)
{
- return s->parameters.mode == MIG_MODE_CPR_REBOOT;
+ MigMode mode = s->parameters.mode;
+ return mode == MIG_MODE_CPR_REBOOT ||
+ mode == MIG_MODE_CPR_TRANSFER;
}
int migrate_init(MigrationState *s, Error **errp)
@@ -1720,7 +1786,10 @@ int migrate_init(MigrationState *s, Error **errp)
s->migration_thread_running = false;
error_free(s->error);
s->error = NULL;
- s->vmdesc = NULL;
+
+ if (should_send_vmdesc()) {
+ s->vmdesc = json_writer_new(false);
+ }
migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
@@ -1745,7 +1814,7 @@ static bool is_busy(Error **reasonp, Error **errp)
ERRP_GUARD();
/* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
- if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
+ if (runstate_check(RUN_STATE_SAVE_VM) || migration_is_running()) {
error_propagate_prepend(errp, *reasonp,
"disallowing migration blocker "
"(migration/snapshot in progress) for: ");
@@ -1877,6 +1946,17 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
return;
}
+ /*
+ * Making sure MigrationState is available until incoming migration
+ * completes.
+ *
+ * NOTE: QEMU _might_ leak this refcount in some failure paths, but
+ * that's OK. This is the minimum change we need to at least making
+ * sure success case is clean on the refcount. We can try harder to
+ * make it accurate for any kind of failures, but it might be an
+ * overkill and doesn't bring us much benefit.
+ */
+ migrate_incoming_ref_outgoing_state();
once = false;
}
@@ -2066,6 +2146,40 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
return true;
}
+static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested,
+ Error **errp);
+
+static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb,
+ void *opaque)
+{
+ s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP);
+ g_source_set_callback(s->hup_source, cb, opaque, NULL);
+ g_source_attach(s->hup_source, NULL);
+}
+
+static void migrate_hup_delete(MigrationState *s)
+{
+ if (s->hup_source) {
+ g_source_destroy(s->hup_source);
+ g_source_unref(s->hup_source);
+ s->hup_source = NULL;
+ }
+}
+
+static gboolean qmp_migrate_finish_cb(QIOChannel *channel,
+ GIOCondition cond,
+ void *opaque)
+{
+ MigrationAddress *addr = opaque;
+
+ qmp_migrate_finish(addr, false, NULL);
+
+ cpr_state_close();
+ migrate_hup_delete(migrate_get_current());
+ qapi_free_MigrationAddress(addr);
+ return G_SOURCE_REMOVE;
+}
+
void qmp_migrate(const char *uri, bool has_channels,
MigrationChannelList *channels, bool has_detach, bool detach,
bool has_resume, bool resume, Error **errp)
@@ -2075,6 +2189,8 @@ void qmp_migrate(const char *uri, bool has_channels,
MigrationState *s = migrate_get_current();
g_autoptr(MigrationChannel) channel = NULL;
MigrationAddress *addr = NULL;
+ MigrationChannel *channelv[MIGRATION_CHANNEL_TYPE__MAX] = { NULL };
+ MigrationChannel *cpr_channel = NULL;
/*
* Having preliminary checks for uri and channel
@@ -2085,12 +2201,22 @@ void qmp_migrate(const char *uri, bool has_channels,
}
if (channels) {
- /* To verify that Migrate channel list has only item */
- if (channels->next) {
- error_setg(errp, "Channel list has more than one entries");
+ for ( ; channels; channels = channels->next) {
+ MigrationChannelType type = channels->value->channel_type;
+
+ if (channelv[type]) {
+ error_setg(errp, "Channel list has more than one %s entry",
+ MigrationChannelType_str(type));
+ return;
+ }
+ channelv[type] = channels->value;
+ }
+ cpr_channel = channelv[MIGRATION_CHANNEL_TYPE_CPR];
+ addr = channelv[MIGRATION_CHANNEL_TYPE_MAIN]->addr;
+ if (!addr) {
+ error_setg(errp, "Channel list has no main entry");
return;
}
- addr = channels->value->addr;
}
if (uri) {
@@ -2102,7 +2228,12 @@ void qmp_migrate(const char *uri, bool has_channels,
}
/* transport mechanism not suitable for migration? */
- if (!migration_channels_and_transport_compatible(addr, errp)) {
+ if (!migration_transport_compatible(addr, errp)) {
+ return;
+ }
+
+ if (s->parameters.mode == MIG_MODE_CPR_TRANSFER && !cpr_channel) {
+ error_setg(errp, "missing 'cpr' migration channel");
return;
}
@@ -2112,6 +2243,41 @@ void qmp_migrate(const char *uri, bool has_channels,
return;
}
+ if (cpr_state_save(cpr_channel, &local_err)) {
+ goto out;
+ }
+
+ /*
+ * For cpr-transfer, the target may not be listening yet on the migration
+ * channel, because first it must finish cpr_load_state. The target tells
+ * us it is listening by closing the cpr-state socket. Wait for that HUP
+ * event before connecting in qmp_migrate_finish.
+ *
+ * The HUP could occur because the target fails while reading CPR state,
+ * in which case the target will not listen for the incoming migration
+ * connection, so qmp_migrate_finish will fail to connect, and then recover.
+ */
+ if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) {
+ migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb,
+ QAPI_CLONE(MigrationAddress, addr));
+
+ } else {
+ qmp_migrate_finish(addr, resume_requested, errp);
+ }
+
+out:
+ if (local_err) {
+ migration_connect_set_error(s, local_err);
+ error_propagate(errp, local_err);
+ }
+}
+
+static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested,
+ Error **errp)
+{
+ MigrationState *s = migrate_get_current();
+ Error *local_err = NULL;
+
if (!resume_requested) {
if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
return;
@@ -2146,7 +2312,7 @@ void qmp_migrate(const char *uri, bool has_channels,
if (!resume_requested) {
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
- migrate_fd_error(s, local_err);
+ migration_connect_set_error(s, local_err);
error_propagate(errp, local_err);
return;
}
@@ -2154,7 +2320,18 @@ void qmp_migrate(const char *uri, bool has_channels,
void qmp_migrate_cancel(Error **errp)
{
- migration_cancel(NULL);
+ /*
+ * After postcopy migration has started, the source machine is not
+ * recoverable in case of a migration error. This also means the
+ * cancel command cannot be used as cancel should allow the
+ * machine to continue operation.
+ */
+ if (migration_in_postcopy()) {
+ error_setg(errp, "Postcopy migration in progress, cannot cancel.");
+ return;
+ }
+
+ migration_cancel();
}
void qmp_migrate_continue(MigrationStatus state, Error **errp)
@@ -2165,7 +2342,7 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp)
MigrationStatus_str(s->state));
return;
}
- qemu_sem_post(&s->pause_sem);
+ qemu_event_set(&s->pause_event);
}
int migration_rp_wait(MigrationState *s)
@@ -2273,7 +2450,7 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s,
*/
static void migration_release_dst_files(MigrationState *ms)
{
- QEMUFile *file;
+ QEMUFile *file = NULL;
WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
/*
@@ -2318,7 +2495,7 @@ static void *source_return_path_thread(void *opaque)
trace_source_return_path_thread_entry();
rcu_register_thread();
- while (migration_is_setup_or_active()) {
+ while (migration_is_running()) {
trace_source_return_path_thread_loop_top();
header_type = qemu_get_be16(rp);
@@ -2473,7 +2650,7 @@ static int open_return_path_on_source(MigrationState *ms)
trace_open_return_path_on_source();
- qemu_thread_create(&ms->rp_state.rp_thread, "mig/src/rp-thr",
+ qemu_thread_create(&ms->rp_state.rp_thread, MIGRATION_THREAD_SRC_RETURN,
source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
ms->rp_state.rp_thread_created = true;
@@ -2528,23 +2705,30 @@ static int postcopy_start(MigrationState *ms, Error **errp)
int ret;
QIOChannelBuffer *bioc;
QEMUFile *fb;
- uint64_t bandwidth = migrate_max_postcopy_bandwidth();
- bool restart_block = false;
- int cur_state = MIGRATION_STATUS_ACTIVE;
+
+ /*
+ * Now we're 100% sure to switch to postcopy, so JSON writer won't be
+ * useful anymore. Free the resources early if it is there. Clearing
+ * the vmdesc also means any follow up vmstate_save()s will start to
+ * skip all JSON operations, which can shrink postcopy downtime.
+ */
+ migration_cleanup_json_writer(ms);
if (migrate_postcopy_preempt()) {
migration_wait_main_channel(ms);
if (postcopy_preempt_establish_channel(ms)) {
- migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
+ if (ms->state != MIGRATION_STATUS_CANCELLING) {
+ migrate_set_state(&ms->state, ms->state,
+ MIGRATION_STATUS_FAILED);
+ }
error_setg(errp, "%s: Failed to establish preempt channel",
__func__);
return -1;
}
}
- if (!migrate_pause_before_switchover()) {
- migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
- MIGRATION_STATUS_POSTCOPY_ACTIVE);
+ if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) {
+ return -1;
}
trace_postcopy_start();
@@ -2557,27 +2741,19 @@ static int postcopy_start(MigrationState *ms, Error **errp)
goto fail;
}
- ret = migration_maybe_pause(ms, &cur_state,
- MIGRATION_STATUS_POSTCOPY_ACTIVE);
- if (ret < 0) {
- error_setg_errno(errp, -ret, "%s: Failed in migration_maybe_pause()",
- __func__);
+ if (!migration_switchover_start(ms, errp)) {
goto fail;
}
- ret = bdrv_inactivate_all();
- if (ret < 0) {
- error_setg_errno(errp, -ret, "%s: Failed in bdrv_inactivate_all()",
- __func__);
- goto fail;
- }
- restart_block = true;
-
/*
* Cause any non-postcopiable, but iterative devices to
* send out their final data.
*/
- qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
+ ret = qemu_savevm_state_complete_precopy_iterable(ms->to_dst_file, true);
+ if (ret) {
+ error_setg(errp, "Postcopy save non-postcopiable iterables failed");
+ goto fail;
+ }
/*
* in Finish migrate and with the io-lock held everything should
@@ -2589,12 +2765,6 @@ static int postcopy_start(MigrationState *ms, Error **errp)
ram_postcopy_send_discard_bitmap(ms);
}
- /*
- * send rest of state - note things that are doing postcopy
- * will notice we're in POSTCOPY_ACTIVE and not actually
- * wrap their state up here
- */
- migration_rate_set(bandwidth);
if (migrate_postcopy_ram()) {
/* Ping just for debugging, helps line traces up */
qemu_savevm_send_ping(ms->to_dst_file, 2);
@@ -2622,7 +2792,12 @@ static int postcopy_start(MigrationState *ms, Error **errp)
*/
qemu_savevm_send_postcopy_listen(fb);
- qemu_savevm_state_complete_precopy(fb, false, false);
+ ret = qemu_savevm_state_complete_precopy_non_iterable(fb, true);
+ if (ret) {
+ error_setg(errp, "Postcopy save non-iterable device states failed");
+ goto fail_closefb;
+ }
+
if (migrate_postcopy_ram()) {
qemu_savevm_send_ping(fb, 3);
}
@@ -2641,8 +2816,6 @@ static int postcopy_start(MigrationState *ms, Error **errp)
goto fail_closefb;
}
- restart_block = false;
-
/* Now send that blob */
if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
error_setg(errp, "%s: Failed to send packaged data", __func__);
@@ -2658,8 +2831,6 @@ static int postcopy_start(MigrationState *ms, Error **errp)
migration_downtime_end(ms);
- bql_unlock();
-
if (migrate_postcopy_ram()) {
/*
* Although this ping is just for debug, it could potentially be
@@ -2675,11 +2846,22 @@ static int postcopy_start(MigrationState *ms, Error **errp)
ret = qemu_file_get_error(ms->to_dst_file);
if (ret) {
error_setg_errno(errp, -ret, "postcopy_start: Migration stream error");
- bql_lock();
goto fail;
}
trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
+ /*
+ * Now postcopy officially started, switch to postcopy bandwidth that
+ * user specified.
+ */
+ migration_rate_set(migrate_max_postcopy_bandwidth());
+
+ /* Now, switchover looks all fine, switching to postcopy-active */
+ migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+ bql_unlock();
+
return ret;
fail_closefb:
@@ -2687,67 +2869,104 @@ fail_closefb:
fail:
migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
MIGRATION_STATUS_FAILED);
- if (restart_block) {
- /* A failure happened early enough that we know the destination hasn't
- * accessed block devices, so we're safe to recover.
- */
- Error *local_err = NULL;
-
- bdrv_activate_all(&local_err);
- if (local_err) {
- error_report_err(local_err);
- }
- }
+ migration_block_activate(NULL);
migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
bql_unlock();
return -1;
}
/**
- * migration_maybe_pause: Pause if required to by
- * migrate_pause_before_switchover called with the BQL locked
- * Returns: 0 on success
+ * @migration_switchover_prepare: Start VM switchover procedure
+ *
+ * @s: The migration state object pointer
+ *
+ * Prepares for the switchover, depending on "pause-before-switchover"
+ * capability.
+ *
+ * If cap set, state machine goes like:
+ * [postcopy-]active -> pre-switchover -> device
+ *
+ * If cap not set:
+ * [postcopy-]active -> device
+ *
+ * Returns: true on success, false on interruptions.
*/
-static int migration_maybe_pause(MigrationState *s,
- int *current_active_state,
- int new_state)
+static bool migration_switchover_prepare(MigrationState *s)
{
+ /* Concurrent cancellation? Quit */
+ if (s->state == MIGRATION_STATUS_CANCELLING) {
+ return false;
+ }
+
+ /*
+ * No matter precopy or postcopy, since we still hold BQL it must not
+ * change concurrently to CANCELLING, so it must be either ACTIVE or
+ * POSTCOPY_ACTIVE.
+ */
+ assert(migration_is_active());
+
+ /* If the pre stage not requested, directly switch to DEVICE */
if (!migrate_pause_before_switchover()) {
- return 0;
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_DEVICE);
+ return true;
}
- /* Since leaving this state is not atomic with posting the semaphore
+ /*
+ * Since leaving this state is not atomic with setting the event
* it's possible that someone could have issued multiple migrate_continue
- * and the semaphore is incorrectly positive at this point;
- * the docs say it's undefined to reinit a semaphore that's already
- * init'd, so use timedwait to eat up any existing posts.
+ * and the event is incorrectly set at this point so reset it.
*/
- while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
- /* This block intentionally left blank */
- }
+ qemu_event_reset(&s->pause_event);
+
+ /* Update [POSTCOPY_]ACTIVE to PRE_SWITCHOVER */
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_PRE_SWITCHOVER);
+ bql_unlock();
+
+ qemu_event_wait(&s->pause_event);
+ bql_lock();
/*
- * If the migration is cancelled when it is in the completion phase,
- * the migration state is set to MIGRATION_STATUS_CANCELLING.
- * So we don't need to wait a semaphore, otherwise we would always
- * wait for the 'pause_sem' semaphore.
+ * After BQL released and retaken, the state can be CANCELLING if it
+ * happend during sem_wait().. Only change the state if it's still
+ * pre-switchover.
*/
- if (s->state != MIGRATION_STATUS_CANCELLING) {
- bql_unlock();
- migrate_set_state(&s->state, *current_active_state,
- MIGRATION_STATUS_PRE_SWITCHOVER);
- qemu_sem_wait(&s->pause_sem);
- migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
- new_state);
- *current_active_state = new_state;
- bql_lock();
+ migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
+ MIGRATION_STATUS_DEVICE);
+
+ return s->state == MIGRATION_STATUS_DEVICE;
+}
+
+static bool migration_switchover_start(MigrationState *s, Error **errp)
+{
+ ERRP_GUARD();
+
+ if (!migration_switchover_prepare(s)) {
+ error_setg(errp, "Switchover is interrupted");
+ return false;
}
- return s->state == new_state ? 0 : -EINVAL;
+ /* Inactivate disks except in COLO */
+ if (!migrate_colo()) {
+ /*
+ * Inactivate before sending QEMU_VM_EOF so that the
+ * bdrv_activate_all() on the other end won't fail.
+ */
+ if (!migration_block_inactivate()) {
+ error_setg(errp, "Block inactivate failed during switchover");
+ return false;
+ }
+ }
+
+ migration_rate_set(RATE_LIMIT_DISABLED);
+
+ precopy_notify_complete();
+
+ qemu_savevm_maybe_send_switchover_start(s->to_dst_file);
+
+ return true;
}
-static int migration_completion_precopy(MigrationState *s,
- int *current_active_state)
+static int migration_completion_precopy(MigrationState *s)
{
int ret;
@@ -2760,20 +2979,12 @@ static int migration_completion_precopy(MigrationState *s,
}
}
- ret = migration_maybe_pause(s, current_active_state,
- MIGRATION_STATUS_DEVICE);
- if (ret < 0) {
+ if (!migration_switchover_start(s, NULL)) {
+ ret = -EFAULT;
goto out_unlock;
}
- /*
- * Inactivate disks except in COLO, and track that we have done so in order
- * to remember to reactivate them if migration fails or is cancelled.
- */
- s->block_inactive = !migrate_colo();
- migration_rate_set(RATE_LIMIT_DISABLED);
- ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
- s->block_inactive);
+ ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false);
out_unlock:
bql_unlock();
return ret;
@@ -2798,31 +3009,6 @@ static void migration_completion_postcopy(MigrationState *s)
trace_migration_completion_postcopy_end_after_complete();
}
-static void migration_completion_failed(MigrationState *s,
- int current_active_state)
-{
- if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_DEVICE)) {
- /*
- * If not doing postcopy, vm_start() will be called: let's
- * regain control on images.
- */
- Error *local_err = NULL;
-
- bql_lock();
- bdrv_activate_all(&local_err);
- if (local_err) {
- error_report_err(local_err);
- } else {
- s->block_inactive = false;
- }
- bql_unlock();
- }
-
- migrate_set_state(&s->state, current_active_state,
- MIGRATION_STATUS_FAILED);
-}
-
/**
* migration_completion: Used by migration_thread when there's not much left.
* The caller 'breaks' the loop when this returns.
@@ -2832,11 +3018,10 @@ static void migration_completion_failed(MigrationState *s,
static void migration_completion(MigrationState *s)
{
int ret = 0;
- int current_active_state = s->state;
Error *local_err = NULL;
if (s->state == MIGRATION_STATUS_ACTIVE) {
- ret = migration_completion_precopy(s, &current_active_state);
+ ret = migration_completion_precopy(s);
} else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
migration_completion_postcopy(s);
} else {
@@ -2876,7 +3061,9 @@ fail:
error_free(local_err);
}
- migration_completion_failed(s, current_active_state);
+ if (s->state != MIGRATION_STATUS_CANCELLING) {
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+ }
}
/**
@@ -2899,7 +3086,7 @@ static void bg_migration_completion(MigrationState *s)
qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
qemu_fflush(s->to_dst_file);
} else if (s->state == MIGRATION_STATUS_CANCELLING) {
- goto fail;
+ return;
}
if (qemu_file_get_error(s->to_dst_file)) {
@@ -3283,10 +3470,17 @@ static MigIterateState migration_iteration_run(MigrationState *s)
static void migration_iteration_finish(MigrationState *s)
{
- /* If we enabled cpu throttling for auto-converge, turn it off. */
- cpu_throttle_stop();
-
bql_lock();
+
+ /*
+ * If we enabled cpu throttling for auto-converge, turn it off.
+ * Stopping CPU throttle should be serialized by BQL to avoid
+ * racing for the throttle_dirty_sync_timer.
+ */
+ if (migrate_auto_converge()) {
+ cpu_throttle_stop();
+ }
+
switch (s->state) {
case MIGRATION_STATUS_COMPLETED:
runstate_set(RUN_STATE_POSTMIGRATE);
@@ -3299,6 +3493,11 @@ static void migration_iteration_finish(MigrationState *s)
case MIGRATION_STATUS_FAILED:
case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_CANCELLING:
+ /*
+ * Re-activate the block drives if they're inactivated. Note, COLO
+ * shouldn't use block_active at all, so it should be no-op there.
+ */
+ migration_block_activate(NULL);
if (runstate_is_live(s->vm_old_state)) {
if (!runstate_check(RUN_STATE_SHUTDOWN)) {
vm_start();
@@ -3316,7 +3515,7 @@ static void migration_iteration_finish(MigrationState *s)
break;
}
- migration_bh_schedule(migrate_fd_cleanup_bh, s);
+ migration_bh_schedule(migration_cleanup_bh, s);
bql_unlock();
}
@@ -3344,7 +3543,7 @@ static void bg_migration_iteration_finish(MigrationState *s)
break;
}
- migration_bh_schedule(migrate_fd_cleanup_bh, s);
+ migration_bh_schedule(migration_cleanup_bh, s);
bql_unlock();
}
@@ -3462,11 +3661,11 @@ static void *migration_thread(void *opaque)
Error *local_err = NULL;
int ret;
- thread = migration_threads_add("live_migration", qemu_get_thread_id());
+ thread = migration_threads_add(MIGRATION_THREAD_SRC_MAIN,
+ qemu_get_thread_id());
rcu_register_thread();
- object_ref(OBJECT(s));
update_iteration_initial_status(s);
if (!multifd_send_setup()) {
@@ -3503,6 +3702,11 @@ static void *migration_thread(void *opaque)
qemu_savevm_send_colo_enable(s->to_dst_file);
}
+ if (migrate_auto_converge()) {
+ /* Start RAMBlock dirty bitmap sync timer */
+ cpu_throttle_dirty_sync_timer(true);
+ }
+
bql_lock();
ret = qemu_savevm_state_setup(s->to_dst_file, &local_err);
bql_unlock();
@@ -3599,7 +3803,6 @@ static void *bg_migration_thread(void *opaque)
int ret;
rcu_register_thread();
- object_ref(OBJECT(s));
migration_rate_set(RATE_LIMIT_DISABLED);
@@ -3657,12 +3860,8 @@ static void *bg_migration_thread(void *opaque)
if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
goto fail;
}
- /*
- * Put vCPUs in sync with shadow context structures, then
- * save their state to channel-buffer along with devices.
- */
- cpu_synchronize_all_states();
- if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
+
+ if (qemu_savevm_state_complete_precopy_non_iterable(fb, false)) {
goto fail;
}
/*
@@ -3726,7 +3925,7 @@ fail_setup:
return NULL;
}
-void migrate_fd_connect(MigrationState *s, Error *error_in)
+void migration_connect(MigrationState *s, Error *error_in)
{
Error *local_err = NULL;
uint64_t rate_limit;
@@ -3736,24 +3935,24 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
/*
* If there's a previous error, free it and prepare for another one.
* Meanwhile if migration completes successfully, there won't have an error
- * dumped when calling migrate_fd_cleanup().
+ * dumped when calling migration_cleanup().
*/
migrate_error_free(s);
s->expected_downtime = migrate_downtime_limit();
if (error_in) {
- migrate_fd_error(s, error_in);
+ migration_connect_set_error(s, error_in);
if (resume) {
/*
* Don't do cleanup for resume if channel is invalid, but only dump
* the error. We wait for another channel connect from the user.
* The error_report still gives HMP user a hint on what failed.
- * It's normally done in migrate_fd_cleanup(), but call it here
+ * It's normally done in migration_cleanup(), but call it here
* explicitly.
*/
error_report_err(error_copy(s->error));
} else {
- migrate_fd_cleanup(s);
+ migration_cleanup(s);
}
return;
}
@@ -3811,11 +4010,19 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
}
}
+ /*
+ * Take a refcount to make sure the migration object won't get freed by
+ * the main thread already in migration_shutdown().
+ *
+ * The refcount will be released at the end of the thread function.
+ */
+ object_ref(OBJECT(s));
+
if (migrate_background_snapshot()) {
- qemu_thread_create(&s->thread, "mig/snapshot",
+ qemu_thread_create(&s->thread, MIGRATION_THREAD_SNAPSHOT,
bg_migration_thread, s, QEMU_THREAD_JOINABLE);
} else {
- qemu_thread_create(&s->thread, "mig/src/main",
+ qemu_thread_create(&s->thread, MIGRATION_THREAD_SRC_MAIN,
migration_thread, s, QEMU_THREAD_JOINABLE);
}
s->migration_thread_running = true;
@@ -3823,17 +4030,20 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
fail:
migrate_set_error(s, local_err);
- migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+ if (s->state != MIGRATION_STATUS_CANCELLING) {
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+ }
error_report_err(local_err);
- migrate_fd_cleanup(s);
+ migration_cleanup(s);
}
-static void migration_class_init(ObjectClass *klass, void *data)
+static void migration_class_init(ObjectClass *klass, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
dc->user_creatable = false;
- device_class_set_props(dc, migration_properties);
+ device_class_set_props_n(dc, migration_properties,
+ migration_properties_count);
}
static void migration_instance_finalize(Object *obj)
@@ -3844,7 +4054,7 @@ static void migration_instance_finalize(Object *obj)
qemu_mutex_destroy(&ms->qemu_file_lock);
qemu_sem_destroy(&ms->wait_unplug_sem);
qemu_sem_destroy(&ms->rate_limit_sem);
- qemu_sem_destroy(&ms->pause_sem);
+ qemu_event_destroy(&ms->pause_event);
qemu_sem_destroy(&ms->postcopy_pause_sem);
qemu_sem_destroy(&ms->rp_state.rp_sem);
qemu_sem_destroy(&ms->rp_state.rp_pong_acks);
@@ -3859,7 +4069,7 @@ static void migration_instance_init(Object *obj)
ms->state = MIGRATION_STATUS_NONE;
ms->mbps = -1;
ms->pages_per_second = -1;
- qemu_sem_init(&ms->pause_sem, 0);
+ qemu_event_init(&ms->pause_event, false);
qemu_mutex_init(&ms->error_mutex);
migrate_params_init(&ms->parameters);
diff --git a/migration/migration.h b/migration/migration.h
index 38aa140..739289d 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -17,7 +17,7 @@
#include "exec/cpu-common.h"
#include "hw/qdev-core.h"
#include "qapi/qapi-types-migration.h"
-#include "qapi/qmp/json-writer.h"
+#include "qobject/json-writer.h"
#include "qemu/thread.h"
#include "qemu/coroutine.h"
#include "io/channel.h"
@@ -25,10 +25,25 @@
#include "net/announce.h"
#include "qom/object.h"
#include "postcopy-ram.h"
-#include "sysemu/runstate.h"
+#include "system/runstate.h"
#include "migration/misc.h"
+#define MIGRATION_THREAD_SNAPSHOT "mig/snapshot"
+#define MIGRATION_THREAD_DIRTY_RATE "mig/dirtyrate"
+
+#define MIGRATION_THREAD_SRC_MAIN "mig/src/main"
+#define MIGRATION_THREAD_SRC_MULTIFD "mig/src/send_%d"
+#define MIGRATION_THREAD_SRC_RETURN "mig/src/return"
+#define MIGRATION_THREAD_SRC_TLS "mig/src/tls"
+
+#define MIGRATION_THREAD_DST_COLO "mig/dst/colo"
+#define MIGRATION_THREAD_DST_MULTIFD "mig/dst/recv_%d"
+#define MIGRATION_THREAD_DST_FAULT "mig/dst/fault"
+#define MIGRATION_THREAD_DST_LISTEN "mig/dst/listen"
+#define MIGRATION_THREAD_DST_PREEMPT "mig/dst/preempt"
+
struct PostcopyBlocktimeContext;
+typedef struct ThreadPool ThreadPool;
#define MIGRATION_RESUME_ACK_VALUE (1)
@@ -83,9 +98,9 @@ struct MigrationIncomingState {
void (*transport_cleanup)(void *data);
/*
* Used to sync thread creations. Note that we can't create threads in
- * parallel with this sem.
+ * parallel with this event.
*/
- QemuSemaphore thread_sync_sem;
+ QemuEvent thread_sync_event;
/*
* Free at the start of the main state load, set as the main thread finishes
* loading state.
@@ -171,7 +186,11 @@ struct MigrationIncomingState {
/* The coroutine we should enter (back) after failover */
Coroutine *colo_incoming_co;
- QemuSemaphore colo_incoming_sem;
+ QemuEvent colo_incoming_event;
+
+ /* Optional load threads pool and its thread exit request flag */
+ ThreadPool *load_threads;
+ bool load_threads_abort;
/*
* PostcopyBlocktimeContext to keep information for postcopy
@@ -356,17 +375,14 @@ struct MigrationState {
/* Flag set once the migration thread is running (and needs joining) */
bool migration_thread_running;
- /* Flag set once the migration thread called bdrv_inactivate_all */
- bool block_inactive;
-
/* Migration is waiting for guest to unplug device */
QemuSemaphore wait_unplug_sem;
/* Migration is paused due to pause-before-switchover */
- QemuSemaphore pause_sem;
+ QemuEvent pause_event;
- /* The semaphore is used to notify COLO thread that failover is finished */
- QemuSemaphore colo_exit_sem;
+ /* The event is used to notify COLO thread that failover is finished */
+ QemuEvent colo_exit_event;
/* The event is used to notify COLO thread to do checkpoint */
QemuEvent colo_checkpoint_event;
@@ -389,6 +405,8 @@ struct MigrationState {
bool send_configuration;
/* Whether we send section footer during migration */
bool send_section_footer;
+ /* Whether we send switchover start notification during migration */
+ bool send_switchover_start;
/* Needed by postcopy-pause state */
QemuSemaphore postcopy_pause_sem;
@@ -432,6 +450,39 @@ struct MigrationState {
* Default value is false. (since 8.1)
*/
bool multifd_flush_after_each_section;
+
+ /*
+ * This variable only makes sense when set on the machine that is
+ * the destination of a multifd migration with TLS enabled. It
+ * affects the behavior of the last send->recv iteration with
+ * regards to termination of the TLS session.
+ *
+ * When set:
+ *
+ * - the destination QEMU instance can expect to never get a
+ * GNUTLS_E_PREMATURE_TERMINATION error. Manifested as the error
+ * message: "The TLS connection was non-properly terminated".
+ *
+ * When clear:
+ *
+ * - the destination QEMU instance can expect to see a
+ * GNUTLS_E_PREMATURE_TERMINATION error in any multifd channel
+ * whenever the last recv() call of that channel happens after
+ * the source QEMU instance has already issued shutdown() on the
+ * channel.
+ *
+ * Commit 637280aeb2 (since 9.1) introduced a side effect that
+ * causes the destination instance to not be affected by the
+ * premature termination, while commit 1d457daf86 (since 10.0)
+ * causes the premature termination condition to be once again
+ * reachable.
+ *
+ * NOTE: Regardless of the state of this option, a premature
+ * termination of the TLS connection might happen due to error at
+ * any moment prior to the last send->recv iteration.
+ */
+ bool multifd_clean_tls_termination;
+
/*
* This decides the size of guest memory chunk that will be used
* to track dirty bitmap clearing. The size of memory chunk will
@@ -457,6 +508,8 @@ struct MigrationState {
bool switchover_acked;
/* Is this a rdma migration */
bool rdma_migration;
+
+ GSource *hup_source;
};
void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
@@ -471,7 +524,7 @@ bool migration_has_all_channels(void);
void migrate_set_error(MigrationState *s, const Error *error);
bool migrate_has_error(MigrationState *s);
-void migrate_fd_connect(MigrationState *s, Error *error_in);
+void migration_connect(MigrationState *s, Error *error_in);
int migration_call_notifiers(MigrationState *s, MigrationEventType type,
Error **errp);
@@ -508,8 +561,6 @@ bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm,
Error **errp);
void migrate_add_address(SocketAddress *address);
-bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
- Error **errp);
int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
#define qemu_ram_foreach_block \
@@ -519,7 +570,7 @@ void migration_make_urgent_request(void);
void migration_consume_urgent_request(void);
bool migration_rate_limit(void);
void migration_bh_schedule(QEMUBHFunc *cb, void *opaque);
-void migration_cancel(const Error *error);
+void migration_cancel(void);
void migration_populate_vfio_info(MigrationInfo *info);
void migration_reset_vfio_bytes_transferred(void);
@@ -537,4 +588,10 @@ int migration_rp_wait(MigrationState *s);
*/
void migration_rp_kick(MigrationState *s);
+void migration_bitmap_sync_precopy(bool last_stage);
+
+/* migration/block-dirty-bitmap.c */
+void dirty_bitmap_mig_init(void);
+bool should_send_vmdesc(void);
+
#endif
diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
new file mode 100644
index 0000000..94222d0
--- /dev/null
+++ b/migration/multifd-device-state.c
@@ -0,0 +1,212 @@
+/*
+ * Multifd device state migration
+ *
+ * Copyright (C) 2024,2025 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/lockable.h"
+#include "block/thread-pool.h"
+#include "migration.h"
+#include "migration/misc.h"
+#include "multifd.h"
+#include "options.h"
+
+static struct {
+ QemuMutex queue_job_mutex;
+
+ MultiFDSendData *send_data;
+
+ ThreadPool *threads;
+ bool threads_abort;
+} *multifd_send_device_state;
+
+void multifd_device_state_send_setup(void)
+{
+ assert(!multifd_send_device_state);
+ multifd_send_device_state = g_malloc(sizeof(*multifd_send_device_state));
+
+ qemu_mutex_init(&multifd_send_device_state->queue_job_mutex);
+
+ multifd_send_device_state->send_data = multifd_send_data_alloc();
+
+ multifd_send_device_state->threads = thread_pool_new();
+ multifd_send_device_state->threads_abort = false;
+}
+
+void multifd_device_state_send_cleanup(void)
+{
+ g_clear_pointer(&multifd_send_device_state->threads, thread_pool_free);
+ g_clear_pointer(&multifd_send_device_state->send_data,
+ multifd_send_data_free);
+
+ qemu_mutex_destroy(&multifd_send_device_state->queue_job_mutex);
+
+ g_clear_pointer(&multifd_send_device_state, g_free);
+}
+
+void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state)
+{
+ g_clear_pointer(&device_state->idstr, g_free);
+ g_clear_pointer(&device_state->buf, g_free);
+}
+
+static void multifd_device_state_fill_packet(MultiFDSendParams *p)
+{
+ MultiFDDeviceState_t *device_state = &p->data->u.device_state;
+ MultiFDPacketDeviceState_t *packet = p->packet_device_state;
+
+ packet->hdr.flags = cpu_to_be32(p->flags);
+ strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr) - 1);
+ packet->idstr[sizeof(packet->idstr) - 1] = 0;
+ packet->instance_id = cpu_to_be32(device_state->instance_id);
+ packet->next_packet_size = cpu_to_be32(p->next_packet_size);
+}
+
+static void multifd_prepare_header_device_state(MultiFDSendParams *p)
+{
+ p->iov[0].iov_len = sizeof(*p->packet_device_state);
+ p->iov[0].iov_base = p->packet_device_state;
+ p->iovs_num++;
+}
+
+void multifd_device_state_send_prepare(MultiFDSendParams *p)
+{
+ MultiFDDeviceState_t *device_state = &p->data->u.device_state;
+
+ assert(multifd_payload_device_state(p->data));
+
+ multifd_prepare_header_device_state(p);
+
+ assert(!(p->flags & MULTIFD_FLAG_SYNC));
+
+ p->next_packet_size = device_state->buf_len;
+ if (p->next_packet_size > 0) {
+ p->iov[p->iovs_num].iov_base = device_state->buf;
+ p->iov[p->iovs_num].iov_len = p->next_packet_size;
+ p->iovs_num++;
+ }
+
+ p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE;
+
+ multifd_device_state_fill_packet(p);
+}
+
+bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
+ char *data, size_t len)
+{
+ /* Device state submissions can come from multiple threads */
+ QEMU_LOCK_GUARD(&multifd_send_device_state->queue_job_mutex);
+ MultiFDDeviceState_t *device_state;
+
+ assert(multifd_payload_empty(multifd_send_device_state->send_data));
+
+ multifd_set_payload_type(multifd_send_device_state->send_data,
+ MULTIFD_PAYLOAD_DEVICE_STATE);
+ device_state = &multifd_send_device_state->send_data->u.device_state;
+ device_state->idstr = g_strdup(idstr);
+ device_state->instance_id = instance_id;
+ device_state->buf = g_memdup2(data, len);
+ device_state->buf_len = len;
+
+ if (!multifd_send(&multifd_send_device_state->send_data)) {
+ multifd_send_data_clear(multifd_send_device_state->send_data);
+ return false;
+ }
+
+ return true;
+}
+
+bool multifd_device_state_supported(void)
+{
+ return migrate_multifd() && !migrate_mapped_ram() &&
+ migrate_multifd_compression() == MULTIFD_COMPRESSION_NONE;
+}
+
+static void multifd_device_state_save_thread_data_free(void *opaque)
+{
+ SaveLiveCompletePrecopyThreadData *data = opaque;
+
+ g_clear_pointer(&data->idstr, g_free);
+ g_free(data);
+}
+
+static int multifd_device_state_save_thread(void *opaque)
+{
+ SaveLiveCompletePrecopyThreadData *data = opaque;
+ g_autoptr(Error) local_err = NULL;
+
+ if (!data->hdlr(data, &local_err)) {
+ MigrationState *s = migrate_get_current();
+
+ /*
+ * Can't call abort_device_state_save_threads() here since new
+ * save threads could still be in process of being launched
+ * (if, for example, the very first save thread launched exited
+ * with an error very quickly).
+ */
+
+ assert(local_err);
+
+ /*
+ * In case of multiple save threads failing which thread error
+ * return we end setting is purely arbitrary.
+ */
+ migrate_set_error(s, local_err);
+ }
+
+ return 0;
+}
+
+bool multifd_device_state_save_thread_should_exit(void)
+{
+ return qatomic_read(&multifd_send_device_state->threads_abort);
+}
+
+void
+multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr,
+ char *idstr, uint32_t instance_id,
+ void *opaque)
+{
+ SaveLiveCompletePrecopyThreadData *data;
+
+ assert(multifd_device_state_supported());
+ assert(multifd_send_device_state);
+
+ assert(!qatomic_read(&multifd_send_device_state->threads_abort));
+
+ data = g_new(SaveLiveCompletePrecopyThreadData, 1);
+ data->hdlr = hdlr;
+ data->idstr = g_strdup(idstr);
+ data->instance_id = instance_id;
+ data->handler_opaque = opaque;
+
+ thread_pool_submit_immediate(multifd_send_device_state->threads,
+ multifd_device_state_save_thread,
+ data,
+ multifd_device_state_save_thread_data_free);
+}
+
+void multifd_abort_device_state_save_threads(void)
+{
+ assert(multifd_device_state_supported());
+
+ qatomic_set(&multifd_send_device_state->threads_abort, true);
+}
+
+bool multifd_join_device_state_save_threads(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ assert(multifd_device_state_supported());
+
+ thread_pool_wait(multifd_send_device_state->threads);
+
+ return !migrate_has_error(s);
+}
diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
new file mode 100644
index 0000000..b48eae3
--- /dev/null
+++ b/migration/multifd-nocomp.c
@@ -0,0 +1,468 @@
+/*
+ * Multifd RAM migration without compression
+ *
+ * Copyright (c) 2019-2020 Red Hat Inc
+ *
+ * Authors:
+ * Juan Quintela <quintela@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "system/ramblock.h"
+#include "exec/target_page.h"
+#include "file.h"
+#include "migration-stats.h"
+#include "multifd.h"
+#include "options.h"
+#include "migration.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qemu-file.h"
+
+static MultiFDSendData *multifd_ram_send;
+
+void multifd_ram_payload_alloc(MultiFDPages_t *pages)
+{
+ pages->offset = g_new0(ram_addr_t, multifd_ram_page_count());
+}
+
+void multifd_ram_payload_free(MultiFDPages_t *pages)
+{
+ g_clear_pointer(&pages->offset, g_free);
+}
+
+void multifd_ram_save_setup(void)
+{
+ multifd_ram_send = multifd_send_data_alloc();
+}
+
+void multifd_ram_save_cleanup(void)
+{
+ g_clear_pointer(&multifd_ram_send, multifd_send_data_free);
+}
+
+static void multifd_set_file_bitmap(MultiFDSendParams *p)
+{
+ MultiFDPages_t *pages = &p->data->u.ram;
+
+ assert(pages->block);
+
+ for (int i = 0; i < pages->normal_num; i++) {
+ ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true);
+ }
+
+ for (int i = pages->normal_num; i < pages->num; i++) {
+ ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false);
+ }
+}
+
+static int multifd_nocomp_send_setup(MultiFDSendParams *p, Error **errp)
+{
+ uint32_t page_count = multifd_ram_page_count();
+
+ if (migrate_zero_copy_send()) {
+ p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
+ }
+
+ if (!migrate_mapped_ram()) {
+ /* We need one extra place for the packet header */
+ p->iov = g_new0(struct iovec, page_count + 1);
+ } else {
+ p->iov = g_new0(struct iovec, page_count);
+ }
+
+ return 0;
+}
+
+static void multifd_nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+ g_free(p->iov);
+ p->iov = NULL;
+}
+
+static void multifd_ram_prepare_header(MultiFDSendParams *p)
+{
+ p->iov[0].iov_len = p->packet_len;
+ p->iov[0].iov_base = p->packet;
+ p->iovs_num++;
+}
+
+static void multifd_send_prepare_iovs(MultiFDSendParams *p)
+{
+ MultiFDPages_t *pages = &p->data->u.ram;
+ uint32_t page_size = multifd_ram_page_size();
+
+ for (int i = 0; i < pages->normal_num; i++) {
+ p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
+ p->iov[p->iovs_num].iov_len = page_size;
+ p->iovs_num++;
+ }
+
+ p->next_packet_size = pages->normal_num * page_size;
+}
+
+static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+ bool use_zero_copy_send = migrate_zero_copy_send();
+ int ret;
+
+ multifd_send_zero_page_detect(p);
+
+ if (migrate_mapped_ram()) {
+ multifd_send_prepare_iovs(p);
+ multifd_set_file_bitmap(p);
+
+ return 0;
+ }
+
+ if (!use_zero_copy_send) {
+ /*
+ * Only !zerocopy needs the header in IOV; zerocopy will
+ * send it separately.
+ */
+ multifd_ram_prepare_header(p);
+ }
+
+ multifd_send_prepare_iovs(p);
+ p->flags |= MULTIFD_FLAG_NOCOMP;
+
+ multifd_send_fill_packet(p);
+
+ if (use_zero_copy_send) {
+ /* Send header first, without zerocopy */
+ ret = qio_channel_write_all(p->c, (void *)p->packet,
+ p->packet_len, errp);
+ if (ret != 0) {
+ return -1;
+ }
+
+ stat64_add(&mig_stats.multifd_bytes, p->packet_len);
+ }
+
+ return 0;
+}
+
+static int multifd_nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+ p->iov = g_new0(struct iovec, multifd_ram_page_count());
+ return 0;
+}
+
+static void multifd_nocomp_recv_cleanup(MultiFDRecvParams *p)
+{
+ g_free(p->iov);
+ p->iov = NULL;
+}
+
+static int multifd_nocomp_recv(MultiFDRecvParams *p, Error **errp)
+{
+ uint32_t flags;
+
+ if (migrate_mapped_ram()) {
+ return multifd_file_recv_data(p, errp);
+ }
+
+ flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+
+ if (flags != MULTIFD_FLAG_NOCOMP) {
+ error_setg(errp, "multifd %u: flags received %x flags expected %x",
+ p->id, flags, MULTIFD_FLAG_NOCOMP);
+ return -1;
+ }
+
+ multifd_recv_zero_page_process(p);
+
+ if (!p->normal_num) {
+ return 0;
+ }
+
+ for (int i = 0; i < p->normal_num; i++) {
+ p->iov[i].iov_base = p->host + p->normal[i];
+ p->iov[i].iov_len = multifd_ram_page_size();
+ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
+ }
+ return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
+}
+
+static void multifd_pages_reset(MultiFDPages_t *pages)
+{
+ /*
+ * We don't need to touch offset[] array, because it will be
+ * overwritten later when reused.
+ */
+ pages->num = 0;
+ pages->normal_num = 0;
+ pages->block = NULL;
+}
+
+void multifd_ram_fill_packet(MultiFDSendParams *p)
+{
+ MultiFDPacket_t *packet = p->packet;
+ MultiFDPages_t *pages = &p->data->u.ram;
+ uint32_t zero_num = pages->num - pages->normal_num;
+
+ packet->pages_alloc = cpu_to_be32(multifd_ram_page_count());
+ packet->normal_pages = cpu_to_be32(pages->normal_num);
+ packet->zero_pages = cpu_to_be32(zero_num);
+
+ if (pages->block) {
+ pstrcpy(packet->ramblock, sizeof(packet->ramblock),
+ pages->block->idstr);
+ }
+
+ for (int i = 0; i < pages->num; i++) {
+ /* there are architectures where ram_addr_t is 32 bit */
+ uint64_t temp = pages->offset[i];
+
+ packet->offset[i] = cpu_to_be64(temp);
+ }
+
+ trace_multifd_send_ram_fill(p->id, pages->normal_num,
+ zero_num);
+}
+
+int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp)
+{
+ MultiFDPacket_t *packet = p->packet;
+ uint32_t page_count = multifd_ram_page_count();
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t pages_per_packet = be32_to_cpu(packet->pages_alloc);
+ int i;
+
+ if (pages_per_packet > page_count) {
+ error_setg(errp, "multifd: received packet with %u pages, expected %u",
+ pages_per_packet, page_count);
+ return -1;
+ }
+
+ p->normal_num = be32_to_cpu(packet->normal_pages);
+ if (p->normal_num > pages_per_packet) {
+ error_setg(errp, "multifd: received packet with %u non-zero pages, "
+ "which exceeds maximum expected pages %u",
+ p->normal_num, pages_per_packet);
+ return -1;
+ }
+
+ p->zero_num = be32_to_cpu(packet->zero_pages);
+ if (p->zero_num > pages_per_packet - p->normal_num) {
+ error_setg(errp,
+ "multifd: received packet with %u zero pages, expected maximum %u",
+ p->zero_num, pages_per_packet - p->normal_num);
+ return -1;
+ }
+
+ if (p->normal_num == 0 && p->zero_num == 0) {
+ return 0;
+ }
+
+ /* make sure that ramblock is 0 terminated */
+ packet->ramblock[255] = 0;
+ p->block = qemu_ram_block_by_name(packet->ramblock);
+ if (!p->block) {
+ error_setg(errp, "multifd: unknown ram block %s",
+ packet->ramblock);
+ return -1;
+ }
+
+ p->host = p->block->host;
+ for (i = 0; i < p->normal_num; i++) {
+ uint64_t offset = be64_to_cpu(packet->offset[i]);
+
+ if (offset > (p->block->used_length - page_size)) {
+ error_setg(errp, "multifd: offset too long %" PRIu64
+ " (max " RAM_ADDR_FMT ")",
+ offset, p->block->used_length);
+ return -1;
+ }
+ p->normal[i] = offset;
+ }
+
+ for (i = 0; i < p->zero_num; i++) {
+ uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
+
+ if (offset > (p->block->used_length - page_size)) {
+ error_setg(errp, "multifd: offset too long %" PRIu64
+ " (max " RAM_ADDR_FMT ")",
+ offset, p->block->used_length);
+ return -1;
+ }
+ p->zero[i] = offset;
+ }
+
+ return 0;
+}
+
+static inline bool multifd_queue_empty(MultiFDPages_t *pages)
+{
+ return pages->num == 0;
+}
+
+static inline bool multifd_queue_full(MultiFDPages_t *pages)
+{
+ return pages->num == multifd_ram_page_count();
+}
+
+static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset)
+{
+ pages->offset[pages->num++] = offset;
+}
+
+/* Returns true if enqueue successful, false otherwise */
+bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
+{
+ MultiFDPages_t *pages;
+
+retry:
+ pages = &multifd_ram_send->u.ram;
+
+ if (multifd_payload_empty(multifd_ram_send)) {
+ multifd_pages_reset(pages);
+ multifd_set_payload_type(multifd_ram_send, MULTIFD_PAYLOAD_RAM);
+ }
+
+ /* If the queue is empty, we can already enqueue now */
+ if (multifd_queue_empty(pages)) {
+ pages->block = block;
+ multifd_enqueue(pages, offset);
+ return true;
+ }
+
+ /*
+ * Not empty, meanwhile we need a flush. It can because of either:
+ *
+ * (1) The page is not on the same ramblock of previous ones, or,
+ * (2) The queue is full.
+ *
+ * After flush, always retry.
+ */
+ if (pages->block != block || multifd_queue_full(pages)) {
+ if (!multifd_send(&multifd_ram_send)) {
+ return false;
+ }
+ goto retry;
+ }
+
+ /* Not empty, and we still have space, do it! */
+ multifd_enqueue(pages, offset);
+ return true;
+}
+
+/*
+ * We have two modes for multifd flushes:
+ *
+ * - Per-section mode: this is the legacy way to flush, it requires one
+ * MULTIFD_FLAG_SYNC message for each RAM_SAVE_FLAG_EOS.
+ *
+ * - Per-round mode: this is the modern way to flush, it requires one
+ * MULTIFD_FLAG_SYNC message only for each round of RAM scan. Normally
+ * it's paired with a new RAM_SAVE_FLAG_MULTIFD_FLUSH message in network
+ * based migrations.
+ *
+ * One thing to mention is mapped-ram always use the modern way to sync.
+ */
+
+/* Do we need a per-section multifd flush (legacy way)? */
+bool multifd_ram_sync_per_section(void)
+{
+ if (!migrate_multifd()) {
+ return false;
+ }
+
+ if (migrate_mapped_ram()) {
+ return false;
+ }
+
+ return migrate_multifd_flush_after_each_section();
+}
+
+/* Do we need a per-round multifd flush (modern way)? */
+bool multifd_ram_sync_per_round(void)
+{
+ if (!migrate_multifd()) {
+ return false;
+ }
+
+ if (migrate_mapped_ram()) {
+ return true;
+ }
+
+ return !migrate_multifd_flush_after_each_section();
+}
+
+int multifd_ram_flush_and_sync(QEMUFile *f)
+{
+ MultiFDSyncReq req;
+ int ret;
+
+ if (!migrate_multifd() || migration_in_postcopy()) {
+ return 0;
+ }
+
+ if (!multifd_payload_empty(multifd_ram_send)) {
+ if (!multifd_send(&multifd_ram_send)) {
+ error_report("%s: multifd_send fail", __func__);
+ return -1;
+ }
+ }
+
+ /* File migrations only need to sync with threads */
+ req = migrate_mapped_ram() ? MULTIFD_SYNC_LOCAL : MULTIFD_SYNC_ALL;
+
+ ret = multifd_send_sync_main(req);
+ if (ret) {
+ return ret;
+ }
+
+ /* If we don't need to sync with remote at all, nothing else to do */
+ if (req == MULTIFD_SYNC_LOCAL) {
+ return 0;
+ }
+
+ /*
+ * Old QEMUs don't understand RAM_SAVE_FLAG_MULTIFD_FLUSH, it relies
+ * on RAM_SAVE_FLAG_EOS instead.
+ */
+ if (migrate_multifd_flush_after_each_section()) {
+ return 0;
+ }
+
+ qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
+ qemu_fflush(f);
+
+ return 0;
+}
+
+bool multifd_send_prepare_common(MultiFDSendParams *p)
+{
+ MultiFDPages_t *pages = &p->data->u.ram;
+ multifd_ram_prepare_header(p);
+ multifd_send_zero_page_detect(p);
+
+ if (!pages->normal_num) {
+ p->next_packet_size = 0;
+ return false;
+ }
+
+ return true;
+}
+
+static const MultiFDMethods multifd_nocomp_ops = {
+ .send_setup = multifd_nocomp_send_setup,
+ .send_cleanup = multifd_nocomp_send_cleanup,
+ .send_prepare = multifd_nocomp_send_prepare,
+ .recv_setup = multifd_nocomp_recv_setup,
+ .recv_cleanup = multifd_nocomp_recv_cleanup,
+ .recv = multifd_nocomp_recv
+};
+
+static void multifd_nocomp_register(void)
+{
+ multifd_register_ops(MULTIFD_COMPRESSION_NONE, &multifd_nocomp_ops);
+}
+
+migration_init(multifd_nocomp_register);
diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c
new file mode 100644
index 0000000..7419e5d
--- /dev/null
+++ b/migration/multifd-qatzip.c
@@ -0,0 +1,395 @@
+/*
+ * Multifd QATzip compression implementation
+ *
+ * Copyright (c) Bytedance
+ *
+ * Authors:
+ * Bryan Zhang <bryan.zhang@bytedance.com>
+ * Hao Xiang <hao.xiang@bytedance.com>
+ * Yichen Wang <yichen.wang@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "system/ramblock.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qapi-types-migration.h"
+#include "options.h"
+#include "multifd.h"
+#include <qatzip.h>
+
+typedef struct {
+ /*
+ * Unique session for use with QATzip API
+ */
+ QzSession_T sess;
+
+ /*
+ * For compression: Buffer for pages to compress
+ * For decompression: Buffer for data to decompress
+ */
+ uint8_t *in_buf;
+ uint32_t in_len;
+
+ /*
+ * For compression: Output buffer of compressed data
+ * For decompression: Output buffer of decompressed data
+ */
+ uint8_t *out_buf;
+ uint32_t out_len;
+} QatzipData;
+
+/**
+ * qatzip_send_setup: Set up QATzip session and private buffers.
+ *
+ * @param p Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return 0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_send_setup(MultiFDSendParams *p, Error **errp)
+{
+ QatzipData *q;
+ QzSessionParamsDeflate_T params;
+ const char *err_msg;
+ int ret;
+
+ q = g_new0(QatzipData, 1);
+ p->compress_data = q;
+ /* We need one extra place for the packet header */
+ p->iov = g_new0(struct iovec, 2);
+
+ /*
+ * Initialize QAT device with software fallback by default. This allows
+ * QATzip to use CPU path when QAT hardware reaches maximum throughput.
+ */
+ ret = qzInit(&q->sess, true);
+ if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+ err_msg = "qzInit failed";
+ goto err;
+ }
+
+ ret = qzGetDefaultsDeflate(&params);
+ if (ret != QZ_OK) {
+ err_msg = "qzGetDefaultsDeflate failed";
+ goto err;
+ }
+
+ /* Make sure to use configured QATzip compression level. */
+ params.common_params.comp_lvl = migrate_multifd_qatzip_level();
+ ret = qzSetupSessionDeflate(&q->sess, &params);
+ if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+ err_msg = "qzSetupSessionDeflate failed";
+ goto err;
+ }
+
+ if (MULTIFD_PACKET_SIZE > UINT32_MAX) {
+ err_msg = "packet size too large for QAT";
+ goto err;
+ }
+
+ q->in_len = MULTIFD_PACKET_SIZE;
+ /*
+ * PINNED_MEM is an enum from qatzip headers, which means to use
+ * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device
+ * is not available or software fallback is used, the malloc flag needs to
+ * be set as COMMON_MEM.
+ */
+ q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM);
+ if (!q->in_buf) {
+ q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM);
+ if (!q->in_buf) {
+ err_msg = "qzMalloc failed";
+ goto err;
+ }
+ }
+
+ q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess);
+ q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM);
+ if (!q->out_buf) {
+ q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM);
+ if (!q->out_buf) {
+ err_msg = "qzMalloc failed";
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg);
+ return -1;
+}
+
+/**
+ * qatzip_send_cleanup: Tear down QATzip session and release private buffers.
+ *
+ * @param p Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return None
+ */
+static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+ QatzipData *q = p->compress_data;
+
+ if (q) {
+ if (q->in_buf) {
+ qzFree(q->in_buf);
+ }
+ if (q->out_buf) {
+ qzFree(q->out_buf);
+ }
+ (void)qzTeardownSession(&q->sess);
+ (void)qzClose(&q->sess);
+ g_free(q);
+ }
+
+ g_free(p->iov);
+ p->iov = NULL;
+ p->compress_data = NULL;
+}
+
+/**
+ * qatzip_send_prepare: Compress pages and update IO channel info.
+ *
+ * @param p Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return 0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+ uint32_t page_size = multifd_ram_page_size();
+ MultiFDPages_t *pages = &p->data->u.ram;
+ QatzipData *q = p->compress_data;
+ int ret;
+ unsigned int in_len, out_len;
+
+ if (!multifd_send_prepare_common(p)) {
+ goto out;
+ }
+
+ /*
+ * Unlike other multifd compression implementations, we use a non-streaming
+ * API and place all the data into one buffer, rather than sending each
+ * page to the compression API at a time. Based on initial benchmarks, the
+ * non-streaming API outperforms the streaming API. Plus, the logic in QEMU
+ * is friendly to using the non-streaming API anyway. If either of these
+ * statements becomes no longer true, we can revisit adding a streaming
+ * implementation.
+ */
+ for (int i = 0; i < pages->normal_num; i++) {
+ memcpy(q->in_buf + (i * page_size),
+ pages->block->host + pages->offset[i],
+ page_size);
+ }
+
+ in_len = pages->normal_num * page_size;
+ if (in_len > q->in_len) {
+ error_setg(errp, "multifd %u: unexpectedly large input", p->id);
+ return -1;
+ }
+ out_len = q->out_len;
+
+ ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1);
+ if (ret != QZ_OK) {
+ error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK",
+ p->id, ret);
+ return -1;
+ }
+ if (in_len != pages->normal_num * page_size) {
+ error_setg(errp, "multifd %u: QATzip failed to compress all input",
+ p->id);
+ return -1;
+ }
+
+ p->iov[p->iovs_num].iov_base = q->out_buf;
+ p->iov[p->iovs_num].iov_len = out_len;
+ p->iovs_num++;
+ p->next_packet_size = out_len;
+
+out:
+ p->flags |= MULTIFD_FLAG_QATZIP;
+ multifd_send_fill_packet(p);
+ return 0;
+}
+
+/**
+ * qatzip_recv_setup: Set up QATzip session and allocate private buffers.
+ *
+ * @param p Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return 0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+ QatzipData *q;
+ QzSessionParamsDeflate_T params;
+ const char *err_msg;
+ int ret;
+
+ q = g_new0(QatzipData, 1);
+ p->compress_data = q;
+
+ /*
+ * Initialize QAT device with software fallback by default. This allows
+ * QATzip to use CPU path when QAT hardware reaches maximum throughput.
+ */
+ ret = qzInit(&q->sess, true);
+ if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+ err_msg = "qzInit failed";
+ goto err;
+ }
+
+ ret = qzGetDefaultsDeflate(&params);
+ if (ret != QZ_OK) {
+ err_msg = "qzGetDefaultsDeflate failed";
+ goto err;
+ }
+
+ ret = qzSetupSessionDeflate(&q->sess, &params);
+ if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+ err_msg = "qzSetupSessionDeflate failed";
+ goto err;
+ }
+
+ /*
+ * Reserve extra spaces for the incoming packets. Current implementation
+ * doesn't send uncompressed pages in case the compression gets too big.
+ */
+ q->in_len = MULTIFD_PACKET_SIZE * 2;
+ /*
+ * PINNED_MEM is an enum from qatzip headers, which means to use
+ * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device
+ * is not available or software fallback is used, the malloc flag needs to
+ * be set as COMMON_MEM.
+ */
+ q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM);
+ if (!q->in_buf) {
+ q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM);
+ if (!q->in_buf) {
+ err_msg = "qzMalloc failed";
+ goto err;
+ }
+ }
+
+ q->out_len = MULTIFD_PACKET_SIZE;
+ q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM);
+ if (!q->out_buf) {
+ q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM);
+ if (!q->out_buf) {
+ err_msg = "qzMalloc failed";
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg);
+ return -1;
+}
+
+/**
+ * qatzip_recv_cleanup: Tear down QATzip session and release private buffers.
+ *
+ * @param p Multifd channel params
+ * @return None
+ */
+static void qatzip_recv_cleanup(MultiFDRecvParams *p)
+{
+ QatzipData *q = p->compress_data;
+
+ if (q) {
+ if (q->in_buf) {
+ qzFree(q->in_buf);
+ }
+ if (q->out_buf) {
+ qzFree(q->out_buf);
+ }
+ (void)qzTeardownSession(&q->sess);
+ (void)qzClose(&q->sess);
+ g_free(q);
+ }
+ p->compress_data = NULL;
+}
+
+
+/**
+ * qatzip_recv: Decompress pages and copy them to the appropriate
+ * locations.
+ *
+ * @param p Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return 0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_recv(MultiFDRecvParams *p, Error **errp)
+{
+ QatzipData *q = p->compress_data;
+ int ret;
+ unsigned int in_len, out_len;
+ uint32_t in_size = p->next_packet_size;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t expected_size = p->normal_num * page_size;
+ uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+
+ if (in_size > q->in_len) {
+ error_setg(errp, "multifd %u: received unexpectedly large packet",
+ p->id);
+ return -1;
+ }
+
+ if (flags != MULTIFD_FLAG_QATZIP) {
+ error_setg(errp, "multifd %u: flags received %x flags expected %x",
+ p->id, flags, MULTIFD_FLAG_QATZIP);
+ return -1;
+ }
+
+ multifd_recv_zero_page_process(p);
+ if (!p->normal_num) {
+ assert(in_size == 0);
+ return 0;
+ }
+
+ ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp);
+ if (ret != 0) {
+ return ret;
+ }
+
+ in_len = in_size;
+ out_len = q->out_len;
+ ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len);
+ if (ret != QZ_OK) {
+ error_setg(errp, "multifd %u: qzDecompress failed", p->id);
+ return -1;
+ }
+ if (out_len != expected_size) {
+ error_setg(errp, "multifd %u: packet size received %u size expected %u",
+ p->id, out_len, expected_size);
+ return -1;
+ }
+
+ /* Copy each page to its appropriate location. */
+ for (int i = 0; i < p->normal_num; i++) {
+ memcpy(p->host + p->normal[i], q->out_buf + page_size * i, page_size);
+ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
+ }
+ return 0;
+}
+
+static MultiFDMethods multifd_qatzip_ops = {
+ .send_setup = qatzip_send_setup,
+ .send_cleanup = qatzip_send_cleanup,
+ .send_prepare = qatzip_send_prepare,
+ .recv_setup = qatzip_recv_setup,
+ .recv_cleanup = qatzip_recv_cleanup,
+ .recv = qatzip_recv
+};
+
+static void multifd_qatzip_register(void)
+{
+ multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops);
+}
+
+migration_init(multifd_qatzip_register);
diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
index 9265098..52902eb 100644
--- a/migration/multifd-qpl.c
+++ b/migration/multifd-qpl.c
@@ -14,7 +14,7 @@
#include "qemu/module.h"
#include "qapi/error.h"
#include "qapi/qapi-types-migration.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "multifd.h"
#include "qpl/qpl.h"
@@ -220,21 +220,13 @@ static void multifd_qpl_deinit(QplData *qpl)
}
}
-/**
- * multifd_qpl_send_setup: set up send side
- *
- * Set up the channel with QPL compression.
- *
- * Returns 0 on success or -1 on error
- *
- * @p: Params for the channel being used
- * @errp: pointer to an error
- */
static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp)
{
QplData *qpl;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t page_count = multifd_ram_page_count();
- qpl = multifd_qpl_init(p->page_count, p->page_size, errp);
+ qpl = multifd_qpl_init(page_count, page_size, errp);
if (!qpl) {
return -1;
}
@@ -245,18 +237,10 @@ static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp)
* additional two IOVs are used to store packet header and compressed data
* length
*/
- p->iov = g_new0(struct iovec, p->page_count + 2);
+ p->iov = g_new0(struct iovec, page_count + 2);
return 0;
}
-/**
- * multifd_qpl_send_cleanup: clean up send side
- *
- * Close the channel and free memory.
- *
- * @p: Params for the channel being used
- * @errp: pointer to an error
- */
static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
{
multifd_qpl_deinit(p->compress_data);
@@ -404,13 +388,14 @@ retry:
static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p)
{
QplData *qpl = p->compress_data;
- uint32_t size = p->page_size;
+ MultiFDPages_t *pages = &p->data->u.ram;
+ uint32_t size = multifd_ram_page_size();
qpl_job *job = qpl->sw_job;
uint8_t *zbuf = qpl->zbuf;
uint8_t *buf;
- for (int i = 0; i < p->pages->normal_num; i++) {
- buf = p->pages->block->host + p->pages->offset[i];
+ for (int i = 0; i < pages->normal_num; i++) {
+ buf = pages->block->host + pages->offset[i];
multifd_qpl_prepare_comp_job(job, buf, zbuf, size);
if (qpl_execute_job(job) == QPL_STS_OK) {
multifd_qpl_fill_packet(i, p, zbuf, job->total_out);
@@ -434,8 +419,8 @@ static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p)
static void multifd_qpl_compress_pages(MultiFDSendParams *p)
{
QplData *qpl = p->compress_data;
- MultiFDPages_t *pages = p->pages;
- uint32_t size = p->page_size;
+ MultiFDPages_t *pages = &p->data->u.ram;
+ uint32_t size = multifd_ram_page_size();
QplHwJob *hw_job;
uint8_t *buf;
uint8_t *zbuf;
@@ -484,20 +469,10 @@ static void multifd_qpl_compress_pages(MultiFDSendParams *p)
}
}
-/**
- * multifd_qpl_send_prepare: prepare data to be able to send
- *
- * Create a compressed buffer with all the pages that we are going to
- * send.
- *
- * Returns 0 on success or -1 on error
- *
- * @p: Params for the channel being used
- * @errp: pointer to an error
- */
static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp)
{
QplData *qpl = p->compress_data;
+ MultiFDPages_t *pages = &p->data->u.ram;
uint32_t len = 0;
if (!multifd_send_prepare_common(p)) {
@@ -505,7 +480,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp)
}
/* The first IOV is used to store the compressed page lengths */
- len = p->pages->normal_num * sizeof(uint32_t);
+ len = pages->normal_num * sizeof(uint32_t);
multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len);
if (qpl->hw_avail) {
multifd_qpl_compress_pages(p);
@@ -519,21 +494,13 @@ out:
return 0;
}
-/**
- * multifd_qpl_recv_setup: set up receive side
- *
- * Create the compressed channel and buffer.
- *
- * Returns 0 on success or -1 on error
- *
- * @p: Params for the channel being used
- * @errp: pointer to an error
- */
static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp)
{
QplData *qpl;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t page_count = multifd_ram_page_count();
- qpl = multifd_qpl_init(p->page_count, p->page_size, errp);
+ qpl = multifd_qpl_init(page_count, page_size, errp);
if (!qpl) {
return -1;
}
@@ -541,13 +508,6 @@ static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp)
return 0;
}
-/**
- * multifd_qpl_recv_cleanup: set up receive side
- *
- * Close the channel and free memory.
- *
- * @p: Params for the channel being used
- */
static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p)
{
multifd_qpl_deinit(p->compress_data);
@@ -600,7 +560,7 @@ static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p,
Error **errp)
{
QplData *qpl = p->compress_data;
- uint32_t size = p->page_size;
+ uint32_t size = multifd_ram_page_size();
qpl_job *job = qpl->sw_job;
uint8_t *zbuf = qpl->zbuf;
uint8_t *addr;
@@ -638,7 +598,7 @@ static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p,
static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp)
{
QplData *qpl = p->compress_data;
- uint32_t size = p->page_size;
+ uint32_t size = multifd_ram_page_size();
uint8_t *zbuf = qpl->zbuf;
uint8_t *addr;
uint32_t len;
@@ -688,17 +648,6 @@ static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp)
}
return 0;
}
-/**
- * multifd_qpl_recv: read the data from the channel into actual pages
- *
- * Read the compressed buffer, and uncompress it into the actual
- * pages.
- *
- * Returns 0 on success or -1 on error
- *
- * @p: Params for the channel being used
- * @errp: pointer to an error
- */
static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp)
{
QplData *qpl = p->compress_data;
@@ -728,8 +677,9 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp)
}
for (int i = 0; i < p->normal_num; i++) {
qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]);
- assert(qpl->zlen[i] <= p->page_size);
+ assert(qpl->zlen[i] <= multifd_ram_page_size());
zbuf_len += qpl->zlen[i];
+ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
}
/* read compressed pages */
@@ -745,7 +695,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp)
return multifd_qpl_decompress_pages_slow_path(p, errp);
}
-static MultiFDMethods multifd_qpl_ops = {
+static const MultiFDMethods multifd_qpl_ops = {
.send_setup = multifd_qpl_send_setup,
.send_cleanup = multifd_qpl_send_cleanup,
.send_prepare = multifd_qpl_send_prepare,
diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c
index d12353f..fd7cd9b 100644
--- a/migration/multifd-uadk.c
+++ b/migration/multifd-uadk.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include "qemu/module.h"
#include "qapi/error.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "migration.h"
#include "multifd.h"
#include "options.h"
@@ -103,19 +103,13 @@ static void multifd_uadk_uninit_sess(struct wd_data *wd)
g_free(wd);
}
-/**
- * multifd_uadk_send_setup: setup send side
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp)
{
struct wd_data *wd;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t page_count = multifd_ram_page_count();
- wd = multifd_uadk_init_sess(p->page_count, p->page_size, true, errp);
+ wd = multifd_uadk_init_sess(page_count, page_size, true, errp);
if (!wd) {
return -1;
}
@@ -128,24 +122,18 @@ static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp)
* length
*/
- p->iov = g_new0(struct iovec, p->page_count + 2);
+ p->iov = g_new0(struct iovec, page_count + 2);
return 0;
}
-/**
- * multifd_uadk_send_cleanup: cleanup send side
- *
- * Close the channel and return memory.
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp)
{
struct wd_data *wd = p->compress_data;
multifd_uadk_uninit_sess(wd);
p->compress_data = NULL;
+ g_free(p->iov);
+ p->iov = NULL;
}
static inline void prepare_next_iov(MultiFDSendParams *p, void *base,
@@ -157,40 +145,31 @@ static inline void prepare_next_iov(MultiFDSendParams *p, void *base,
p->iovs_num++;
}
-/**
- * multifd_uadk_send_prepare: prepare data to be able to send
- *
- * Create a compressed buffer with all the pages that we are going to
- * send.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
{
struct wd_data *uadk_data = p->compress_data;
uint32_t hdr_size;
+ uint32_t page_size = multifd_ram_page_size();
uint8_t *buf = uadk_data->buf;
int ret = 0;
+ MultiFDPages_t *pages = &p->data->u.ram;
if (!multifd_send_prepare_common(p)) {
goto out;
}
- hdr_size = p->pages->normal_num * sizeof(uint32_t);
+ hdr_size = pages->normal_num * sizeof(uint32_t);
/* prepare the header that stores the lengths of all compressed data */
prepare_next_iov(p, uadk_data->buf_hdr, hdr_size);
- for (int i = 0; i < p->pages->normal_num; i++) {
+ for (int i = 0; i < pages->normal_num; i++) {
struct wd_comp_req creq = {
.op_type = WD_DIR_COMPRESS,
- .src = p->pages->block->host + p->pages->offset[i],
- .src_len = p->page_size,
+ .src = pages->block->host + pages->offset[i],
+ .src_len = page_size,
.dst = buf,
/* Set dst_len to double the src in case compressed out >= page_size */
- .dst_len = p->page_size * 2,
+ .dst_len = page_size * 2,
};
if (uadk_data->handle) {
@@ -200,7 +179,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
p->id, ret, creq.status);
return -1;
}
- if (creq.dst_len < p->page_size) {
+ if (creq.dst_len < page_size) {
uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len);
prepare_next_iov(p, buf, creq.dst_len);
buf += creq.dst_len;
@@ -212,11 +191,11 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
* than page_size as well because at the receive end we can skip the
* decompression. But it is tricky to find the right number here.
*/
- if (!uadk_data->handle || creq.dst_len >= p->page_size) {
- uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size);
- prepare_next_iov(p, p->pages->block->host + p->pages->offset[i],
- p->page_size);
- buf += p->page_size;
+ if (!uadk_data->handle || creq.dst_len >= page_size) {
+ uadk_data->buf_hdr[i] = cpu_to_be32(page_size);
+ prepare_next_iov(p, pages->block->host + pages->offset[i],
+ page_size);
+ buf += page_size;
}
}
out:
@@ -225,21 +204,13 @@ out:
return 0;
}
-/**
- * multifd_uadk_recv_setup: setup receive side
- *
- * Create the compressed channel and buffer.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp)
{
struct wd_data *wd;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t page_count = multifd_ram_page_count();
- wd = multifd_uadk_init_sess(p->page_count, p->page_size, false, errp);
+ wd = multifd_uadk_init_sess(page_count, page_size, false, errp);
if (!wd) {
return -1;
}
@@ -247,13 +218,6 @@ static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp)
return 0;
}
-/**
- * multifd_uadk_recv_cleanup: cleanup receive side
- *
- * Close the channel and return memory.
- *
- * @p: Params for the channel that we are using
- */
static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p)
{
struct wd_data *wd = p->compress_data;
@@ -262,17 +226,6 @@ static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p)
p->compress_data = NULL;
}
-/**
- * multifd_uadk_recv: read the data from the channel into actual pages
- *
- * Read the compressed buffer, and uncompress it into the actual
- * pages.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
{
struct wd_data *uadk_data = p->compress_data;
@@ -280,6 +233,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
uint32_t hdr_len = p->normal_num * sizeof(uint32_t);
uint32_t data_len = 0;
+ uint32_t page_size = multifd_ram_page_size();
uint8_t *buf = uadk_data->buf;
int ret = 0;
@@ -306,7 +260,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
for (int i = 0; i < p->normal_num; i++) {
uadk_data->buf_hdr[i] = be32_to_cpu(uadk_data->buf_hdr[i]);
data_len += uadk_data->buf_hdr[i];
- assert(uadk_data->buf_hdr[i] <= p->page_size);
+ assert(uadk_data->buf_hdr[i] <= page_size);
}
/* read compressed data */
@@ -322,12 +276,12 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
.src = buf,
.src_len = uadk_data->buf_hdr[i],
.dst = p->host + p->normal[i],
- .dst_len = p->page_size,
+ .dst_len = page_size,
};
- if (uadk_data->buf_hdr[i] == p->page_size) {
- memcpy(p->host + p->normal[i], buf, p->page_size);
- buf += p->page_size;
+ if (uadk_data->buf_hdr[i] == page_size) {
+ memcpy(p->host + p->normal[i], buf, page_size);
+ buf += page_size;
continue;
}
@@ -343,7 +297,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
p->id, ret, creq.status);
return -1;
}
- if (creq.dst_len != p->page_size) {
+ if (creq.dst_len != page_size) {
error_setg(errp, "multifd %u: decompressed length error", p->id);
return -1;
}
@@ -353,7 +307,7 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
return 0;
}
-static MultiFDMethods multifd_uadk_ops = {
+static const MultiFDMethods multifd_uadk_ops = {
.send_setup = multifd_uadk_send_setup,
.send_cleanup = multifd_uadk_send_cleanup,
.send_prepare = multifd_uadk_send_prepare,
diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
index e1b8370..4cde868 100644
--- a/migration/multifd-zero-page.c
+++ b/migration/multifd-zero-page.c
@@ -12,8 +12,9 @@
#include "qemu/osdep.h"
#include "qemu/cutils.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "migration.h"
+#include "migration-stats.h"
#include "multifd.h"
#include "options.h"
#include "ram.h"
@@ -46,14 +47,14 @@ static void swap_page_offset(ram_addr_t *pages_offset, int a, int b)
*/
void multifd_send_zero_page_detect(MultiFDSendParams *p)
{
- MultiFDPages_t *pages = p->pages;
+ MultiFDPages_t *pages = &p->data->u.ram;
RAMBlock *rb = pages->block;
int i = 0;
int j = pages->num - 1;
if (!multifd_zero_page_enabled()) {
pages->normal_num = pages->num;
- return;
+ goto out;
}
/*
@@ -63,7 +64,7 @@ void multifd_send_zero_page_detect(MultiFDSendParams *p)
while (i <= j) {
uint64_t offset = pages->offset[i];
- if (!buffer_is_zero(rb->host + offset, p->page_size)) {
+ if (!buffer_is_zero(rb->host + offset, multifd_ram_page_size())) {
i++;
continue;
}
@@ -74,15 +75,37 @@ void multifd_send_zero_page_detect(MultiFDSendParams *p)
}
pages->normal_num = i;
+
+out:
+ stat64_add(&mig_stats.normal_pages, pages->normal_num);
+ stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num);
}
void multifd_recv_zero_page_process(MultiFDRecvParams *p)
{
for (int i = 0; i < p->zero_num; i++) {
void *page = p->host + p->zero[i];
- if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) {
- memset(page, 0, p->page_size);
- } else {
+ bool received =
+ ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i]);
+
+ /*
+ * During multifd migration zero page is written to the memory
+ * only if it is migrated more than once.
+ *
+ * It becomes a problem when both multifd & postcopy options are
+ * enabled. If the zero page which was skipped during multifd phase,
+ * is accessed during the postcopy phase of the migration, a page
+ * fault occurs. But this page fault is not served because the
+ * 'receivedmap' says the zero page is already received. Thus the
+ * thread accessing that page may hang.
+ *
+ * When postcopy is enabled, always write the zero page as and when
+ * it is migrated.
+ */
+ if (migrate_postcopy_ram() || received) {
+ memset(page, 0, multifd_ram_page_size());
+ }
+ if (!received) {
ramblock_recv_bitmap_set_offset(p->block, p->zero[i]);
}
}
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 2ced694..8820b2a 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include <zlib.h>
#include "qemu/rcu.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "qapi/error.h"
#include "migration.h"
@@ -34,17 +34,7 @@ struct zlib_data {
/* Multifd zlib compression */
-/**
- * zlib_send_setup: setup send side
- *
- * Setup each channel with zlib compression.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
+static int multifd_zlib_send_setup(MultiFDSendParams *p, Error **errp)
{
struct zlib_data *z = g_new0(struct zlib_data, 1);
z_stream *zs = &z->zs;
@@ -86,15 +76,7 @@ err_free_z:
return -1;
}
-/**
- * zlib_send_cleanup: cleanup send side
- *
- * Close the channel and return memory.
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
+static void multifd_zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
{
struct zlib_data *z = p->compress_data;
@@ -110,23 +92,13 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
p->iov = NULL;
}
-/**
- * zlib_send_prepare: prepare date to be able to send
- *
- * Create a compressed buffer with all the pages that we are going to
- * send.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
+static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp)
{
- MultiFDPages_t *pages = p->pages;
+ MultiFDPages_t *pages = &p->data->u.ram;
struct zlib_data *z = p->compress_data;
z_stream *zs = &z->zs;
uint32_t out_size = 0;
+ uint32_t page_size = multifd_ram_page_size();
int ret;
uint32_t i;
@@ -147,8 +119,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
* with compression. zlib does not guarantee that this is safe,
* therefore copy the page before calling deflate().
*/
- memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
- zs->avail_in = p->page_size;
+ memcpy(z->buf, pages->block->host + pages->offset[i], page_size);
+ zs->avail_in = page_size;
zs->next_in = z->buf;
zs->avail_out = available;
@@ -188,17 +160,7 @@ out:
return 0;
}
-/**
- * zlib_recv_setup: setup receive side
- *
- * Create the compressed channel and buffer.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp)
+static int multifd_zlib_recv_setup(MultiFDRecvParams *p, Error **errp)
{
struct zlib_data *z = g_new0(struct zlib_data, 1);
z_stream *zs = &z->zs;
@@ -224,14 +186,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp)
return 0;
}
-/**
- * zlib_recv_cleanup: setup receive side
- *
- * For no compression this function does nothing.
- *
- * @p: Params for the channel that we are using
- */
-static void zlib_recv_cleanup(MultiFDRecvParams *p)
+static void multifd_zlib_recv_cleanup(MultiFDRecvParams *p)
{
struct zlib_data *z = p->compress_data;
@@ -242,25 +197,15 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p)
p->compress_data = NULL;
}
-/**
- * zlib_recv: read the data from the channel into actual pages
- *
- * Read the compressed buffer, and uncompress it into the actual
- * pages.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zlib_recv(MultiFDRecvParams *p, Error **errp)
+static int multifd_zlib_recv(MultiFDRecvParams *p, Error **errp)
{
struct zlib_data *z = p->compress_data;
z_stream *zs = &z->zs;
uint32_t in_size = p->next_packet_size;
/* we measure the change of total_out */
uint32_t out_size = zs->total_out;
- uint32_t expected_size = p->normal_num * p->page_size;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t expected_size = p->normal_num * page_size;
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
int ret;
int i;
@@ -296,7 +241,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp)
flush = Z_SYNC_FLUSH;
}
- zs->avail_out = p->page_size;
+ zs->avail_out = page_size;
zs->next_out = p->host + p->normal[i];
/*
@@ -310,8 +255,8 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp)
do {
ret = inflate(zs, flush);
} while (ret == Z_OK && zs->avail_in
- && (zs->total_out - start) < p->page_size);
- if (ret == Z_OK && (zs->total_out - start) < p->page_size) {
+ && (zs->total_out - start) < page_size);
+ if (ret == Z_OK && (zs->total_out - start) < page_size) {
error_setg(errp, "multifd %u: inflate generated too few output",
p->id);
return -1;
@@ -332,13 +277,13 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp)
return 0;
}
-static MultiFDMethods multifd_zlib_ops = {
- .send_setup = zlib_send_setup,
- .send_cleanup = zlib_send_cleanup,
- .send_prepare = zlib_send_prepare,
- .recv_setup = zlib_recv_setup,
- .recv_cleanup = zlib_recv_cleanup,
- .recv = zlib_recv
+static const MultiFDMethods multifd_zlib_ops = {
+ .send_setup = multifd_zlib_send_setup,
+ .send_cleanup = multifd_zlib_send_cleanup,
+ .send_prepare = multifd_zlib_send_prepare,
+ .recv_setup = multifd_zlib_recv_setup,
+ .recv_cleanup = multifd_zlib_recv_cleanup,
+ .recv = multifd_zlib_recv
};
static void multifd_zlib_register(void)
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index ca17b7e..3c2dcf7 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include <zstd.h>
#include "qemu/rcu.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "qapi/error.h"
#include "migration.h"
@@ -37,17 +37,7 @@ struct zstd_data {
/* Multifd zstd compression */
-/**
- * zstd_send_setup: setup send side
- *
- * Setup each channel with zstd compression.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
+static int multifd_zstd_send_setup(MultiFDSendParams *p, Error **errp)
{
struct zstd_data *z = g_new0(struct zstd_data, 1);
int res;
@@ -83,15 +73,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
return 0;
}
-/**
- * zstd_send_cleanup: cleanup send side
- *
- * Close the channel and return memory.
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
+static void multifd_zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
{
struct zstd_data *z = p->compress_data;
@@ -106,20 +88,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
p->iov = NULL;
}
-/**
- * zstd_send_prepare: prepare date to be able to send
- *
- * Create a compressed buffer with all the pages that we are going to
- * send.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
+static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp)
{
- MultiFDPages_t *pages = p->pages;
+ MultiFDPages_t *pages = &p->data->u.ram;
struct zstd_data *z = p->compress_data;
int ret;
uint32_t i;
@@ -138,8 +109,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
if (i == pages->normal_num - 1) {
flush = ZSTD_e_flush;
}
- z->in.src = p->pages->block->host + pages->offset[i];
- z->in.size = p->page_size;
+ z->in.src = pages->block->host + pages->offset[i];
+ z->in.size = multifd_ram_page_size();
z->in.pos = 0;
/*
@@ -152,9 +123,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
*/
do {
ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush);
- } while (ret > 0 && (z->in.size - z->in.pos > 0)
- && (z->out.size - z->out.pos > 0));
- if (ret > 0 && (z->in.size - z->in.pos > 0)) {
+ } while (ret > 0 && (z->in.size > z->in.pos)
+ && (z->out.size > z->out.pos));
+ if (ret > 0 && (z->in.size > z->in.pos)) {
error_setg(errp, "multifd %u: compressStream buffer too small",
p->id);
return -1;
@@ -176,17 +147,7 @@ out:
return 0;
}
-/**
- * zstd_recv_setup: setup receive side
- *
- * Create the compressed channel and buffer.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp)
+static int multifd_zstd_recv_setup(MultiFDRecvParams *p, Error **errp)
{
struct zstd_data *z = g_new0(struct zstd_data, 1);
int ret;
@@ -220,14 +181,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp)
return 0;
}
-/**
- * zstd_recv_cleanup: setup receive side
- *
- * For no compression this function does nothing.
- *
- * @p: Params for the channel that we are using
- */
-static void zstd_recv_cleanup(MultiFDRecvParams *p)
+static void multifd_zstd_recv_cleanup(MultiFDRecvParams *p)
{
struct zstd_data *z = p->compress_data;
@@ -239,22 +193,12 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p)
p->compress_data = NULL;
}
-/**
- * zstd_recv: read the data from the channel into actual pages
- *
- * Read the compressed buffer, and uncompress it into the actual
- * pages.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int zstd_recv(MultiFDRecvParams *p, Error **errp)
+static int multifd_zstd_recv(MultiFDRecvParams *p, Error **errp)
{
uint32_t in_size = p->next_packet_size;
uint32_t out_size = 0;
- uint32_t expected_size = p->normal_num * p->page_size;
+ uint32_t page_size = multifd_ram_page_size();
+ uint32_t expected_size = p->normal_num * page_size;
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
struct zstd_data *z = p->compress_data;
int ret;
@@ -286,7 +230,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp)
for (i = 0; i < p->normal_num; i++) {
ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
z->out.dst = p->host + p->normal[i];
- z->out.size = p->page_size;
+ z->out.size = page_size;
z->out.pos = 0;
/*
@@ -299,9 +243,9 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp)
*/
do {
ret = ZSTD_decompressStream(z->zds, &z->out, &z->in);
- } while (ret > 0 && (z->in.size - z->in.pos > 0)
- && (z->out.pos < p->page_size));
- if (ret > 0 && (z->out.pos < p->page_size)) {
+ } while (ret > 0 && (z->in.size > z->in.pos)
+ && (z->out.pos < page_size));
+ if (ret > 0 && (z->out.pos < page_size)) {
error_setg(errp, "multifd %u: decompressStream buffer too small",
p->id);
return -1;
@@ -321,13 +265,13 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp)
return 0;
}
-static MultiFDMethods multifd_zstd_ops = {
- .send_setup = zstd_send_setup,
- .send_cleanup = zstd_send_cleanup,
- .send_prepare = zstd_send_prepare,
- .recv_setup = zstd_recv_setup,
- .recv_cleanup = zstd_recv_cleanup,
- .recv = zstd_recv
+static const MultiFDMethods multifd_zstd_ops = {
+ .send_setup = multifd_zstd_send_setup,
+ .send_cleanup = multifd_zstd_send_cleanup,
+ .send_prepare = multifd_zstd_send_prepare,
+ .recv_setup = multifd_zstd_recv_setup,
+ .recv_cleanup = multifd_zstd_recv_cleanup,
+ .recv = multifd_zstd_recv
};
static void multifd_zstd_register(void)
diff --git a/migration/multifd.c b/migration/multifd.c
index 0b4cbad..b255778 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -12,15 +12,18 @@
#include "qemu/osdep.h"
#include "qemu/cutils.h"
+#include "qemu/iov.h"
#include "qemu/rcu.h"
#include "exec/target_page.h"
-#include "sysemu/sysemu.h"
-#include "exec/ramblock.h"
+#include "system/system.h"
+#include "system/ramblock.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "file.h"
+#include "migration/misc.h"
#include "migration.h"
#include "migration-stats.h"
+#include "savevm.h"
#include "socket.h"
#include "tls.h"
#include "qemu-file.h"
@@ -33,11 +36,6 @@
#include "io/channel-socket.h"
#include "yank_functions.h"
-/* Multiple fd's */
-
-#define MULTIFD_MAGIC 0x11223344U
-#define MULTIFD_VERSION 1
-
typedef struct {
uint32_t magic;
uint32_t version;
@@ -49,8 +47,10 @@ typedef struct {
struct {
MultiFDSendParams *params;
- /* array of pages to sent */
- MultiFDPages_t *pages;
+
+ /* multifd_send() body is not thread safe, needs serialization */
+ QemuMutex multifd_send_mutex;
+
/*
* Global number of generated multifd packets.
*
@@ -78,7 +78,7 @@ struct {
*/
int exiting;
/* multifd ops */
- MultiFDMethods *ops;
+ const MultiFDMethods *ops;
} *multifd_send_state;
struct {
@@ -95,236 +95,70 @@ struct {
uint64_t packet_num;
int exiting;
/* multifd ops */
- MultiFDMethods *ops;
+ const MultiFDMethods *ops;
} *multifd_recv_state;
-static bool multifd_use_packets(void)
-{
- return !migrate_mapped_ram();
-}
-
-void multifd_send_channel_created(void)
-{
- qemu_sem_post(&multifd_send_state->channels_created);
-}
-
-static void multifd_set_file_bitmap(MultiFDSendParams *p)
+MultiFDSendData *multifd_send_data_alloc(void)
{
- MultiFDPages_t *pages = p->pages;
+ MultiFDSendData *new = g_new0(MultiFDSendData, 1);
- assert(pages->block);
+ multifd_ram_payload_alloc(&new->u.ram);
+ /* Device state allocates its payload on-demand */
- for (int i = 0; i < p->pages->normal_num; i++) {
- ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true);
- }
-
- for (int i = p->pages->normal_num; i < p->pages->num; i++) {
- ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false);
- }
+ return new;
}
-/* Multifd without compression */
-
-/**
- * nocomp_send_setup: setup send side
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
+void multifd_send_data_clear(MultiFDSendData *data)
{
- if (migrate_zero_copy_send()) {
- p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
+ if (multifd_payload_empty(data)) {
+ return;
}
- if (multifd_use_packets()) {
- /* We need one extra place for the packet header */
- p->iov = g_new0(struct iovec, p->page_count + 1);
- } else {
- p->iov = g_new0(struct iovec, p->page_count);
+ switch (data->type) {
+ case MULTIFD_PAYLOAD_DEVICE_STATE:
+ multifd_send_data_clear_device_state(&data->u.device_state);
+ break;
+ default:
+ /* Nothing to do */
+ break;
}
- return 0;
-}
-
-/**
- * nocomp_send_cleanup: cleanup send side
- *
- * For no compression this function does nothing.
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
-{
- g_free(p->iov);
- p->iov = NULL;
- return;
+ data->type = MULTIFD_PAYLOAD_NONE;
}
-static void multifd_send_prepare_iovs(MultiFDSendParams *p)
+void multifd_send_data_free(MultiFDSendData *data)
{
- MultiFDPages_t *pages = p->pages;
-
- for (int i = 0; i < pages->normal_num; i++) {
- p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
- p->iov[p->iovs_num].iov_len = p->page_size;
- p->iovs_num++;
- }
-
- p->next_packet_size = pages->normal_num * p->page_size;
-}
-
-/**
- * nocomp_send_prepare: prepare date to be able to send
- *
- * For no compression we just have to calculate the size of the
- * packet.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
-{
- bool use_zero_copy_send = migrate_zero_copy_send();
- int ret;
-
- multifd_send_zero_page_detect(p);
-
- if (!multifd_use_packets()) {
- multifd_send_prepare_iovs(p);
- multifd_set_file_bitmap(p);
-
- return 0;
- }
-
- if (!use_zero_copy_send) {
- /*
- * Only !zerocopy needs the header in IOV; zerocopy will
- * send it separately.
- */
- multifd_send_prepare_header(p);
+ if (!data) {
+ return;
}
- multifd_send_prepare_iovs(p);
- p->flags |= MULTIFD_FLAG_NOCOMP;
+ /* This also free's device state payload */
+ multifd_send_data_clear(data);
- multifd_send_fill_packet(p);
-
- if (use_zero_copy_send) {
- /* Send header first, without zerocopy */
- ret = qio_channel_write_all(p->c, (void *)p->packet,
- p->packet_len, errp);
- if (ret != 0) {
- return -1;
- }
- }
+ multifd_ram_payload_free(&data->u.ram);
- return 0;
-}
-
-/**
- * nocomp_recv_setup: setup receive side
- *
- * For no compression this function does nothing.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
-{
- p->iov = g_new0(struct iovec, p->page_count);
- return 0;
+ g_free(data);
}
-/**
- * nocomp_recv_cleanup: setup receive side
- *
- * For no compression this function does nothing.
- *
- * @p: Params for the channel that we are using
- */
-static void nocomp_recv_cleanup(MultiFDRecvParams *p)
+static bool multifd_use_packets(void)
{
- g_free(p->iov);
- p->iov = NULL;
+ return !migrate_mapped_ram();
}
-/**
- * nocomp_recv: read the data from the channel
- *
- * For no compression we just need to read things into the correct place.
- *
- * Returns 0 for success or -1 for error
- *
- * @p: Params for the channel that we are using
- * @errp: pointer to an error
- */
-static int nocomp_recv(MultiFDRecvParams *p, Error **errp)
+void multifd_send_channel_created(void)
{
- uint32_t flags;
-
- if (!multifd_use_packets()) {
- return multifd_file_recv_data(p, errp);
- }
-
- flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
-
- if (flags != MULTIFD_FLAG_NOCOMP) {
- error_setg(errp, "multifd %u: flags received %x flags expected %x",
- p->id, flags, MULTIFD_FLAG_NOCOMP);
- return -1;
- }
-
- multifd_recv_zero_page_process(p);
-
- if (!p->normal_num) {
- return 0;
- }
-
- for (int i = 0; i < p->normal_num; i++) {
- p->iov[i].iov_base = p->host + p->normal[i];
- p->iov[i].iov_len = p->page_size;
- ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
- }
- return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
+ qemu_sem_post(&multifd_send_state->channels_created);
}
-static MultiFDMethods multifd_nocomp_ops = {
- .send_setup = nocomp_send_setup,
- .send_cleanup = nocomp_send_cleanup,
- .send_prepare = nocomp_send_prepare,
- .recv_setup = nocomp_recv_setup,
- .recv_cleanup = nocomp_recv_cleanup,
- .recv = nocomp_recv
-};
+static const MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {};
-static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
- [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
-};
-
-void multifd_register_ops(int method, MultiFDMethods *ops)
+void multifd_register_ops(int method, const MultiFDMethods *ops)
{
- assert(0 < method && method < MULTIFD_COMPRESSION__MAX);
+ assert(0 <= method && method < MULTIFD_COMPRESSION__MAX);
+ assert(!multifd_ops[method]);
multifd_ops[method] = ops;
}
-/* Reset a MultiFDPages_t* object for the next use */
-static void multifd_pages_reset(MultiFDPages_t *pages)
-{
- /*
- * We don't need to touch offset[] array, because it will be
- * overwritten later when reused.
- */
- pages->num = 0;
- pages->normal_num = 0;
- pages->block = NULL;
-}
-
static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
{
MultiFDInit_t msg = {};
@@ -389,160 +223,95 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
return msg.id;
}
-static MultiFDPages_t *multifd_pages_init(uint32_t n)
-{
- MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
-
- pages->allocated = n;
- pages->offset = g_new0(ram_addr_t, n);
-
- return pages;
-}
-
-static void multifd_pages_clear(MultiFDPages_t *pages)
-{
- multifd_pages_reset(pages);
- pages->allocated = 0;
- g_free(pages->offset);
- pages->offset = NULL;
- g_free(pages);
-}
-
+/* Fills a RAM multifd packet */
void multifd_send_fill_packet(MultiFDSendParams *p)
{
MultiFDPacket_t *packet = p->packet;
- MultiFDPages_t *pages = p->pages;
uint64_t packet_num;
- uint32_t zero_num = pages->num - pages->normal_num;
- int i;
+ bool sync_packet = p->flags & MULTIFD_FLAG_SYNC;
+
+ memset(packet, 0, p->packet_len);
- packet->flags = cpu_to_be32(p->flags);
- packet->pages_alloc = cpu_to_be32(p->pages->allocated);
- packet->normal_pages = cpu_to_be32(pages->normal_num);
- packet->zero_pages = cpu_to_be32(zero_num);
+ packet->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
+ packet->hdr.version = cpu_to_be32(MULTIFD_VERSION);
+
+ packet->hdr.flags = cpu_to_be32(p->flags);
packet->next_packet_size = cpu_to_be32(p->next_packet_size);
packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num);
packet->packet_num = cpu_to_be64(packet_num);
- if (pages->block) {
- strncpy(packet->ramblock, pages->block->idstr, 256);
- }
-
- for (i = 0; i < pages->num; i++) {
- /* there are architectures where ram_addr_t is 32 bit */
- uint64_t temp = pages->offset[i];
+ p->packets_sent++;
- packet->offset[i] = cpu_to_be64(temp);
+ if (!sync_packet) {
+ multifd_ram_fill_packet(p);
}
- p->packets_sent++;
- p->total_normal_pages += pages->normal_num;
- p->total_zero_pages += zero_num;
-
- trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num,
- p->flags, p->next_packet_size);
+ trace_multifd_send_fill(p->id, packet_num,
+ p->flags, p->next_packet_size);
}
-static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
+static int multifd_recv_unfill_packet_header(MultiFDRecvParams *p,
+ const MultiFDPacketHdr_t *hdr,
+ Error **errp)
{
- MultiFDPacket_t *packet = p->packet;
- int i;
+ uint32_t magic = be32_to_cpu(hdr->magic);
+ uint32_t version = be32_to_cpu(hdr->version);
- packet->magic = be32_to_cpu(packet->magic);
- if (packet->magic != MULTIFD_MAGIC) {
- error_setg(errp, "multifd: received packet "
- "magic %x and expected magic %x",
- packet->magic, MULTIFD_MAGIC);
+ if (magic != MULTIFD_MAGIC) {
+ error_setg(errp, "multifd: received packet magic %x, expected %x",
+ magic, MULTIFD_MAGIC);
return -1;
}
- packet->version = be32_to_cpu(packet->version);
- if (packet->version != MULTIFD_VERSION) {
- error_setg(errp, "multifd: received packet "
- "version %u and expected version %u",
- packet->version, MULTIFD_VERSION);
+ if (version != MULTIFD_VERSION) {
+ error_setg(errp, "multifd: received packet version %u, expected %u",
+ version, MULTIFD_VERSION);
return -1;
}
- p->flags = be32_to_cpu(packet->flags);
-
- packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
- /*
- * If we received a packet that is 100 times bigger than expected
- * just stop migration. It is a magic number.
- */
- if (packet->pages_alloc > p->page_count) {
- error_setg(errp, "multifd: received packet "
- "with size %u and expected a size of %u",
- packet->pages_alloc, p->page_count) ;
- return -1;
- }
+ p->flags = be32_to_cpu(hdr->flags);
- p->normal_num = be32_to_cpu(packet->normal_pages);
- if (p->normal_num > packet->pages_alloc) {
- error_setg(errp, "multifd: received packet "
- "with %u normal pages and expected maximum pages are %u",
- p->normal_num, packet->pages_alloc) ;
- return -1;
- }
+ return 0;
+}
- p->zero_num = be32_to_cpu(packet->zero_pages);
- if (p->zero_num > packet->pages_alloc - p->normal_num) {
- error_setg(errp, "multifd: received packet "
- "with %u zero pages and expected maximum zero pages are %u",
- p->zero_num, packet->pages_alloc - p->normal_num) ;
- return -1;
- }
+static int multifd_recv_unfill_packet_device_state(MultiFDRecvParams *p,
+ Error **errp)
+{
+ MultiFDPacketDeviceState_t *packet = p->packet_dev_state;
+ packet->instance_id = be32_to_cpu(packet->instance_id);
p->next_packet_size = be32_to_cpu(packet->next_packet_size);
- p->packet_num = be64_to_cpu(packet->packet_num);
- p->packets_recved++;
- p->total_normal_pages += p->normal_num;
- p->total_zero_pages += p->zero_num;
- trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num,
- p->flags, p->next_packet_size);
+ return 0;
+}
- if (p->normal_num == 0 && p->zero_num == 0) {
- return 0;
- }
+static int multifd_recv_unfill_packet_ram(MultiFDRecvParams *p, Error **errp)
+{
+ const MultiFDPacket_t *packet = p->packet;
+ int ret = 0;
- /* make sure that ramblock is 0 terminated */
- packet->ramblock[255] = 0;
- p->block = qemu_ram_block_by_name(packet->ramblock);
- if (!p->block) {
- error_setg(errp, "multifd: unknown ram block %s",
- packet->ramblock);
- return -1;
- }
+ p->next_packet_size = be32_to_cpu(packet->next_packet_size);
+ p->packet_num = be64_to_cpu(packet->packet_num);
- p->host = p->block->host;
- for (i = 0; i < p->normal_num; i++) {
- uint64_t offset = be64_to_cpu(packet->offset[i]);
+ /* Always unfill, old QEMUs (<9.0) send data along with SYNC */
+ ret = multifd_ram_unfill_packet(p, errp);
- if (offset > (p->block->used_length - p->page_size)) {
- error_setg(errp, "multifd: offset too long %" PRIu64
- " (max " RAM_ADDR_FMT ")",
- offset, p->block->used_length);
- return -1;
- }
- p->normal[i] = offset;
- }
+ trace_multifd_recv_unfill(p->id, p->packet_num, p->flags,
+ p->next_packet_size);
- for (i = 0; i < p->zero_num; i++) {
- uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
+ return ret;
+}
- if (offset > (p->block->used_length - p->page_size)) {
- error_setg(errp, "multifd: offset too long %" PRIu64
- " (max " RAM_ADDR_FMT ")",
- offset, p->block->used_length);
- return -1;
- }
- p->zero[i] = offset;
+static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
+{
+ p->packets_recved++;
+
+ if (p->flags & MULTIFD_FLAG_DEVICE_STATE) {
+ return multifd_recv_unfill_packet_device_state(p, errp);
}
- return 0;
+ return multifd_recv_unfill_packet_ram(p, errp);
}
static bool multifd_send_should_exit(void)
@@ -568,35 +337,32 @@ static void multifd_send_kick_main(MultiFDSendParams *p)
}
/*
- * How we use multifd_send_state->pages and channel->pages?
+ * multifd_send() works by exchanging the MultiFDSendData object
+ * provided by the caller with an unused MultiFDSendData object from
+ * the next channel that is found to be idle.
*
- * We create a pages for each channel, and a main one. Each time that
- * we need to send a batch of pages we interchange the ones between
- * multifd_send_state and the channel that is sending it. There are
- * two reasons for that:
- * - to not have to do so many mallocs during migration
- * - to make easier to know what to free at the end of migration
+ * The channel owns the data until it finishes transmitting and the
+ * caller owns the empty object until it fills it with data and calls
+ * this function again. No locking necessary.
*
- * This way we always know who is the owner of each "pages" struct,
- * and we don't need any locking. It belongs to the migration thread
- * or to the channel thread. Switching is safe because the migration
- * thread is using the channel mutex when changing it, and the channel
- * have to had finish with its own, otherwise pending_job can't be
- * false.
+ * Switching is safe because both the migration thread and the channel
+ * thread have barriers in place to serialize access.
*
* Returns true if succeed, false otherwise.
*/
-static bool multifd_send_pages(void)
+bool multifd_send(MultiFDSendData **send_data)
{
int i;
static int next_channel;
MultiFDSendParams *p = NULL; /* make happy gcc */
- MultiFDPages_t *pages = multifd_send_state->pages;
+ MultiFDSendData *tmp;
if (multifd_send_should_exit()) {
return false;
}
+ QEMU_LOCK_GUARD(&multifd_send_state->multifd_send_mutex);
+
/* We wait here, until at least one channel is ready */
qemu_sem_wait(&multifd_send_state->channels_ready);
@@ -626,66 +392,24 @@ static bool multifd_send_pages(void)
* qatomic_store_release() in multifd_send_thread().
*/
smp_mb_acquire();
- assert(!p->pages->num);
- multifd_send_state->pages = p->pages;
- p->pages = pages;
- /*
- * Making sure p->pages is setup before marking pending_job=true. Pairs
- * with the qatomic_load_acquire() in multifd_send_thread().
- */
- qatomic_store_release(&p->pending_job, true);
- qemu_sem_post(&p->sem);
-
- return true;
-}
-
-static inline bool multifd_queue_empty(MultiFDPages_t *pages)
-{
- return pages->num == 0;
-}
-static inline bool multifd_queue_full(MultiFDPages_t *pages)
-{
- return pages->num == pages->allocated;
-}
-
-static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset)
-{
- pages->offset[pages->num++] = offset;
-}
+ assert(multifd_payload_empty(p->data));
-/* Returns true if enqueue successful, false otherwise */
-bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
-{
- MultiFDPages_t *pages;
-
-retry:
- pages = multifd_send_state->pages;
-
- /* If the queue is empty, we can already enqueue now */
- if (multifd_queue_empty(pages)) {
- pages->block = block;
- multifd_enqueue(pages, offset);
- return true;
- }
+ /*
+ * Swap the pointers. The channel gets the client data for
+ * transferring and the client gets back an unused data slot.
+ */
+ tmp = *send_data;
+ *send_data = p->data;
+ p->data = tmp;
/*
- * Not empty, meanwhile we need a flush. It can because of either:
- *
- * (1) The page is not on the same ramblock of previous ones, or,
- * (2) The queue is full.
- *
- * After flush, always retry.
+ * Making sure p->data is setup before marking pending_job=true. Pairs
+ * with the qatomic_load_acquire() in multifd_send_thread().
*/
- if (pages->block != block || multifd_queue_full(pages)) {
- if (!multifd_send_pages()) {
- return false;
- }
- goto retry;
- }
+ qatomic_store_release(&p->pending_job, true);
+ qemu_sem_post(&p->sem);
- /* Not empty, and we still have space, do it! */
- multifd_enqueue(pages, offset);
return true;
}
@@ -775,7 +499,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
* channels have no I/O handler callback registered when reaching
* here, because migration thread will wait for all multifd channel
* establishments to complete during setup. Since
- * migrate_fd_cleanup() will be scheduled in main thread too, all
+ * migration_cleanup() will be scheduled in main thread too, all
* previous callbacks should guarantee to be completed when
* reaching here. See multifd_send_state.channels_created and its
* usage. In the future, we could replace this with an assert
@@ -790,12 +514,13 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
qemu_sem_destroy(&p->sem_sync);
g_free(p->name);
p->name = NULL;
- multifd_pages_clear(p->pages);
- p->pages = NULL;
+ g_clear_pointer(&p->data, multifd_send_data_free);
p->packet_len = 0;
+ g_clear_pointer(&p->packet_device_state, g_free);
g_free(p->packet);
p->packet = NULL;
multifd_send_state->ops->send_cleanup(p, errp);
+ assert(!p->iov);
return *errp == NULL;
}
@@ -804,12 +529,12 @@ static void multifd_send_cleanup_state(void)
{
file_cleanup_outgoing_migration();
socket_cleanup_outgoing_migration();
+ multifd_device_state_send_cleanup();
qemu_sem_destroy(&multifd_send_state->channels_created);
qemu_sem_destroy(&multifd_send_state->channels_ready);
+ qemu_mutex_destroy(&multifd_send_state->multifd_send_mutex);
g_free(multifd_send_state->params);
multifd_send_state->params = NULL;
- multifd_pages_clear(multifd_send_state->pages);
- multifd_send_state->pages = NULL;
g_free(multifd_send_state);
multifd_send_state = NULL;
}
@@ -822,6 +547,36 @@ void multifd_send_shutdown(void)
return;
}
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
+ /* thread_created implies the TLS handshake has succeeded */
+ if (p->tls_thread_created && p->thread_created) {
+ Error *local_err = NULL;
+ /*
+ * The destination expects the TLS session to always be
+ * properly terminated. This helps to detect a premature
+ * termination in the middle of the stream. Note that
+ * older QEMUs always break the connection on the source
+ * and the destination always sees
+ * GNUTLS_E_PREMATURE_TERMINATION.
+ */
+ migration_tls_channel_end(p->c, &local_err);
+
+ /*
+ * The above can return an error in case the migration has
+ * already failed. If the migration succeeded, errors are
+ * not expected but there's no need to kill the source.
+ */
+ if (local_err && !migration_has_failed(migrate_get_current())) {
+ warn_report(
+ "multifd_send_%d: Failed to terminate TLS connection: %s",
+ p->id, error_get_pretty(local_err));
+ break;
+ }
+ }
+ }
+
multifd_send_terminate_threads();
for (i = 0; i < migrate_multifd_channels(); i++) {
@@ -854,20 +609,12 @@ static int multifd_zero_copy_flush(QIOChannel *c)
return ret;
}
-int multifd_send_sync_main(void)
+int multifd_send_sync_main(MultiFDSyncReq req)
{
int i;
bool flush_zero_copy;
- if (!migrate_multifd()) {
- return 0;
- }
- if (multifd_send_state->pages->num) {
- if (!multifd_send_pages()) {
- error_report("%s: multifd_send_pages fail", __func__);
- return -1;
- }
- }
+ assert(req != MULTIFD_SYNC_NONE);
flush_zero_copy = migrate_zero_copy_send();
@@ -884,8 +631,8 @@ int multifd_send_sync_main(void)
* We should be the only user so far, so not possible to be set by
* others concurrently.
*/
- assert(qatomic_read(&p->pending_sync) == false);
- qatomic_set(&p->pending_sync, true);
+ assert(qatomic_read(&p->pending_sync) == MULTIFD_SYNC_NONE);
+ qatomic_set(&p->pending_sync, req);
qemu_sem_post(&p->sem);
}
for (i = 0; i < migrate_multifd_channels(); i++) {
@@ -937,26 +684,46 @@ static void *multifd_send_thread(void *opaque)
}
/*
- * Read pending_job flag before p->pages. Pairs with the
- * qatomic_store_release() in multifd_send_pages().
+ * Read pending_job flag before p->data. Pairs with the
+ * qatomic_store_release() in multifd_send().
*/
if (qatomic_load_acquire(&p->pending_job)) {
- MultiFDPages_t *pages = p->pages;
+ bool is_device_state = multifd_payload_device_state(p->data);
+ size_t total_size;
+ int write_flags_masked = 0;
+ p->flags = 0;
p->iovs_num = 0;
- assert(pages->num);
+ assert(!multifd_payload_empty(p->data));
- ret = multifd_send_state->ops->send_prepare(p, &local_err);
- if (ret != 0) {
- break;
+ if (is_device_state) {
+ multifd_device_state_send_prepare(p);
+
+ /* Device state packets cannot be sent via zerocopy */
+ write_flags_masked |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
+ } else {
+ ret = multifd_send_state->ops->send_prepare(p, &local_err);
+ if (ret != 0) {
+ break;
+ }
}
+ /*
+ * The packet header in the zerocopy RAM case is accounted for
+ * in multifd_nocomp_send_prepare() - where it is actually
+ * being sent.
+ */
+ total_size = iov_size(p->iov, p->iovs_num);
+
if (migrate_mapped_ram()) {
+ assert(!is_device_state);
+
ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
- p->pages->block, &local_err);
+ &p->data->u.ram, &local_err);
} else {
ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
- NULL, 0, p->write_flags,
+ NULL, 0,
+ p->write_flags & ~write_flags_masked,
&local_err);
}
@@ -964,29 +731,29 @@ static void *multifd_send_thread(void *opaque)
break;
}
- stat64_add(&mig_stats.multifd_bytes,
- p->next_packet_size + p->packet_len);
- stat64_add(&mig_stats.normal_pages, pages->normal_num);
- stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num);
+ stat64_add(&mig_stats.multifd_bytes, total_size);
- multifd_pages_reset(p->pages);
p->next_packet_size = 0;
+ multifd_send_data_clear(p->data);
/*
- * Making sure p->pages is published before saying "we're
+ * Making sure p->data is published before saying "we're
* free". Pairs with the smp_mb_acquire() in
- * multifd_send_pages().
+ * multifd_send().
*/
qatomic_store_release(&p->pending_job, false);
} else {
+ MultiFDSyncReq req = qatomic_read(&p->pending_sync);
+
/*
* If not a normal job, must be a sync request. Note that
* pending_sync is a standalone flag (unlike pending_job), so
* it doesn't require explicit memory barriers.
*/
- assert(qatomic_read(&p->pending_sync));
+ assert(req != MULTIFD_SYNC_NONE);
- if (use_packets) {
+ /* Only push the SYNC message if it involves a remote sync */
+ if (req == MULTIFD_SYNC_ALL) {
p->flags = MULTIFD_FLAG_SYNC;
multifd_send_fill_packet(p);
ret = qio_channel_write_all(p->c, (void *)p->packet,
@@ -996,10 +763,9 @@ static void *multifd_send_thread(void *opaque)
}
/* p->next_packet_size will always be zero for a SYNC packet */
stat64_add(&mig_stats.multifd_bytes, p->packet_len);
- p->flags = 0;
}
- qatomic_set(&p->pending_sync, false);
+ qatomic_set(&p->pending_sync, MULTIFD_SYNC_NONE);
qemu_sem_post(&p->sem_sync);
}
}
@@ -1015,8 +781,7 @@ out:
rcu_unregister_thread();
migration_threads_remove(thread);
- trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages,
- p->total_zero_pages);
+ trace_multifd_send_thread_end(p->id, p->packets_sent);
return NULL;
}
@@ -1069,7 +834,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p,
args->p = p;
p->tls_thread_created = true;
- qemu_thread_create(&p->tls_thread, "mig/src/tls",
+ qemu_thread_create(&p->tls_thread, MIGRATION_THREAD_SRC_TLS,
multifd_tls_handshake_thread, args,
QEMU_THREAD_JOINABLE);
return true;
@@ -1156,9 +921,8 @@ static bool multifd_new_send_channel_create(gpointer opaque, Error **errp)
bool multifd_send_setup(void)
{
MigrationState *s = migrate_get_current();
- Error *local_err = NULL;
int thread_count, ret = 0;
- uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+ uint32_t page_count = multifd_ram_page_count();
bool use_packets = multifd_use_packets();
uint8_t i;
@@ -1169,7 +933,7 @@ bool multifd_send_setup(void)
thread_count = migrate_multifd_channels();
multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
- multifd_send_state->pages = multifd_pages_init(page_count);
+ qemu_mutex_init(&multifd_send_state->multifd_send_mutex);
qemu_sem_init(&multifd_send_state->channels_created, 0);
qemu_sem_init(&multifd_send_state->channels_ready, 0);
qatomic_set(&multifd_send_state->exiting, 0);
@@ -1177,26 +941,27 @@ bool multifd_send_setup(void)
for (i = 0; i < thread_count; i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];
+ Error *local_err = NULL;
qemu_sem_init(&p->sem, 0);
qemu_sem_init(&p->sem_sync, 0);
p->id = i;
- p->pages = multifd_pages_init(page_count);
+ p->data = multifd_send_data_alloc();
if (use_packets) {
p->packet_len = sizeof(MultiFDPacket_t)
+ sizeof(uint64_t) * page_count;
p->packet = g_malloc0(p->packet_len);
- p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
- p->packet->version = cpu_to_be32(MULTIFD_VERSION);
+ p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state));
+ p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
+ p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION);
}
- p->name = g_strdup_printf("mig/src/send_%d", i);
- p->page_size = qemu_target_page_size();
- p->page_count = page_count;
+ p->name = g_strdup_printf(MIGRATION_THREAD_SRC_MULTIFD, i);
p->write_flags = 0;
if (!multifd_new_send_channel_create(p, &local_err)) {
- return false;
+ migrate_set_error(s, local_err);
+ ret = -1;
}
}
@@ -1209,24 +974,30 @@ bool multifd_send_setup(void)
qemu_sem_wait(&multifd_send_state->channels_created);
}
+ if (ret) {
+ goto err;
+ }
+
for (i = 0; i < thread_count; i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];
+ Error *local_err = NULL;
ret = multifd_send_state->ops->send_setup(p, &local_err);
if (ret) {
- break;
+ migrate_set_error(s, local_err);
+ goto err;
}
+ assert(p->iov);
}
- if (ret) {
- migrate_set_error(s, local_err);
- error_report_err(local_err);
- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
- MIGRATION_STATUS_FAILED);
- return false;
- }
+ multifd_device_state_send_setup();
return true;
+
+err:
+ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_FAILED);
+ return false;
}
bool multifd_recv(void)
@@ -1353,11 +1124,14 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p)
qemu_mutex_destroy(&p->mutex);
qemu_sem_destroy(&p->sem_sync);
qemu_sem_destroy(&p->sem);
+ g_free(p->data);
+ p->data = NULL;
g_free(p->name);
p->name = NULL;
p->packet_len = 0;
g_free(p->packet);
p->packet = NULL;
+ g_clear_pointer(&p->packet_dev_state, g_free);
g_free(p->normal);
p->normal = NULL;
g_free(p->zero);
@@ -1459,8 +1233,37 @@ void multifd_recv_sync_main(void)
trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
}
+static int multifd_device_state_recv(MultiFDRecvParams *p, Error **errp)
+{
+ g_autofree char *dev_state_buf = NULL;
+ int ret;
+
+ dev_state_buf = g_malloc(p->next_packet_size);
+
+ ret = qio_channel_read_all(p->c, dev_state_buf, p->next_packet_size, errp);
+ if (ret != 0) {
+ return ret;
+ }
+
+ if (p->packet_dev_state->idstr[sizeof(p->packet_dev_state->idstr) - 1]
+ != 0) {
+ error_setg(errp, "unterminated multifd device state idstr");
+ return -1;
+ }
+
+ if (!qemu_loadvm_load_state_buffer(p->packet_dev_state->idstr,
+ p->packet_dev_state->instance_id,
+ dev_state_buf, p->next_packet_size,
+ errp)) {
+ ret = -1;
+ }
+
+ return ret;
+}
+
static void *multifd_recv_thread(void *opaque)
{
+ MigrationState *s = migrate_get_current();
MultiFDRecvParams *p = opaque;
Error *local_err = NULL;
bool use_packets = multifd_use_packets();
@@ -1469,19 +1272,65 @@ static void *multifd_recv_thread(void *opaque)
trace_multifd_recv_thread_start(p->id);
rcu_register_thread();
+ if (!s->multifd_clean_tls_termination) {
+ p->read_flags = QIO_CHANNEL_READ_FLAG_RELAXED_EOF;
+ }
+
while (true) {
+ MultiFDPacketHdr_t hdr;
uint32_t flags = 0;
+ bool is_device_state = false;
bool has_data = false;
+ uint8_t *pkt_buf;
+ size_t pkt_len;
+
p->normal_num = 0;
if (use_packets) {
+ struct iovec iov = {
+ .iov_base = (void *)&hdr,
+ .iov_len = sizeof(hdr)
+ };
+
if (multifd_recv_should_exit()) {
break;
}
- ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
- p->packet_len, &local_err);
- if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */
+ ret = qio_channel_readv_full_all_eof(p->c, &iov, 1, NULL, NULL,
+ p->read_flags, &local_err);
+ if (!ret) {
+ /* EOF */
+ assert(!local_err);
+ break;
+ }
+
+ if (ret == -1) {
+ break;
+ }
+
+ ret = multifd_recv_unfill_packet_header(p, &hdr, &local_err);
+ if (ret) {
+ break;
+ }
+
+ is_device_state = p->flags & MULTIFD_FLAG_DEVICE_STATE;
+ if (is_device_state) {
+ pkt_buf = (uint8_t *)p->packet_dev_state + sizeof(hdr);
+ pkt_len = sizeof(*p->packet_dev_state) - sizeof(hdr);
+ } else {
+ pkt_buf = (uint8_t *)p->packet + sizeof(hdr);
+ pkt_len = p->packet_len - sizeof(hdr);
+ }
+
+ ret = qio_channel_read_all_eof(p->c, (char *)pkt_buf, pkt_len,
+ &local_err);
+ if (!ret) {
+ /* EOF */
+ error_setg(&local_err, "multifd: unexpected EOF after packet header");
+ break;
+ }
+
+ if (ret == -1) {
break;
}
@@ -1495,7 +1344,18 @@ static void *multifd_recv_thread(void *opaque)
flags = p->flags;
/* recv methods don't know how to handle the SYNC flag */
p->flags &= ~MULTIFD_FLAG_SYNC;
- has_data = p->normal_num || p->zero_num;
+
+ if (is_device_state) {
+ has_data = p->next_packet_size > 0;
+ } else {
+ /*
+ * Even if it's a SYNC packet, this needs to be set
+ * because older QEMUs (<9.0) still send data along with
+ * the SYNC packet.
+ */
+ has_data = p->normal_num || p->zero_num;
+ }
+
qemu_mutex_unlock(&p->mutex);
} else {
/*
@@ -1524,19 +1384,40 @@ static void *multifd_recv_thread(void *opaque)
}
if (has_data) {
- ret = multifd_recv_state->ops->recv(p, &local_err);
+ /*
+ * multifd thread should not be active and receive data
+ * when migration is in the Postcopy phase. Two threads
+ * writing the same memory area could easily corrupt
+ * the guest state.
+ */
+ assert(!migration_in_postcopy());
+ if (is_device_state) {
+ assert(use_packets);
+ ret = multifd_device_state_recv(p, &local_err);
+ } else {
+ ret = multifd_recv_state->ops->recv(p, &local_err);
+ }
if (ret != 0) {
break;
}
+ } else if (is_device_state) {
+ error_setg(&local_err,
+ "multifd: received empty device state packet");
+ break;
}
if (use_packets) {
if (flags & MULTIFD_FLAG_SYNC) {
+ if (is_device_state) {
+ error_setg(&local_err,
+ "multifd: received SYNC device state packet");
+ break;
+ }
+
qemu_sem_post(&multifd_recv_state->sem_sync);
qemu_sem_wait(&p->sem_sync);
}
} else {
- p->total_normal_pages += p->data->size / qemu_target_page_size();
p->data->size = 0;
/*
* Order data->size update before clearing
@@ -1553,9 +1434,7 @@ static void *multifd_recv_thread(void *opaque)
}
rcu_unregister_thread();
- trace_multifd_recv_thread_end(p->id, p->packets_recved,
- p->total_normal_pages,
- p->total_zero_pages);
+ trace_multifd_recv_thread_end(p->id, p->packets_recved);
return NULL;
}
@@ -1563,7 +1442,7 @@ static void *multifd_recv_thread(void *opaque)
int multifd_recv_setup(Error **errp)
{
int thread_count;
- uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+ uint32_t page_count = multifd_ram_page_count();
bool use_packets = multifd_use_packets();
uint8_t i;
@@ -1603,12 +1482,11 @@ int multifd_recv_setup(Error **errp)
p->packet_len = sizeof(MultiFDPacket_t)
+ sizeof(uint64_t) * page_count;
p->packet = g_malloc0(p->packet_len);
+ p->packet_dev_state = g_malloc0(sizeof(*p->packet_dev_state));
}
- p->name = g_strdup_printf("mig/dst/recv_%d", i);
+ p->name = g_strdup_printf(MIGRATION_THREAD_DST_MULTIFD, i);
p->normal = g_new0(ram_addr_t, page_count);
p->zero = g_new0(ram_addr_t, page_count);
- p->page_count = page_count;
- p->page_size = qemu_target_page_size();
}
for (i = 0; i < thread_count; i++) {
@@ -1681,17 +1559,3 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
QEMU_THREAD_JOINABLE);
qatomic_inc(&multifd_recv_state->count);
}
-
-bool multifd_send_prepare_common(MultiFDSendParams *p)
-{
- multifd_send_zero_page_detect(p);
-
- if (!p->pages->normal_num) {
- p->next_packet_size = 0;
- return false;
- }
-
- multifd_send_prepare_header(p);
-
- return true;
-}
diff --git a/migration/multifd.h b/migration/multifd.h
index 0ecd6f4..9b6d81e 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -13,9 +13,27 @@
#ifndef QEMU_MIGRATION_MULTIFD_H
#define QEMU_MIGRATION_MULTIFD_H
+#include "exec/target_page.h"
#include "ram.h"
typedef struct MultiFDRecvData MultiFDRecvData;
+typedef struct MultiFDSendData MultiFDSendData;
+
+typedef enum {
+ /* No sync request */
+ MULTIFD_SYNC_NONE = 0,
+ /* Sync locally on the sender threads without pushing messages */
+ MULTIFD_SYNC_LOCAL,
+ /*
+ * Sync not only on the sender threads, but also push MULTIFD_FLAG_SYNC
+ * message to the wire for each iochannel (which is for a remote sync).
+ *
+ * When remote sync is used, need to be paired with a follow up
+ * RAM_SAVE_FLAG_EOS / RAM_SAVE_FLAG_MULTIFD_FLUSH message on the main
+ * channel.
+ */
+ MULTIFD_SYNC_ALL,
+} MultiFDSyncReq;
bool multifd_send_setup(void);
void multifd_send_shutdown(void);
@@ -26,22 +44,34 @@ void multifd_recv_shutdown(void);
bool multifd_recv_all_channels_created(void);
void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
void multifd_recv_sync_main(void);
-int multifd_send_sync_main(void);
+int multifd_send_sync_main(MultiFDSyncReq req);
bool multifd_queue_page(RAMBlock *block, ram_addr_t offset);
bool multifd_recv(void);
MultiFDRecvData *multifd_get_recv_data(void);
+/* Multiple fd's */
+
+#define MULTIFD_MAGIC 0x11223344U
+#define MULTIFD_VERSION 1
+
/* Multifd Compression flags */
#define MULTIFD_FLAG_SYNC (1 << 0)
-/* We reserve 4 bits for compression methods */
-#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1)
+/* We reserve 5 bits for compression methods */
+#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1)
/* we need to be compatible. Before compression value was 0 */
#define MULTIFD_FLAG_NOCOMP (0 << 1)
#define MULTIFD_FLAG_ZLIB (1 << 1)
#define MULTIFD_FLAG_ZSTD (2 << 1)
#define MULTIFD_FLAG_QPL (4 << 1)
#define MULTIFD_FLAG_UADK (8 << 1)
+#define MULTIFD_FLAG_QATZIP (16 << 1)
+
+/*
+ * If set it means that this packet contains device state
+ * (MultiFDPacketDeviceState_t), not RAM data (MultiFDPacket_t).
+ */
+#define MULTIFD_FLAG_DEVICE_STATE (32 << 1)
/* This value needs to be a multiple of qemu_target_page_size() */
#define MULTIFD_PACKET_SIZE (512 * 1024)
@@ -50,6 +80,11 @@ typedef struct {
uint32_t magic;
uint32_t version;
uint32_t flags;
+} __attribute__((packed)) MultiFDPacketHdr_t;
+
+typedef struct {
+ MultiFDPacketHdr_t hdr;
+
/* maximum number of allocated pages */
uint32_t pages_alloc;
/* non zero pages */
@@ -71,15 +106,27 @@ typedef struct {
} __attribute__((packed)) MultiFDPacket_t;
typedef struct {
+ MultiFDPacketHdr_t hdr;
+
+ char idstr[256];
+ uint32_t instance_id;
+
+ /* size of the next packet that contains the actual data */
+ uint32_t next_packet_size;
+} __attribute__((packed)) MultiFDPacketDeviceState_t;
+
+typedef struct {
/* number of used pages */
uint32_t num;
/* number of normal pages */
uint32_t normal_num;
- /* number of allocated pages */
- uint32_t allocated;
- /* offset of each page */
- ram_addr_t *offset;
+ /*
+ * Pointer to the ramblock. NOTE: it's caller's responsibility to make
+ * sure the pointer is always valid!
+ */
RAMBlock *block;
+ /* offset array of each page, managed by multifd */
+ ram_addr_t *offset;
} MultiFDPages_t;
struct MultiFDRecvData {
@@ -90,6 +137,48 @@ struct MultiFDRecvData {
};
typedef struct {
+ char *idstr;
+ uint32_t instance_id;
+ char *buf;
+ size_t buf_len;
+} MultiFDDeviceState_t;
+
+typedef enum {
+ MULTIFD_PAYLOAD_NONE,
+ MULTIFD_PAYLOAD_RAM,
+ MULTIFD_PAYLOAD_DEVICE_STATE,
+} MultiFDPayloadType;
+
+typedef struct MultiFDPayload {
+ MultiFDPages_t ram;
+ MultiFDDeviceState_t device_state;
+} MultiFDPayload;
+
+struct MultiFDSendData {
+ MultiFDPayloadType type;
+ MultiFDPayload u;
+};
+
+static inline bool multifd_payload_empty(MultiFDSendData *data)
+{
+ return data->type == MULTIFD_PAYLOAD_NONE;
+}
+
+static inline bool multifd_payload_device_state(MultiFDSendData *data)
+{
+ return data->type == MULTIFD_PAYLOAD_DEVICE_STATE;
+}
+
+static inline void multifd_set_payload_type(MultiFDSendData *data,
+ MultiFDPayloadType type)
+{
+ assert(multifd_payload_empty(data));
+ assert(type != MULTIFD_PAYLOAD_NONE);
+
+ data->type = type;
+}
+
+typedef struct {
/* Fields are only written at creating/deletion time */
/* No lock required for them, they are read only */
@@ -106,10 +195,6 @@ typedef struct {
QIOChannel *c;
/* packet allocated len */
uint32_t packet_len;
- /* guest page size */
- uint32_t page_size;
- /* number of pages in a full packet */
- uint32_t page_count;
/* multifd flags for sending ram */
int write_flags;
@@ -121,7 +206,7 @@ typedef struct {
/* multifd flags for each packet */
uint32_t flags;
/*
- * The sender thread has work to do if either of below boolean is set.
+ * The sender thread has work to do if either of below field is set.
*
* @pending_job: a job is pending
* @pending_sync: a sync request is pending
@@ -130,26 +215,19 @@ typedef struct {
* cleared by the multifd sender threads.
*/
bool pending_job;
- bool pending_sync;
- /* array of pages to sent.
- * The owner of 'pages' depends of 'pending_job' value:
- * pending_job == 0 -> migration_thread can use it.
- * pending_job != 0 -> multifd_channel can use it.
- */
- MultiFDPages_t *pages;
+ MultiFDSyncReq pending_sync;
+
+ MultiFDSendData *data;
/* thread local variables. No locking required */
- /* pointer to the packet */
+ /* pointers to the possible packet types */
MultiFDPacket_t *packet;
+ MultiFDPacketDeviceState_t *packet_device_state;
/* size of the next packet that contains pages */
uint32_t next_packet_size;
/* packets sent through this channel */
uint64_t packets_sent;
- /* non zero pages sent through this channel */
- uint64_t total_normal_pages;
- /* zero pages sent through this channel */
- uint64_t total_zero_pages;
/* buffers to send */
struct iovec *iov;
/* number of iovs used */
@@ -173,10 +251,6 @@ typedef struct {
QIOChannel *c;
/* packet allocated len */
uint32_t packet_len;
- /* guest page size */
- uint32_t page_size;
- /* number of pages in a full packet */
- uint32_t page_count;
/* syncs main thread and channels */
QemuSemaphore sem_sync;
@@ -196,8 +270,9 @@ typedef struct {
/* thread local variables. No locking required */
- /* pointer to the packet */
+ /* pointers to the possible packet types */
MultiFDPacket_t *packet;
+ MultiFDPacketDeviceState_t *packet_dev_state;
/* size of the next packet that contains pages */
uint32_t next_packet_size;
/* packets received through this channel */
@@ -206,10 +281,6 @@ typedef struct {
RAMBlock *block;
/* ramblock host address */
uint8_t *host;
- /* non zero pages recv through this channel */
- uint64_t total_normal_pages;
- /* zero pages recv through this channel */
- uint64_t total_zero_pages;
/* buffers to recv */
struct iovec *iov;
/* Pages that are not zero */
@@ -222,36 +293,126 @@ typedef struct {
uint32_t zero_num;
/* used for de-compression methods */
void *compress_data;
+ /* Flags for the QIOChannel */
+ int read_flags;
} MultiFDRecvParams;
typedef struct {
- /* Setup for sending side */
+ /*
+ * The send_setup, send_cleanup, send_prepare are only called on
+ * the QEMU instance at the migration source.
+ */
+
+ /*
+ * Setup for sending side. Called once per channel during channel
+ * setup phase.
+ *
+ * Must allocate p->iov. If packets are in use (default), one
+ * extra iovec must be allocated for the packet header. Any memory
+ * allocated in this hook must be released at send_cleanup.
+ *
+ * p->write_flags may be used for passing flags to the QIOChannel.
+ *
+ * p->compression_data may be used by compression methods to store
+ * compression data.
+ */
int (*send_setup)(MultiFDSendParams *p, Error **errp);
- /* Cleanup for sending side */
+
+ /*
+ * Cleanup for sending side. Called once per channel during
+ * channel cleanup phase.
+ */
void (*send_cleanup)(MultiFDSendParams *p, Error **errp);
- /* Prepare the send packet */
+
+ /*
+ * Prepare the send packet. Called as a result of multifd_send()
+ * on the client side, with p pointing to the MultiFDSendParams of
+ * a channel that is currently idle.
+ *
+ * Must populate p->iov with the data to be sent, increment
+ * p->iovs_num to match the amount of iovecs used and set
+ * p->next_packet_size with the amount of data currently present
+ * in p->iov.
+ *
+ * Must indicate whether this is a compression packet by setting
+ * p->flags.
+ *
+ * As a last step, if packets are in use (default), must prepare
+ * the packet by calling multifd_send_fill_packet().
+ */
int (*send_prepare)(MultiFDSendParams *p, Error **errp);
- /* Setup for receiving side */
+
+ /*
+ * The recv_setup, recv_cleanup, recv are only called on the QEMU
+ * instance at the migration destination.
+ */
+
+ /*
+ * Setup for receiving side. Called once per channel during
+ * channel setup phase. May be empty.
+ *
+ * May allocate data structures for the receiving of data. May use
+ * p->iov. Compression methods may use p->compress_data.
+ */
int (*recv_setup)(MultiFDRecvParams *p, Error **errp);
- /* Cleanup for receiving side */
+
+ /*
+ * Cleanup for receiving side. Called once per channel during
+ * channel cleanup phase. May be empty.
+ */
void (*recv_cleanup)(MultiFDRecvParams *p);
- /* Read all data */
+
+ /*
+ * Data receive method. Called as a result of multifd_recv() on
+ * the client side, with p pointing to the MultiFDRecvParams of a
+ * channel that is currently idle. Only called if there is data
+ * available to receive.
+ *
+ * Must validate p->flags according to what was set at
+ * send_prepare.
+ *
+ * Must read the data from the QIOChannel p->c.
+ */
int (*recv)(MultiFDRecvParams *p, Error **errp);
} MultiFDMethods;
-void multifd_register_ops(int method, MultiFDMethods *ops);
+void multifd_register_ops(int method, const MultiFDMethods *ops);
void multifd_send_fill_packet(MultiFDSendParams *p);
bool multifd_send_prepare_common(MultiFDSendParams *p);
void multifd_send_zero_page_detect(MultiFDSendParams *p);
void multifd_recv_zero_page_process(MultiFDRecvParams *p);
-static inline void multifd_send_prepare_header(MultiFDSendParams *p)
+void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc);
+bool multifd_send(MultiFDSendData **send_data);
+MultiFDSendData *multifd_send_data_alloc(void);
+void multifd_send_data_clear(MultiFDSendData *data);
+void multifd_send_data_free(MultiFDSendData *data);
+
+static inline uint32_t multifd_ram_page_size(void)
{
- p->iov[0].iov_len = p->packet_len;
- p->iov[0].iov_base = p->packet;
- p->iovs_num++;
+ return qemu_target_page_size();
}
-void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc);
+static inline uint32_t multifd_ram_page_count(void)
+{
+ return MULTIFD_PACKET_SIZE / qemu_target_page_size();
+}
+
+void multifd_ram_save_setup(void);
+void multifd_ram_save_cleanup(void);
+int multifd_ram_flush_and_sync(QEMUFile *f);
+bool multifd_ram_sync_per_round(void);
+bool multifd_ram_sync_per_section(void);
+void multifd_ram_payload_alloc(MultiFDPages_t *pages);
+void multifd_ram_payload_free(MultiFDPages_t *pages);
+void multifd_ram_fill_packet(MultiFDSendParams *p);
+int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
+
+void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state);
+
+void multifd_device_state_send_setup(void);
+void multifd_device_state_send_cleanup(void);
+
+void multifd_device_state_send_prepare(MultiFDSendParams *p);
#endif
diff --git a/migration/options.c b/migration/options.c
index 645f550..162c72c 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -19,16 +19,17 @@
#include "qapi/qapi-commands-migration.h"
#include "qapi/qapi-visit-migration.h"
#include "qapi/qmp/qerror.h"
-#include "qapi/qmp/qnull.h"
-#include "sysemu/runstate.h"
+#include "qobject/qnull.h"
+#include "system/runstate.h"
#include "migration/colo.h"
+#include "migration/cpr.h"
#include "migration/misc.h"
#include "migration.h"
#include "migration-stats.h"
#include "qemu-file.h"
#include "ram.h"
#include "options.h"
-#include "sysemu/kvm.h"
+#include "system/kvm.h"
/* Maximum migrate downtime set to 2000 seconds */
#define MAX_MIGRATE_DOWNTIME_SECONDS 2000
@@ -55,6 +56,13 @@
#define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
/* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
#define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
+/*
+ * 1: best speed, ... 9: best compress ratio
+ * There is some nuance here. Refer to QATzip documentation to understand
+ * the mapping of QATzip levels to standard deflate levels.
+ */
+#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1
+
/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
@@ -78,19 +86,23 @@
#define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD 1000 /* milliseconds */
#define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT 1 /* MB/s */
-Property migration_properties[] = {
+const Property migration_properties[] = {
DEFINE_PROP_BOOL("store-global-state", MigrationState,
store_global_state, true),
DEFINE_PROP_BOOL("send-configuration", MigrationState,
send_configuration, true),
DEFINE_PROP_BOOL("send-section-footer", MigrationState,
send_section_footer, true),
+ DEFINE_PROP_BOOL("send-switchover-start", MigrationState,
+ send_switchover_start, true),
DEFINE_PROP_BOOL("multifd-flush-after-each-section", MigrationState,
multifd_flush_after_each_section, false),
DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
DEFINE_PROP_BOOL("x-preempt-pre-7-2", MigrationState,
preempt_pre_7_2, false),
+ DEFINE_PROP_BOOL("multifd-clean-tls-termination", MigrationState,
+ multifd_clean_tls_termination, true),
/* Migration parameters */
DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
@@ -123,6 +135,9 @@ Property migration_properties[] = {
DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
parameters.multifd_zlib_level,
DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
+ DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState,
+ parameters.multifd_qatzip_level,
+ DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL),
DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
parameters.multifd_zstd_level,
DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
@@ -186,8 +201,8 @@ Property migration_properties[] = {
MIGRATION_CAPABILITY_SWITCHOVER_ACK),
DEFINE_PROP_MIG_CAP("x-dirty-limit", MIGRATION_CAPABILITY_DIRTY_LIMIT),
DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM),
- DEFINE_PROP_END_OF_LIST(),
};
+const size_t migration_properties_count = ARRAY_SIZE(migration_properties);
bool migrate_auto_converge(void)
{
@@ -196,6 +211,13 @@ bool migrate_auto_converge(void)
return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
}
+bool migrate_send_switchover_start(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ return s->send_switchover_start;
+}
+
bool migrate_background_snapshot(void)
{
MigrationState *s = migrate_get_current();
@@ -329,13 +351,6 @@ bool migrate_xbzrle(void)
return s->capabilities[MIGRATION_CAPABILITY_XBZRLE];
}
-bool migrate_zero_blocks(void)
-{
- MigrationState *s = migrate_get_current();
-
- return s->capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
-}
-
bool migrate_zero_copy_send(void)
{
MigrationState *s = migrate_get_current();
@@ -433,6 +448,24 @@ static bool migrate_incoming_started(void)
return !!migration_incoming_get_current()->transport_data;
}
+bool migrate_rdma_caps_check(bool *caps, Error **errp)
+{
+ if (caps[MIGRATION_CAPABILITY_XBZRLE]) {
+ error_setg(errp, "RDMA and XBZRLE can't be used together");
+ return false;
+ }
+ if (caps[MIGRATION_CAPABILITY_MULTIFD]) {
+ error_setg(errp, "RDMA and multifd can't be used together");
+ return false;
+ }
+ if (caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
+ error_setg(errp, "RDMA and postcopy-ram can't be used together");
+ return false;
+ }
+
+ return true;
+}
+
/**
* @migration_caps_check - check capability compatibility
*
@@ -447,6 +480,10 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
ERRP_GUARD();
MigrationIncomingState *mis = migration_incoming_get_current();
+ if (new_caps[MIGRATION_CAPABILITY_ZERO_BLOCKS]) {
+ warn_report("zero-blocks capability is deprecated");
+ }
+
#ifndef CONFIG_REPLICATION
if (new_caps[MIGRATION_CAPABILITY_X_COLO]) {
error_setg(errp, "QEMU compiled without replication module"
@@ -472,11 +509,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
error_setg(errp, "Postcopy is not compatible with ignore-shared");
return false;
}
-
- if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
- error_setg(errp, "Postcopy is not yet compatible with multifd");
- return false;
- }
}
if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
@@ -536,7 +568,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
return false;
}
- if (migrate_incoming_started()) {
+ if (!migrate_postcopy_preempt() && migrate_incoming_started()) {
error_setg(errp,
"Postcopy preempt must be set before incoming starts");
return false;
@@ -544,7 +576,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
}
if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
- if (migrate_incoming_started()) {
+ if (!migrate_multifd() && migrate_incoming_started()) {
error_setg(errp, "Multifd must be set before incoming starts");
return false;
}
@@ -592,26 +624,13 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
}
}
- return true;
-}
-
-bool migrate_cap_set(int cap, bool value, Error **errp)
-{
- MigrationState *s = migrate_get_current();
- bool new_caps[MIGRATION_CAPABILITY__MAX];
-
- if (migration_is_running()) {
- error_setg(errp, "There's a migration process in progress");
- return false;
- }
-
- memcpy(new_caps, s->capabilities, sizeof(new_caps));
- new_caps[cap] = value;
-
- if (!migrate_caps_check(s->capabilities, new_caps, errp)) {
+ /*
+ * On destination side, check the cases that capability is being set
+ * after incoming thread has started.
+ */
+ if (migrate_rdma() && !migrate_rdma_caps_check(new_caps, errp)) {
return false;
}
- s->capabilities[cap] = value;
return true;
}
@@ -758,8 +777,11 @@ uint64_t migrate_max_postcopy_bandwidth(void)
MigMode migrate_mode(void)
{
- MigrationState *s = migrate_get_current();
- MigMode mode = s->parameters.mode;
+ MigMode mode = cpr_get_incoming_mode();
+
+ if (mode == MIG_MODE_NONE) {
+ mode = migrate_get_current()->parameters.mode;
+ }
assert(mode >= 0 && mode < MIG_MODE__MAX);
return mode;
@@ -787,6 +809,13 @@ int migrate_multifd_zlib_level(void)
return s->parameters.multifd_zlib_level;
}
+int migrate_multifd_qatzip_level(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ return s->parameters.multifd_qatzip_level;
+}
+
int migrate_multifd_zstd_level(void)
{
MigrationState *s = migrate_get_current();
@@ -892,6 +921,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
params->multifd_compression = s->parameters.multifd_compression;
params->has_multifd_zlib_level = true;
params->multifd_zlib_level = s->parameters.multifd_zlib_level;
+ params->has_multifd_qatzip_level = true;
+ params->multifd_qatzip_level = s->parameters.multifd_qatzip_level;
params->has_multifd_zstd_level = true;
params->multifd_zstd_level = s->parameters.multifd_zstd_level;
params->has_xbzrle_cache_size = true;
@@ -946,6 +977,7 @@ void migrate_params_init(MigrationParameters *params)
params->has_multifd_channels = true;
params->has_multifd_compression = true;
params->has_multifd_zlib_level = true;
+ params->has_multifd_qatzip_level = true;
params->has_multifd_zstd_level = true;
params->has_xbzrle_cache_size = true;
params->has_max_postcopy_bandwidth = true;
@@ -1038,6 +1070,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp)
return false;
}
+ if (params->has_multifd_qatzip_level &&
+ ((params->multifd_qatzip_level > 9) ||
+ (params->multifd_qatzip_level < 1))) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level",
+ "a value between 1 and 9");
+ return false;
+ }
+
if (params->has_multifd_zstd_level &&
(params->multifd_zstd_level > 20)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
@@ -1173,6 +1213,11 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
dest->tls_hostname = params->tls_hostname->u.s;
}
+ if (params->tls_authz) {
+ assert(params->tls_authz->type == QTYPE_QSTRING);
+ dest->tls_authz = params->tls_authz->u.s;
+ }
+
if (params->has_max_bandwidth) {
dest->max_bandwidth = params->max_bandwidth;
}
@@ -1195,6 +1240,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
if (params->has_multifd_compression) {
dest->multifd_compression = params->multifd_compression;
}
+ if (params->has_multifd_qatzip_level) {
+ dest->multifd_qatzip_level = params->multifd_qatzip_level;
+ }
if (params->has_multifd_zlib_level) {
dest->multifd_zlib_level = params->multifd_zlib_level;
}
@@ -1315,6 +1363,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
if (params->has_multifd_compression) {
s->parameters.multifd_compression = params->multifd_compression;
}
+ if (params->has_multifd_qatzip_level) {
+ s->parameters.multifd_qatzip_level = params->multifd_qatzip_level;
+ }
if (params->has_multifd_zlib_level) {
s->parameters.multifd_zlib_level = params->multifd_zlib_level;
}
diff --git a/migration/options.h b/migration/options.h
index a239702..82d8397 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -20,7 +20,8 @@
/* migration properties */
-extern Property migration_properties[];
+extern const Property migration_properties[];
+extern const size_t migration_properties_count;
/* capabilities */
@@ -40,7 +41,6 @@ bool migrate_release_ram(void);
bool migrate_return_path(void);
bool migrate_validate_uuid(void);
bool migrate_xbzrle(void);
-bool migrate_zero_blocks(void);
bool migrate_zero_copy_send(void);
/*
@@ -57,8 +57,8 @@ bool migrate_tls(void);
/* capabilities helpers */
+bool migrate_rdma_caps_check(bool *caps, Error **errp);
bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp);
-bool migrate_cap_set(int cap, bool value, Error **errp);
/* parameters */
@@ -78,6 +78,7 @@ uint64_t migrate_max_postcopy_bandwidth(void);
int migrate_multifd_channels(void);
MultiFDCompression migrate_multifd_compression(void);
int migrate_multifd_zlib_level(void);
+int migrate_multifd_qatzip_level(void);
int migrate_multifd_zstd_level(void);
uint8_t migrate_throttle_trigger_threshold(void);
const char *migrate_tls_authz(void);
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 1c374b7..75fd310 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -27,11 +27,11 @@
#include "qapi/error.h"
#include "qemu/notify.h"
#include "qemu/rcu.h"
-#include "sysemu/sysemu.h"
+#include "system/system.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "hw/boards.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "socket.h"
#include "yank_functions.h"
#include "tls.h"
@@ -90,10 +90,10 @@ void postcopy_thread_create(MigrationIncomingState *mis,
QemuThread *thread, const char *name,
void *(*fn)(void *), int joinable)
{
- qemu_sem_init(&mis->thread_sync_sem, 0);
+ qemu_event_init(&mis->thread_sync_event, false);
qemu_thread_create(thread, name, fn, mis, joinable);
- qemu_sem_wait(&mis->thread_sync_sem);
- qemu_sem_destroy(&mis->thread_sync_sem);
+ qemu_event_wait(&mis->thread_sync_event);
+ qemu_event_destroy(&mis->thread_sync_event);
}
/* Postcopy needs to detect accesses to pages that haven't yet been copied
@@ -651,8 +651,8 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
mis->have_fault_thread = false;
}
- if (enable_mlock) {
- if (os_mlock() < 0) {
+ if (should_mlock(mlock_state)) {
+ if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) {
error_report("mlock: %s", strerror(errno));
/*
* It doesn't feel right to fail at this point, we have a valid
@@ -746,18 +746,10 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
RAMBlock *rb)
{
size_t pagesize = qemu_ram_pagesize(rb);
- struct uffdio_range range;
- int ret;
trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
- range.start = ROUND_DOWN(client_addr, pagesize);
- range.len = pagesize;
- ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
- if (ret) {
- error_report("%s: Failed to wake: %zx in %s (%s)",
- __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
- strerror(errno));
- }
- return ret;
+ return uffd_wakeup(pcfd->fd,
+ (void *)(uintptr_t)ROUND_DOWN(client_addr, pagesize),
+ pagesize);
}
static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
@@ -972,7 +964,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
trace_postcopy_ram_fault_thread_entry();
rcu_register_thread();
mis->last_rb = NULL; /* last RAMBlock we sent part of */
- qemu_sem_post(&mis->thread_sync_sem);
+ qemu_event_set(&mis->thread_sync_event);
struct pollfd *pfd;
size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
@@ -1238,7 +1230,8 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
return -1;
}
- postcopy_thread_create(mis, &mis->fault_thread, "mig/dst/fault",
+ postcopy_thread_create(mis, &mis->fault_thread,
+ MIGRATION_THREAD_DST_FAULT,
postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
mis->have_fault_thread = true;
@@ -1258,7 +1251,8 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
* This thread needs to be created after the temp pages because
* it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
*/
- postcopy_thread_create(mis, &mis->postcopy_prio_thread, "mig/dst/preempt",
+ postcopy_thread_create(mis, &mis->postcopy_prio_thread,
+ MIGRATION_THREAD_DST_PREEMPT,
postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
}
@@ -1275,18 +1269,10 @@ static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
int ret;
if (from_addr) {
- struct uffdio_copy copy_struct;
- copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
- copy_struct.src = (uint64_t)(uintptr_t)from_addr;
- copy_struct.len = pagesize;
- copy_struct.mode = 0;
- ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
+ ret = uffd_copy_page(userfault_fd, host_addr, from_addr, pagesize,
+ false);
} else {
- struct uffdio_zeropage zero_struct;
- zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
- zero_struct.range.len = pagesize;
- zero_struct.mode = 0;
- ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
+ ret = uffd_zero_page(userfault_fd, host_addr, pagesize, false);
}
if (!ret) {
qemu_mutex_lock(&mis->page_request_mutex);
@@ -1343,18 +1329,16 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
RAMBlock *rb)
{
size_t pagesize = qemu_ram_pagesize(rb);
+ int e;
/* copy also acks to the kernel waking the stalled thread up
* TODO: We can inhibit that ack and only do it if it was requested
* which would be slightly cheaper, but we'd have to be careful
* of the order of updating our page state.
*/
- if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
- int e = errno;
- error_report("%s: %s copy host: %p from: %p (size: %zd)",
- __func__, strerror(e), host, from, pagesize);
-
- return -e;
+ e = qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb);
+ if (e) {
+ return e;
}
trace_postcopy_place_page(host);
@@ -1376,12 +1360,10 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
* but it's not available for everything (e.g. hugetlbpages)
*/
if (qemu_ram_is_uf_zeroable(rb)) {
- if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
- int e = errno;
- error_report("%s: %s zero host: %p",
- __func__, strerror(e), host);
-
- return -e;
+ int e;
+ e = qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb);
+ if (e) {
+ return e;
}
return postcopy_notify_shared_wake(rb,
qemu_ram_block_host_offset(rb,
@@ -1411,49 +1393,42 @@ int postcopy_ram_incoming_init(MigrationIncomingState *mis)
int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
uint64_t client_addr, uint64_t rb_offset)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
RAMBlock *rb)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
RAMBlock *rb)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
int postcopy_wake_shared(struct PostCopyFD *pcfd,
uint64_t client_addr,
RAMBlock *rb)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
#endif
@@ -1741,7 +1716,7 @@ void *postcopy_preempt_thread(void *opaque)
rcu_register_thread();
- qemu_sem_post(&mis->thread_sync_sem);
+ qemu_event_set(&mis->thread_sync_event);
/*
* The preempt channel is established in asynchronous way. Wait
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index b6d2f58..b6ac190 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -37,6 +37,11 @@
#define IO_BUF_SIZE 32768
#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64)
+typedef struct FdEntry {
+ QTAILQ_ENTRY(FdEntry) entry;
+ int fd;
+} FdEntry;
+
struct QEMUFile {
QIOChannel *ioc;
bool is_writable;
@@ -51,6 +56,9 @@ struct QEMUFile {
int last_error;
Error *last_error_obj;
+
+ bool can_pass_fd;
+ QTAILQ_HEAD(, FdEntry) fds;
};
/*
@@ -109,6 +117,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
object_ref(ioc);
f->ioc = ioc;
f->is_writable = is_writable;
+ f->can_pass_fd = qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS);
+ QTAILQ_INIT(&f->fds);
return f;
}
@@ -310,6 +320,10 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
int len;
int pending;
Error *local_error = NULL;
+ g_autofree int *fds = NULL;
+ size_t nfd = 0;
+ int **pfds = f->can_pass_fd ? &fds : NULL;
+ size_t *pnfd = f->can_pass_fd ? &nfd : NULL;
assert(!qemu_file_is_writable(f));
@@ -325,10 +339,9 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
}
do {
- len = qio_channel_read(f->ioc,
- (char *)f->buf + pending,
- IO_BUF_SIZE - pending,
- &local_error);
+ struct iovec iov = { f->buf + pending, IO_BUF_SIZE - pending };
+ len = qio_channel_readv_full(f->ioc, &iov, 1, pfds, pnfd, 0,
+ &local_error);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
qio_channel_yield(f->ioc, G_IO_IN);
@@ -348,9 +361,66 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
qemu_file_set_error_obj(f, len, local_error);
}
+ for (int i = 0; i < nfd; i++) {
+ FdEntry *fde = g_new0(FdEntry, 1);
+ fde->fd = fds[i];
+ QTAILQ_INSERT_TAIL(&f->fds, fde, entry);
+ }
+
return len;
}
+int qemu_file_put_fd(QEMUFile *f, int fd)
+{
+ int ret = 0;
+ QIOChannel *ioc = qemu_file_get_ioc(f);
+ Error *err = NULL;
+ struct iovec iov = { (void *)" ", 1 };
+
+ /*
+ * Send a dummy byte so qemu_fill_buffer on the receiving side does not
+ * fail with a len=0 error. Flush first to maintain ordering wrt other
+ * data.
+ */
+
+ qemu_fflush(f);
+ if (qio_channel_writev_full(ioc, &iov, 1, &fd, 1, 0, &err) < 1) {
+ error_report_err(error_copy(err));
+ qemu_file_set_error_obj(f, -EIO, err);
+ ret = -1;
+ }
+ trace_qemu_file_put_fd(f->ioc->name, fd, ret);
+ return ret;
+}
+
+int qemu_file_get_fd(QEMUFile *f)
+{
+ int fd = -1;
+ FdEntry *fde;
+
+ if (!f->can_pass_fd) {
+ Error *err = NULL;
+ error_setg(&err, "%s does not support fd passing", f->ioc->name);
+ error_report_err(error_copy(err));
+ qemu_file_set_error_obj(f, -EIO, err);
+ goto out;
+ }
+
+ /* Force the dummy byte and its fd passenger to appear. */
+ qemu_peek_byte(f, 0);
+
+ fde = QTAILQ_FIRST(&f->fds);
+ if (fde) {
+ qemu_get_byte(f); /* Drop the dummy byte */
+ fd = fde->fd;
+ QTAILQ_REMOVE(&f->fds, fde, entry);
+ g_free(fde);
+ }
+out:
+ trace_qemu_file_get_fd(f->ioc->name, fd);
+ return fd;
+}
+
/** Closes the file
*
* Returns negative error value if any error happened on previous operations or
@@ -361,11 +431,17 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
*/
int qemu_fclose(QEMUFile *f)
{
+ FdEntry *fde, *next;
int ret = qemu_fflush(f);
int ret2 = qio_channel_close(f->ioc, NULL);
if (ret >= 0) {
ret = ret2;
}
+ QTAILQ_FOREACH_SAFE(fde, &f->fds, entry, next) {
+ warn_report("qemu_fclose: received fd %d was never claimed", fde->fd);
+ close(fde->fd);
+ g_free(fde);
+ }
g_clear_pointer(&f->ioc, object_unref);
error_free(f->last_error_obj);
g_free(f);
@@ -485,8 +561,6 @@ void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
}
stat64_add(&mig_stats.qemu_file_transferred, buflen);
-
- return;
}
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 11c2120..f5b9f43 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -33,6 +33,8 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc);
QEMUFile *qemu_file_new_output(QIOChannel *ioc);
int qemu_fclose(QEMUFile *f);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(QEMUFile, qemu_fclose)
+
/*
* qemu_file_transferred:
*
@@ -79,5 +81,7 @@ size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
off_t pos);
QIOChannel *qemu_file_get_ioc(QEMUFile *file);
+int qemu_file_put_fd(QEMUFile *f, int fd);
+int qemu_file_get_fd(QEMUFile *f);
#endif
diff --git a/migration/ram.c b/migration/ram.c
index edec1a2..2140785 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -48,19 +48,19 @@
#include "qapi/qapi-commands-migration.h"
#include "qapi/qmp/qerror.h"
#include "trace.h"
-#include "exec/ram_addr.h"
+#include "system/ram_addr.h"
#include "exec/target_page.h"
#include "qemu/rcu_queue.h"
#include "migration/colo.h"
-#include "sysemu/cpu-throttle.h"
+#include "system/cpu-throttle.h"
#include "savevm.h"
#include "qemu/iov.h"
#include "multifd.h"
-#include "sysemu/runstate.h"
+#include "system/runstate.h"
#include "rdma.h"
#include "options.h"
-#include "sysemu/dirtylimit.h"
-#include "sysemu/kvm.h"
+#include "system/dirtylimit.h"
+#include "system/kvm.h"
#include "hw/boards.h" /* for machine_dump_guest_core() */
@@ -72,27 +72,6 @@
/* ram save/restore */
/*
- * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
- * worked for pages that were filled with the same char. We switched
- * it to only search for the zero value. And to avoid confusion with
- * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
- *
- * RAM_SAVE_FLAG_FULL was obsoleted in 2009.
- *
- * RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1.
- */
-#define RAM_SAVE_FLAG_FULL 0x01
-#define RAM_SAVE_FLAG_ZERO 0x02
-#define RAM_SAVE_FLAG_MEM_SIZE 0x04
-#define RAM_SAVE_FLAG_PAGE 0x08
-#define RAM_SAVE_FLAG_EOS 0x10
-#define RAM_SAVE_FLAG_CONTINUE 0x20
-#define RAM_SAVE_FLAG_XBZRLE 0x40
-/* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */
-#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
-/* We can't use any flag that is bigger than 0x200 */
-
-/*
* mapped-ram migration supports O_DIRECT, so we need to make sure the
* userspace buffer, the IO operation size and the file offset are
* aligned according to the underlying device's block size. The first
@@ -112,6 +91,36 @@
XBZRLECacheStats xbzrle_counters;
+/*
+ * This structure locates a specific location of a guest page. In QEMU,
+ * it's described in a tuple of (ramblock, offset).
+ */
+struct PageLocation {
+ RAMBlock *block;
+ unsigned long offset;
+};
+typedef struct PageLocation PageLocation;
+
+/**
+ * PageLocationHint: describes a hint to a page location
+ *
+ * @valid set if the hint is vaild and to be consumed
+ * @location: the hint content
+ *
+ * In postcopy preempt mode, the urgent channel may provide hints to the
+ * background channel, so that QEMU source can try to migrate whatever is
+ * right after the requested urgent pages.
+ *
+ * This is based on the assumption that the VM (already running on the
+ * destination side) tends to access the memory with spatial locality.
+ * This is also the default behavior of vanilla postcopy (preempt off).
+ */
+struct PageLocationHint {
+ bool valid;
+ PageLocation location;
+};
+typedef struct PageLocationHint PageLocationHint;
+
/* used by the search for pages to send */
struct PageSearchStatus {
/* The migration channel used for a specific host page */
@@ -216,7 +225,9 @@ static bool postcopy_preempt_active(void)
bool migrate_ram_is_ignored(RAMBlock *block)
{
+ MigMode mode = migrate_mode();
return !qemu_ram_is_migratable(block) ||
+ mode == MIG_MODE_CPR_TRANSFER ||
(migrate_ignore_shared() && qemu_ram_is_shared(block)
&& qemu_ram_is_named_file(block));
}
@@ -414,6 +425,13 @@ struct RAMState {
* RAM migration.
*/
unsigned int postcopy_bmap_sync_requested;
+ /*
+ * Page hint during postcopy when preempt mode is on. Return path
+ * thread sets it, while background migration thread consumes it.
+ *
+ * Protected by @bitmap_mutex.
+ */
+ PageLocationHint page_hint;
};
typedef struct RAMState RAMState;
@@ -467,13 +485,6 @@ void ram_transferred_add(uint64_t bytes)
}
}
-struct MigrationOps {
- int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
-};
-typedef struct MigrationOps MigrationOps;
-
-MigrationOps *migration_ops;
-
static int ram_save_host_page_urgent(PageSearchStatus *pss);
/* NOTE: page is the PFN not real ram_addr_t. */
@@ -820,14 +831,22 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
bool ret;
/*
- * Clear dirty bitmap if needed. This _must_ be called before we
- * send any of the page in the chunk because we need to make sure
- * we can capture further page content changes when we sync dirty
- * log the next time. So as long as we are going to send any of
- * the page in the chunk we clear the remote dirty bitmap for all.
- * Clearing it earlier won't be a problem, but too late will.
+ * During the last stage (after source VM stopped), resetting the write
+ * protections isn't needed as we know there will be either (1) no
+ * further writes if migration will complete, or (2) migration fails
+ * at last then tracking isn't needed either.
*/
- migration_clear_memory_region_dirty_bitmap(rb, page);
+ if (!rs->last_stage) {
+ /*
+ * Clear dirty bitmap if needed. This _must_ be called before we
+ * send any of the page in the chunk because we need to make sure
+ * we can capture further page content changes when we sync dirty
+ * log the next time. So as long as we are going to send any of
+ * the page in the chunk we clear the remote dirty bitmap for all.
+ * Clearing it earlier won't be a problem, but too late will.
+ */
+ migration_clear_memory_region_dirty_bitmap(rb, page);
+ }
ret = test_and_clear_bit(page, rb->bmap);
if (ret) {
@@ -837,8 +856,8 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
return ret;
}
-static void dirty_bitmap_clear_section(MemoryRegionSection *section,
- void *opaque)
+static int dirty_bitmap_clear_section(MemoryRegionSection *section,
+ void *opaque)
{
const hwaddr offset = section->offset_within_region;
const hwaddr size = int128_get64(section->size);
@@ -857,6 +876,7 @@ static void dirty_bitmap_clear_section(MemoryRegionSection *section,
}
*cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
bitmap_clear(rb->bmap, start, npages);
+ return 0;
}
/*
@@ -1088,9 +1108,10 @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage)
}
}
-static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
+void migration_bitmap_sync_precopy(bool last_stage)
{
Error *local_err = NULL;
+ assert(ram_state);
/*
* The current notifier usage is just an optimization to migration, so we
@@ -1101,7 +1122,7 @@ static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
local_err = NULL;
}
- migration_bitmap_sync(rs, last_stage);
+ migration_bitmap_sync(ram_state, last_stage);
if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
error_report_err(local_err);
@@ -1169,32 +1190,6 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
}
/*
- * @pages: the number of pages written by the control path,
- * < 0 - error
- * > 0 - number of pages written
- *
- * Return true if the pages has been saved, otherwise false is returned.
- */
-static bool control_save_page(PageSearchStatus *pss,
- ram_addr_t offset, int *pages)
-{
- int ret;
-
- ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset,
- TARGET_PAGE_SIZE);
- if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
- return false;
- }
-
- if (ret == RAM_SAVE_CONTROL_DELAYED) {
- *pages = 1;
- return true;
- }
- *pages = ret;
- return true;
-}
-
-/*
* directly send the page to the stream
*
* Returns the number of pages written.
@@ -1322,19 +1317,12 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
pss->page = 0;
pss->block = QLIST_NEXT_RCU(pss->block, next);
if (!pss->block) {
- if (migrate_multifd() &&
- (!migrate_multifd_flush_after_each_section() ||
- migrate_mapped_ram())) {
+ if (multifd_ram_sync_per_round()) {
QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
- int ret = multifd_send_sync_main();
+ int ret = multifd_ram_flush_and_sync(f);
if (ret < 0) {
return ret;
}
-
- if (!migrate_mapped_ram()) {
- qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
- qemu_fflush(f);
- }
}
/* Hit the end of the list */
@@ -1765,19 +1753,17 @@ bool ram_write_tracking_available(void)
bool ram_write_tracking_compatible(void)
{
- assert(0);
- return false;
+ g_assert_not_reached();
}
int ram_write_tracking_start(void)
{
- assert(0);
- return -1;
+ g_assert_not_reached();
}
void ram_write_tracking_stop(void)
{
- assert(0);
+ g_assert_not_reached();
}
#endif /* defined(__linux__) */
@@ -1795,7 +1781,7 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
{
RAMBlock *block;
ram_addr_t offset;
- bool dirty;
+ bool dirty = false;
do {
block = unqueue_page(rs, &offset);
@@ -1987,53 +1973,40 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len,
}
/**
- * ram_save_target_page_legacy: save one target page
- *
- * Returns the number of pages written
+ * ram_save_target_page: save one target page to the precopy thread
+ * OR to multifd workers.
*
* @rs: current RAM state
* @pss: data about the page we want to send
*/
-static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
+static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
{
ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
int res;
- if (control_save_page(pss, offset, &res)) {
- return res;
- }
+ /* Hand over to RDMA first */
+ if (migrate_rdma()) {
+ res = rdma_control_save_page(pss->pss_channel, pss->block->offset,
+ offset, TARGET_PAGE_SIZE);
- if (save_zero_page(rs, pss, offset)) {
- return 1;
+ if (res == RAM_SAVE_CONTROL_DELAYED) {
+ res = 1;
+ }
+ return res;
}
- return ram_save_page(rs, pss);
-}
-
-/**
- * ram_save_target_page_multifd: send one target page to multifd workers
- *
- * Returns 1 if the page was queued, -1 otherwise.
- *
- * @rs: current RAM state
- * @pss: data about the page we want to send
- */
-static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss)
-{
- RAMBlock *block = pss->block;
- ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
-
- /*
- * While using multifd live migration, we still need to handle zero
- * page checking on the migration main thread.
- */
- if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
+ if (!migrate_multifd()
+ || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
if (save_zero_page(rs, pss, offset)) {
return 1;
}
}
- return ram_save_multifd_page(block, offset);
+ if (migrate_multifd() && !migration_in_postcopy()) {
+ return ram_save_multifd_page(pss->block, offset);
+ }
+
+ return ram_save_page(rs, pss);
}
/* Should be called before sending a host page */
@@ -2091,6 +2064,21 @@ static void pss_host_page_finish(PageSearchStatus *pss)
pss->host_page_start = pss->host_page_end = 0;
}
+static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss)
+{
+ PageLocationHint *hint = &rs->page_hint;
+
+ /* If there's a pending hint not consumed, don't bother */
+ if (hint->valid) {
+ return;
+ }
+
+ /* Provide a hint to the background stream otherwise */
+ hint->location.block = pss->block;
+ hint->location.offset = pss->page;
+ hint->valid = true;
+}
+
/*
* Send an urgent host page specified by `pss'. Need to be called with
* bitmap_mutex held.
@@ -2122,7 +2110,7 @@ static int ram_save_host_page_urgent(PageSearchStatus *pss)
if (page_dirty) {
/* Be strict to return code; it must be 1, or what else? */
- if (migration_ops->ram_save_target_page(rs, pss) != 1) {
+ if (ram_save_target_page(rs, pss) != 1) {
error_report_once("%s: ram_save_target_page failed", __func__);
ret = -1;
goto out;
@@ -2136,6 +2124,7 @@ out:
/* For urgent requests, flush immediately if sent */
if (sent) {
qemu_fflush(pss->pss_channel);
+ ram_page_hint_update(rs, pss);
}
return ret;
}
@@ -2191,7 +2180,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
if (preempt_active) {
qemu_mutex_unlock(&rs->bitmap_mutex);
}
- tmppages = migration_ops->ram_save_target_page(rs, pss);
+ tmppages = ram_save_target_page(rs, pss);
if (tmppages >= 0) {
pages += tmppages;
/*
@@ -2223,6 +2212,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
return (res < 0 ? res : pages);
}
+static bool ram_page_hint_valid(RAMState *rs)
+{
+ /* There's only page hint during postcopy preempt mode */
+ if (!postcopy_preempt_active()) {
+ return false;
+ }
+
+ return rs->page_hint.valid;
+}
+
+static void ram_page_hint_collect(RAMState *rs, RAMBlock **block,
+ unsigned long *page)
+{
+ PageLocationHint *hint = &rs->page_hint;
+
+ assert(hint->valid);
+
+ *block = hint->location.block;
+ *page = hint->location.offset;
+
+ /* Mark the hint consumed */
+ hint->valid = false;
+}
+
/**
* ram_find_and_save_block: finds a dirty page and sends it to f
*
@@ -2239,6 +2252,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
static int ram_find_and_save_block(RAMState *rs)
{
PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
+ unsigned long next_page;
+ RAMBlock *next_block;
int pages = 0;
/* No dirty page as there is zero RAM */
@@ -2258,7 +2273,14 @@ static int ram_find_and_save_block(RAMState *rs)
rs->last_page = 0;
}
- pss_init(pss, rs->last_seen_block, rs->last_page);
+ if (ram_page_hint_valid(rs)) {
+ ram_page_hint_collect(rs, &next_block, &next_page);
+ } else {
+ next_block = rs->last_seen_block;
+ next_page = rs->last_page;
+ }
+
+ pss_init(pss, next_block, next_page);
while (true){
if (!get_queued_page(rs, pss)) {
@@ -2387,9 +2409,15 @@ static void ram_save_cleanup(void *opaque)
ram_bitmaps_destroy();
xbzrle_cleanup();
+ multifd_ram_save_cleanup();
ram_state_cleanup(rsp);
- g_free(migration_ops);
- migration_ops = NULL;
+}
+
+static void ram_page_hint_reset(PageLocationHint *hint)
+{
+ hint->location.block = NULL;
+ hint->location.offset = 0;
+ hint->valid = false;
}
static void ram_state_reset(RAMState *rs)
@@ -2404,6 +2432,8 @@ static void ram_state_reset(RAMState *rs)
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_started = false;
+
+ ram_page_hint_reset(&rs->page_hint);
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -2783,7 +2813,7 @@ static bool ram_init_bitmaps(RAMState *rs, Error **errp)
if (!ret) {
goto out_unlock;
}
- migration_bitmap_sync_precopy(rs, false);
+ migration_bitmap_sync_precopy(false);
}
}
out_unlock:
@@ -2860,7 +2890,7 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
size_t used_len, start, npages;
/* This function is currently expected to be used during live migration */
- if (!migration_is_setup_or_active()) {
+ if (!migration_is_running()) {
return;
}
@@ -3055,27 +3085,43 @@ static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp)
return ret;
}
- migration_ops = g_malloc0(sizeof(MigrationOps));
-
if (migrate_multifd()) {
- migration_ops->ram_save_target_page = ram_save_target_page_multifd;
- } else {
- migration_ops->ram_save_target_page = ram_save_target_page_legacy;
+ multifd_ram_save_setup();
}
+ /*
+ * This operation is unfortunate..
+ *
+ * For legacy QEMUs using per-section sync
+ * =======================================
+ *
+ * This must exist because the EOS below requires the SYNC messages
+ * per-channel to work.
+ *
+ * For modern QEMUs using per-round sync
+ * =====================================
+ *
+ * Logically such sync is not needed, and recv threads should not run
+ * until setup ready (using things like channels_ready on src). Then
+ * we should be all fine.
+ *
+ * However even if we add channels_ready to recv side in new QEMUs, old
+ * QEMU won't have them so this sync will still be needed to make sure
+ * multifd recv threads won't start processing guest pages early before
+ * ram_load_setup() is properly done.
+ *
+ * Let's stick with this. Fortunately the overhead is low to sync
+ * during setup because the VM is running, so at least it's not
+ * accounted as part of downtime.
+ */
bql_unlock();
- ret = multifd_send_sync_main();
+ ret = multifd_ram_flush_and_sync(f);
bql_lock();
if (ret < 0) {
error_setg(errp, "%s: multifd synchronization failed", __func__);
return ret;
}
- if (migrate_multifd() && !migrate_multifd_flush_after_each_section()
- && !migrate_mapped_ram()) {
- qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
- }
-
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
ret = qemu_fflush(f);
if (ret < 0) {
@@ -3207,11 +3253,9 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
}
out:
- if (ret >= 0
- && migration_is_setup_or_active()) {
- if (migrate_multifd() && migrate_multifd_flush_after_each_section() &&
- !migrate_mapped_ram()) {
- ret = multifd_send_sync_main();
+ if (ret >= 0 && migration_is_running()) {
+ if (multifd_ram_sync_per_section()) {
+ ret = multifd_ram_flush_and_sync(f);
if (ret < 0) {
return ret;
}
@@ -3248,7 +3292,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
WITH_RCU_READ_LOCK_GUARD() {
if (!migration_in_postcopy()) {
- migration_bitmap_sync_precopy(rs, true);
+ migration_bitmap_sync_precopy(true);
}
ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
@@ -3283,9 +3327,15 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
}
}
- ret = multifd_send_sync_main();
- if (ret < 0) {
- return ret;
+ if (multifd_ram_sync_per_section()) {
+ /*
+ * Only the old dest QEMU will need this sync, because each EOS
+ * will require one SYNC message on each channel.
+ */
+ ret = multifd_ram_flush_and_sync(f);
+ if (ret < 0) {
+ return ret;
+ }
}
if (migrate_mapped_ram()) {
@@ -3330,7 +3380,7 @@ static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
if (!migration_in_postcopy()) {
bql_lock();
WITH_RCU_READ_LOCK_GUARD() {
- migration_bitmap_sync_precopy(rs, false);
+ migration_bitmap_sync_precopy(false);
}
bql_unlock();
}
@@ -3631,7 +3681,9 @@ static int ram_load_cleanup(void *opaque)
RAMBlock *rb;
RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
- qemu_ram_block_writeback(rb);
+ if (memory_region_is_nonvolatile(rb->mr)) {
+ qemu_ram_block_writeback(rb);
+ }
}
xbzrle_load_cleanup();
@@ -3796,15 +3848,7 @@ int ram_load_postcopy(QEMUFile *f, int channel)
TARGET_PAGE_SIZE);
}
break;
- case RAM_SAVE_FLAG_MULTIFD_FLUSH:
- multifd_recv_sync_main();
- break;
case RAM_SAVE_FLAG_EOS:
- /* normal exit */
- if (migrate_multifd() &&
- migrate_multifd_flush_after_each_section()) {
- multifd_recv_sync_main();
- }
break;
default:
error_report("Unknown combination of migration flags: 0x%x"
@@ -4004,8 +4048,6 @@ static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
/* Skip pages array */
qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
-
- return;
}
static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
@@ -4294,6 +4336,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
* it will be necessary to reduce the granularity of this
* critical section.
*/
+ trace_ram_load_start();
WITH_RCU_READ_LOCK_GUARD() {
if (postcopy_running) {
/*
@@ -4460,6 +4503,42 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
return 0;
}
+static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp)
+{
+ int ret;
+
+ if (migrate_multifd()) {
+ /*
+ * When multifd is enabled, source QEMU needs to make sure all the
+ * pages queued before postcopy starts have been flushed.
+ *
+ * The load of these pages must happen before switching to postcopy.
+ * It's because loading of guest pages (so far) in multifd recv
+ * threads is still non-atomic, so the load cannot happen with vCPUs
+ * running on the destination side.
+ *
+ * This flush and sync will guarantee that those pages are loaded
+ * _before_ postcopy starts on the destination. The rationale is,
+ * this happens before VM stops (and before source QEMU sends all
+ * the rest of the postcopy messages). So when the destination QEMU
+ * receives the postcopy messages, it must have received the sync
+ * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH,
+ * or RAM_SAVE_FLAG_EOS), and such message would guarantee that
+ * all previous guest pages queued in the multifd channels are
+ * completely loaded.
+ */
+ ret = multifd_ram_flush_and_sync(f);
+ if (ret < 0) {
+ error_setg(errp, "%s: multifd flush and sync failed", __func__);
+ return false;
+ }
+ }
+
+ qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+
+ return true;
+}
+
void postcopy_preempt_shutdown_file(MigrationState *s)
{
qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
@@ -4479,6 +4558,7 @@ static SaveVMHandlers savevm_ram_handlers = {
.load_setup = ram_load_setup,
.load_cleanup = ram_load_cleanup,
.resume_prepare = ram_resume_prepare,
+ .save_postcopy_prepare = ram_save_postcopy_prepare,
};
static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
@@ -4498,7 +4578,7 @@ static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
return;
}
- if (!migration_is_idle()) {
+ if (migration_is_running()) {
/*
* Precopy code on the source cannot deal with the size of RAM blocks
* changing at random points in time - especially after sending the
@@ -4506,8 +4586,10 @@ static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
* Abort and indicate a proper reason.
*/
error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
- migration_cancel(err);
+ migrate_set_error(migrate_get_current(), err);
error_free(err);
+
+ migration_cancel();
}
switch (ps) {
diff --git a/migration/ram.h b/migration/ram.h
index bc0318b..921c39a 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -33,6 +33,34 @@
#include "exec/cpu-common.h"
#include "io/channel.h"
+/*
+ * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
+ * worked for pages that were filled with the same char. We switched
+ * it to only search for the zero value. And to avoid confusion with
+ * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
+ *
+ * RAM_SAVE_FLAG_FULL (0x01) was obsoleted in 2009.
+ *
+ * RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1.
+ *
+ * RAM_SAVE_FLAG_HOOK is only used in RDMA. Whenever this is found in the
+ * data stream, the flags will be passed to rdma functions in the
+ * incoming-migration side.
+ *
+ * We can't use any flag that is bigger than 0x200, because the flags are
+ * always assumed to be encoded in a ramblock address offset, which is
+ * multiple of PAGE_SIZE. Here it means QEMU supports migration with any
+ * architecture that has PAGE_SIZE>=1K (0x400).
+ */
+#define RAM_SAVE_FLAG_ZERO 0x002
+#define RAM_SAVE_FLAG_MEM_SIZE 0x004
+#define RAM_SAVE_FLAG_PAGE 0x008
+#define RAM_SAVE_FLAG_EOS 0x010
+#define RAM_SAVE_FLAG_CONTINUE 0x020
+#define RAM_SAVE_FLAG_XBZRLE 0x040
+#define RAM_SAVE_FLAG_HOOK 0x080
+#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
+
extern XBZRLECacheStats xbzrle_counters;
/* Should be holding either ram_list.mutex, or the RCU lock. */
@@ -44,6 +72,7 @@ extern XBZRLECacheStats xbzrle_counters;
INTERNAL_RAMBLOCK_FOREACH(block) \
if (!qemu_ram_is_migratable(block)) {} else
+void ram_mig_init(void);
int xbzrle_cache_resize(uint64_t new_size, Error **errp);
uint64_t ram_bytes_remaining(void);
uint64_t ram_bytes_total(void);
diff --git a/migration/rdma.c b/migration/rdma.c
index 855753c..2d839fc 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -30,7 +30,7 @@
#include "qemu/sockets.h"
#include "qemu/bitmap.h"
#include "qemu/coroutine.h"
-#include "exec/memory.h"
+#include "system/memory.h"
#include <sys/socket.h>
#include <netdb.h>
#include <arpa/inet.h>
@@ -768,156 +768,12 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
}
/*
- * As of now, IPv6 over RoCE / iWARP is not supported by linux.
- * We will try the next addrinfo struct, and fail if there are
- * no other valid addresses to bind against.
- *
- * If user is listening on '[::]', then we will not have a opened a device
- * yet and have no way of verifying if the device is RoCE or not.
- *
- * In this case, the source VM will throw an error for ALL types of
- * connections (both IPv4 and IPv6) if the destination machine does not have
- * a regular infiniband network available for use.
- *
- * The only way to guarantee that an error is thrown for broken kernels is
- * for the management software to choose a *specific* interface at bind time
- * and validate what time of hardware it is.
- *
- * Unfortunately, this puts the user in a fix:
- *
- * If the source VM connects with an IPv4 address without knowing that the
- * destination has bound to '[::]' the migration will unconditionally fail
- * unless the management software is explicitly listening on the IPv4
- * address while using a RoCE-based device.
- *
- * If the source VM connects with an IPv6 address, then we're OK because we can
- * throw an error on the source (and similarly on the destination).
- *
- * But in mixed environments, this will be broken for a while until it is fixed
- * inside linux.
- *
- * We do provide a *tiny* bit of help in this function: We can list all of the
- * devices in the system and check to see if all the devices are RoCE or
- * Infiniband.
- *
- * If we detect that we have a *pure* RoCE environment, then we can safely
- * thrown an error even if the management software has specified '[::]' as the
- * bind address.
- *
- * However, if there is are multiple hetergeneous devices, then we cannot make
- * this assumption and the user just has to be sure they know what they are
- * doing.
- *
- * Patches are being reviewed on linux-rdma.
- */
-static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
-{
- /* This bug only exists in linux, to our knowledge. */
-#ifdef CONFIG_LINUX
- struct ibv_port_attr port_attr;
-
- /*
- * Verbs are only NULL if management has bound to '[::]'.
- *
- * Let's iterate through all the devices and see if there any pure IB
- * devices (non-ethernet).
- *
- * If not, then we can safely proceed with the migration.
- * Otherwise, there are no guarantees until the bug is fixed in linux.
- */
- if (!verbs) {
- int num_devices;
- struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
- bool roce_found = false;
- bool ib_found = false;
-
- for (int x = 0; x < num_devices; x++) {
- verbs = ibv_open_device(dev_list[x]);
- /*
- * ibv_open_device() is not documented to set errno. If
- * it does, it's somebody else's doc bug. If it doesn't,
- * the use of errno below is wrong.
- * TODO Find out whether ibv_open_device() sets errno.
- */
- if (!verbs) {
- if (errno == EPERM) {
- continue;
- } else {
- error_setg_errno(errp, errno,
- "could not open RDMA device context");
- return -1;
- }
- }
-
- if (ibv_query_port(verbs, 1, &port_attr)) {
- ibv_close_device(verbs);
- error_setg(errp,
- "RDMA ERROR: Could not query initial IB port");
- return -1;
- }
-
- if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
- ib_found = true;
- } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- roce_found = true;
- }
-
- ibv_close_device(verbs);
-
- }
-
- if (roce_found) {
- if (ib_found) {
- warn_report("migrations may fail:"
- " IPv6 over RoCE / iWARP in linux"
- " is broken. But since you appear to have a"
- " mixed RoCE / IB environment, be sure to only"
- " migrate over the IB fabric until the kernel "
- " fixes the bug.");
- } else {
- error_setg(errp, "RDMA ERROR: "
- "You only have RoCE / iWARP devices in your systems"
- " and your management software has specified '[::]'"
- ", but IPv6 over RoCE / iWARP is not supported in Linux.");
- return -1;
- }
- }
-
- return 0;
- }
-
- /*
- * If we have a verbs context, that means that some other than '[::]' was
- * used by the management software for binding. In which case we can
- * actually warn the user about a potentially broken kernel.
- */
-
- /* IB ports start with 1, not 0 */
- if (ibv_query_port(verbs, 1, &port_attr)) {
- error_setg(errp, "RDMA ERROR: Could not query initial IB port");
- return -1;
- }
-
- if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- error_setg(errp, "RDMA ERROR: "
- "Linux kernel's RoCE / iWARP does not support IPv6 "
- "(but patches on linux-rdma in progress)");
- return -1;
- }
-
-#endif
-
- return 0;
-}
-
-/*
* Figure out which RDMA device corresponds to the requested IP hostname
* Also create the initial connection manager identifiers for opening
* the connection.
*/
static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
{
- Error *err = NULL;
int ret;
struct rdma_addrinfo *res;
char port_str[16];
@@ -953,9 +809,8 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
goto err_resolve_get_addr;
}
- /* Try all addresses, saving the first error in @err */
+ /* Try all addresses, exit loop on first success of resolving address */
for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) {
- Error **local_errp = err ? NULL : &err;
inet_ntop(e->ai_family,
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
@@ -964,25 +819,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
RDMA_RESOLVE_TIMEOUT_MS);
if (ret >= 0) {
- if (e->ai_family == AF_INET6) {
- ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
- local_errp);
- if (ret < 0) {
- continue;
- }
- }
- error_free(err);
goto route;
}
}
rdma_freeaddrinfo(res);
- if (err) {
- error_propagate(errp, err);
- } else {
- error_setg(errp, "RDMA ERROR: could not resolve address %s",
- rdma->host);
- }
+ error_setg(errp, "RDMA ERROR: could not resolve address %s", rdma->host);
goto err_resolve_get_addr;
route:
@@ -2611,7 +2453,6 @@ err_rdma_source_connect:
static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
{
- Error *err = NULL;
int ret;
struct rdma_cm_id *listen_id;
char ip[40] = "unknown";
@@ -2661,9 +2502,8 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
goto err_dest_init_bind_addr;
}
- /* Try all addresses, saving the first error in @err */
+ /* Try all addresses */
for (e = res; e != NULL; e = e->ai_next) {
- Error **local_errp = err ? NULL : &err;
inet_ntop(e->ai_family,
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
@@ -2672,24 +2512,12 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
if (ret < 0) {
continue;
}
- if (e->ai_family == AF_INET6) {
- ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
- local_errp);
- if (ret < 0) {
- continue;
- }
- }
- error_free(err);
break;
}
rdma_freeaddrinfo(res);
if (!e) {
- if (err) {
- error_propagate(errp, err);
- } else {
- error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
- }
+ error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
goto err_dest_init_bind_addr;
}
@@ -3284,14 +3112,11 @@ err:
int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
ram_addr_t offset, size_t size)
{
- if (!migrate_rdma() || migration_in_postcopy()) {
- return RAM_SAVE_CONTROL_NOT_SUPP;
- }
+ assert(migrate_rdma());
int ret = qemu_rdma_save_page(f, block_offset, offset, size);
- if (ret != RAM_SAVE_CONTROL_DELAYED &&
- ret != RAM_SAVE_CONTROL_NOT_SUPP) {
+ if (ret != RAM_SAVE_CONTROL_DELAYED) {
if (ret < 0) {
qemu_file_set_error(f, ret);
}
@@ -3829,7 +3654,7 @@ int rdma_block_notification_handle(QEMUFile *f, const char *name)
int rdma_registration_start(QEMUFile *f, uint64_t flags)
{
- if (!migrate_rdma() || migration_in_postcopy()) {
+ if (!migrate_rdma()) {
return 0;
}
@@ -3861,7 +3686,7 @@ int rdma_registration_stop(QEMUFile *f, uint64_t flags)
RDMAControlHeader head = { .len = 0, .repeat = 1 };
int ret;
- if (!migrate_rdma() || migration_in_postcopy()) {
+ if (!migrate_rdma()) {
return 0;
}
@@ -3985,7 +3810,7 @@ static void qio_channel_rdma_finalize(Object *obj)
}
static void qio_channel_rdma_class_init(ObjectClass *klass,
- void *class_data G_GNUC_UNUSED)
+ const void *class_data G_GNUC_UNUSED)
{
QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
@@ -4174,7 +3999,7 @@ void rdma_start_outgoing_migration(void *opaque,
s->to_dst_file = rdma_new_output(rdma);
s->rdma_migration = true;
- migrate_fd_connect(s, NULL);
+ migration_connect(s, NULL);
return;
return_path_err:
qemu_rdma_cleanup(rdma);
diff --git a/migration/rdma.h b/migration/rdma.h
index a8d27f3..f74f16a 100644
--- a/migration/rdma.h
+++ b/migration/rdma.h
@@ -19,7 +19,7 @@
#ifndef QEMU_MIGRATION_RDMA_H
#define QEMU_MIGRATION_RDMA_H
-#include "exec/memory.h"
+#include "system/memory.h"
void rdma_start_outgoing_migration(void *opaque, InetSocketAddress *host_port,
Error **errp);
@@ -33,14 +33,6 @@ void rdma_start_incoming_migration(InetSocketAddress *host_port, Error **errp);
#define RAM_CONTROL_ROUND 1
#define RAM_CONTROL_FINISH 3
-/*
- * Whenever this is found in the data stream, the flags
- * will be passed to rdma functions in the incoming-migration
- * side.
- */
-#define RAM_SAVE_FLAG_HOOK 0x80
-
-#define RAM_SAVE_CONTROL_NOT_SUPP -1000
#define RAM_SAVE_CONTROL_DELAYED -2000
#ifdef CONFIG_RDMA
@@ -63,7 +55,7 @@ static inline
int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
ram_addr_t offset, size_t size)
{
- return RAM_SAVE_CONTROL_NOT_SUPP;
+ g_assert_not_reached();
}
#endif
#endif
diff --git a/migration/savevm.c b/migration/savevm.c
index deb5783..bb04a45 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -37,6 +37,7 @@
#include "migration/register.h"
#include "migration/global_state.h"
#include "migration/channel-block.h"
+#include "multifd.h"
#include "ram.h"
#include "qemu-file.h"
#include "savevm.h"
@@ -46,27 +47,29 @@
#include "qapi/clone-visitor.h"
#include "qapi/qapi-builtin-visit.h"
#include "qemu/error-report.h"
-#include "sysemu/cpus.h"
-#include "exec/memory.h"
+#include "system/cpus.h"
+#include "system/memory.h"
#include "exec/target_page.h"
+#include "exec/page-vary.h"
#include "trace.h"
#include "qemu/iov.h"
#include "qemu/job.h"
#include "qemu/main-loop.h"
#include "block/snapshot.h"
+#include "block/thread-pool.h"
#include "qemu/cutils.h"
#include "io/channel-buffer.h"
#include "io/channel-file.h"
-#include "sysemu/replay.h"
-#include "sysemu/runstate.h"
-#include "sysemu/sysemu.h"
-#include "sysemu/xen.h"
+#include "system/replay.h"
+#include "system/runstate.h"
+#include "system/system.h"
+#include "system/xen.h"
#include "migration/colo.h"
#include "qemu/bitmap.h"
#include "net/announce.h"
#include "qemu/yank.h"
#include "yank_functions.h"
-#include "sysemu/qtest.h"
+#include "system/qtest.h"
#include "options.h"
const unsigned int postcopy_ram_discard_version;
@@ -90,6 +93,7 @@ enum qemu_vm_cmd {
MIG_CMD_ENABLE_COLO, /* Enable COLO */
MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */
MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */
+ MIG_CMD_SWITCHOVER_START, /* Switchover start notification */
MIG_CMD_MAX
};
@@ -109,6 +113,7 @@ static struct mig_cmd_args {
[MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" },
[MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" },
[MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
+ [MIG_CMD_SWITCHOVER_START] = { .len = 0, .name = "SWITCHOVER_START" },
[MIG_CMD_MAX] = { .len = -1, .name = "MAX" },
};
@@ -130,6 +135,35 @@ static struct mig_cmd_args {
*/
/***********************************************************/
+/* Optional load threads pool support */
+
+static void qemu_loadvm_thread_pool_create(MigrationIncomingState *mis)
+{
+ assert(!mis->load_threads);
+ mis->load_threads = thread_pool_new();
+ mis->load_threads_abort = false;
+}
+
+static void qemu_loadvm_thread_pool_destroy(MigrationIncomingState *mis)
+{
+ qatomic_set(&mis->load_threads_abort, true);
+
+ bql_unlock(); /* Load threads might be waiting for BQL */
+ g_clear_pointer(&mis->load_threads, thread_pool_free);
+ bql_lock();
+}
+
+static bool qemu_loadvm_thread_pool_wait(MigrationState *s,
+ MigrationIncomingState *mis)
+{
+ bql_unlock(); /* Let load threads do work requiring BQL */
+ thread_pool_wait(mis->load_threads);
+ bql_lock();
+
+ return !migrate_has_error(s);
+}
+
+/***********************************************************/
/* savevm/loadvm support */
static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
@@ -232,7 +266,7 @@ typedef struct SaveState {
static SaveState savevm_state = {
.handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
- .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
+ .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL },
.global_section_id = 0,
};
@@ -306,7 +340,7 @@ static int configuration_pre_load(void *opaque)
* predates the variable-target-page-bits support and is using the
* minimum possible value for this CPU.
*/
- state->target_page_bits = qemu_target_page_bits_min();
+ state->target_page_bits = migration_legacy_page_bits();
return 0;
}
@@ -429,8 +463,7 @@ static const VMStateInfo vmstate_info_capability = {
*/
static bool vmstate_target_page_bits_needed(void *opaque)
{
- return qemu_target_page_bits()
- > qemu_target_page_bits_min();
+ return qemu_target_page_bits() > migration_legacy_page_bits();
}
static const VMStateDescription vmstate_target_page_bits = {
@@ -704,7 +737,7 @@ static int calculate_compat_instance_id(const char *idstr)
static inline MigrationPriority save_state_priority(SaveStateEntry *se)
{
- if (se->vmsd) {
+ if (se->vmsd && se->vmsd->priority) {
return se->vmsd->priority;
}
return MIG_PRI_DEFAULT;
@@ -860,23 +893,6 @@ static void vmstate_check(const VMStateDescription *vmsd)
}
}
-/*
- * See comment in hw/intc/xics.c:icp_realize()
- *
- * This function can be removed when
- * pre_2_10_vmstate_register_dummy_icp() is removed.
- */
-int vmstate_replace_hack_for_ppc(VMStateIf *obj, int instance_id,
- const VMStateDescription *vmsd,
- void *opaque)
-{
- SaveStateEntry *se = find_se(vmsd->name, instance_id);
-
- if (se) {
- savevm_state_handler_remove(se);
- }
- return vmstate_register(obj, instance_id, vmsd, opaque);
-}
int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
const VMStateDescription *vmsd,
@@ -1218,6 +1234,19 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
}
+static void qemu_savevm_send_switchover_start(QEMUFile *f)
+{
+ trace_savevm_send_switchover_start();
+ qemu_savevm_command_send(f, MIG_CMD_SWITCHOVER_START, 0, NULL);
+}
+
+void qemu_savevm_maybe_send_switchover_start(QEMUFile *f)
+{
+ if (migrate_send_switchover_start()) {
+ qemu_savevm_send_switchover_start(f);
+ }
+}
+
bool qemu_savevm_state_blocked(Error **errp)
{
SaveStateEntry *se;
@@ -1248,8 +1277,7 @@ void qemu_savevm_non_migratable_list(strList **reasons)
void qemu_savevm_state_header(QEMUFile *f)
{
MigrationState *s = migrate_get_current();
-
- s->vmdesc = json_writer_new(false);
+ JSONWriter *vmdesc = s->vmdesc;
trace_savevm_state_header();
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
@@ -1258,16 +1286,21 @@ void qemu_savevm_state_header(QEMUFile *f)
if (s->send_configuration) {
qemu_put_byte(f, QEMU_VM_CONFIGURATION);
- /*
- * This starts the main json object and is paired with the
- * json_writer_end_object in
- * qemu_savevm_state_complete_precopy_non_iterable
- */
- json_writer_start_object(s->vmdesc, NULL);
+ if (vmdesc) {
+ /*
+ * This starts the main json object and is paired with the
+ * json_writer_end_object in
+ * qemu_savevm_state_complete_precopy_non_iterable
+ */
+ json_writer_start_object(vmdesc, NULL);
+ json_writer_start_object(vmdesc, "configuration");
+ }
- json_writer_start_object(s->vmdesc, "configuration");
- vmstate_save_state(f, &vmstate_configuration, &savevm_state, s->vmdesc);
- json_writer_end_object(s->vmdesc);
+ vmstate_save_state(f, &vmstate_configuration, &savevm_state, vmdesc);
+
+ if (vmdesc) {
+ json_writer_end_object(vmdesc);
+ }
}
}
@@ -1313,16 +1346,19 @@ int qemu_savevm_state_setup(QEMUFile *f, Error **errp)
{
ERRP_GUARD();
MigrationState *ms = migrate_get_current();
+ JSONWriter *vmdesc = ms->vmdesc;
SaveStateEntry *se;
int ret = 0;
- json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
- json_writer_start_array(ms->vmdesc, "devices");
+ if (vmdesc) {
+ json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
+ json_writer_start_array(vmdesc, "devices");
+ }
trace_savevm_state_setup();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->vmsd && se->vmsd->early_setup) {
- ret = vmstate_save(f, se, ms->vmdesc, errp);
+ ret = vmstate_save(f, se, vmdesc, errp);
if (ret) {
migrate_set_error(ms, *errp);
qemu_file_set_error(f, ret);
@@ -1441,11 +1477,11 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
return all_finished;
}
-static bool should_send_vmdesc(void)
+bool should_send_vmdesc(void)
{
MachineState *machine = MACHINE(qdev_get_machine());
- bool in_postcopy = migration_in_postcopy();
- return !machine->suppress_vmdesc && !in_postcopy;
+
+ return !machine->suppress_vmdesc;
}
/*
@@ -1487,12 +1523,62 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f)
qemu_fflush(f);
}
-static
+bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp)
+{
+ SaveStateEntry *se;
+ bool ret;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->ops || !se->ops->save_postcopy_prepare) {
+ continue;
+ }
+
+ if (se->ops->is_active) {
+ if (!se->ops->is_active(se->opaque)) {
+ continue;
+ }
+ }
+
+ trace_savevm_section_start(se->idstr, se->section_id);
+
+ save_section_header(f, se, QEMU_VM_SECTION_PART);
+ ret = se->ops->save_postcopy_prepare(f, se->opaque, errp);
+ save_section_footer(f, se);
+
+ trace_savevm_section_end(se->idstr, se->section_id, ret);
+
+ if (!ret) {
+ assert(*errp);
+ return false;
+ }
+ }
+
+ return true;
+}
+
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
{
int64_t start_ts_each, end_ts_each;
SaveStateEntry *se;
int ret;
+ bool multifd_device_state = multifd_device_state_supported();
+
+ if (multifd_device_state) {
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ SaveLiveCompletePrecopyThreadHandler hdlr;
+
+ if (!se->ops || (in_postcopy && se->ops->has_postcopy &&
+ se->ops->has_postcopy(se->opaque)) ||
+ !se->ops->save_live_complete_precopy_thread) {
+ continue;
+ }
+
+ hdlr = se->ops->save_live_complete_precopy_thread;
+ multifd_spawn_device_state_save_thread(hdlr,
+ se->idstr, se->instance_id,
+ se->opaque);
+ }
+ }
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops ||
@@ -1518,21 +1604,39 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
save_section_footer(f, se);
if (ret < 0) {
qemu_file_set_error(f, ret);
- return -1;
+ goto ret_fail_abort_threads;
}
end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id,
end_ts_each - start_ts_each);
}
+ if (multifd_device_state) {
+ if (migrate_has_error(migrate_get_current())) {
+ multifd_abort_device_state_save_threads();
+ }
+
+ if (!multifd_join_device_state_save_threads()) {
+ qemu_file_set_error(f, -EINVAL);
+ return -1;
+ }
+ }
+
trace_vmstate_downtime_checkpoint("src-iterable-saved");
return 0;
+
+ret_fail_abort_threads:
+ if (multifd_device_state) {
+ multifd_abort_device_state_save_threads();
+ multifd_join_device_state_save_threads();
+ }
+
+ return -1;
}
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
- bool in_postcopy,
- bool inactivate_disks)
+ bool in_postcopy)
{
MigrationState *ms = migrate_get_current();
int64_t start_ts_each, end_ts_each;
@@ -1542,6 +1646,9 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
Error *local_err = NULL;
int ret;
+ /* Making sure cpu states are synchronized before saving non-iterable */
+ cpu_synchronize_all_states();
+
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->vmsd && se->vmsd->early_setup) {
/* Already saved during qemu_savevm_state_setup(). */
@@ -1563,76 +1670,42 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
end_ts_each - start_ts_each);
}
- if (inactivate_disks) {
- /* Inactivate before sending QEMU_VM_EOF so that the
- * bdrv_activate_all() on the other end won't fail. */
- ret = bdrv_inactivate_all();
- if (ret) {
- error_setg(&local_err, "%s: bdrv_inactivate_all() failed (%d)",
- __func__, ret);
- migrate_set_error(ms, local_err);
- error_report_err(local_err);
- qemu_file_set_error(f, ret);
- return ret;
- }
- }
if (!in_postcopy) {
/* Postcopy stream will still be going */
qemu_put_byte(f, QEMU_VM_EOF);
- }
- json_writer_end_array(vmdesc);
- json_writer_end_object(vmdesc);
- vmdesc_len = strlen(json_writer_get(vmdesc));
+ if (vmdesc) {
+ json_writer_end_array(vmdesc);
+ json_writer_end_object(vmdesc);
+ vmdesc_len = strlen(json_writer_get(vmdesc));
- if (should_send_vmdesc()) {
- qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
- qemu_put_be32(f, vmdesc_len);
- qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
+ qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
+ qemu_put_be32(f, vmdesc_len);
+ qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
+ }
}
- /* Free it now to detect any inconsistencies. */
- json_writer_free(vmdesc);
- ms->vmdesc = NULL;
-
trace_vmstate_downtime_checkpoint("src-non-iterable-saved");
return 0;
}
-int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
- bool inactivate_disks)
+int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
{
int ret;
- Error *local_err = NULL;
- bool in_postcopy = migration_in_postcopy();
- if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
- error_report_err(local_err);
+ ret = qemu_savevm_state_complete_precopy_iterable(f, false);
+ if (ret) {
+ return ret;
}
- trace_savevm_state_complete_precopy();
-
- cpu_synchronize_all_states();
-
- if (!in_postcopy || iterable_only) {
- ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
+ if (!iterable_only) {
+ ret = qemu_savevm_state_complete_precopy_non_iterable(f, false);
if (ret) {
return ret;
}
}
- if (iterable_only) {
- goto flush;
- }
-
- ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
- inactivate_disks);
- if (ret) {
- return ret;
- }
-
-flush:
return qemu_fflush(f);
}
@@ -1730,7 +1803,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
ret = qemu_file_get_error(f);
if (ret == 0) {
- qemu_savevm_state_complete_precopy(f, false, false);
+ qemu_savevm_maybe_send_switchover_start(f);
+ qemu_savevm_state_complete_precopy(f, false);
ret = qemu_file_get_error(f);
}
if (ret != 0) {
@@ -1756,7 +1830,7 @@ cleanup:
void qemu_savevm_live_state(QEMUFile *f)
{
/* save QEMU_VM_SECTION_END section */
- qemu_savevm_state_complete_precopy(f, true, false);
+ qemu_savevm_state_complete_precopy(f, true);
qemu_put_byte(f, QEMU_VM_EOF);
}
@@ -2004,7 +2078,7 @@ static void *postcopy_ram_listen_thread(void *opaque)
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_POSTCOPY_ACTIVE);
- qemu_sem_post(&mis->thread_sync_sem);
+ qemu_event_set(&mis->thread_sync_event);
trace_postcopy_ram_listen_thread_start();
rcu_register_thread();
@@ -2013,6 +2087,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
* in qemu_file, and thus we must be blocking now.
*/
qemu_file_set_blocking(f, true);
+
+ /* TODO: sanity check that only postcopiable data will be loaded here */
load_res = qemu_loadvm_state_main(f, mis);
/*
@@ -2073,8 +2149,9 @@ static void *postcopy_ram_listen_thread(void *opaque)
* (If something broke then qemu will have to exit anyway since it's
* got a bad migration state).
*/
+ bql_lock();
migration_incoming_state_destroy();
- qemu_loadvm_state_cleanup();
+ bql_unlock();
rcu_unregister_thread();
mis->have_listen_thread = false;
@@ -2129,7 +2206,8 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
}
mis->have_listen_thread = true;
- postcopy_thread_create(mis, &mis->listen_thread, "mig/dst/listen",
+ postcopy_thread_create(mis, &mis->listen_thread,
+ MIGRATION_THREAD_DST_LISTEN,
postcopy_ram_listen_thread, QEMU_THREAD_DETACHED);
trace_loadvm_postcopy_handle_listen("return");
@@ -2138,7 +2216,6 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
static void loadvm_postcopy_handle_run_bh(void *opaque)
{
- Error *local_err = NULL;
MigrationIncomingState *mis = opaque;
trace_vmstate_downtime_checkpoint("dst-postcopy-bh-enter");
@@ -2154,22 +2231,20 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
trace_vmstate_downtime_checkpoint("dst-postcopy-bh-announced");
- /* Make sure all file formats throw away their mutable metadata.
- * If we get an error here, just don't restart the VM yet. */
- bdrv_activate_all(&local_err);
- if (local_err) {
- error_report_err(local_err);
- local_err = NULL;
- autostart = false;
- }
-
- trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated");
-
dirty_bitmap_mig_before_vm_start();
if (autostart) {
- /* Hold onto your hats, starting the CPU */
- vm_start();
+ /*
+ * Make sure all file formats throw away their mutable metadata.
+ * If we get an error here, just don't restart the VM yet.
+ */
+ bool success = migration_block_activate(NULL);
+
+ trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated");
+
+ if (success) {
+ vm_start();
+ }
} else {
/* leave it paused and let management decide when to start the CPU */
runstate_set(RUN_STATE_PAUSED);
@@ -2429,6 +2504,26 @@ static int loadvm_process_enable_colo(MigrationIncomingState *mis)
return ret;
}
+static int loadvm_postcopy_handle_switchover_start(void)
+{
+ SaveStateEntry *se;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ int ret;
+
+ if (!se->ops || !se->ops->switchover_start) {
+ continue;
+ }
+
+ ret = se->ops->switchover_start(se->opaque);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/*
* Process an incoming 'QEMU_VM_COMMAND'
* 0 just a normal return
@@ -2527,6 +2622,9 @@ static int loadvm_process_command(QEMUFile *f)
case MIG_CMD_ENABLE_COLO:
return loadvm_process_enable_colo(mis);
+
+ case MIG_CMD_SWITCHOVER_START:
+ return loadvm_postcopy_handle_switchover_start();
}
return 0;
@@ -2576,8 +2674,7 @@ static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
}
static int
-qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
- uint8_t type)
+qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type)
{
bool trace_downtime = (type == QEMU_VM_SECTION_FULL);
uint32_t instance_id, version_id, section_id;
@@ -2655,8 +2752,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
}
static int
-qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
- uint8_t type)
+qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type)
{
bool trace_downtime = (type == QEMU_VM_SECTION_END);
int64_t start_ts, end_ts;
@@ -2732,13 +2828,11 @@ static int qemu_loadvm_state_header(QEMUFile *f)
if (migrate_get_current()->send_configuration) {
if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
error_report("Configuration section missing");
- qemu_loadvm_state_cleanup();
return -EINVAL;
}
ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
if (ret) {
- qemu_loadvm_state_cleanup();
return ret;
}
}
@@ -2790,16 +2884,68 @@ static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp)
return 0;
}
-void qemu_loadvm_state_cleanup(void)
+struct LoadThreadData {
+ MigrationLoadThread function;
+ void *opaque;
+};
+
+static int qemu_loadvm_load_thread(void *thread_opaque)
+{
+ struct LoadThreadData *data = thread_opaque;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ g_autoptr(Error) local_err = NULL;
+
+ if (!data->function(data->opaque, &mis->load_threads_abort, &local_err)) {
+ MigrationState *s = migrate_get_current();
+
+ /*
+ * Can't set load_threads_abort here since processing of main migration
+ * channel data could still be happening, resulting in launching of new
+ * load threads.
+ */
+
+ assert(local_err);
+
+ /*
+ * In case of multiple load threads failing which thread error
+ * return we end setting is purely arbitrary.
+ */
+ migrate_set_error(s, local_err);
+ }
+
+ return 0;
+}
+
+void qemu_loadvm_start_load_thread(MigrationLoadThread function,
+ void *opaque)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ struct LoadThreadData *data;
+
+ /* We only set it from this thread so it's okay to read it directly */
+ assert(!mis->load_threads_abort);
+
+ data = g_new(struct LoadThreadData, 1);
+ data->function = function;
+ data->opaque = opaque;
+
+ thread_pool_submit_immediate(mis->load_threads, qemu_loadvm_load_thread,
+ data, g_free);
+}
+
+void qemu_loadvm_state_cleanup(MigrationIncomingState *mis)
{
SaveStateEntry *se;
trace_loadvm_state_cleanup();
+
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->ops && se->ops->load_cleanup) {
se->ops->load_cleanup(se->opaque);
}
}
+
+ qemu_loadvm_thread_pool_destroy(mis);
}
/* Return true if we should continue the migration, or false. */
@@ -2891,14 +3037,14 @@ retry:
switch (section_type) {
case QEMU_VM_SECTION_START:
case QEMU_VM_SECTION_FULL:
- ret = qemu_loadvm_section_start_full(f, mis, section_type);
+ ret = qemu_loadvm_section_start_full(f, section_type);
if (ret < 0) {
goto out;
}
break;
case QEMU_VM_SECTION_PART:
case QEMU_VM_SECTION_END:
- ret = qemu_loadvm_section_part_end(f, mis, section_type);
+ ret = qemu_loadvm_section_part_end(f, section_type);
if (ret < 0) {
goto out;
}
@@ -2950,6 +3096,7 @@ out:
int qemu_loadvm_state(QEMUFile *f)
{
+ MigrationState *s = migrate_get_current();
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
int ret;
@@ -2959,6 +3106,8 @@ int qemu_loadvm_state(QEMUFile *f)
return -EINVAL;
}
+ qemu_loadvm_thread_pool_create(mis);
+
ret = qemu_loadvm_state_header(f);
if (ret) {
return ret;
@@ -2981,13 +3130,27 @@ int qemu_loadvm_state(QEMUFile *f)
trace_qemu_loadvm_state_post_main(ret);
if (mis->have_listen_thread) {
- /* Listen thread still going, can't clean up yet */
+ /*
+ * Postcopy listen thread still going, don't synchronize the
+ * cpus yet.
+ */
return ret;
}
+ /* When reaching here, it must be precopy */
if (ret == 0) {
- ret = qemu_file_get_error(f);
+ if (migrate_has_error(migrate_get_current()) ||
+ !qemu_loadvm_thread_pool_wait(s, mis)) {
+ ret = -EINVAL;
+ } else {
+ ret = qemu_file_get_error(f);
+ }
}
+ /*
+ * Set this flag unconditionally so we'll catch further attempts to
+ * start additional threads via an appropriate assert()
+ */
+ qatomic_set(&mis->load_threads_abort, true);
/*
* Try to read in the VMDESC section as well, so that dumping tools that
@@ -3024,7 +3187,6 @@ int qemu_loadvm_state(QEMUFile *f)
}
}
- qemu_loadvm_state_cleanup();
cpu_synchronize_all_post_init();
return ret;
@@ -3064,6 +3226,29 @@ int qemu_loadvm_approve_switchover(void)
return migrate_send_rp_switchover_ack(mis);
}
+bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
+ char *buf, size_t len, Error **errp)
+{
+ SaveStateEntry *se;
+
+ se = find_se(idstr, instance_id);
+ if (!se) {
+ error_setg(errp,
+ "Unknown idstr %s or instance id %u for load state buffer",
+ idstr, instance_id);
+ return false;
+ }
+
+ if (!se->ops || !se->ops->load_state_buffer) {
+ error_setg(errp,
+ "idstr %s / instance %u has no load state buffer operation",
+ idstr, instance_id);
+ return false;
+ }
+
+ return se->ops->load_state_buffer(se->opaque, buf, len, errp);
+}
+
bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
bool has_devices, strList *devices, Error **errp)
{
@@ -3211,11 +3396,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
* side of the migration take control of the images.
*/
if (live && !saved_vm_running) {
- ret = bdrv_inactivate_all();
- if (ret) {
- error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
- __func__, ret);
- }
+ migration_block_inactivate();
}
}
@@ -3286,6 +3467,7 @@ bool load_snapshot(const char *name, const char *vmstate,
/* Don't even try to load empty VM states */
ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
if (ret < 0) {
+ error_setg(errp, "Snapshot can not be found");
return false;
} else if (sn.vm_state_size == 0) {
error_setg(errp, "This is a disk-only snapshot. Revert to it "
@@ -3365,12 +3547,14 @@ void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
qemu_ram_set_idstr(mr->ram_block,
memory_region_name(mr), dev);
qemu_ram_set_migratable(mr->ram_block);
+ ram_block_add_cpr_blocker(mr->ram_block, &error_fatal);
}
void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
{
qemu_ram_unset_idstr(mr->ram_block);
qemu_ram_unset_migratable(mr->ram_block);
+ ram_block_del_cpr_blocker(mr->ram_block);
}
void vmstate_register_ram_global(MemoryRegion *mr)
diff --git a/migration/savevm.h b/migration/savevm.h
index 9ec96a9..2d5e9c7 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -39,12 +39,13 @@ void qemu_savevm_state_header(QEMUFile *f);
int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
void qemu_savevm_state_cleanup(void);
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
-int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
- bool inactivate_disks);
+int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only);
void qemu_savevm_state_pending_exact(uint64_t *must_precopy,
uint64_t *can_postcopy);
void qemu_savevm_state_pending_estimate(uint64_t *must_precopy,
uint64_t *can_postcopy);
+int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
+bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
void qemu_savevm_send_open_return_path(QEMUFile *f);
int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
@@ -53,6 +54,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f);
void qemu_savevm_send_postcopy_run(QEMUFile *f);
void qemu_savevm_send_postcopy_resume(QEMUFile *f);
void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name);
+void qemu_savevm_maybe_send_switchover_start(QEMUFile *f);
void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
uint16_t len,
@@ -63,11 +65,14 @@ void qemu_savevm_live_state(QEMUFile *f);
int qemu_save_device_state(QEMUFile *f);
int qemu_loadvm_state(QEMUFile *f);
-void qemu_loadvm_state_cleanup(void);
+void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
int qemu_load_device_state(QEMUFile *f);
int qemu_loadvm_approve_switchover(void);
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
- bool in_postcopy, bool inactivate_disks);
+ bool in_postcopy);
+
+bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
+ char *buf, size_t len, Error **errp);
#endif
diff --git a/migration/socket.c b/migration/socket.c
index 9ab89b1..5ec65b8 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -42,24 +42,6 @@ void socket_send_channel_create(QIOTaskFunc f, void *data)
f, data, NULL, NULL);
}
-QIOChannel *socket_send_channel_create_sync(Error **errp)
-{
- QIOChannelSocket *sioc = qio_channel_socket_new();
-
- if (!outgoing_args.saddr) {
- object_unref(OBJECT(sioc));
- error_setg(errp, "Initial sock address not set!");
- return NULL;
- }
-
- if (qio_channel_socket_connect_sync(sioc, outgoing_args.saddr, errp) < 0) {
- object_unref(OBJECT(sioc));
- return NULL;
- }
-
- return QIO_CHANNEL(sioc);
-}
-
struct SocketConnectData {
MigrationState *s;
char *hostname;
diff --git a/migration/socket.h b/migration/socket.h
index 46c233e..04ebbe9 100644
--- a/migration/socket.h
+++ b/migration/socket.h
@@ -22,7 +22,6 @@
#include "qemu/sockets.h"
void socket_send_channel_create(QIOTaskFunc f, void *data);
-QIOChannel *socket_send_channel_create_sync(Error **errp);
void socket_start_incoming_migration(SocketAddress *saddr, Error **errp);
diff --git a/migration/target.c b/migration/target.c
index a6ffa9a..12fd399 100644
--- a/migration/target.c
+++ b/migration/target.c
@@ -11,21 +11,21 @@
#include CONFIG_DEVICES
#ifdef CONFIG_VFIO
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-migration.h"
#endif
#ifdef CONFIG_VFIO
void migration_populate_vfio_info(MigrationInfo *info)
{
- if (vfio_mig_active()) {
+ if (vfio_migration_active()) {
info->vfio = g_malloc0(sizeof(*info->vfio));
- info->vfio->transferred = vfio_mig_bytes_transferred();
+ info->vfio->transferred = vfio_migration_bytes_transferred();
}
}
void migration_reset_vfio_bytes_transferred(void)
{
- vfio_reset_bytes_transferred();
+ vfio_migration_reset_bytes_transferred();
}
#else
void migration_populate_vfio_info(MigrationInfo *info)
diff --git a/migration/tls.c b/migration/tls.c
index fa03d91..5cbf952 100644
--- a/migration/tls.c
+++ b/migration/tls.c
@@ -156,6 +156,11 @@ void migration_tls_channel_connect(MigrationState *s,
NULL);
}
+void migration_tls_channel_end(QIOChannel *ioc, Error **errp)
+{
+ qio_channel_tls_bye(QIO_CHANNEL_TLS(ioc), errp);
+}
+
bool migrate_channel_requires_tls_upgrade(QIOChannel *ioc)
{
if (!migrate_tls()) {
diff --git a/migration/tls.h b/migration/tls.h
index 5797d15..58b25e1 100644
--- a/migration/tls.h
+++ b/migration/tls.h
@@ -36,7 +36,7 @@ void migration_tls_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error **errp);
-
+void migration_tls_channel_end(QIOChannel *ioc, Error **errp);
/* Whether the QIO channel requires further TLS handshake? */
bool migrate_channel_requires_tls_upgrade(QIOChannel *ioc);
diff --git a/migration/trace-events b/migration/trace-events
index 0b7c332..c506e11 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -39,12 +39,12 @@ savevm_send_postcopy_run(void) ""
savevm_send_postcopy_resume(void) ""
savevm_send_colo_enable(void) ""
savevm_send_recv_bitmap(char *name) "%s"
+savevm_send_switchover_start(void) ""
savevm_state_setup(void) ""
savevm_state_resume_prepare(void) ""
savevm_state_header(void) ""
savevm_state_iterate(void) ""
savevm_state_cleanup(void) ""
-savevm_state_complete_precopy(void) ""
vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
vmstate_downtime_save(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64
@@ -88,6 +88,8 @@ put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
# qemu-file.c
qemu_file_fclose(void) ""
+qemu_file_put_fd(const char *name, int fd, int ret) "ioc %s, fd %d -> status %d"
+qemu_file_get_fd(const char *name, int fd) "ioc %s -> fd %d"
# ram.c
get_queued_page(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx"
@@ -115,6 +117,7 @@ colo_flush_ram_cache_end(void) ""
save_xbzrle_page_skipping(void) ""
save_xbzrle_page_overflow(void) ""
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
+ram_load_start(void) ""
ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
@@ -128,21 +131,22 @@ postcopy_preempt_reset_channel(void) ""
# multifd.c
multifd_new_send_channel_async(uint8_t id) "channel %u"
multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p"
-multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u"
+multifd_recv_unfill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u"
multifd_recv_new_channel(uint8_t id) "channel %u"
multifd_recv_sync_main(long packet_num) "packet num %ld"
multifd_recv_sync_main_signal(uint8_t id) "channel %u"
multifd_recv_sync_main_wait(uint8_t id) "iter %u"
multifd_recv_terminate_threads(bool error) "error %d"
-multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64
+multifd_recv_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64
multifd_recv_thread_start(uint8_t id) "%u"
-multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u"
+multifd_send_fill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u"
+multifd_send_ram_fill(uint8_t id, uint32_t normal, uint32_t zero) "channel %u normal pages %u zero pages %u"
multifd_send_error(uint8_t id) "channel %u"
multifd_send_sync_main(long packet_num) "packet num %ld"
multifd_send_sync_main_signal(uint8_t id) "channel %u"
multifd_send_sync_main_wait(uint8_t id) "channel %u"
multifd_send_terminate_threads(void) ""
-multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64
+multifd_send_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64
multifd_send_thread_start(uint8_t id) "%u"
multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s"
multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s"
@@ -151,9 +155,9 @@ multifd_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostnam
# migration.c
migrate_set_state(const char *new_state) "new state %s"
-migrate_fd_cleanup(void) ""
+migration_cleanup(void) ""
migrate_error(const char *error_desc) "error=%s"
-migrate_fd_cancel(void) ""
+migration_cancel(void) ""
migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")"
migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")"
@@ -191,6 +195,7 @@ migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidt
process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
process_incoming_migration_co_postcopy_end_main(void) ""
postcopy_preempt_enabled(bool value) "%d"
+migration_precopy_complete(void) ""
# migration-stats
migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd, uint64_t rdma) "qemu_file %" PRIu64 " multifd %" PRIu64 " RDMA %" PRIu64
@@ -340,6 +345,15 @@ colo_receive_message(const char *msg) "Receive '%s' message"
# colo-failover.c
colo_failover_set_state(const char *new_state) "new state %s"
+# cpr.c
+cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d"
+cpr_delete_fd(const char *name, int id) "%s, id %d"
+cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d"
+cpr_state_save(const char *mode) "%s mode"
+cpr_state_load(const char *mode) "%s mode"
+cpr_transfer_input(const char *path) "%s"
+cpr_transfer_output(const char *path) "%s"
+
# block-dirty-bitmap.c
send_bitmap_header_enter(void) ""
send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64
@@ -377,3 +391,10 @@ migration_block_progression(unsigned percent) "Completed %u%%"
# page_cache.c
migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64
migration_pagecache_insert(void) "Error allocating page"
+
+# cpu-throttle.c
+cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%"
+cpu_throttle_dirty_sync(void) ""
+
+# block-active.c
+migration_block_activation(const char *name) "%s"
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
index e83bfcc..741a588 100644
--- a/migration/vmstate-types.c
+++ b/migration/vmstate-types.c
@@ -15,6 +15,7 @@
#include "qemu-file.h"
#include "migration.h"
#include "migration/vmstate.h"
+#include "migration/client-options.h"
#include "qemu/error-report.h"
#include "qemu/queue.h"
#include "trace.h"
@@ -314,6 +315,29 @@ const VMStateInfo vmstate_info_uint64 = {
.put = put_uint64,
};
+/* File descriptor communicated via SCM_RIGHTS */
+
+static int get_fd(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field)
+{
+ int32_t *v = pv;
+ *v = qemu_file_get_fd(f);
+ return 0;
+}
+
+static int put_fd(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field, JSONWriter *vmdesc)
+{
+ int32_t *v = pv;
+ return qemu_file_put_fd(f, *v);
+}
+
+const VMStateInfo vmstate_info_fd = {
+ .name = "fd",
+ .get = get_fd,
+ .put = put_fd,
+};
+
static int get_nullptr(QEMUFile *f, void *pv, size_t size,
const VMStateField *field)
@@ -338,7 +362,7 @@ static int put_nullptr(QEMUFile *f, void *pv, size_t size,
}
const VMStateInfo vmstate_info_nullptr = {
- .name = "uint64",
+ .name = "nullptr",
.get = get_nullptr,
.put = put_nullptr,
};
diff --git a/migration/vmstate.c b/migration/vmstate.c
index ff5d589..5feaa32 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -15,14 +15,15 @@
#include "migration/vmstate.h"
#include "savevm.h"
#include "qapi/error.h"
-#include "qapi/qmp/json-writer.h"
+#include "qobject/json-writer.h"
#include "qemu-file.h"
#include "qemu/bitops.h"
#include "qemu/error-report.h"
#include "trace.h"
static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
- void *opaque, JSONWriter *vmdesc);
+ void *opaque, JSONWriter *vmdesc,
+ Error **errp);
static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd,
void *opaque);
@@ -50,6 +51,36 @@ vmstate_field_exists(const VMStateDescription *vmsd, const VMStateField *field,
return result;
}
+/*
+ * Create a fake nullptr field when there's a NULL pointer detected in the
+ * array of a VMS_ARRAY_OF_POINTER VMSD field. It's needed because we
+ * can't dereference the NULL pointer.
+ */
+static const VMStateField *
+vmsd_create_fake_nullptr_field(const VMStateField *field)
+{
+ VMStateField *fake = g_new0(VMStateField, 1);
+
+ /* It can only happen on an array of pointers! */
+ assert(field->flags & VMS_ARRAY_OF_POINTER);
+
+ /* Some of fake's properties should match the original's */
+ fake->name = field->name;
+ fake->version_id = field->version_id;
+
+ /* Do not need "field_exists" check as it always exists (which is null) */
+ fake->field_exists = NULL;
+
+ /* See vmstate_info_nullptr - use 1 byte to represent nullptr */
+ fake->size = 1;
+ fake->info = &vmstate_info_nullptr;
+ fake->flags = VMS_SINGLE;
+
+ /* All the rest fields shouldn't matter.. */
+
+ return (const VMStateField *)fake;
+}
+
static int vmstate_n_elems(void *opaque, const VMStateField *field)
{
int n_elems = 1;
@@ -142,23 +173,39 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
}
for (i = 0; i < n_elems; i++) {
void *curr_elem = first_elem + size * i;
+ const VMStateField *inner_field;
if (field->flags & VMS_ARRAY_OF_POINTER) {
curr_elem = *(void **)curr_elem;
}
+
if (!curr_elem && size) {
- /* if null pointer check placeholder and do not follow */
- assert(field->flags & VMS_ARRAY_OF_POINTER);
- ret = vmstate_info_nullptr.get(f, curr_elem, size, NULL);
- } else if (field->flags & VMS_STRUCT) {
- ret = vmstate_load_state(f, field->vmsd, curr_elem,
- field->vmsd->version_id);
- } else if (field->flags & VMS_VSTRUCT) {
- ret = vmstate_load_state(f, field->vmsd, curr_elem,
- field->struct_version_id);
+ /*
+ * If null pointer found (which should only happen in
+ * an array of pointers), use null placeholder and do
+ * not follow.
+ */
+ inner_field = vmsd_create_fake_nullptr_field(field);
+ } else {
+ inner_field = field;
+ }
+
+ if (inner_field->flags & VMS_STRUCT) {
+ ret = vmstate_load_state(f, inner_field->vmsd, curr_elem,
+ inner_field->vmsd->version_id);
+ } else if (inner_field->flags & VMS_VSTRUCT) {
+ ret = vmstate_load_state(f, inner_field->vmsd, curr_elem,
+ inner_field->struct_version_id);
} else {
- ret = field->info->get(f, curr_elem, size, field);
+ ret = inner_field->info->get(f, curr_elem, size,
+ inner_field);
+ }
+
+ /* If we used a fake temp field.. free it now */
+ if (inner_field != field) {
+ g_clear_pointer((gpointer *)&inner_field, g_free);
}
+
if (ret >= 0) {
ret = qemu_file_get_error(f);
}
@@ -310,7 +357,7 @@ static void vmsd_desc_field_start(const VMStateDescription *vmsd,
static void vmsd_desc_field_end(const VMStateDescription *vmsd,
JSONWriter *vmdesc,
- const VMStateField *field, size_t size, int i)
+ const VMStateField *field, size_t size)
{
if (!vmdesc) {
return;
@@ -378,37 +425,91 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
int size = vmstate_size(opaque, field);
uint64_t old_offset, written_bytes;
JSONWriter *vmdesc_loop = vmdesc;
+ bool is_prev_null = false;
trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems);
if (field->flags & VMS_POINTER) {
first_elem = *(void **)first_elem;
assert(first_elem || !n_elems || !size);
}
+
for (i = 0; i < n_elems; i++) {
void *curr_elem = first_elem + size * i;
+ const VMStateField *inner_field;
+ bool is_null;
+ int max_elems = n_elems - i;
- vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems);
old_offset = qemu_file_transferred(f);
if (field->flags & VMS_ARRAY_OF_POINTER) {
assert(curr_elem);
curr_elem = *(void **)curr_elem;
}
+
if (!curr_elem && size) {
- /* if null pointer write placeholder and do not follow */
- assert(field->flags & VMS_ARRAY_OF_POINTER);
- ret = vmstate_info_nullptr.put(f, curr_elem, size, NULL,
- NULL);
- } else if (field->flags & VMS_STRUCT) {
- ret = vmstate_save_state(f, field->vmsd, curr_elem,
- vmdesc_loop);
- } else if (field->flags & VMS_VSTRUCT) {
- ret = vmstate_save_state_v(f, field->vmsd, curr_elem,
- vmdesc_loop,
- field->struct_version_id, errp);
+ /*
+ * If null pointer found (which should only happen in
+ * an array of pointers), use null placeholder and do
+ * not follow.
+ */
+ inner_field = vmsd_create_fake_nullptr_field(field);
+ is_null = true;
+ } else {
+ inner_field = field;
+ is_null = false;
+ }
+
+ /*
+ * This logic only matters when dumping VM Desc.
+ *
+ * Due to the fake nullptr handling above, if there's mixed
+ * null/non-null data, it doesn't make sense to emit a
+ * compressed array representation spanning the entire array
+ * because the field types will be different (e.g. struct
+ * vs. nullptr). Search ahead for the next null/non-null element
+ * and start a new compressed array if found.
+ */
+ if (vmdesc && (field->flags & VMS_ARRAY_OF_POINTER) &&
+ is_null != is_prev_null) {
+
+ is_prev_null = is_null;
+ vmdesc_loop = vmdesc;
+
+ for (int j = i + 1; j < n_elems; j++) {
+ void *elem = *(void **)(first_elem + size * j);
+ bool elem_is_null = !elem && size;
+
+ if (is_null != elem_is_null) {
+ max_elems = j - i;
+ break;
+ }
+ }
+ }
+
+ vmsd_desc_field_start(vmsd, vmdesc_loop, inner_field,
+ i, max_elems);
+
+ if (inner_field->flags & VMS_STRUCT) {
+ ret = vmstate_save_state(f, inner_field->vmsd,
+ curr_elem, vmdesc_loop);
+ } else if (inner_field->flags & VMS_VSTRUCT) {
+ ret = vmstate_save_state_v(f, inner_field->vmsd,
+ curr_elem, vmdesc_loop,
+ inner_field->struct_version_id,
+ errp);
} else {
- ret = field->info->put(f, curr_elem, size, field,
- vmdesc_loop);
+ ret = inner_field->info->put(f, curr_elem, size,
+ inner_field, vmdesc_loop);
+ }
+
+ written_bytes = qemu_file_transferred(f) - old_offset;
+ vmsd_desc_field_end(vmsd, vmdesc_loop, inner_field,
+ written_bytes);
+
+ /* If we used a fake temp field.. free it now */
+ if (is_null) {
+ g_clear_pointer((gpointer *)&inner_field, g_free);
}
+
if (ret) {
error_setg(errp, "Save of field %s/%s failed",
vmsd->name, field->name);
@@ -418,9 +519,6 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
return ret;
}
- written_bytes = qemu_file_transferred(f) - old_offset;
- vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i);
-
/* Compressed arrays only care about the first element */
if (vmdesc_loop && vmsd_can_compress(field)) {
vmdesc_loop = NULL;
@@ -441,12 +539,13 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
json_writer_end_array(vmdesc);
}
- ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc);
+ ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc, errp);
if (vmsd->post_save) {
int ps_ret = vmsd->post_save(opaque);
- if (!ret) {
+ if (!ret && ps_ret) {
ret = ps_ret;
+ error_setg(errp, "post-save failed: %s", vmsd->name);
}
}
return ret;
@@ -518,7 +617,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd,
}
static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
- void *opaque, JSONWriter *vmdesc)
+ void *opaque, JSONWriter *vmdesc,
+ Error **errp)
{
const VMStateDescription * const *sub = vmsd->subsections;
bool vmdesc_has_subsections = false;
@@ -546,7 +646,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len);
qemu_put_be32(f, vmsdsub->version_id);
- ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc);
+ ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp);
if (ret) {
return ret;
}