aboutsummaryrefslogtreecommitdiff
path: root/migration
diff options
context:
space:
mode:
Diffstat (limited to 'migration')
-rw-r--r--migration/block-dirty-bitmap.c3
-rw-r--r--migration/channel-block.c4
-rw-r--r--migration/colo.c20
-rw-r--r--migration/cpr-transfer.c7
-rw-r--r--migration/cpr.c88
-rw-r--r--migration/dirtyrate.c4
-rw-r--r--migration/file.c2
-rw-r--r--migration/migration-hmp-cmds.c289
-rw-r--r--migration/migration.c274
-rw-r--r--migration/migration.h14
-rw-r--r--migration/multifd-device-state.c10
-rw-r--r--migration/multifd-nocomp.c6
-rw-r--r--migration/multifd-qatzip.c2
-rw-r--r--migration/multifd-qpl.c2
-rw-r--r--migration/multifd-uadk.c2
-rw-r--r--migration/multifd-zero-page.c24
-rw-r--r--migration/multifd-zlib.c2
-rw-r--r--migration/multifd-zstd.c2
-rw-r--r--migration/multifd.c21
-rw-r--r--migration/multifd.h5
-rw-r--r--migration/options.c41
-rw-r--r--migration/options.h1
-rw-r--r--migration/postcopy-ram.c575
-rw-r--r--migration/postcopy-ram.h2
-rw-r--r--migration/qemu-file.c2
-rw-r--r--migration/ram.c238
-rw-r--r--migration/rdma.c195
-rw-r--r--migration/rdma.h5
-rw-r--r--migration/savevm.c128
-rw-r--r--migration/savevm.h1
-rw-r--r--migration/target.c8
-rw-r--r--migration/trace-events9
32 files changed, 1281 insertions, 705 deletions
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index f2c352d..a061aad 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -1248,8 +1248,7 @@ static bool dirty_bitmap_has_postcopy(void *opaque)
static SaveVMHandlers savevm_dirty_bitmap_handlers = {
.save_setup = dirty_bitmap_save_setup,
- .save_live_complete_postcopy = dirty_bitmap_save_complete,
- .save_live_complete_precopy = dirty_bitmap_save_complete,
+ .save_complete = dirty_bitmap_save_complete,
.has_postcopy = dirty_bitmap_has_postcopy,
.state_pending_exact = dirty_bitmap_state_pending,
.state_pending_estimate = dirty_bitmap_state_pending,
diff --git a/migration/channel-block.c b/migration/channel-block.c
index fff8d87..97de5a6 100644
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@@ -123,7 +123,7 @@ qio_channel_block_seek(QIOChannel *ioc,
bioc->offset = offset;
break;
case SEEK_CUR:
- bioc->offset += whence;
+ bioc->offset += offset;
break;
case SEEK_END:
error_setg(errp, "Size of VMstate region is unknown");
@@ -170,7 +170,7 @@ qio_channel_block_set_aio_fd_handler(QIOChannel *ioc,
static void
qio_channel_block_class_init(ObjectClass *klass,
- void *class_data G_GNUC_UNUSED)
+ const void *class_data G_GNUC_UNUSED)
{
QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
diff --git a/migration/colo.c b/migration/colo.c
index c976b3f..e0f713c 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -146,7 +146,7 @@ static void secondary_vm_do_failover(void)
return;
}
/* Notify COLO incoming thread that failover work is finished */
- qemu_sem_post(&mis->colo_incoming_sem);
+ qemu_event_set(&mis->colo_incoming_event);
/* For Secondary VM, jump to incoming co */
if (mis->colo_incoming_co) {
@@ -195,7 +195,7 @@ static void primary_vm_do_failover(void)
}
/* Notify COLO thread that failover work is finished */
- qemu_sem_post(&s->colo_exit_sem);
+ qemu_event_set(&s->colo_exit_event);
}
COLOMode get_colo_mode(void)
@@ -620,8 +620,8 @@ out:
}
/* Hope this not to be too long to wait here */
- qemu_sem_wait(&s->colo_exit_sem);
- qemu_sem_destroy(&s->colo_exit_sem);
+ qemu_event_wait(&s->colo_exit_event);
+ qemu_event_destroy(&s->colo_exit_event);
/*
* It is safe to unregister notifier after failover finished.
@@ -651,7 +651,7 @@ void migrate_start_colo_process(MigrationState *s)
s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST,
colo_checkpoint_notify_timer, NULL);
- qemu_sem_init(&s->colo_exit_sem, 0);
+ qemu_event_init(&s->colo_exit_event, false);
colo_process_checkpoint(s);
bql_lock();
}
@@ -808,11 +808,11 @@ void colo_shutdown(void)
case COLO_MODE_PRIMARY:
s = migrate_get_current();
qemu_event_set(&s->colo_checkpoint_event);
- qemu_sem_post(&s->colo_exit_sem);
+ qemu_event_set(&s->colo_exit_event);
break;
case COLO_MODE_SECONDARY:
mis = migration_incoming_get_current();
- qemu_sem_post(&mis->colo_incoming_sem);
+ qemu_event_set(&mis->colo_incoming_event);
break;
default:
break;
@@ -827,7 +827,7 @@ static void *colo_process_incoming_thread(void *opaque)
Error *local_err = NULL;
rcu_register_thread();
- qemu_sem_init(&mis->colo_incoming_sem, 0);
+ qemu_event_init(&mis->colo_incoming_event, false);
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_COLO);
@@ -923,8 +923,8 @@ out:
}
/* Hope this not to be too long to loop here */
- qemu_sem_wait(&mis->colo_incoming_sem);
- qemu_sem_destroy(&mis->colo_incoming_sem);
+ qemu_event_wait(&mis->colo_incoming_event);
+ qemu_event_destroy(&mis->colo_incoming_event);
rcu_unregister_thread();
return NULL;
diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
index e1f1403..00371d1 100644
--- a/migration/cpr-transfer.c
+++ b/migration/cpr-transfer.c
@@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
MigrationAddress *addr = channel->addr;
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
- addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
+ (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
g_autoptr(QIOChannelSocket) sioc = NULL;
SocketAddress *saddr = &addr->u.socket;
@@ -60,7 +61,9 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
sioc = qio_net_listener_wait_client(listener);
ioc = QIO_CHANNEL(sioc);
- trace_cpr_transfer_input(addr->u.socket.u.q_unix.path);
+ trace_cpr_transfer_input(
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ?
+ addr->u.socket.u.q_unix.path : addr->u.socket.u.fd.str);
qio_channel_set_name(ioc, "cpr-in");
return qemu_file_new_input(ioc);
diff --git a/migration/cpr.c b/migration/cpr.c
index 42c4656..42ad0b0 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -7,25 +7,21 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
+#include "hw/vfio/vfio-device.h"
#include "migration/cpr.h"
#include "migration/misc.h"
#include "migration/options.h"
#include "migration/qemu-file.h"
#include "migration/savevm.h"
#include "migration/vmstate.h"
+#include "monitor/monitor.h"
#include "system/runstate.h"
#include "trace.h"
/*************************************************************************/
/* cpr state container for all information to be saved. */
-typedef QLIST_HEAD(CprFdList, CprFd) CprFdList;
-
-typedef struct CprState {
- CprFdList fds;
-} CprState;
-
-static CprState cpr_state;
+CprState cpr_state;
/****************************************************************************/
@@ -95,9 +91,37 @@ int cpr_find_fd(const char *name, int id)
trace_cpr_find_fd(name, id, fd);
return fd;
}
-/*************************************************************************/
-#define CPR_STATE "CprState"
+void cpr_resave_fd(const char *name, int id, int fd)
+{
+ CprFd *elem = find_fd(&cpr_state.fds, name, id);
+ int old_fd = elem ? elem->fd : -1;
+
+ if (old_fd < 0) {
+ cpr_save_fd(name, id, fd);
+ } else if (old_fd != fd) {
+ error_setg(&error_fatal,
+ "internal error: cpr fd '%s' id %d value %d "
+ "already saved with a different value %d",
+ name, id, fd, old_fd);
+ }
+}
+
+int cpr_open_fd(const char *path, int flags, const char *name, int id,
+ Error **errp)
+{
+ int fd = cpr_find_fd(name, id);
+
+ if (fd < 0) {
+ fd = qemu_open(path, flags, errp);
+ if (fd >= 0) {
+ cpr_save_fd(name, id, fd);
+ }
+ }
+ return fd;
+}
+
+/*************************************************************************/
static const VMStateDescription vmstate_cpr_state = {
.name = CPR_STATE,
.version_id = 1,
@@ -105,6 +129,10 @@ static const VMStateDescription vmstate_cpr_state = {
.fields = (VMStateField[]) {
VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next),
VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * const []) {
+ &vmstate_cpr_vfio_devices,
+ NULL
}
};
/*************************************************************************/
@@ -228,3 +256,45 @@ void cpr_state_close(void)
cpr_state_file = NULL;
}
}
+
+bool cpr_incoming_needed(void *opaque)
+{
+ MigMode mode = migrate_mode();
+ return mode == MIG_MODE_CPR_TRANSFER;
+}
+
+/*
+ * cpr_get_fd_param: find a descriptor and return its value.
+ *
+ * @name: CPR name for the descriptor
+ * @fdname: An integer-valued string, or a name passed to a getfd command
+ * @index: CPR index of the descriptor
+ * @errp: returned error message
+ *
+ * If CPR is not being performed, then use @fdname to find the fd.
+ * If CPR is being performed, then ignore @fdname, and look for @name
+ * and @index in CPR state.
+ *
+ * On success returns the fd value, else returns -1.
+ */
+int cpr_get_fd_param(const char *name, const char *fdname, int index,
+ Error **errp)
+{
+ ERRP_GUARD();
+ int fd;
+
+ if (cpr_is_incoming()) {
+ fd = cpr_find_fd(name, index);
+ if (fd < 0) {
+ error_setg(errp, "cannot find saved value for fd %s", fdname);
+ }
+ } else {
+ fd = monitor_fd_param(monitor_cur(), fdname, errp);
+ if (fd >= 0) {
+ cpr_save_fd(name, index, fd);
+ } else {
+ error_prepend(errp, "Could not parse object fd %s:", fdname);
+ }
+ }
+ return fd;
+}
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 4cd1477..986624c 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -14,7 +14,7 @@
#include "qemu/error-report.h"
#include "hw/core/cpu.h"
#include "qapi/error.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "qemu/rcu_queue.h"
#include "qemu/main-loop.h"
@@ -27,7 +27,7 @@
#include "qobject/qdict.h"
#include "system/kvm.h"
#include "system/runstate.h"
-#include "exec/memory.h"
+#include "system/memory.h"
#include "qemu/xxhash.h"
#include "migration.h"
diff --git a/migration/file.c b/migration/file.c
index 7f11e26..bb8031e 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -6,7 +6,7 @@
*/
#include "qemu/osdep.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 49c26da..cef5608 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -37,29 +37,110 @@ static void migration_global_dump(Monitor *mon)
{
MigrationState *ms = migrate_get_current();
- monitor_printf(mon, "globals:\n");
- monitor_printf(mon, "store-global-state: %s\n",
+ monitor_printf(mon, "Globals:\n");
+ monitor_printf(mon, " store-global-state: %s\n",
ms->store_global_state ? "on" : "off");
- monitor_printf(mon, "only-migratable: %s\n",
+ monitor_printf(mon, " only-migratable: %s\n",
only_migratable ? "on" : "off");
- monitor_printf(mon, "send-configuration: %s\n",
+ monitor_printf(mon, " send-configuration: %s\n",
ms->send_configuration ? "on" : "off");
- monitor_printf(mon, "send-section-footer: %s\n",
+ monitor_printf(mon, " send-section-footer: %s\n",
ms->send_section_footer ? "on" : "off");
- monitor_printf(mon, "send-switchover-start: %s\n",
+ monitor_printf(mon, " send-switchover-start: %s\n",
ms->send_switchover_start ? "on" : "off");
- monitor_printf(mon, "clear-bitmap-shift: %u\n",
+ monitor_printf(mon, " clear-bitmap-shift: %u\n",
ms->clear_bitmap_shift);
}
+static const gchar *format_time_str(uint64_t us)
+{
+ const char *units[] = {"us", "ms", "sec"};
+ int index = 0;
+
+ while (us > 1000) {
+ us /= 1000;
+ if (++index >= (sizeof(units) - 1)) {
+ break;
+ }
+ }
+
+ return g_strdup_printf("%"PRIu64" %s", us, units[index]);
+}
+
+static void migration_dump_blocktime(Monitor *mon, MigrationInfo *info)
+{
+ if (info->has_postcopy_blocktime) {
+ monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n",
+ info->postcopy_blocktime);
+ }
+
+ if (info->has_postcopy_vcpu_blocktime) {
+ uint32List *item = info->postcopy_vcpu_blocktime;
+ const char *sep = "";
+ int count = 0;
+
+ monitor_printf(mon, "Postcopy vCPU Blocktime (ms):\n [");
+
+ while (item) {
+ monitor_printf(mon, "%s%"PRIu32, sep, item->value);
+ item = item->next;
+ /* Each line 10 vcpu results, newline if there's more */
+ sep = ((++count % 10 == 0) && item) ? ",\n " : ", ";
+ }
+ monitor_printf(mon, "]\n");
+ }
+
+ if (info->has_postcopy_latency) {
+ monitor_printf(mon, "Postcopy Latency (ns): %" PRIu64 "\n",
+ info->postcopy_latency);
+ }
+
+ if (info->has_postcopy_non_vcpu_latency) {
+ monitor_printf(mon, "Postcopy non-vCPU Latencies (ns): %" PRIu64 "\n",
+ info->postcopy_non_vcpu_latency);
+ }
+
+ if (info->has_postcopy_vcpu_latency) {
+ uint64List *item = info->postcopy_vcpu_latency;
+ const char *sep = "";
+ int count = 0;
+
+ monitor_printf(mon, "Postcopy vCPU Latencies (ns):\n [");
+
+ while (item) {
+ monitor_printf(mon, "%s%"PRIu64, sep, item->value);
+ item = item->next;
+ /* Each line 10 vcpu results, newline if there's more */
+ sep = ((++count % 10 == 0) && item) ? ",\n " : ", ";
+ }
+ monitor_printf(mon, "]\n");
+ }
+
+ if (info->has_postcopy_latency_dist) {
+ uint64List *item = info->postcopy_latency_dist;
+ int count = 0;
+
+ monitor_printf(mon, "Postcopy Latency Distribution:\n");
+
+ while (item) {
+ g_autofree const gchar *from = format_time_str(1UL << count);
+ g_autofree const gchar *to = format_time_str(1UL << (count + 1));
+
+ monitor_printf(mon, " [ %8s - %8s ]: %10"PRIu64"\n",
+ from, to, item->value);
+ item = item->next;
+ count++;
+ }
+ }
+}
+
void hmp_info_migrate(Monitor *mon, const QDict *qdict)
{
+ bool show_all = qdict_get_try_bool(qdict, "all", false);
MigrationInfo *info;
info = qmp_query_migrate(NULL);
- migration_global_dump(mon);
-
if (info->blocked_reasons) {
strList *reasons = info->blocked_reasons;
monitor_printf(mon, "Outgoing migration blocked:\n");
@@ -70,7 +151,7 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
}
if (info->has_status) {
- monitor_printf(mon, "Migration status: %s",
+ monitor_printf(mon, "Status: \t\t%s",
MigrationStatus_str(info->status));
if (info->status == MIGRATION_STATUS_FAILED && info->error_desc) {
monitor_printf(mon, " (%s)\n", info->error_desc);
@@ -78,139 +159,133 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
monitor_printf(mon, "\n");
}
- monitor_printf(mon, "total time: %" PRIu64 " ms\n",
- info->total_time);
- if (info->has_expected_downtime) {
- monitor_printf(mon, "expected downtime: %" PRIu64 " ms\n",
- info->expected_downtime);
- }
- if (info->has_downtime) {
- monitor_printf(mon, "downtime: %" PRIu64 " ms\n",
- info->downtime);
+ if (info->total_time) {
+ monitor_printf(mon, "Time (ms): \t\ttotal=%" PRIu64,
+ info->total_time);
+ if (info->has_setup_time) {
+ monitor_printf(mon, ", setup=%" PRIu64,
+ info->setup_time);
+ }
+ if (info->has_expected_downtime) {
+ monitor_printf(mon, ", exp_down=%" PRIu64,
+ info->expected_downtime);
+ }
+ if (info->has_downtime) {
+ monitor_printf(mon, ", down=%" PRIu64,
+ info->downtime);
+ }
+ monitor_printf(mon, "\n");
}
- if (info->has_setup_time) {
- monitor_printf(mon, "setup: %" PRIu64 " ms\n",
- info->setup_time);
+ }
+
+ if (info->has_socket_address) {
+ SocketAddressList *addr;
+
+ monitor_printf(mon, "Sockets: [\n");
+
+ for (addr = info->socket_address; addr; addr = addr->next) {
+ char *s = socket_uri(addr->value);
+ monitor_printf(mon, "\t%s\n", s);
+ g_free(s);
}
+ monitor_printf(mon, "]\n");
}
if (info->ram) {
- monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n",
- info->ram->transferred >> 10);
- monitor_printf(mon, "throughput: %0.2f mbps\n",
+ g_autofree char *str_psize = size_to_str(info->ram->page_size);
+ g_autofree char *str_total = size_to_str(info->ram->total);
+ g_autofree char *str_transferred = size_to_str(info->ram->transferred);
+ g_autofree char *str_remaining = size_to_str(info->ram->remaining);
+ g_autofree char *str_precopy = size_to_str(info->ram->precopy_bytes);
+ g_autofree char *str_multifd = size_to_str(info->ram->multifd_bytes);
+ g_autofree char *str_postcopy = size_to_str(info->ram->postcopy_bytes);
+
+ monitor_printf(mon, "RAM info:\n");
+ monitor_printf(mon, " Throughput (Mbps): \t%0.2f\n",
info->ram->mbps);
- monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n",
- info->ram->remaining >> 10);
- monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n",
- info->ram->total >> 10);
- monitor_printf(mon, "duplicate: %" PRIu64 " pages\n",
- info->ram->duplicate);
- monitor_printf(mon, "normal: %" PRIu64 " pages\n",
- info->ram->normal);
- monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n",
- info->ram->normal_bytes >> 10);
- monitor_printf(mon, "dirty sync count: %" PRIu64 "\n",
- info->ram->dirty_sync_count);
- monitor_printf(mon, "page size: %" PRIu64 " kbytes\n",
- info->ram->page_size >> 10);
- monitor_printf(mon, "multifd bytes: %" PRIu64 " kbytes\n",
- info->ram->multifd_bytes >> 10);
- monitor_printf(mon, "pages-per-second: %" PRIu64 "\n",
- info->ram->pages_per_second);
+ monitor_printf(mon, " Sizes: \t\tpagesize=%s, total=%s\n",
+ str_psize, str_total);
+ monitor_printf(mon, " Transfers: \t\ttransferred=%s, remain=%s\n",
+ str_transferred, str_remaining);
+ monitor_printf(mon, " Channels: \t\tprecopy=%s, "
+ "multifd=%s, postcopy=%s",
+ str_precopy, str_multifd, str_postcopy);
+
+ if (info->vfio) {
+ g_autofree char *str_vfio = size_to_str(info->vfio->transferred);
+
+ monitor_printf(mon, ", vfio=%s", str_vfio);
+ }
+ monitor_printf(mon, "\n");
+ monitor_printf(mon, " Page Types: \tnormal=%" PRIu64
+ ", zero=%" PRIu64 "\n",
+ info->ram->normal, info->ram->duplicate);
+ monitor_printf(mon, " Page Rates (pps): \ttransfer=%" PRIu64,
+ info->ram->pages_per_second);
if (info->ram->dirty_pages_rate) {
- monitor_printf(mon, "dirty pages rate: %" PRIu64 " pages\n",
+ monitor_printf(mon, ", dirty=%" PRIu64,
info->ram->dirty_pages_rate);
}
+ monitor_printf(mon, "\n");
+
+ monitor_printf(mon, " Others: \t\tdirty_syncs=%" PRIu64,
+ info->ram->dirty_sync_count);
if (info->ram->postcopy_requests) {
- monitor_printf(mon, "postcopy request count: %" PRIu64 "\n",
+ monitor_printf(mon, ", postcopy_req=%" PRIu64,
info->ram->postcopy_requests);
}
- if (info->ram->precopy_bytes) {
- monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n",
- info->ram->precopy_bytes >> 10);
- }
if (info->ram->downtime_bytes) {
- monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n",
- info->ram->downtime_bytes >> 10);
- }
- if (info->ram->postcopy_bytes) {
- monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n",
- info->ram->postcopy_bytes >> 10);
+ monitor_printf(mon, ", downtime_bytes=%" PRIu64,
+ info->ram->downtime_bytes);
}
if (info->ram->dirty_sync_missed_zero_copy) {
- monitor_printf(mon,
- "Zero-copy-send fallbacks happened: %" PRIu64 " times\n",
+ monitor_printf(mon, ", zerocopy_fallbacks=%" PRIu64,
info->ram->dirty_sync_missed_zero_copy);
}
+ monitor_printf(mon, "\n");
}
+ if (!show_all) {
+ goto out;
+ }
+
+ migration_global_dump(mon);
+
if (info->xbzrle_cache) {
- monitor_printf(mon, "cache size: %" PRIu64 " bytes\n",
- info->xbzrle_cache->cache_size);
- monitor_printf(mon, "xbzrle transferred: %" PRIu64 " kbytes\n",
- info->xbzrle_cache->bytes >> 10);
- monitor_printf(mon, "xbzrle pages: %" PRIu64 " pages\n",
- info->xbzrle_cache->pages);
- monitor_printf(mon, "xbzrle cache miss: %" PRIu64 " pages\n",
- info->xbzrle_cache->cache_miss);
- monitor_printf(mon, "xbzrle cache miss rate: %0.2f\n",
- info->xbzrle_cache->cache_miss_rate);
- monitor_printf(mon, "xbzrle encoding rate: %0.2f\n",
- info->xbzrle_cache->encoding_rate);
- monitor_printf(mon, "xbzrle overflow: %" PRIu64 "\n",
+ monitor_printf(mon, "XBZRLE: size=%" PRIu64
+ ", transferred=%" PRIu64
+ ", pages=%" PRIu64
+ ", miss=%" PRIu64 "\n"
+ " miss_rate=%0.2f"
+ ", encode_rate=%0.2f"
+ ", overflow=%" PRIu64 "\n",
+ info->xbzrle_cache->cache_size,
+ info->xbzrle_cache->bytes,
+ info->xbzrle_cache->pages,
+ info->xbzrle_cache->cache_miss,
+ info->xbzrle_cache->cache_miss_rate,
+ info->xbzrle_cache->encoding_rate,
info->xbzrle_cache->overflow);
}
if (info->has_cpu_throttle_percentage) {
- monitor_printf(mon, "cpu throttle percentage: %" PRIu64 "\n",
+ monitor_printf(mon, "CPU Throttle (%%): %" PRIu64 "\n",
info->cpu_throttle_percentage);
}
if (info->has_dirty_limit_throttle_time_per_round) {
- monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n",
+ monitor_printf(mon, "Dirty-limit Throttle (us): %" PRIu64 "\n",
info->dirty_limit_throttle_time_per_round);
}
if (info->has_dirty_limit_ring_full_time) {
- monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n",
+ monitor_printf(mon, "Dirty-limit Ring Full (us): %" PRIu64 "\n",
info->dirty_limit_ring_full_time);
}
- if (info->has_postcopy_blocktime) {
- monitor_printf(mon, "postcopy blocktime: %u\n",
- info->postcopy_blocktime);
- }
-
- if (info->has_postcopy_vcpu_blocktime) {
- Visitor *v;
- char *str;
- v = string_output_visitor_new(false, &str);
- visit_type_uint32List(v, NULL, &info->postcopy_vcpu_blocktime,
- &error_abort);
- visit_complete(v, &str);
- monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str);
- g_free(str);
- visit_free(v);
- }
- if (info->has_socket_address) {
- SocketAddressList *addr;
-
- monitor_printf(mon, "socket address: [\n");
-
- for (addr = info->socket_address; addr; addr = addr->next) {
- char *s = socket_uri(addr->value);
- monitor_printf(mon, "\t%s\n", s);
- g_free(s);
- }
- monitor_printf(mon, "]\n");
- }
-
- if (info->vfio) {
- monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n",
- info->vfio->transferred >> 10);
- }
-
+ migration_dump_blocktime(mon, info);
+out:
qapi_free_MigrationInfo(info);
}
diff --git a/migration/migration.c b/migration/migration.c
index d46e776..10c216d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -95,6 +95,9 @@ enum mig_rp_message_type {
MIG_RP_MSG_MAX
};
+/* Migration channel types */
+enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY };
+
/* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add
dynamic creation of migration */
@@ -259,6 +262,24 @@ migration_channels_and_transport_compatible(MigrationAddress *addr,
return true;
}
+static bool
+migration_capabilities_and_transport_compatible(MigrationAddress *addr,
+ Error **errp)
+{
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
+ return migrate_rdma_caps_check(migrate_get_current()->capabilities,
+ errp);
+ }
+
+ return true;
+}
+
+static bool migration_transport_compatible(MigrationAddress *addr, Error **errp)
+{
+ return migration_channels_and_transport_compatible(addr, errp) &&
+ migration_capabilities_and_transport_compatible(addr, errp);
+}
+
static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
{
uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
@@ -555,22 +576,27 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
}
int migrate_send_rp_req_pages(MigrationIncomingState *mis,
- RAMBlock *rb, ram_addr_t start, uint64_t haddr)
+ RAMBlock *rb, ram_addr_t start, uint64_t haddr,
+ uint32_t tid)
{
void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
bool received = false;
WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
received = ramblock_recv_bitmap_test_byte_offset(rb, start);
- if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
- /*
- * The page has not been received, and it's not yet in the page
- * request list. Queue it. Set the value of element to 1, so that
- * things like g_tree_lookup() will return TRUE (1) when found.
- */
- g_tree_insert(mis->page_requested, aligned, (gpointer)1);
- qatomic_inc(&mis->page_requested_count);
- trace_postcopy_page_req_add(aligned, mis->page_requested_count);
+ if (!received) {
+ if (!g_tree_lookup(mis->page_requested, aligned)) {
+ /*
+ * The page has not been received, and it's not yet in the
+ * page request list. Queue it. Set the value of element
+ * to 1, so that things like g_tree_lookup() will return
+ * TRUE (1) when found.
+ */
+ g_tree_insert(mis->page_requested, aligned, (gpointer)1);
+ qatomic_inc(&mis->page_requested_count);
+ trace_postcopy_page_req_add(aligned, mis->page_requested_count);
+ }
+ mark_postcopy_blocktime_begin(haddr, tid, rb);
}
}
@@ -750,7 +776,7 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
}
/* transport mechanism not suitable for migration? */
- if (!migration_channels_and_transport_compatible(addr, errp)) {
+ if (!migration_transport_compatible(addr, errp)) {
return;
}
@@ -769,14 +795,6 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
}
#ifdef CONFIG_RDMA
} else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
- if (migrate_xbzrle()) {
- error_setg(errp, "RDMA and XBZRLE can't be used together");
- return;
- }
- if (migrate_multifd()) {
- error_setg(errp, "RDMA and multifd can't be used together");
- return;
- }
rdma_start_incoming_migration(&addr->u.rdma, errp);
#endif
} else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
@@ -931,9 +949,8 @@ static void migration_incoming_setup(QEMUFile *f)
{
MigrationIncomingState *mis = migration_incoming_get_current();
- if (!mis->from_src_file) {
- mis->from_src_file = f;
- }
+ assert(!mis->from_src_file);
+ mis->from_src_file = f;
qemu_file_set_blocking(f, false);
}
@@ -985,28 +1002,19 @@ void migration_fd_process_incoming(QEMUFile *f)
migration_incoming_process();
}
-/*
- * Returns true when we want to start a new incoming migration process,
- * false otherwise.
- */
-static bool migration_should_start_incoming(bool main_channel)
+static bool migration_has_main_and_multifd_channels(void)
{
- /* Multifd doesn't start unless all channels are established */
- if (migrate_multifd()) {
- return migration_has_all_channels();
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ if (!mis->from_src_file) {
+ /* main channel not established */
+ return false;
}
- /* Preempt channel only starts when the main channel is created */
- if (migrate_postcopy_preempt()) {
- return main_channel;
+ if (migrate_multifd() && !multifd_recv_all_channels_created()) {
+ return false;
}
- /*
- * For all the rest types of migration, we should only reach here when
- * it's the main channel that's being created, and we should always
- * proceed with this channel.
- */
- assert(main_channel);
+ /* main and all multifd channels are established */
return true;
}
@@ -1015,59 +1023,81 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
QEMUFile *f;
- bool default_channel = true;
+ uint8_t channel;
uint32_t channel_magic = 0;
int ret = 0;
- if (migrate_multifd() && !migrate_mapped_ram() &&
- !migrate_postcopy_ram() &&
- qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
- /*
- * With multiple channels, it is possible that we receive channels
- * out of order on destination side, causing incorrect mapping of
- * source channels on destination side. Check channel MAGIC to
- * decide type of channel. Please note this is best effort, postcopy
- * preempt channel does not send any magic number so avoid it for
- * postcopy live migration. Also tls live migration already does
- * tls handshake while initializing main channel so with tls this
- * issue is not possible.
- */
- ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
- sizeof(channel_magic), errp);
+ if (!migration_has_main_and_multifd_channels()) {
+ if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+ /*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort,
+ * postcopy preempt channel does not send any magic number so
+ * avoid it for postcopy live migration. Also tls live migration
+ * already does tls handshake while initializing main channel so
+ * with tls this issue is not possible.
+ */
+ ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
+ sizeof(channel_magic), errp);
+ if (ret != 0) {
+ return;
+ }
- if (ret != 0) {
+ channel_magic = be32_to_cpu(channel_magic);
+ if (channel_magic == QEMU_VM_FILE_MAGIC) {
+ channel = CH_MAIN;
+ } else if (channel_magic == MULTIFD_MAGIC) {
+ assert(migrate_multifd());
+ channel = CH_MULTIFD;
+ } else if (!mis->from_src_file &&
+ mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+ /* reconnect main channel for postcopy recovery */
+ channel = CH_MAIN;
+ } else {
+ error_setg(errp, "unknown channel magic: %u", channel_magic);
+ return;
+ }
+ } else if (mis->from_src_file && migrate_multifd()) {
+ /*
+ * Non-peekable channels like tls/file are processed as
+ * multifd channels when multifd is enabled.
+ */
+ channel = CH_MULTIFD;
+ } else if (!mis->from_src_file) {
+ channel = CH_MAIN;
+ } else {
+ error_setg(errp, "non-peekable channel used without multifd");
return;
}
-
- default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
} else {
- default_channel = !mis->from_src_file;
+ assert(migrate_postcopy_preempt());
+ channel = CH_POSTCOPY;
}
if (multifd_recv_setup(errp) != 0) {
return;
}
- if (default_channel) {
+ if (channel == CH_MAIN) {
f = qemu_file_new_input(ioc);
migration_incoming_setup(f);
- } else {
+ } else if (channel == CH_MULTIFD) {
/* Multiple connections */
- assert(migration_needs_multiple_sockets());
- if (migrate_multifd()) {
- multifd_recv_new_channel(ioc, &local_err);
- } else {
- assert(migrate_postcopy_preempt());
- f = qemu_file_new_input(ioc);
- postcopy_preempt_new_channel(mis, f);
- }
+ multifd_recv_new_channel(ioc, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
+ } else if (channel == CH_POSTCOPY) {
+ assert(!mis->postcopy_qemufile_dst);
+ f = qemu_file_new_input(ioc);
+ postcopy_preempt_new_channel(mis, f);
+ return;
}
- if (migration_should_start_incoming(default_channel)) {
+ if (migration_has_main_and_multifd_channels()) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
@@ -1084,18 +1114,13 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
*/
bool migration_has_all_channels(void)
{
- MigrationIncomingState *mis = migration_incoming_get_current();
-
- if (!mis->from_src_file) {
+ if (!migration_has_main_and_multifd_channels()) {
return false;
}
- if (migrate_multifd()) {
- return multifd_recv_all_channels_created();
- }
-
- if (migrate_postcopy_preempt()) {
- return mis->postcopy_qemufile_dst != NULL;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) {
+ return false;
}
return true;
@@ -1610,7 +1635,7 @@ void migration_cancel(void)
}
/* If the migration is paused, kick it out of the pause */
if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
- qemu_sem_post(&s->pause_sem);
+ qemu_event_set(&s->pause_event);
}
migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
} while (s->state != MIGRATION_STATUS_CANCELLING);
@@ -2208,7 +2233,7 @@ void qmp_migrate(const char *uri, bool has_channels,
}
/* transport mechanism not suitable for migration? */
- if (!migration_channels_and_transport_compatible(addr, errp)) {
+ if (!migration_transport_compatible(addr, errp)) {
return;
}
@@ -2322,7 +2347,7 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp)
MigrationStatus_str(s->state));
return;
}
- qemu_sem_post(&s->pause_sem);
+ qemu_event_set(&s->pause_event);
}
int migration_rp_wait(MigrationState *s)
@@ -2707,6 +2732,10 @@ static int postcopy_start(MigrationState *ms, Error **errp)
}
}
+ if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) {
+ return -1;
+ }
+
trace_postcopy_start();
bql_lock();
trace_postcopy_start_set_run();
@@ -2887,21 +2916,18 @@ static bool migration_switchover_prepare(MigrationState *s)
return true;
}
- /* Since leaving this state is not atomic with posting the semaphore
+ /*
+ * Since leaving this state is not atomic with setting the event
* it's possible that someone could have issued multiple migrate_continue
- * and the semaphore is incorrectly positive at this point;
- * the docs say it's undefined to reinit a semaphore that's already
- * init'd, so use timedwait to eat up any existing posts.
+ * and the event is incorrectly set at this point so reset it.
*/
- while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
- /* This block intentionally left blank */
- }
+ qemu_event_reset(&s->pause_event);
/* Update [POSTCOPY_]ACTIVE to PRE_SWITCHOVER */
migrate_set_state(&s->state, s->state, MIGRATION_STATUS_PRE_SWITCHOVER);
bql_unlock();
- qemu_sem_wait(&s->pause_sem);
+ qemu_event_wait(&s->pause_event);
bql_lock();
/*
@@ -3415,33 +3441,60 @@ static MigIterateState migration_iteration_run(MigrationState *s)
Error *local_err = NULL;
bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
bool can_switchover = migration_can_switchover(s);
+ bool complete_ready;
+ /* Fast path - get the estimated amount of pending data */
qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
pending_size = must_precopy + can_postcopy;
trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy);
- if (pending_size < s->threshold_size) {
- qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
- pending_size = must_precopy + can_postcopy;
- trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy);
+ if (in_postcopy) {
+ /*
+ * Iterate in postcopy until all pending data flushed. Note that
+ * postcopy completion doesn't rely on can_switchover, because when
+ * POSTCOPY_ACTIVE it means switchover already happened.
+ */
+ complete_ready = !pending_size;
+ } else {
+ /*
+ * Exact pending reporting is only needed for precopy. Taking RAM
+ * as example, there'll be no extra dirty information after
+ * postcopy started, so ESTIMATE should always match with EXACT
+ * during postcopy phase.
+ */
+ if (pending_size < s->threshold_size) {
+ qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
+ pending_size = must_precopy + can_postcopy;
+ trace_migrate_pending_exact(pending_size, must_precopy,
+ can_postcopy);
+ }
+
+ /* Should we switch to postcopy now? */
+ if (must_precopy <= s->threshold_size &&
+ can_switchover && qatomic_read(&s->start_postcopy)) {
+ if (postcopy_start(s, &local_err)) {
+ migrate_set_error(s, local_err);
+ error_report_err(local_err);
+ }
+ return MIG_ITERATE_SKIP;
+ }
+
+ /*
+ * For precopy, migration can complete only if:
+ *
+ * (1) Switchover is acknowledged by destination
+ * (2) Pending size is no more than the threshold specified
+ * (which was calculated from expected downtime)
+ */
+ complete_ready = can_switchover && (pending_size <= s->threshold_size);
}
- if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
+ if (complete_ready) {
trace_migration_thread_low_pending(pending_size);
migration_completion(s);
return MIG_ITERATE_BREAK;
}
- /* Still a significant amount to transfer */
- if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover &&
- qatomic_read(&s->start_postcopy)) {
- if (postcopy_start(s, &local_err)) {
- migrate_set_error(s, local_err);
- error_report_err(local_err);
- }
- return MIG_ITERATE_SKIP;
- }
-
/* Just another iteration step */
qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
return MIG_ITERATE_RESUME;
@@ -3866,9 +3919,8 @@ static void *bg_migration_thread(void *opaque)
while (migration_is_active()) {
MigIterateState iter_state = bg_migration_iteration_run(s);
- if (iter_state == MIG_ITERATE_SKIP) {
- continue;
- } else if (iter_state == MIG_ITERATE_BREAK) {
+
+ if (iter_state == MIG_ITERATE_BREAK) {
break;
}
@@ -4016,7 +4068,7 @@ fail:
migration_cleanup(s);
}
-static void migration_class_init(ObjectClass *klass, void *data)
+static void migration_class_init(ObjectClass *klass, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@@ -4033,7 +4085,7 @@ static void migration_instance_finalize(Object *obj)
qemu_mutex_destroy(&ms->qemu_file_lock);
qemu_sem_destroy(&ms->wait_unplug_sem);
qemu_sem_destroy(&ms->rate_limit_sem);
- qemu_sem_destroy(&ms->pause_sem);
+ qemu_event_destroy(&ms->pause_event);
qemu_sem_destroy(&ms->postcopy_pause_sem);
qemu_sem_destroy(&ms->rp_state.rp_sem);
qemu_sem_destroy(&ms->rp_state.rp_pong_acks);
@@ -4048,7 +4100,7 @@ static void migration_instance_init(Object *obj)
ms->state = MIGRATION_STATUS_NONE;
ms->mbps = -1;
ms->pages_per_second = -1;
- qemu_sem_init(&ms->pause_sem, 0);
+ qemu_event_init(&ms->pause_event, false);
qemu_mutex_init(&ms->error_mutex);
migrate_params_init(&ms->parameters);
diff --git a/migration/migration.h b/migration/migration.h
index d53f7ca..01329bf 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -98,9 +98,9 @@ struct MigrationIncomingState {
void (*transport_cleanup)(void *data);
/*
* Used to sync thread creations. Note that we can't create threads in
- * parallel with this sem.
+ * parallel with this event.
*/
- QemuSemaphore thread_sync_sem;
+ QemuEvent thread_sync_event;
/*
* Free at the start of the main state load, set as the main thread finishes
* loading state.
@@ -186,7 +186,7 @@ struct MigrationIncomingState {
/* The coroutine we should enter (back) after failover */
Coroutine *colo_incoming_co;
- QemuSemaphore colo_incoming_sem;
+ QemuEvent colo_incoming_event;
/* Optional load threads pool and its thread exit request flag */
ThreadPool *load_threads;
@@ -379,10 +379,10 @@ struct MigrationState {
QemuSemaphore wait_unplug_sem;
/* Migration is paused due to pause-before-switchover */
- QemuSemaphore pause_sem;
+ QemuEvent pause_event;
- /* The semaphore is used to notify COLO thread that failover is finished */
- QemuSemaphore colo_exit_sem;
+ /* The event is used to notify COLO thread that failover is finished */
+ QemuEvent colo_exit_event;
/* The event is used to notify COLO thread to do checkpoint */
QemuEvent colo_checkpoint_event;
@@ -546,7 +546,7 @@ void migrate_send_rp_shut(MigrationIncomingState *mis,
void migrate_send_rp_pong(MigrationIncomingState *mis,
uint32_t value);
int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb,
- ram_addr_t start, uint64_t haddr);
+ ram_addr_t start, uint64_t haddr, uint32_t tid);
int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
RAMBlock *rb, ram_addr_t start);
void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
index 94222d0..fce64f0 100644
--- a/migration/multifd-device-state.c
+++ b/migration/multifd-device-state.c
@@ -131,7 +131,7 @@ bool multifd_device_state_supported(void)
static void multifd_device_state_save_thread_data_free(void *opaque)
{
- SaveLiveCompletePrecopyThreadData *data = opaque;
+ SaveCompletePrecopyThreadData *data = opaque;
g_clear_pointer(&data->idstr, g_free);
g_free(data);
@@ -139,7 +139,7 @@ static void multifd_device_state_save_thread_data_free(void *opaque)
static int multifd_device_state_save_thread(void *opaque)
{
- SaveLiveCompletePrecopyThreadData *data = opaque;
+ SaveCompletePrecopyThreadData *data = opaque;
g_autoptr(Error) local_err = NULL;
if (!data->hdlr(data, &local_err)) {
@@ -170,18 +170,18 @@ bool multifd_device_state_save_thread_should_exit(void)
}
void
-multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr,
+multifd_spawn_device_state_save_thread(SaveCompletePrecopyThreadHandler hdlr,
char *idstr, uint32_t instance_id,
void *opaque)
{
- SaveLiveCompletePrecopyThreadData *data;
+ SaveCompletePrecopyThreadData *data;
assert(multifd_device_state_supported());
assert(multifd_send_device_state);
assert(!qatomic_read(&multifd_send_device_state->threads_abort));
- data = g_new(SaveLiveCompletePrecopyThreadData, 1);
+ data = g_new(SaveCompletePrecopyThreadData, 1);
data->hdlr = hdlr;
data->idstr = g_strdup(idstr);
data->instance_id = instance_id;
diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
index ffe7525..b48eae3 100644
--- a/migration/multifd-nocomp.c
+++ b/migration/multifd-nocomp.c
@@ -11,12 +11,13 @@
*/
#include "qemu/osdep.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "file.h"
#include "migration-stats.h"
#include "multifd.h"
#include "options.h"
+#include "migration.h"
#include "qapi/error.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
@@ -82,7 +83,6 @@ static void multifd_nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
{
g_free(p->iov);
p->iov = NULL;
- return;
}
static void multifd_ram_prepare_header(MultiFDSendParams *p)
@@ -399,7 +399,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f)
MultiFDSyncReq req;
int ret;
- if (!migrate_multifd()) {
+ if (!migrate_multifd() || migration_in_postcopy()) {
return 0;
}
diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c
index 6a0e989..7419e5d 100644
--- a/migration/multifd-qatzip.c
+++ b/migration/multifd-qatzip.c
@@ -13,7 +13,7 @@
*/
#include "qemu/osdep.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "qapi/qapi-types-migration.h"
diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
index 88e2344..52902eb 100644
--- a/migration/multifd-qpl.c
+++ b/migration/multifd-qpl.c
@@ -14,7 +14,7 @@
#include "qemu/module.h"
#include "qapi/error.h"
#include "qapi/qapi-types-migration.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "multifd.h"
#include "qpl/qpl.h"
diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c
index 6895c1f..fd7cd9b 100644
--- a/migration/multifd-uadk.c
+++ b/migration/multifd-uadk.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include "qemu/module.h"
#include "qapi/error.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "migration.h"
#include "multifd.h"
#include "options.h"
diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
index f1e988a..4cde868 100644
--- a/migration/multifd-zero-page.c
+++ b/migration/multifd-zero-page.c
@@ -12,7 +12,7 @@
#include "qemu/osdep.h"
#include "qemu/cutils.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "migration.h"
#include "migration-stats.h"
#include "multifd.h"
@@ -85,9 +85,27 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p)
{
for (int i = 0; i < p->zero_num; i++) {
void *page = p->host + p->zero[i];
- if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) {
+ bool received =
+ ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i]);
+
+ /*
+ * During multifd migration zero page is written to the memory
+ * only if it is migrated more than once.
+ *
+ * It becomes a problem when both multifd & postcopy options are
+ * enabled. If the zero page which was skipped during multifd phase,
+ * is accessed during the postcopy phase of the migration, a page
+ * fault occurs. But this page fault is not served because the
+ * 'receivedmap' says the zero page is already received. Thus the
+ * thread accessing that page may hang.
+ *
+ * When postcopy is enabled, always write the zero page as and when
+ * it is migrated.
+ */
+ if (migrate_postcopy_ram() || received) {
memset(page, 0, multifd_ram_page_size());
- } else {
+ }
+ if (!received) {
ramblock_recv_bitmap_set_offset(p->block, p->zero[i]);
}
}
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 8cf8a26..8820b2a 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include <zlib.h>
#include "qemu/rcu.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "qapi/error.h"
#include "migration.h"
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index abed140..3c2dcf7 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -13,7 +13,7 @@
#include "qemu/osdep.h"
#include <zstd.h>
#include "qemu/rcu.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "exec/target_page.h"
#include "qapi/error.h"
#include "migration.h"
diff --git a/migration/multifd.c b/migration/multifd.c
index dfb5189..b255778 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -16,7 +16,7 @@
#include "qemu/rcu.h"
#include "exec/target_page.h"
#include "system/system.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "file.h"
@@ -36,11 +36,6 @@
#include "io/channel-socket.h"
#include "yank_functions.h"
-/* Multiple fd's */
-
-#define MULTIFD_MAGIC 0x11223344U
-#define MULTIFD_VERSION 1
-
typedef struct {
uint32_t magic;
uint32_t version;
@@ -695,6 +690,7 @@ static void *multifd_send_thread(void *opaque)
if (qatomic_load_acquire(&p->pending_job)) {
bool is_device_state = multifd_payload_device_state(p->data);
size_t total_size;
+ int write_flags_masked = 0;
p->flags = 0;
p->iovs_num = 0;
@@ -702,6 +698,9 @@ static void *multifd_send_thread(void *opaque)
if (is_device_state) {
multifd_device_state_send_prepare(p);
+
+ /* Device state packets cannot be sent via zerocopy */
+ write_flags_masked |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
} else {
ret = multifd_send_state->ops->send_prepare(p, &local_err);
if (ret != 0) {
@@ -723,7 +722,8 @@ static void *multifd_send_thread(void *opaque)
&p->data->u.ram, &local_err);
} else {
ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
- NULL, 0, p->write_flags,
+ NULL, 0,
+ p->write_flags & ~write_flags_masked,
&local_err);
}
@@ -1384,6 +1384,13 @@ static void *multifd_recv_thread(void *opaque)
}
if (has_data) {
+ /*
+ * multifd thread should not be active and receive data
+ * when migration is in the Postcopy phase. Two threads
+ * writing the same memory area could easily corrupt
+ * the guest state.
+ */
+ assert(!migration_in_postcopy());
if (is_device_state) {
assert(use_packets);
ret = multifd_device_state_recv(p, &local_err);
diff --git a/migration/multifd.h b/migration/multifd.h
index 2d337e7..9b6d81e 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -49,6 +49,11 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset);
bool multifd_recv(void);
MultiFDRecvData *multifd_get_recv_data(void);
+/* Multiple fd's */
+
+#define MULTIFD_MAGIC 0x11223344U
+#define MULTIFD_VERSION 1
+
/* Multifd Compression flags */
#define MULTIFD_FLAG_SYNC (1 << 0)
diff --git a/migration/options.c b/migration/options.c
index b0ac2ea..4e923a2 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -187,6 +187,8 @@ const Property migration_properties[] = {
DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
DEFINE_PROP_MIG_CAP("x-postcopy-preempt",
MIGRATION_CAPABILITY_POSTCOPY_PREEMPT),
+ DEFINE_PROP_MIG_CAP("postcopy-blocktime",
+ MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME),
DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
@@ -448,6 +450,24 @@ static bool migrate_incoming_started(void)
return !!migration_incoming_get_current()->transport_data;
}
+bool migrate_rdma_caps_check(bool *caps, Error **errp)
+{
+ if (caps[MIGRATION_CAPABILITY_XBZRLE]) {
+ error_setg(errp, "RDMA and XBZRLE can't be used together");
+ return false;
+ }
+ if (caps[MIGRATION_CAPABILITY_MULTIFD]) {
+ error_setg(errp, "RDMA and multifd can't be used together");
+ return false;
+ }
+ if (caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
+ error_setg(errp, "RDMA and postcopy-ram can't be used together");
+ return false;
+ }
+
+ return true;
+}
+
/**
* @migration_caps_check - check capability compatibility
*
@@ -491,11 +511,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
error_setg(errp, "Postcopy is not compatible with ignore-shared");
return false;
}
-
- if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
- error_setg(errp, "Postcopy is not yet compatible with multifd");
- return false;
- }
}
if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
@@ -555,7 +570,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
return false;
}
- if (migrate_incoming_started()) {
+ if (!migrate_postcopy_preempt() && migrate_incoming_started()) {
error_setg(errp,
"Postcopy preempt must be set before incoming starts");
return false;
@@ -563,7 +578,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
}
if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
- if (migrate_incoming_started()) {
+ if (!migrate_multifd() && migrate_incoming_started()) {
error_setg(errp, "Multifd must be set before incoming starts");
return false;
}
@@ -611,6 +626,13 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
}
}
+ /*
+ * On destination side, check the cases that capability is being set
+ * after incoming thread has started.
+ */
+ if (migrate_rdma() && !migrate_rdma_caps_check(new_caps, errp)) {
+ return false;
+ }
return true;
}
@@ -1193,6 +1215,11 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
dest->tls_hostname = params->tls_hostname->u.s;
}
+ if (params->tls_authz) {
+ assert(params->tls_authz->type == QTYPE_QSTRING);
+ dest->tls_authz = params->tls_authz->u.s;
+ }
+
if (params->has_max_bandwidth) {
dest->max_bandwidth = params->max_bandwidth;
}
diff --git a/migration/options.h b/migration/options.h
index 762be4e..82d8397 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -57,6 +57,7 @@ bool migrate_tls(void);
/* capabilities helpers */
+bool migrate_rdma_caps_check(bool *caps, Error **errp);
bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp);
/* parameters */
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 5d3edfc..45af9a3 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -31,7 +31,7 @@
#include "qemu/error-report.h"
#include "trace.h"
#include "hw/boards.h"
-#include "exec/ramblock.h"
+#include "system/ramblock.h"
#include "socket.h"
#include "yank_functions.h"
#include "tls.h"
@@ -90,10 +90,10 @@ void postcopy_thread_create(MigrationIncomingState *mis,
QemuThread *thread, const char *name,
void *(*fn)(void *), int joinable)
{
- qemu_sem_init(&mis->thread_sync_sem, 0);
+ qemu_event_init(&mis->thread_sync_event, false);
qemu_thread_create(thread, name, fn, mis, joinable);
- qemu_sem_wait(&mis->thread_sync_sem);
- qemu_sem_destroy(&mis->thread_sync_sem);
+ qemu_event_wait(&mis->thread_sync_event);
+ qemu_event_destroy(&mis->thread_sync_event);
}
/* Postcopy needs to detect accesses to pages that haven't yet been copied
@@ -110,19 +110,104 @@ void postcopy_thread_create(MigrationIncomingState *mis,
#include <sys/eventfd.h>
#include <linux/userfaultfd.h>
+/*
+ * Here we use 24 buckets, which means the last bucket will cover [2^24 us,
+ * 2^25 us) ~= [16, 32) seconds. It should be far enough to record even
+ * extreme (perf-wise broken) 1G pages moving over, which can sometimes
+ * take a few seconds due to various reasons. Anything more than that
+ * might be unsensible to account anymore.
+ */
+#define BLOCKTIME_LATENCY_BUCKET_N (24)
+
+/* All the time records are in unit of nanoseconds */
typedef struct PostcopyBlocktimeContext {
- /* time when page fault initiated per vCPU */
- uint32_t *page_fault_vcpu_time;
- /* page address per vCPU */
- uintptr_t *vcpu_addr;
- uint32_t total_blocktime;
/* blocktime per vCPU */
- uint32_t *vcpu_blocktime;
+ uint64_t *vcpu_blocktime_total;
+ /* count of faults per vCPU */
+ uint64_t *vcpu_faults_count;
+ /*
+ * count of currently blocked faults per vCPU.
+ *
+ * NOTE: Normally there should only be one fault in-progress per vCPU
+ * thread, so logically it _seems_ vcpu_faults_count[] for any vCPU
+ * should be either zero or one. However, there can be reasons we see
+ * >1 faults on the same vCPU thread.
+ *
+ * CASE (1): since the process to resolve faults (ioctl(UFFDIO_COPY),
+ * for example) is done before taking the mutex that protects the
+ * blocktime context, it can happen that we read more than one faulted
+ * addresses per vCPU.
+ *
+ * One example when we can see >1 faulted addresses for one vCPU:
+ *
+ * vcpu1 thread fault thread resolve thread
+ * ============ ============ ==============
+ *
+ * faulted on addr1
+ * read uffd msg (addr1)
+ * MUTEX_LOCK
+ * add entry (cpu1, addr1)
+ * MUTEX_UNLOCK
+ * request remote fault (addr1)
+ * resolve fault (addr1)
+ * addr1 resolved, continue..
+ * faulted on addr2
+ * read uffd msg (addr2)
+ * MUTEX_LOCK
+ * add entry (cpu1, addr2) <--------------- [A]
+ * MUTEX_UNLOCK
+ * MUTEX_LOCK
+ * remove entry (cpu1, addr1)
+ * MUTEX_UNLOCK
+ *
+ * In above case, we may see (cpu1, addr1) and (cpu1, addr2) entries to
+ * appear together at [A], when it gets the lock before the resolve
+ * thread. Use this counter to maintain such case, and only when it
+ * reaches zero we know the vCPU is not blocked anymore.
+ *
+ * CASE (2): theoretically (the author admit to not have verified
+ * this..), one vCPU thread can also generate more than one userfaultfd
+ * message on the same address. It can happen e.g. for whatever reason
+ * the fault got retried before a resolution arrives. In that extremely
+ * rare case, we could also see two (cpu1, addr1) entries.
+ *
+ * In all cases, be prepared with such re-entrancies with this array.
+ *
+ * Using uint8_t should be far enough for now. For example, when
+ * there're only one resolve thread (postcopy ram listening thread),
+ * the max (concurrent fault entries) should be two.
+ */
+ uint8_t *vcpu_faults_current;
+ /*
+ * The hash that contains addr1->[(cpu1,ts1),(cpu2,ts2) ...] mappings.
+ * Each of the entry is a tuple of (CPU index, fault timestamp) showing
+ * that a fault was requested.
+ */
+ GHashTable *vcpu_addr_hash;
+ /*
+ * Each bucket stores the count of faults that were resolved within the
+ * bucket window [2^N us, 2^(N+1) us).
+ */
+ uint64_t latency_buckets[BLOCKTIME_LATENCY_BUCKET_N];
+ /* total blocktime when all vCPUs are stopped */
+ uint64_t total_blocktime;
/* point in time when last page fault was initiated */
- uint32_t last_begin;
+ uint64_t last_begin;
/* number of vCPU are suspended */
int smp_cpus_down;
- uint64_t start_time;
+
+ /*
+ * Fast path for looking up vcpu_index from tid. NOTE: this result
+ * only reflects the vcpu setup when postcopy is running. It may not
+ * always match with the current vcpu setup because vcpus can be hot
+ * attached/detached after migration completes. However this should be
+ * stable when blocktime is using the structure.
+ */
+ GHashTable *tid_to_vcpu_hash;
+ /* Count of non-vCPU faults. This is only for debugging purpose. */
+ uint64_t non_vcpu_faults;
+ /* total blocktime when a non-vCPU thread is stopped */
+ uint64_t non_vcpu_blocktime_total;
/*
* Handler for exit event, necessary for
@@ -131,11 +216,41 @@ typedef struct PostcopyBlocktimeContext {
Notifier exit_notifier;
} PostcopyBlocktimeContext;
+typedef struct {
+ /* The time the fault was triggered */
+ uint64_t fault_time;
+ /*
+ * The vCPU index that was blocked, when cpu==-1, it means it's a
+ * fault from non-vCPU threads.
+ */
+ int cpu;
+} BlocktimeVCPUEntry;
+
+/* Alloc an entry to record a vCPU fault */
+static BlocktimeVCPUEntry *
+blocktime_vcpu_entry_alloc(int cpu, uint64_t fault_time)
+{
+ BlocktimeVCPUEntry *entry = g_new(BlocktimeVCPUEntry, 1);
+
+ entry->fault_time = fault_time;
+ entry->cpu = cpu;
+
+ return entry;
+}
+
+/* Free a @GList of @BlocktimeVCPUEntry */
+static void blocktime_vcpu_list_free(gpointer data)
+{
+ g_list_free_full(data, g_free);
+}
+
static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
{
- g_free(ctx->page_fault_vcpu_time);
- g_free(ctx->vcpu_addr);
- g_free(ctx->vcpu_blocktime);
+ g_hash_table_destroy(ctx->tid_to_vcpu_hash);
+ g_hash_table_destroy(ctx->vcpu_addr_hash);
+ g_free(ctx->vcpu_blocktime_total);
+ g_free(ctx->vcpu_faults_count);
+ g_free(ctx->vcpu_faults_current);
g_free(ctx);
}
@@ -146,32 +261,65 @@ static void migration_exit_cb(Notifier *n, void *data)
destroy_blocktime_context(ctx);
}
+static GHashTable *blocktime_init_tid_to_vcpu_hash(void)
+{
+ /*
+ * TID as an unsigned int can be directly used as the key. However,
+ * CPU index can NOT be directly used as value, because CPU index can
+ * be 0, which means NULL. Then when lookup we can never know whether
+ * it's 0 or "not found". Hence use an indirection for CPU index.
+ */
+ GHashTable *table = g_hash_table_new_full(g_direct_hash, g_direct_equal,
+ NULL, g_free);
+ CPUState *cpu;
+
+ /*
+ * Initialize the tid->cpu_id mapping for lookups. The caller needs to
+ * make sure when reaching here the CPU topology is frozen and will be
+ * stable for the whole blocktime trapping period.
+ */
+ CPU_FOREACH(cpu) {
+ int *value = g_new(int, 1);
+
+ *value = cpu->cpu_index;
+ g_hash_table_insert(table,
+ GUINT_TO_POINTER((uint32_t)cpu->thread_id),
+ value);
+ trace_postcopy_blocktime_tid_cpu_map(cpu->cpu_index, cpu->thread_id);
+ }
+
+ return table;
+}
+
static struct PostcopyBlocktimeContext *blocktime_context_new(void)
{
MachineState *ms = MACHINE(qdev_get_machine());
unsigned int smp_cpus = ms->smp.cpus;
PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
- ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
- ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
- ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
- ctx->exit_notifier.notify = migration_exit_cb;
- ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
- qemu_add_exit_notifier(&ctx->exit_notifier);
- return ctx;
-}
+ /* Initialize all counters to be zeros */
+ memset(ctx->latency_buckets, 0, sizeof(ctx->latency_buckets));
-static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
-{
- MachineState *ms = MACHINE(qdev_get_machine());
- uint32List *list = NULL;
- int i;
+ ctx->vcpu_blocktime_total = g_new0(uint64_t, smp_cpus);
+ ctx->vcpu_faults_count = g_new0(uint64_t, smp_cpus);
+ ctx->vcpu_faults_current = g_new0(uint8_t, smp_cpus);
+ ctx->tid_to_vcpu_hash = blocktime_init_tid_to_vcpu_hash();
- for (i = ms->smp.cpus - 1; i >= 0; i--) {
- QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
- }
+ /*
+ * The key (host virtual addresses) will always be gpointer-sized on
+ * either 32bits or 64bits systems, so it'll fit as a direct key.
+ *
+ * The value will be a list of BlocktimeVCPUEntry entries.
+ */
+ ctx->vcpu_addr_hash = g_hash_table_new_full(g_direct_hash,
+ g_direct_equal,
+ NULL,
+ blocktime_vcpu_list_free);
+
+ ctx->exit_notifier.notify = migration_exit_cb;
+ qemu_add_exit_notifier(&ctx->exit_notifier);
- return list;
+ return ctx;
}
/*
@@ -185,18 +333,64 @@ void fill_destination_postcopy_migration_info(MigrationInfo *info)
{
MigrationIncomingState *mis = migration_incoming_get_current();
PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
+ MachineState *ms = MACHINE(qdev_get_machine());
+ uint64_t latency_total = 0, faults = 0;
+ uint32List *list_blocktime = NULL;
+ uint64List *list_latency = NULL;
+ uint64List *latency_buckets = NULL;
+ int i;
if (!bc) {
return;
}
+ for (i = ms->smp.cpus - 1; i >= 0; i--) {
+ uint64_t latency, total, count;
+
+ /* Convert ns -> ms */
+ QAPI_LIST_PREPEND(list_blocktime,
+ (uint32_t)(bc->vcpu_blocktime_total[i] / SCALE_MS));
+
+ /* The rest in nanoseconds */
+ total = bc->vcpu_blocktime_total[i];
+ latency_total += total;
+ count = bc->vcpu_faults_count[i];
+ faults += count;
+
+ if (count) {
+ latency = total / count;
+ } else {
+ /* No fault detected */
+ latency = 0;
+ }
+
+ QAPI_LIST_PREPEND(list_latency, latency);
+ }
+
+ for (i = BLOCKTIME_LATENCY_BUCKET_N - 1; i >= 0; i--) {
+ QAPI_LIST_PREPEND(latency_buckets, bc->latency_buckets[i]);
+ }
+
+ latency_total += bc->non_vcpu_blocktime_total;
+ faults += bc->non_vcpu_faults;
+
+ info->has_postcopy_non_vcpu_latency = true;
+ info->postcopy_non_vcpu_latency = bc->non_vcpu_faults ?
+ (bc->non_vcpu_blocktime_total / bc->non_vcpu_faults) : 0;
info->has_postcopy_blocktime = true;
- info->postcopy_blocktime = bc->total_blocktime;
+ /* Convert ns -> ms */
+ info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS);
info->has_postcopy_vcpu_blocktime = true;
- info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
+ info->postcopy_vcpu_blocktime = list_blocktime;
+ info->has_postcopy_latency = true;
+ info->postcopy_latency = faults ? (latency_total / faults) : 0;
+ info->has_postcopy_vcpu_latency = true;
+ info->postcopy_vcpu_latency = list_latency;
+ info->has_postcopy_latency_dist = true;
+ info->postcopy_latency_dist = latency_buckets;
}
-static uint32_t get_postcopy_total_blocktime(void)
+static uint64_t get_postcopy_total_blocktime(void)
{
MigrationIncomingState *mis = migration_incoming_get_current();
PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
@@ -300,13 +494,13 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis,
}
#ifdef UFFD_FEATURE_THREAD_ID
+ /*
+ * Postcopy blocktime conditionally needs THREAD_ID feature (introduced
+ * to Linux in 2017). Always try to enable it when QEMU is compiled
+ * with such environment.
+ */
if (UFFD_FEATURE_THREAD_ID & supported_features) {
asked_features |= UFFD_FEATURE_THREAD_ID;
- if (migrate_postcopy_blocktime()) {
- if (!mis->blocktime_ctx) {
- mis->blocktime_ctx = blocktime_context_new();
- }
- }
}
#endif
@@ -752,8 +946,12 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
pagesize);
}
+/*
+ * NOTE: @tid is only used when postcopy-blocktime feature is enabled, and
+ * also optional: when zero is provided, the fault accounting will be ignored.
+ */
static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
- ram_addr_t start, uint64_t haddr)
+ ram_addr_t start, uint64_t haddr, uint32_t tid)
{
void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
@@ -772,7 +970,7 @@ static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
}
- return migrate_send_rp_req_pages(mis, rb, start, haddr);
+ return migrate_send_rp_req_pages(mis, rb, start, haddr, tid);
}
/*
@@ -793,83 +991,204 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
qemu_ram_get_idstr(rb), rb_offset);
return postcopy_wake_shared(pcfd, client_addr, rb);
}
- postcopy_request_page(mis, rb, aligned_rbo, client_addr);
+ /* TODO: support blocktime tracking */
+ postcopy_request_page(mis, rb, aligned_rbo, client_addr, 0);
return 0;
}
-static int get_mem_fault_cpu_index(uint32_t pid)
+static int blocktime_get_vcpu(PostcopyBlocktimeContext *ctx, uint32_t tid)
{
- CPUState *cpu_iter;
+ int *found;
- CPU_FOREACH(cpu_iter) {
- if (cpu_iter->thread_id == pid) {
- trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
- return cpu_iter->cpu_index;
- }
+ found = g_hash_table_lookup(ctx->tid_to_vcpu_hash, GUINT_TO_POINTER(tid));
+ if (!found) {
+ /*
+ * NOTE: this is possible, because QEMU's non-vCPU threads can
+ * also access a missing page. Or, when KVM async pf is enabled, a
+ * fault can even happen from a kworker..
+ */
+ return -1;
}
- trace_get_mem_fault_cpu_index(-1, pid);
- return -1;
+
+ return *found;
}
-static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
+static uint64_t get_current_ns(void)
{
- int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
- dc->start_time;
- return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
+ return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+}
+
+/*
+ * Inject an (cpu, fault_time) entry into the database, using addr as key.
+ * When cpu==-1, it means it's a non-vCPU fault.
+ */
+static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx,
+ uintptr_t addr, int cpu, uint64_t time)
+{
+ BlocktimeVCPUEntry *entry = blocktime_vcpu_entry_alloc(cpu, time);
+ GHashTable *table = ctx->vcpu_addr_hash;
+ gpointer key = (gpointer)addr;
+ GList *head, *list;
+ gboolean result;
+
+ head = g_hash_table_lookup(table, key);
+ if (head) {
+ /*
+ * If existed, steal the @head for list operation rather than
+ * freeing it, making sure steal succeeded.
+ */
+ result = g_hash_table_steal(table, key);
+ assert(result == TRUE);
+ }
+
+ /*
+ * Now the key is guaranteed to be absent. Two cases:
+ *
+ * (1) There's no existing entry, list contains the only one. Insert.
+ * (2) There're existing entries, after stealing we own it, prepend the
+ * result and re-insert.
+ */
+ list = g_list_prepend(head, entry);
+ g_hash_table_insert(table, key, list);
+
+ trace_postcopy_blocktime_begin(addr, time, cpu, !!head);
}
/*
- * This function is being called when pagefault occurs. It
- * tracks down vCPU blocking time.
+ * This function is being called when pagefault occurs. It tracks down vCPU
+ * blocking time. It's protected by @page_request_mutex.
*
* @addr: faulted host virtual address
* @ptid: faulted process thread id
* @rb: ramblock appropriate to addr
*/
-static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
- RAMBlock *rb)
+void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
+ RAMBlock *rb)
{
- int cpu, already_received;
+ int cpu;
MigrationIncomingState *mis = migration_incoming_get_current();
PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
- uint32_t low_time_offset;
+ uint64_t current;
if (!dc || ptid == 0) {
return;
}
- cpu = get_mem_fault_cpu_index(ptid);
- if (cpu < 0) {
- return;
+
+ /*
+ * The caller should only inject a blocktime entry when the page is
+ * yet missing.
+ */
+ assert(!ramblock_recv_bitmap_test(rb, (void *)addr));
+
+ current = get_current_ns();
+ cpu = blocktime_get_vcpu(dc, ptid);
+
+ if (cpu >= 0) {
+ /* How many faults on this vCPU in total? */
+ dc->vcpu_faults_count[cpu]++;
+
+ /*
+ * Account how many concurrent faults on this vCPU we trapped. See
+ * comments above vcpu_faults_current[] on why it can be more than one.
+ */
+ if (dc->vcpu_faults_current[cpu]++ == 0) {
+ dc->smp_cpus_down++;
+ /*
+ * We use last_begin to cover (1) the 1st fault on this specific
+ * vCPU, but meanwhile (2) the last vCPU that got blocked. It's
+ * only used to calculate system-wide blocktime.
+ */
+ dc->last_begin = current;
+ }
+
+ /* Making sure it won't overflow - it really should never! */
+ assert(dc->vcpu_faults_current[cpu] <= 255);
+ } else {
+ /*
+ * For non-vCPU thread faults, we don't care about tid or cpu index
+ * or time the thread is blocked (e.g., a kworker trying to help
+ * KVM when async_pf=on is OK to be blocked and not affect guest
+ * responsiveness), but we care about latency. Track it with
+ * cpu=-1.
+ *
+ * Note that this will NOT affect blocktime reports on vCPU being
+ * blocked, but only about system-wide latency reports.
+ */
+ dc->non_vcpu_faults++;
}
- low_time_offset = get_low_time_offset(dc);
- if (dc->vcpu_addr[cpu] == 0) {
- qatomic_inc(&dc->smp_cpus_down);
+ blocktime_fault_inject(dc, addr, cpu, current);
+}
+
+static void blocktime_latency_account(PostcopyBlocktimeContext *ctx,
+ uint64_t time_us)
+{
+ /*
+ * Convert time (in us) to bucket index it belongs. Take extra caution
+ * of time_us==0 even if normally rare - when happens put into bucket 0.
+ */
+ int index = time_us ? (63 - clz64(time_us)) : 0;
+
+ assert(index >= 0);
+
+ /* If it's too large, put into top bucket */
+ if (index >= BLOCKTIME_LATENCY_BUCKET_N) {
+ index = BLOCKTIME_LATENCY_BUCKET_N - 1;
}
- qatomic_xchg(&dc->last_begin, low_time_offset);
- qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
- qatomic_xchg(&dc->vcpu_addr[cpu], addr);
+ ctx->latency_buckets[index]++;
+}
+
+typedef struct {
+ PostcopyBlocktimeContext *ctx;
+ uint64_t current;
+ int affected_cpus;
+ int affected_non_cpus;
+} BlockTimeVCPUIter;
+
+static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
+{
+ BlockTimeVCPUIter *iter = user_data;
+ PostcopyBlocktimeContext *ctx = iter->ctx;
+ BlocktimeVCPUEntry *entry = data;
+ uint64_t time_passed;
+ int cpu = entry->cpu;
/*
- * check it here, not at the beginning of the function,
- * due to, check could occur early than bitmap_set in
- * qemu_ufd_copy_ioctl
+ * Time should never go back.. so when the fault is resolved it must be
+ * later than when it was faulted.
*/
- already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
- if (already_received) {
- qatomic_xchg(&dc->vcpu_addr[cpu], 0);
- qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
- qatomic_dec(&dc->smp_cpus_down);
+ assert(iter->current >= entry->fault_time);
+ time_passed = iter->current - entry->fault_time;
+
+ /* Latency buckets are in microseconds */
+ blocktime_latency_account(ctx, time_passed / SCALE_US);
+
+ if (cpu >= 0) {
+ /*
+ * If we resolved all pending faults on one vCPU due to this page
+ * resolution, take a note.
+ */
+ if (--ctx->vcpu_faults_current[cpu] == 0) {
+ ctx->vcpu_blocktime_total[cpu] += time_passed;
+ iter->affected_cpus += 1;
+ }
+ trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]);
+ } else {
+ iter->affected_non_cpus++;
+ ctx->non_vcpu_blocktime_total += time_passed;
+ /*
+ * We do not maintain how many pending non-vCPU faults because we
+ * do not care about blocktime, only latency.
+ */
+ trace_postcopy_blocktime_end_one(-1, 0);
}
- trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
- cpu, already_received);
}
/*
- * This function just provide calculated blocktime per cpu and trace it.
- * Total blocktime is calculated in mark_postcopy_blocktime_end.
- *
+ * This function just provide calculated blocktime per cpu and trace it.
+ * Total blocktime is calculated in mark_postcopy_blocktime_end. It's
+ * protected by @page_request_mutex.
*
* Assume we have 3 CPU
*
@@ -899,48 +1218,45 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
MachineState *ms = MACHINE(qdev_get_machine());
unsigned int smp_cpus = ms->smp.cpus;
- int i, affected_cpu = 0;
- bool vcpu_total_blocktime = false;
- uint32_t read_vcpu_time, low_time_offset;
+ BlockTimeVCPUIter iter = {
+ .current = get_current_ns(),
+ .affected_cpus = 0,
+ .affected_non_cpus = 0,
+ .ctx = dc,
+ };
+ gpointer key = (gpointer)addr;
+ GHashTable *table;
+ GList *list;
if (!dc) {
return;
}
- low_time_offset = get_low_time_offset(dc);
- /* lookup cpu, to clear it,
- * that algorithm looks straightforward, but it's not
- * optimal, more optimal algorithm is keeping tree or hash
- * where key is address value is a list of */
- for (i = 0; i < smp_cpus; i++) {
- uint32_t vcpu_blocktime = 0;
-
- read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
- if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
- read_vcpu_time == 0) {
- continue;
- }
- qatomic_xchg(&dc->vcpu_addr[i], 0);
- vcpu_blocktime = low_time_offset - read_vcpu_time;
- affected_cpu += 1;
- /* we need to know is that mark_postcopy_end was due to
- * faulted page, another possible case it's prefetched
- * page and in that case we shouldn't be here */
- if (!vcpu_total_blocktime &&
- qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
- vcpu_total_blocktime = true;
- }
- /* continue cycle, due to one page could affect several vCPUs */
- dc->vcpu_blocktime[i] += vcpu_blocktime;
+ table = dc->vcpu_addr_hash;
+ /* the address wasn't tracked at all? */
+ list = g_hash_table_lookup(table, key);
+ if (!list) {
+ return;
}
- qatomic_sub(&dc->smp_cpus_down, affected_cpu);
- if (vcpu_total_blocktime) {
- dc->total_blocktime += low_time_offset - qatomic_fetch_add(
- &dc->last_begin, 0);
+ /*
+ * Loop over the set of vCPUs that got blocked on this addr, do the
+ * blocktime accounting. After that, remove the whole list.
+ */
+ g_list_foreach(list, blocktime_cpu_list_iter_fn, &iter);
+ g_hash_table_remove(table, key);
+
+ /*
+ * If all vCPUs used to be down, and copying this page would free some
+ * vCPUs, then the system-level blocktime ends here.
+ */
+ if (dc->smp_cpus_down == smp_cpus && iter.affected_cpus) {
+ dc->total_blocktime += iter.current - dc->last_begin;
}
- trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
- affected_cpu);
+ dc->smp_cpus_down -= iter.affected_cpus;
+
+ trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus,
+ iter.affected_non_cpus);
}
static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
@@ -964,7 +1280,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
trace_postcopy_ram_fault_thread_entry();
rcu_register_thread();
mis->last_rb = NULL; /* last RAMBlock we sent part of */
- qemu_sem_post(&mis->thread_sync_sem);
+ qemu_event_set(&mis->thread_sync_event);
struct pollfd *pfd;
size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
@@ -1068,17 +1384,14 @@ static void *postcopy_ram_fault_thread(void *opaque)
qemu_ram_get_idstr(rb),
rb_offset,
msg.arg.pagefault.feat.ptid);
- mark_postcopy_blocktime_begin(
- (uintptr_t)(msg.arg.pagefault.address),
- msg.arg.pagefault.feat.ptid, rb);
-
retry:
/*
* Send the request to the source - we want to request one
* of our host page sizes (which is >= TPS)
*/
ret = postcopy_request_page(mis, rb, rb_offset,
- msg.arg.pagefault.address);
+ msg.arg.pagefault.address,
+ msg.arg.pagefault.feat.ptid);
if (ret) {
/* May be network failure, try to wait for recovery */
postcopy_pause_fault_thread(mis);
@@ -1221,6 +1534,11 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
return -1;
}
+ if (migrate_postcopy_blocktime()) {
+ assert(mis->blocktime_ctx == NULL);
+ mis->blocktime_ctx = blocktime_context_new();
+ }
+
/* Now an eventfd we use to tell the fault-thread to quit */
mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
if (mis->userfault_event_fd == -1) {
@@ -1299,8 +1617,8 @@ static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
qemu_cond_signal(&mis->page_request_cond);
}
}
- qemu_mutex_unlock(&mis->page_request_mutex);
mark_postcopy_blocktime_end((uintptr_t)host_addr);
+ qemu_mutex_unlock(&mis->page_request_mutex);
}
return ret;
}
@@ -1430,6 +1748,11 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
{
g_assert_not_reached();
}
+
+void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
+ RAMBlock *rb)
+{
+}
#endif
/* ------------------------------------------------------------------------- */
@@ -1716,7 +2039,7 @@ void *postcopy_preempt_thread(void *opaque)
rcu_register_thread();
- qemu_sem_post(&mis->thread_sync_sem);
+ qemu_event_set(&mis->thread_sync_event);
/*
* The preempt channel is established in asynchronous way. Wait
diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
index a6df1b2..3852141 100644
--- a/migration/postcopy-ram.h
+++ b/migration/postcopy-ram.h
@@ -196,5 +196,7 @@ void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
void postcopy_preempt_setup(MigrationState *s);
int postcopy_preempt_establish_channel(MigrationState *s);
bool postcopy_is_paused(MigrationStatus status);
+void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
+ RAMBlock *rb);
#endif
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 1303a5b..b6ac190 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -561,8 +561,6 @@ void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
}
stat64_add(&mig_stats.qemu_file_transferred, buflen);
-
- return;
}
diff --git a/migration/ram.c b/migration/ram.c
index 424df6d..7208bc1 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -48,7 +48,7 @@
#include "qapi/qapi-commands-migration.h"
#include "qapi/qmp/qerror.h"
#include "trace.h"
-#include "exec/ram_addr.h"
+#include "system/ram_addr.h"
#include "exec/target_page.h"
#include "qemu/rcu_queue.h"
#include "migration/colo.h"
@@ -91,6 +91,36 @@
XBZRLECacheStats xbzrle_counters;
+/*
+ * This structure locates a specific location of a guest page. In QEMU,
+ * it's described in a tuple of (ramblock, offset).
+ */
+struct PageLocation {
+ RAMBlock *block;
+ unsigned long offset;
+};
+typedef struct PageLocation PageLocation;
+
+/**
+ * PageLocationHint: describes a hint to a page location
+ *
+ * @valid set if the hint is vaild and to be consumed
+ * @location: the hint content
+ *
+ * In postcopy preempt mode, the urgent channel may provide hints to the
+ * background channel, so that QEMU source can try to migrate whatever is
+ * right after the requested urgent pages.
+ *
+ * This is based on the assumption that the VM (already running on the
+ * destination side) tends to access the memory with spatial locality.
+ * This is also the default behavior of vanilla postcopy (preempt off).
+ */
+struct PageLocationHint {
+ bool valid;
+ PageLocation location;
+};
+typedef struct PageLocationHint PageLocationHint;
+
/* used by the search for pages to send */
struct PageSearchStatus {
/* The migration channel used for a specific host page */
@@ -395,6 +425,13 @@ struct RAMState {
* RAM migration.
*/
unsigned int postcopy_bmap_sync_requested;
+ /*
+ * Page hint during postcopy when preempt mode is on. Return path
+ * thread sets it, while background migration thread consumes it.
+ *
+ * Protected by @bitmap_mutex.
+ */
+ PageLocationHint page_hint;
};
typedef struct RAMState RAMState;
@@ -794,14 +831,24 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
bool ret;
/*
- * Clear dirty bitmap if needed. This _must_ be called before we
- * send any of the page in the chunk because we need to make sure
- * we can capture further page content changes when we sync dirty
- * log the next time. So as long as we are going to send any of
- * the page in the chunk we clear the remote dirty bitmap for all.
- * Clearing it earlier won't be a problem, but too late will.
+ * During the last stage (after source VM stopped), resetting the write
+ * protections isn't needed as we know there will be either (1) no
+ * further writes if migration will complete, or (2) migration fails
+ * at last then tracking isn't needed either.
+ *
+ * Do the same for postcopy due to the same reason.
*/
- migration_clear_memory_region_dirty_bitmap(rb, page);
+ if (!rs->last_stage && !migration_in_postcopy()) {
+ /*
+ * Clear dirty bitmap if needed. This _must_ be called before we
+ * send any of the page in the chunk because we need to make sure
+ * we can capture further page content changes when we sync dirty
+ * log the next time. So as long as we are going to send any of
+ * the page in the chunk we clear the remote dirty bitmap for all.
+ * Clearing it earlier won't be a problem, but too late will.
+ */
+ migration_clear_memory_region_dirty_bitmap(rb, page);
+ }
ret = test_and_clear_bit(page, rb->bmap);
if (ret) {
@@ -811,8 +858,8 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
return ret;
}
-static void dirty_bitmap_clear_section(MemoryRegionSection *section,
- void *opaque)
+static int dirty_bitmap_clear_section(MemoryRegionSection *section,
+ void *opaque)
{
const hwaddr offset = section->offset_within_region;
const hwaddr size = int128_get64(section->size);
@@ -831,6 +878,7 @@ static void dirty_bitmap_clear_section(MemoryRegionSection *section,
}
*cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
bitmap_clear(rb->bmap, start, npages);
+ return 0;
}
/*
@@ -1144,32 +1192,6 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
}
/*
- * @pages: the number of pages written by the control path,
- * < 0 - error
- * > 0 - number of pages written
- *
- * Return true if the pages has been saved, otherwise false is returned.
- */
-static bool control_save_page(PageSearchStatus *pss,
- ram_addr_t offset, int *pages)
-{
- int ret;
-
- ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset,
- TARGET_PAGE_SIZE);
- if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
- return false;
- }
-
- if (ret == RAM_SAVE_CONTROL_DELAYED) {
- *pages = 1;
- return true;
- }
- *pages = ret;
- return true;
-}
-
-/*
* directly send the page to the stream
*
* Returns the number of pages written.
@@ -1965,7 +1987,13 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
int res;
/* Hand over to RDMA first */
- if (control_save_page(pss, offset, &res)) {
+ if (migrate_rdma()) {
+ res = rdma_control_save_page(pss->pss_channel, pss->block->offset,
+ offset, TARGET_PAGE_SIZE);
+
+ if (res == RAM_SAVE_CONTROL_DELAYED) {
+ res = 1;
+ }
return res;
}
@@ -1976,9 +2004,8 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
}
}
- if (migrate_multifd()) {
- RAMBlock *block = pss->block;
- return ram_save_multifd_page(block, offset);
+ if (migrate_multifd() && !migration_in_postcopy()) {
+ return ram_save_multifd_page(pss->block, offset);
}
return ram_save_page(rs, pss);
@@ -2039,6 +2066,21 @@ static void pss_host_page_finish(PageSearchStatus *pss)
pss->host_page_start = pss->host_page_end = 0;
}
+static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss)
+{
+ PageLocationHint *hint = &rs->page_hint;
+
+ /* If there's a pending hint not consumed, don't bother */
+ if (hint->valid) {
+ return;
+ }
+
+ /* Provide a hint to the background stream otherwise */
+ hint->location.block = pss->block;
+ hint->location.offset = pss->page;
+ hint->valid = true;
+}
+
/*
* Send an urgent host page specified by `pss'. Need to be called with
* bitmap_mutex held.
@@ -2084,6 +2126,7 @@ out:
/* For urgent requests, flush immediately if sent */
if (sent) {
qemu_fflush(pss->pss_channel);
+ ram_page_hint_update(rs, pss);
}
return ret;
}
@@ -2171,6 +2214,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
return (res < 0 ? res : pages);
}
+static bool ram_page_hint_valid(RAMState *rs)
+{
+ /* There's only page hint during postcopy preempt mode */
+ if (!postcopy_preempt_active()) {
+ return false;
+ }
+
+ return rs->page_hint.valid;
+}
+
+static void ram_page_hint_collect(RAMState *rs, RAMBlock **block,
+ unsigned long *page)
+{
+ PageLocationHint *hint = &rs->page_hint;
+
+ assert(hint->valid);
+
+ *block = hint->location.block;
+ *page = hint->location.offset;
+
+ /* Mark the hint consumed */
+ hint->valid = false;
+}
+
/**
* ram_find_and_save_block: finds a dirty page and sends it to f
*
@@ -2187,6 +2254,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
static int ram_find_and_save_block(RAMState *rs)
{
PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
+ unsigned long next_page;
+ RAMBlock *next_block;
int pages = 0;
/* No dirty page as there is zero RAM */
@@ -2206,22 +2275,31 @@ static int ram_find_and_save_block(RAMState *rs)
rs->last_page = 0;
}
- pss_init(pss, rs->last_seen_block, rs->last_page);
+ if (ram_page_hint_valid(rs)) {
+ ram_page_hint_collect(rs, &next_block, &next_page);
+ } else {
+ next_block = rs->last_seen_block;
+ next_page = rs->last_page;
+ }
+
+ pss_init(pss, next_block, next_page);
while (true){
if (!get_queued_page(rs, pss)) {
/* priority queue empty, so just search for something dirty */
int res = find_dirty_block(rs, pss);
- if (res != PAGE_DIRTY_FOUND) {
- if (res == PAGE_ALL_CLEAN) {
- break;
- } else if (res == PAGE_TRY_AGAIN) {
- continue;
- } else if (res < 0) {
- pages = res;
- break;
- }
+
+ if (res == PAGE_ALL_CLEAN) {
+ break;
+ } else if (res == PAGE_TRY_AGAIN) {
+ continue;
+ } else if (res < 0) {
+ pages = res;
+ break;
}
+
+ /* Otherwise we must have a dirty page to move */
+ assert(res == PAGE_DIRTY_FOUND);
}
pages = ram_save_host_page(rs, pss);
if (pages) {
@@ -2339,6 +2417,13 @@ static void ram_save_cleanup(void *opaque)
ram_state_cleanup(rsp);
}
+static void ram_page_hint_reset(PageLocationHint *hint)
+{
+ hint->location.block = NULL;
+ hint->location.offset = 0;
+ hint->valid = false;
+}
+
static void ram_state_reset(RAMState *rs)
{
int i;
@@ -2351,6 +2436,8 @@ static void ram_state_reset(RAMState *rs)
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_started = false;
+
+ ram_page_hint_reset(&rs->page_hint);
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -3205,6 +3292,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
RAMState *rs = *temp;
int ret = 0;
+ trace_ram_save_complete(rs->migration_dirty_pages, 0);
+
rs->last_stage = !migration_in_colo_state();
WITH_RCU_READ_LOCK_GUARD() {
@@ -3268,6 +3357,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
}
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+
+ trace_ram_save_complete(rs->migration_dirty_pages, 1);
+
return qemu_fflush(f);
}
@@ -3598,7 +3690,9 @@ static int ram_load_cleanup(void *opaque)
RAMBlock *rb;
RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
- qemu_ram_block_writeback(rb);
+ if (memory_region_is_nonvolatile(rb->mr)) {
+ qemu_ram_block_writeback(rb);
+ }
}
xbzrle_load_cleanup();
@@ -3963,8 +4057,6 @@ static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
/* Skip pages array */
qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
-
- return;
}
static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
@@ -4420,6 +4512,42 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
return 0;
}
+static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp)
+{
+ int ret;
+
+ if (migrate_multifd()) {
+ /*
+ * When multifd is enabled, source QEMU needs to make sure all the
+ * pages queued before postcopy starts have been flushed.
+ *
+ * The load of these pages must happen before switching to postcopy.
+ * It's because loading of guest pages (so far) in multifd recv
+ * threads is still non-atomic, so the load cannot happen with vCPUs
+ * running on the destination side.
+ *
+ * This flush and sync will guarantee that those pages are loaded
+ * _before_ postcopy starts on the destination. The rationale is,
+ * this happens before VM stops (and before source QEMU sends all
+ * the rest of the postcopy messages). So when the destination QEMU
+ * receives the postcopy messages, it must have received the sync
+ * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH,
+ * or RAM_SAVE_FLAG_EOS), and such message would guarantee that
+ * all previous guest pages queued in the multifd channels are
+ * completely loaded.
+ */
+ ret = multifd_ram_flush_and_sync(f);
+ if (ret < 0) {
+ error_setg(errp, "%s: multifd flush and sync failed", __func__);
+ return false;
+ }
+ }
+
+ qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+
+ return true;
+}
+
void postcopy_preempt_shutdown_file(MigrationState *s)
{
qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
@@ -4429,8 +4557,7 @@ void postcopy_preempt_shutdown_file(MigrationState *s)
static SaveVMHandlers savevm_ram_handlers = {
.save_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
- .save_live_complete_postcopy = ram_save_complete,
- .save_live_complete_precopy = ram_save_complete,
+ .save_complete = ram_save_complete,
.has_postcopy = ram_has_postcopy,
.state_pending_exact = ram_state_pending_exact,
.state_pending_estimate = ram_state_pending_estimate,
@@ -4439,6 +4566,7 @@ static SaveVMHandlers savevm_ram_handlers = {
.load_setup = ram_load_setup,
.load_cleanup = ram_load_cleanup,
.resume_prepare = ram_resume_prepare,
+ .save_postcopy_prepare = ram_save_postcopy_prepare,
};
static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
diff --git a/migration/rdma.c b/migration/rdma.c
index 76fb034..2d839fc 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -30,7 +30,7 @@
#include "qemu/sockets.h"
#include "qemu/bitmap.h"
#include "qemu/coroutine.h"
-#include "exec/memory.h"
+#include "system/memory.h"
#include <sys/socket.h>
#include <netdb.h>
#include <arpa/inet.h>
@@ -768,156 +768,12 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
}
/*
- * As of now, IPv6 over RoCE / iWARP is not supported by linux.
- * We will try the next addrinfo struct, and fail if there are
- * no other valid addresses to bind against.
- *
- * If user is listening on '[::]', then we will not have a opened a device
- * yet and have no way of verifying if the device is RoCE or not.
- *
- * In this case, the source VM will throw an error for ALL types of
- * connections (both IPv4 and IPv6) if the destination machine does not have
- * a regular infiniband network available for use.
- *
- * The only way to guarantee that an error is thrown for broken kernels is
- * for the management software to choose a *specific* interface at bind time
- * and validate what time of hardware it is.
- *
- * Unfortunately, this puts the user in a fix:
- *
- * If the source VM connects with an IPv4 address without knowing that the
- * destination has bound to '[::]' the migration will unconditionally fail
- * unless the management software is explicitly listening on the IPv4
- * address while using a RoCE-based device.
- *
- * If the source VM connects with an IPv6 address, then we're OK because we can
- * throw an error on the source (and similarly on the destination).
- *
- * But in mixed environments, this will be broken for a while until it is fixed
- * inside linux.
- *
- * We do provide a *tiny* bit of help in this function: We can list all of the
- * devices in the system and check to see if all the devices are RoCE or
- * Infiniband.
- *
- * If we detect that we have a *pure* RoCE environment, then we can safely
- * thrown an error even if the management software has specified '[::]' as the
- * bind address.
- *
- * However, if there is are multiple hetergeneous devices, then we cannot make
- * this assumption and the user just has to be sure they know what they are
- * doing.
- *
- * Patches are being reviewed on linux-rdma.
- */
-static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
-{
- /* This bug only exists in linux, to our knowledge. */
-#ifdef CONFIG_LINUX
- struct ibv_port_attr port_attr;
-
- /*
- * Verbs are only NULL if management has bound to '[::]'.
- *
- * Let's iterate through all the devices and see if there any pure IB
- * devices (non-ethernet).
- *
- * If not, then we can safely proceed with the migration.
- * Otherwise, there are no guarantees until the bug is fixed in linux.
- */
- if (!verbs) {
- int num_devices;
- struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
- bool roce_found = false;
- bool ib_found = false;
-
- for (int x = 0; x < num_devices; x++) {
- verbs = ibv_open_device(dev_list[x]);
- /*
- * ibv_open_device() is not documented to set errno. If
- * it does, it's somebody else's doc bug. If it doesn't,
- * the use of errno below is wrong.
- * TODO Find out whether ibv_open_device() sets errno.
- */
- if (!verbs) {
- if (errno == EPERM) {
- continue;
- } else {
- error_setg_errno(errp, errno,
- "could not open RDMA device context");
- return -1;
- }
- }
-
- if (ibv_query_port(verbs, 1, &port_attr)) {
- ibv_close_device(verbs);
- error_setg(errp,
- "RDMA ERROR: Could not query initial IB port");
- return -1;
- }
-
- if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
- ib_found = true;
- } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- roce_found = true;
- }
-
- ibv_close_device(verbs);
-
- }
-
- if (roce_found) {
- if (ib_found) {
- warn_report("migrations may fail:"
- " IPv6 over RoCE / iWARP in linux"
- " is broken. But since you appear to have a"
- " mixed RoCE / IB environment, be sure to only"
- " migrate over the IB fabric until the kernel "
- " fixes the bug.");
- } else {
- error_setg(errp, "RDMA ERROR: "
- "You only have RoCE / iWARP devices in your systems"
- " and your management software has specified '[::]'"
- ", but IPv6 over RoCE / iWARP is not supported in Linux.");
- return -1;
- }
- }
-
- return 0;
- }
-
- /*
- * If we have a verbs context, that means that some other than '[::]' was
- * used by the management software for binding. In which case we can
- * actually warn the user about a potentially broken kernel.
- */
-
- /* IB ports start with 1, not 0 */
- if (ibv_query_port(verbs, 1, &port_attr)) {
- error_setg(errp, "RDMA ERROR: Could not query initial IB port");
- return -1;
- }
-
- if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- error_setg(errp, "RDMA ERROR: "
- "Linux kernel's RoCE / iWARP does not support IPv6 "
- "(but patches on linux-rdma in progress)");
- return -1;
- }
-
-#endif
-
- return 0;
-}
-
-/*
* Figure out which RDMA device corresponds to the requested IP hostname
* Also create the initial connection manager identifiers for opening
* the connection.
*/
static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
{
- Error *err = NULL;
int ret;
struct rdma_addrinfo *res;
char port_str[16];
@@ -953,9 +809,8 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
goto err_resolve_get_addr;
}
- /* Try all addresses, saving the first error in @err */
+ /* Try all addresses, exit loop on first success of resolving address */
for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) {
- Error **local_errp = err ? NULL : &err;
inet_ntop(e->ai_family,
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
@@ -964,25 +819,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
RDMA_RESOLVE_TIMEOUT_MS);
if (ret >= 0) {
- if (e->ai_family == AF_INET6) {
- ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
- local_errp);
- if (ret < 0) {
- continue;
- }
- }
- error_free(err);
goto route;
}
}
rdma_freeaddrinfo(res);
- if (err) {
- error_propagate(errp, err);
- } else {
- error_setg(errp, "RDMA ERROR: could not resolve address %s",
- rdma->host);
- }
+ error_setg(errp, "RDMA ERROR: could not resolve address %s", rdma->host);
goto err_resolve_get_addr;
route:
@@ -2611,7 +2453,6 @@ err_rdma_source_connect:
static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
{
- Error *err = NULL;
int ret;
struct rdma_cm_id *listen_id;
char ip[40] = "unknown";
@@ -2661,9 +2502,8 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
goto err_dest_init_bind_addr;
}
- /* Try all addresses, saving the first error in @err */
+ /* Try all addresses */
for (e = res; e != NULL; e = e->ai_next) {
- Error **local_errp = err ? NULL : &err;
inet_ntop(e->ai_family,
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
@@ -2672,24 +2512,12 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
if (ret < 0) {
continue;
}
- if (e->ai_family == AF_INET6) {
- ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
- local_errp);
- if (ret < 0) {
- continue;
- }
- }
- error_free(err);
break;
}
rdma_freeaddrinfo(res);
if (!e) {
- if (err) {
- error_propagate(errp, err);
- } else {
- error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
- }
+ error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
goto err_dest_init_bind_addr;
}
@@ -3284,14 +3112,11 @@ err:
int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
ram_addr_t offset, size_t size)
{
- if (!migrate_rdma() || migration_in_postcopy()) {
- return RAM_SAVE_CONTROL_NOT_SUPP;
- }
+ assert(migrate_rdma());
int ret = qemu_rdma_save_page(f, block_offset, offset, size);
- if (ret != RAM_SAVE_CONTROL_DELAYED &&
- ret != RAM_SAVE_CONTROL_NOT_SUPP) {
+ if (ret != RAM_SAVE_CONTROL_DELAYED) {
if (ret < 0) {
qemu_file_set_error(f, ret);
}
@@ -3829,7 +3654,7 @@ int rdma_block_notification_handle(QEMUFile *f, const char *name)
int rdma_registration_start(QEMUFile *f, uint64_t flags)
{
- if (!migrate_rdma() || migration_in_postcopy()) {
+ if (!migrate_rdma()) {
return 0;
}
@@ -3861,7 +3686,7 @@ int rdma_registration_stop(QEMUFile *f, uint64_t flags)
RDMAControlHeader head = { .len = 0, .repeat = 1 };
int ret;
- if (!migrate_rdma() || migration_in_postcopy()) {
+ if (!migrate_rdma()) {
return 0;
}
@@ -3985,7 +3810,7 @@ static void qio_channel_rdma_finalize(Object *obj)
}
static void qio_channel_rdma_class_init(ObjectClass *klass,
- void *class_data G_GNUC_UNUSED)
+ const void *class_data G_GNUC_UNUSED)
{
QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
diff --git a/migration/rdma.h b/migration/rdma.h
index f55f28b..f74f16a 100644
--- a/migration/rdma.h
+++ b/migration/rdma.h
@@ -19,7 +19,7 @@
#ifndef QEMU_MIGRATION_RDMA_H
#define QEMU_MIGRATION_RDMA_H
-#include "exec/memory.h"
+#include "system/memory.h"
void rdma_start_outgoing_migration(void *opaque, InetSocketAddress *host_port,
Error **errp);
@@ -33,7 +33,6 @@ void rdma_start_incoming_migration(InetSocketAddress *host_port, Error **errp);
#define RAM_CONTROL_ROUND 1
#define RAM_CONTROL_FINISH 3
-#define RAM_SAVE_CONTROL_NOT_SUPP -1000
#define RAM_SAVE_CONTROL_DELAYED -2000
#ifdef CONFIG_RDMA
@@ -56,7 +55,7 @@ static inline
int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
ram_addr_t offset, size_t size)
{
- return RAM_SAVE_CONTROL_NOT_SUPP;
+ g_assert_not_reached();
}
#endif
#endif
diff --git a/migration/savevm.c b/migration/savevm.c
index ce158c3..fabbeb2 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -48,8 +48,9 @@
#include "qapi/qapi-builtin-visit.h"
#include "qemu/error-report.h"
#include "system/cpus.h"
-#include "exec/memory.h"
+#include "system/memory.h"
#include "exec/target_page.h"
+#include "exec/page-vary.h"
#include "trace.h"
#include "qemu/iov.h"
#include "qemu/job.h"
@@ -265,7 +266,7 @@ typedef struct SaveState {
static SaveState savevm_state = {
.handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
- .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
+ .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL },
.global_section_id = 0,
};
@@ -339,7 +340,7 @@ static int configuration_pre_load(void *opaque)
* predates the variable-target-page-bits support and is using the
* minimum possible value for this CPU.
*/
- state->target_page_bits = qemu_target_page_bits_min();
+ state->target_page_bits = migration_legacy_page_bits();
return 0;
}
@@ -462,8 +463,7 @@ static const VMStateInfo vmstate_info_capability = {
*/
static bool vmstate_target_page_bits_needed(void *opaque)
{
- return qemu_target_page_bits()
- > qemu_target_page_bits_min();
+ return qemu_target_page_bits() > migration_legacy_page_bits();
}
static const VMStateDescription vmstate_target_page_bits = {
@@ -737,7 +737,7 @@ static int calculate_compat_instance_id(const char *idstr)
static inline MigrationPriority save_state_priority(SaveStateEntry *se)
{
- if (se->vmsd) {
+ if (se->vmsd && se->vmsd->priority) {
return se->vmsd->priority;
}
return MIG_PRI_DEFAULT;
@@ -1484,63 +1484,112 @@ bool should_send_vmdesc(void)
return !machine->suppress_vmdesc;
}
+static bool qemu_savevm_complete_exists(SaveStateEntry *se)
+{
+ return se->ops && se->ops->save_complete;
+}
+
/*
- * Calls the save_live_complete_postcopy methods
- * causing the last few pages to be sent immediately and doing any associated
- * cleanup.
+ * Invoke the ->save_complete() if necessary.
+ * Returns: 0 if skip the current SE or succeeded, <0 if error happened.
+ */
+static int qemu_savevm_complete(SaveStateEntry *se, QEMUFile *f)
+{
+ int ret;
+
+ if (se->ops->is_active) {
+ if (!se->ops->is_active(se->opaque)) {
+ return 0;
+ }
+ }
+
+ trace_savevm_section_start(se->idstr, se->section_id);
+ save_section_header(f, se, QEMU_VM_SECTION_END);
+ ret = se->ops->save_complete(f, se->opaque);
+ trace_savevm_section_end(se->idstr, se->section_id, ret);
+ save_section_footer(f, se);
+
+ if (ret < 0) {
+ qemu_file_set_error(f, ret);
+ }
+
+ return ret;
+}
+
+/*
+ * Complete saving any postcopy-able devices.
+ *
* Note postcopy also calls qemu_savevm_state_complete_precopy to complete
* all the other devices, but that happens at the point we switch to postcopy.
*/
void qemu_savevm_state_complete_postcopy(QEMUFile *f)
{
SaveStateEntry *se;
- int ret;
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (!se->ops || !se->ops->save_live_complete_postcopy) {
+ if (!qemu_savevm_complete_exists(se)) {
+ continue;
+ }
+
+ if (qemu_savevm_complete(se, f) < 0) {
+ return;
+ }
+ }
+
+ qemu_put_byte(f, QEMU_VM_EOF);
+ qemu_fflush(f);
+}
+
+bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp)
+{
+ SaveStateEntry *se;
+ bool ret;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->ops || !se->ops->save_postcopy_prepare) {
continue;
}
+
if (se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}
+
trace_savevm_section_start(se->idstr, se->section_id);
- /* Section type */
- qemu_put_byte(f, QEMU_VM_SECTION_END);
- qemu_put_be32(f, se->section_id);
- ret = se->ops->save_live_complete_postcopy(f, se->opaque);
- trace_savevm_section_end(se->idstr, se->section_id, ret);
+ save_section_header(f, se, QEMU_VM_SECTION_PART);
+ ret = se->ops->save_postcopy_prepare(f, se->opaque, errp);
save_section_footer(f, se);
- if (ret < 0) {
- qemu_file_set_error(f, ret);
- return;
+
+ trace_savevm_section_end(se->idstr, se->section_id, ret);
+
+ if (!ret) {
+ assert(*errp);
+ return false;
}
}
- qemu_put_byte(f, QEMU_VM_EOF);
- qemu_fflush(f);
+ return true;
}
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
{
int64_t start_ts_each, end_ts_each;
SaveStateEntry *se;
- int ret;
bool multifd_device_state = multifd_device_state_supported();
if (multifd_device_state) {
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- SaveLiveCompletePrecopyThreadHandler hdlr;
+ SaveCompletePrecopyThreadHandler hdlr;
if (!se->ops || (in_postcopy && se->ops->has_postcopy &&
se->ops->has_postcopy(se->opaque)) ||
- !se->ops->save_live_complete_precopy_thread) {
+ !se->ops->save_complete_precopy_thread) {
continue;
}
- hdlr = se->ops->save_live_complete_precopy_thread;
+ hdlr = se->ops->save_complete_precopy_thread;
multifd_spawn_device_state_save_thread(hdlr,
se->idstr, se->instance_id,
se->opaque);
@@ -1548,32 +1597,25 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
}
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (!se->ops ||
- (in_postcopy && se->ops->has_postcopy &&
- se->ops->has_postcopy(se->opaque)) ||
- !se->ops->save_live_complete_precopy) {
+ if (!qemu_savevm_complete_exists(se)) {
continue;
}
- if (se->ops->is_active) {
- if (!se->ops->is_active(se->opaque)) {
- continue;
- }
+ if (in_postcopy && se->ops->has_postcopy &&
+ se->ops->has_postcopy(se->opaque)) {
+ /*
+ * If postcopy will start soon, and if the SE supports
+ * postcopy, then we can skip the SE for the postcopy phase.
+ */
+ continue;
}
start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
- trace_savevm_section_start(se->idstr, se->section_id);
-
- save_section_header(f, se, QEMU_VM_SECTION_END);
-
- ret = se->ops->save_live_complete_precopy(f, se->opaque);
- trace_savevm_section_end(se->idstr, se->section_id, ret);
- save_section_footer(f, se);
- if (ret < 0) {
- qemu_file_set_error(f, ret);
+ if (qemu_savevm_complete(se, f) < 0) {
goto ret_fail_abort_threads;
}
end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
+
trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id,
end_ts_each - start_ts_each);
}
@@ -2045,7 +2087,7 @@ static void *postcopy_ram_listen_thread(void *opaque)
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_POSTCOPY_ACTIVE);
- qemu_sem_post(&mis->thread_sync_sem);
+ qemu_event_set(&mis->thread_sync_event);
trace_postcopy_ram_listen_thread_start();
rcu_register_thread();
diff --git a/migration/savevm.h b/migration/savevm.h
index 138c39a..2d5e9c7 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -45,6 +45,7 @@ void qemu_savevm_state_pending_exact(uint64_t *must_precopy,
void qemu_savevm_state_pending_estimate(uint64_t *must_precopy,
uint64_t *can_postcopy);
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
+bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
void qemu_savevm_send_open_return_path(QEMUFile *f);
int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
diff --git a/migration/target.c b/migration/target.c
index a6ffa9a..12fd399 100644
--- a/migration/target.c
+++ b/migration/target.c
@@ -11,21 +11,21 @@
#include CONFIG_DEVICES
#ifdef CONFIG_VFIO
-#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio-migration.h"
#endif
#ifdef CONFIG_VFIO
void migration_populate_vfio_info(MigrationInfo *info)
{
- if (vfio_mig_active()) {
+ if (vfio_migration_active()) {
info->vfio = g_malloc0(sizeof(*info->vfio));
- info->vfio->transferred = vfio_mig_bytes_transferred();
+ info->vfio->transferred = vfio_migration_bytes_transferred();
}
}
void migration_reset_vfio_bytes_transferred(void)
{
- vfio_reset_bytes_transferred();
+ vfio_migration_reset_bytes_transferred();
}
#else
void migration_populate_vfio_info(MigrationInfo *info)
diff --git a/migration/trace-events b/migration/trace-events
index c506e11..706db97 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -105,6 +105,7 @@ ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d addr=0x%"
ram_postcopy_send_discard_bitmap(void) ""
ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p"
ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx"
+ram_save_complete(uint64_t dirty_pages, int done) "dirty=%" PRIu64 ", done=%d"
ram_dirty_bitmap_request(char *str) "%s"
ram_dirty_bitmap_reload_begin(char *str) "%s"
ram_dirty_bitmap_reload_complete(char *str) "%s"
@@ -284,8 +285,6 @@ postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t
postcopy_place_page(void *host_addr) "host=%p"
postcopy_place_page_zero(void *host_addr) "host=%p"
postcopy_ram_enable_notify(void) ""
-mark_postcopy_blocktime_begin(uint64_t addr, void *dd, uint32_t time, int cpu, int received) "addr: 0x%" PRIx64 ", dd: %p, time: %u, cpu: %d, already_received: %d"
-mark_postcopy_blocktime_end(uint64_t addr, void *dd, uint32_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %u, affected_cpu: %d"
postcopy_pause_fault_thread(void) ""
postcopy_pause_fault_thread_continued(void) ""
postcopy_pause_fast_load(void) ""
@@ -309,8 +308,10 @@ postcopy_preempt_tls_handshake(void) ""
postcopy_preempt_new_channel(void) ""
postcopy_preempt_thread_entry(void) ""
postcopy_preempt_thread_exit(void) ""
-
-get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u"
+postcopy_blocktime_tid_cpu_map(int cpu, uint32_t tid) "cpu: %d, tid: %u"
+postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu, bool exists) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d, exist: %d"
+postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu, int affected_non_cpus) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d, affected_non_cpus: %d"
+postcopy_blocktime_end_one(int cpu, uint8_t left_faults) "cpu: %d, left_faults: %" PRIu8
# exec.c
migration_exec_outgoing(const char *cmd) "cmd=%s"