diff options
Diffstat (limited to 'migration')
-rw-r--r-- | migration/channel-block.c | 2 | ||||
-rw-r--r-- | migration/colo.c | 20 | ||||
-rw-r--r-- | migration/cpr-transfer.c | 7 | ||||
-rw-r--r-- | migration/cpr.c | 36 | ||||
-rw-r--r-- | migration/dirtyrate.c | 4 | ||||
-rw-r--r-- | migration/file.c | 2 | ||||
-rw-r--r-- | migration/migration-hmp-cmds.c | 186 | ||||
-rw-r--r-- | migration/migration.c | 187 | ||||
-rw-r--r-- | migration/migration.h | 12 | ||||
-rw-r--r-- | migration/multifd-nocomp.c | 5 | ||||
-rw-r--r-- | migration/multifd-qatzip.c | 2 | ||||
-rw-r--r-- | migration/multifd-qpl.c | 2 | ||||
-rw-r--r-- | migration/multifd-uadk.c | 2 | ||||
-rw-r--r-- | migration/multifd-zero-page.c | 24 | ||||
-rw-r--r-- | migration/multifd-zlib.c | 2 | ||||
-rw-r--r-- | migration/multifd-zstd.c | 2 | ||||
-rw-r--r-- | migration/multifd.c | 21 | ||||
-rw-r--r-- | migration/multifd.h | 5 | ||||
-rw-r--r-- | migration/options.c | 39 | ||||
-rw-r--r-- | migration/options.h | 1 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 12 | ||||
-rw-r--r-- | migration/ram.c | 206 | ||||
-rw-r--r-- | migration/rdma.c | 195 | ||||
-rw-r--r-- | migration/rdma.h | 5 | ||||
-rw-r--r-- | migration/savevm.c | 47 | ||||
-rw-r--r-- | migration/savevm.h | 1 | ||||
-rw-r--r-- | migration/target.c | 8 |
27 files changed, 569 insertions, 466 deletions
diff --git a/migration/channel-block.c b/migration/channel-block.c index b0477f5..97de5a6 100644 --- a/migration/channel-block.c +++ b/migration/channel-block.c @@ -170,7 +170,7 @@ qio_channel_block_set_aio_fd_handler(QIOChannel *ioc, static void qio_channel_block_class_init(ObjectClass *klass, - void *class_data G_GNUC_UNUSED) + const void *class_data G_GNUC_UNUSED) { QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass); diff --git a/migration/colo.c b/migration/colo.c index c976b3f..e0f713c 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -146,7 +146,7 @@ static void secondary_vm_do_failover(void) return; } /* Notify COLO incoming thread that failover work is finished */ - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); /* For Secondary VM, jump to incoming co */ if (mis->colo_incoming_co) { @@ -195,7 +195,7 @@ static void primary_vm_do_failover(void) } /* Notify COLO thread that failover work is finished */ - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); } COLOMode get_colo_mode(void) @@ -620,8 +620,8 @@ out: } /* Hope this not to be too long to wait here */ - qemu_sem_wait(&s->colo_exit_sem); - qemu_sem_destroy(&s->colo_exit_sem); + qemu_event_wait(&s->colo_exit_event); + qemu_event_destroy(&s->colo_exit_event); /* * It is safe to unregister notifier after failover finished. @@ -651,7 +651,7 @@ void migrate_start_colo_process(MigrationState *s) s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, colo_checkpoint_notify_timer, NULL); - qemu_sem_init(&s->colo_exit_sem, 0); + qemu_event_init(&s->colo_exit_event, false); colo_process_checkpoint(s); bql_lock(); } @@ -808,11 +808,11 @@ void colo_shutdown(void) case COLO_MODE_PRIMARY: s = migrate_get_current(); qemu_event_set(&s->colo_checkpoint_event); - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); break; case COLO_MODE_SECONDARY: mis = migration_incoming_get_current(); - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); break; default: break; @@ -827,7 +827,7 @@ static void *colo_process_incoming_thread(void *opaque) Error *local_err = NULL; rcu_register_thread(); - qemu_sem_init(&mis->colo_incoming_sem, 0); + qemu_event_init(&mis->colo_incoming_event, false); migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -923,8 +923,8 @@ out: } /* Hope this not to be too long to loop here */ - qemu_sem_wait(&mis->colo_incoming_sem); - qemu_sem_destroy(&mis->colo_incoming_sem); + qemu_event_wait(&mis->colo_incoming_event); + qemu_event_destroy(&mis->colo_incoming_event); rcu_unregister_thread(); return NULL; diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c index e1f1403..00371d1 100644 --- a/migration/cpr-transfer.c +++ b/migration/cpr-transfer.c @@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp) MigrationAddress *addr = channel->addr; if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET && - addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) { + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX || + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) { g_autoptr(QIOChannelSocket) sioc = NULL; SocketAddress *saddr = &addr->u.socket; @@ -60,7 +61,9 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp) sioc = qio_net_listener_wait_client(listener); ioc = QIO_CHANNEL(sioc); - trace_cpr_transfer_input(addr->u.socket.u.q_unix.path); + trace_cpr_transfer_input( + addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ? + addr->u.socket.u.q_unix.path : addr->u.socket.u.fd.str); qio_channel_set_name(ioc, "cpr-in"); return qemu_file_new_input(ioc); diff --git a/migration/cpr.c b/migration/cpr.c index 42c4656..a50a57e 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -95,6 +95,36 @@ int cpr_find_fd(const char *name, int id) trace_cpr_find_fd(name, id, fd); return fd; } + +void cpr_resave_fd(const char *name, int id, int fd) +{ + CprFd *elem = find_fd(&cpr_state.fds, name, id); + int old_fd = elem ? elem->fd : -1; + + if (old_fd < 0) { + cpr_save_fd(name, id, fd); + } else if (old_fd != fd) { + error_setg(&error_fatal, + "internal error: cpr fd '%s' id %d value %d " + "already saved with a different value %d", + name, id, fd, old_fd); + } +} + +int cpr_open_fd(const char *path, int flags, const char *name, int id, + Error **errp) +{ + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = qemu_open(path, flags, errp); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + /*************************************************************************/ #define CPR_STATE "CprState" @@ -228,3 +258,9 @@ void cpr_state_close(void) cpr_state_file = NULL; } } + +bool cpr_incoming_needed(void *opaque) +{ + MigMode mode = migrate_mode(); + return mode == MIG_MODE_CPR_TRANSFER; +} diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c index 4cd1477..986624c 100644 --- a/migration/dirtyrate.c +++ b/migration/dirtyrate.c @@ -14,7 +14,7 @@ #include "qemu/error-report.h" #include "hw/core/cpu.h" #include "qapi/error.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "qemu/rcu_queue.h" #include "qemu/main-loop.h" @@ -27,7 +27,7 @@ #include "qobject/qdict.h" #include "system/kvm.h" #include "system/runstate.h" -#include "exec/memory.h" +#include "system/memory.h" #include "qemu/xxhash.h" #include "migration.h" diff --git a/migration/file.c b/migration/file.c index 7f11e26..bb8031e 100644 --- a/migration/file.c +++ b/migration/file.c @@ -6,7 +6,7 @@ */ #include "qemu/osdep.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "qemu/cutils.h" #include "qemu/error-report.h" #include "qapi/error.h" diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 49c26da..e8a563c 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -37,29 +37,28 @@ static void migration_global_dump(Monitor *mon) { MigrationState *ms = migrate_get_current(); - monitor_printf(mon, "globals:\n"); - monitor_printf(mon, "store-global-state: %s\n", + monitor_printf(mon, "Globals:\n"); + monitor_printf(mon, " store-global-state: %s\n", ms->store_global_state ? "on" : "off"); - monitor_printf(mon, "only-migratable: %s\n", + monitor_printf(mon, " only-migratable: %s\n", only_migratable ? "on" : "off"); - monitor_printf(mon, "send-configuration: %s\n", + monitor_printf(mon, " send-configuration: %s\n", ms->send_configuration ? "on" : "off"); - monitor_printf(mon, "send-section-footer: %s\n", + monitor_printf(mon, " send-section-footer: %s\n", ms->send_section_footer ? "on" : "off"); - monitor_printf(mon, "send-switchover-start: %s\n", + monitor_printf(mon, " send-switchover-start: %s\n", ms->send_switchover_start ? "on" : "off"); - monitor_printf(mon, "clear-bitmap-shift: %u\n", + monitor_printf(mon, " clear-bitmap-shift: %u\n", ms->clear_bitmap_shift); } void hmp_info_migrate(Monitor *mon, const QDict *qdict) { + bool show_all = qdict_get_try_bool(qdict, "all", false); MigrationInfo *info; info = qmp_query_migrate(NULL); - migration_global_dump(mon); - if (info->blocked_reasons) { strList *reasons = info->blocked_reasons; monitor_printf(mon, "Outgoing migration blocked:\n"); @@ -70,7 +69,7 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) } if (info->has_status) { - monitor_printf(mon, "Migration status: %s", + monitor_printf(mon, "Status: %s", MigrationStatus_str(info->status)); if (info->status == MIGRATION_STATUS_FAILED && info->error_desc) { monitor_printf(mon, " (%s)\n", info->error_desc); @@ -78,107 +77,130 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) monitor_printf(mon, "\n"); } - monitor_printf(mon, "total time: %" PRIu64 " ms\n", - info->total_time); - if (info->has_expected_downtime) { - monitor_printf(mon, "expected downtime: %" PRIu64 " ms\n", - info->expected_downtime); - } - if (info->has_downtime) { - monitor_printf(mon, "downtime: %" PRIu64 " ms\n", - info->downtime); + if (info->total_time) { + monitor_printf(mon, "Time (ms): total=%" PRIu64, + info->total_time); + if (info->has_setup_time) { + monitor_printf(mon, ", setup=%" PRIu64, + info->setup_time); + } + if (info->has_expected_downtime) { + monitor_printf(mon, ", exp_down=%" PRIu64, + info->expected_downtime); + } + if (info->has_downtime) { + monitor_printf(mon, ", down=%" PRIu64, + info->downtime); + } + monitor_printf(mon, "\n"); } - if (info->has_setup_time) { - monitor_printf(mon, "setup: %" PRIu64 " ms\n", - info->setup_time); + } + + if (info->has_socket_address) { + SocketAddressList *addr; + + monitor_printf(mon, "Sockets: [\n"); + + for (addr = info->socket_address; addr; addr = addr->next) { + char *s = socket_uri(addr->value); + monitor_printf(mon, "\t%s\n", s); + g_free(s); } + monitor_printf(mon, "]\n"); } if (info->ram) { - monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n", - info->ram->transferred >> 10); - monitor_printf(mon, "throughput: %0.2f mbps\n", + monitor_printf(mon, "RAM info:\n"); + monitor_printf(mon, " Throughput (Mbps): %0.2f\n", info->ram->mbps); - monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n", - info->ram->remaining >> 10); - monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n", + monitor_printf(mon, " Sizes (KiB): pagesize=%" PRIu64 + ", total=%" PRIu64 ",\n", + info->ram->page_size >> 10, info->ram->total >> 10); - monitor_printf(mon, "duplicate: %" PRIu64 " pages\n", - info->ram->duplicate); - monitor_printf(mon, "normal: %" PRIu64 " pages\n", - info->ram->normal); - monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n", - info->ram->normal_bytes >> 10); - monitor_printf(mon, "dirty sync count: %" PRIu64 "\n", - info->ram->dirty_sync_count); - monitor_printf(mon, "page size: %" PRIu64 " kbytes\n", - info->ram->page_size >> 10); - monitor_printf(mon, "multifd bytes: %" PRIu64 " kbytes\n", - info->ram->multifd_bytes >> 10); - monitor_printf(mon, "pages-per-second: %" PRIu64 "\n", + monitor_printf(mon, " transferred=%" PRIu64 + ", remain=%" PRIu64 ",\n", + info->ram->transferred >> 10, + info->ram->remaining >> 10); + monitor_printf(mon, " precopy=%" PRIu64 + ", multifd=%" PRIu64 + ", postcopy=%" PRIu64, + info->ram->precopy_bytes >> 10, + info->ram->multifd_bytes >> 10, + info->ram->postcopy_bytes >> 10); + + if (info->vfio) { + monitor_printf(mon, ", vfio=%" PRIu64, + info->vfio->transferred >> 10); + } + monitor_printf(mon, "\n"); + + monitor_printf(mon, " Pages: normal=%" PRIu64 ", zero=%" PRIu64 + ", rate_per_sec=%" PRIu64 "\n", + info->ram->normal, + info->ram->duplicate, info->ram->pages_per_second); + monitor_printf(mon, " Others: dirty_syncs=%" PRIu64, + info->ram->dirty_sync_count); if (info->ram->dirty_pages_rate) { - monitor_printf(mon, "dirty pages rate: %" PRIu64 " pages\n", + monitor_printf(mon, ", dirty_pages_rate=%" PRIu64, info->ram->dirty_pages_rate); } if (info->ram->postcopy_requests) { - monitor_printf(mon, "postcopy request count: %" PRIu64 "\n", + monitor_printf(mon, ", postcopy_req=%" PRIu64, info->ram->postcopy_requests); } - if (info->ram->precopy_bytes) { - monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n", - info->ram->precopy_bytes >> 10); - } if (info->ram->downtime_bytes) { - monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n", - info->ram->downtime_bytes >> 10); - } - if (info->ram->postcopy_bytes) { - monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n", - info->ram->postcopy_bytes >> 10); + monitor_printf(mon, ", downtime_ram=%" PRIu64, + info->ram->downtime_bytes); } if (info->ram->dirty_sync_missed_zero_copy) { - monitor_printf(mon, - "Zero-copy-send fallbacks happened: %" PRIu64 " times\n", + monitor_printf(mon, ", zerocopy_fallbacks=%" PRIu64, info->ram->dirty_sync_missed_zero_copy); } + monitor_printf(mon, "\n"); + } + + if (!show_all) { + goto out; } + migration_global_dump(mon); + if (info->xbzrle_cache) { - monitor_printf(mon, "cache size: %" PRIu64 " bytes\n", - info->xbzrle_cache->cache_size); - monitor_printf(mon, "xbzrle transferred: %" PRIu64 " kbytes\n", - info->xbzrle_cache->bytes >> 10); - monitor_printf(mon, "xbzrle pages: %" PRIu64 " pages\n", - info->xbzrle_cache->pages); - monitor_printf(mon, "xbzrle cache miss: %" PRIu64 " pages\n", - info->xbzrle_cache->cache_miss); - monitor_printf(mon, "xbzrle cache miss rate: %0.2f\n", - info->xbzrle_cache->cache_miss_rate); - monitor_printf(mon, "xbzrle encoding rate: %0.2f\n", - info->xbzrle_cache->encoding_rate); - monitor_printf(mon, "xbzrle overflow: %" PRIu64 "\n", + monitor_printf(mon, "XBZRLE: size=%" PRIu64 + ", transferred=%" PRIu64 + ", pages=%" PRIu64 + ", miss=%" PRIu64 "\n" + " miss_rate=%0.2f" + ", encode_rate=%0.2f" + ", overflow=%" PRIu64 "\n", + info->xbzrle_cache->cache_size, + info->xbzrle_cache->bytes, + info->xbzrle_cache->pages, + info->xbzrle_cache->cache_miss, + info->xbzrle_cache->cache_miss_rate, + info->xbzrle_cache->encoding_rate, info->xbzrle_cache->overflow); } if (info->has_cpu_throttle_percentage) { - monitor_printf(mon, "cpu throttle percentage: %" PRIu64 "\n", + monitor_printf(mon, "CPU Throttle (%%): %" PRIu64 "\n", info->cpu_throttle_percentage); } if (info->has_dirty_limit_throttle_time_per_round) { - monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n", + monitor_printf(mon, "Dirty-limit Throttle (us): %" PRIu64 "\n", info->dirty_limit_throttle_time_per_round); } if (info->has_dirty_limit_ring_full_time) { - monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n", + monitor_printf(mon, "Dirty-limit Ring Full (us): %" PRIu64 "\n", info->dirty_limit_ring_full_time); } if (info->has_postcopy_blocktime) { - monitor_printf(mon, "postcopy blocktime: %u\n", + monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n", info->postcopy_blocktime); } @@ -189,28 +211,12 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) visit_type_uint32List(v, NULL, &info->postcopy_vcpu_blocktime, &error_abort); visit_complete(v, &str); - monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str); + monitor_printf(mon, "Postcopy vCPU Blocktime: %s\n", str); g_free(str); visit_free(v); } - if (info->has_socket_address) { - SocketAddressList *addr; - - monitor_printf(mon, "socket address: [\n"); - - for (addr = info->socket_address; addr; addr = addr->next) { - char *s = socket_uri(addr->value); - monitor_printf(mon, "\t%s\n", s); - g_free(s); - } - monitor_printf(mon, "]\n"); - } - - if (info->vfio) { - monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n", - info->vfio->transferred >> 10); - } +out: qapi_free_MigrationInfo(info); } diff --git a/migration/migration.c b/migration/migration.c index d46e776..4098870 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -95,6 +95,9 @@ enum mig_rp_message_type { MIG_RP_MSG_MAX }; +/* Migration channel types */ +enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY }; + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -259,6 +262,24 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, return true; } +static bool +migration_capabilities_and_transport_compatible(MigrationAddress *addr, + Error **errp) +{ + if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { + return migrate_rdma_caps_check(migrate_get_current()->capabilities, + errp); + } + + return true; +} + +static bool migration_transport_compatible(MigrationAddress *addr, Error **errp) +{ + return migration_channels_and_transport_compatible(addr, errp) && + migration_capabilities_and_transport_compatible(addr, errp); +} + static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) { uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp; @@ -750,7 +771,7 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, } /* transport mechanism not suitable for migration? */ - if (!migration_channels_and_transport_compatible(addr, errp)) { + if (!migration_transport_compatible(addr, errp)) { return; } @@ -769,14 +790,6 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, } #ifdef CONFIG_RDMA } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { - if (migrate_xbzrle()) { - error_setg(errp, "RDMA and XBZRLE can't be used together"); - return; - } - if (migrate_multifd()) { - error_setg(errp, "RDMA and multifd can't be used together"); - return; - } rdma_start_incoming_migration(&addr->u.rdma, errp); #endif } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) { @@ -931,9 +944,8 @@ static void migration_incoming_setup(QEMUFile *f) { MigrationIncomingState *mis = migration_incoming_get_current(); - if (!mis->from_src_file) { - mis->from_src_file = f; - } + assert(!mis->from_src_file); + mis->from_src_file = f; qemu_file_set_blocking(f, false); } @@ -985,28 +997,19 @@ void migration_fd_process_incoming(QEMUFile *f) migration_incoming_process(); } -/* - * Returns true when we want to start a new incoming migration process, - * false otherwise. - */ -static bool migration_should_start_incoming(bool main_channel) +static bool migration_has_main_and_multifd_channels(void) { - /* Multifd doesn't start unless all channels are established */ - if (migrate_multifd()) { - return migration_has_all_channels(); + MigrationIncomingState *mis = migration_incoming_get_current(); + if (!mis->from_src_file) { + /* main channel not established */ + return false; } - /* Preempt channel only starts when the main channel is created */ - if (migrate_postcopy_preempt()) { - return main_channel; + if (migrate_multifd() && !multifd_recv_all_channels_created()) { + return false; } - /* - * For all the rest types of migration, we should only reach here when - * it's the main channel that's being created, and we should always - * proceed with this channel. - */ - assert(main_channel); + /* main and all multifd channels are established */ return true; } @@ -1015,59 +1018,81 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; QEMUFile *f; - bool default_channel = true; + uint8_t channel; uint32_t channel_magic = 0; int ret = 0; - if (migrate_multifd() && !migrate_mapped_ram() && - !migrate_postcopy_ram() && - qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { - /* - * With multiple channels, it is possible that we receive channels - * out of order on destination side, causing incorrect mapping of - * source channels on destination side. Check channel MAGIC to - * decide type of channel. Please note this is best effort, postcopy - * preempt channel does not send any magic number so avoid it for - * postcopy live migration. Also tls live migration already does - * tls handshake while initializing main channel so with tls this - * issue is not possible. - */ - ret = migration_channel_read_peek(ioc, (void *)&channel_magic, - sizeof(channel_magic), errp); + if (!migration_has_main_and_multifd_channels()) { + if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { + /* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, + * postcopy preempt channel does not send any magic number so + * avoid it for postcopy live migration. Also tls live migration + * already does tls handshake while initializing main channel so + * with tls this issue is not possible. + */ + ret = migration_channel_read_peek(ioc, (void *)&channel_magic, + sizeof(channel_magic), errp); + if (ret != 0) { + return; + } - if (ret != 0) { + channel_magic = be32_to_cpu(channel_magic); + if (channel_magic == QEMU_VM_FILE_MAGIC) { + channel = CH_MAIN; + } else if (channel_magic == MULTIFD_MAGIC) { + assert(migrate_multifd()); + channel = CH_MULTIFD; + } else if (!mis->from_src_file && + mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { + /* reconnect main channel for postcopy recovery */ + channel = CH_MAIN; + } else { + error_setg(errp, "unknown channel magic: %u", channel_magic); + return; + } + } else if (mis->from_src_file && migrate_multifd()) { + /* + * Non-peekable channels like tls/file are processed as + * multifd channels when multifd is enabled. + */ + channel = CH_MULTIFD; + } else if (!mis->from_src_file) { + channel = CH_MAIN; + } else { + error_setg(errp, "non-peekable channel used without multifd"); return; } - - default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); } else { - default_channel = !mis->from_src_file; + assert(migrate_postcopy_preempt()); + channel = CH_POSTCOPY; } if (multifd_recv_setup(errp) != 0) { return; } - if (default_channel) { + if (channel == CH_MAIN) { f = qemu_file_new_input(ioc); migration_incoming_setup(f); - } else { + } else if (channel == CH_MULTIFD) { /* Multiple connections */ - assert(migration_needs_multiple_sockets()); - if (migrate_multifd()) { - multifd_recv_new_channel(ioc, &local_err); - } else { - assert(migrate_postcopy_preempt()); - f = qemu_file_new_input(ioc); - postcopy_preempt_new_channel(mis, f); - } + multifd_recv_new_channel(ioc, &local_err); if (local_err) { error_propagate(errp, local_err); return; } + } else if (channel == CH_POSTCOPY) { + assert(!mis->postcopy_qemufile_dst); + f = qemu_file_new_input(ioc); + postcopy_preempt_new_channel(mis, f); + return; } - if (migration_should_start_incoming(default_channel)) { + if (migration_has_main_and_multifd_channels()) { /* If it's a recovery, we're done */ if (postcopy_try_recover()) { return; @@ -1084,18 +1109,13 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) */ bool migration_has_all_channels(void) { - MigrationIncomingState *mis = migration_incoming_get_current(); - - if (!mis->from_src_file) { + if (!migration_has_main_and_multifd_channels()) { return false; } - if (migrate_multifd()) { - return multifd_recv_all_channels_created(); - } - - if (migrate_postcopy_preempt()) { - return mis->postcopy_qemufile_dst != NULL; + MigrationIncomingState *mis = migration_incoming_get_current(); + if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) { + return false; } return true; @@ -1610,7 +1630,7 @@ void migration_cancel(void) } /* If the migration is paused, kick it out of the pause */ if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); } while (s->state != MIGRATION_STATUS_CANCELLING); @@ -2208,7 +2228,7 @@ void qmp_migrate(const char *uri, bool has_channels, } /* transport mechanism not suitable for migration? */ - if (!migration_channels_and_transport_compatible(addr, errp)) { + if (!migration_transport_compatible(addr, errp)) { return; } @@ -2322,7 +2342,7 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) MigrationStatus_str(s->state)); return; } - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } int migration_rp_wait(MigrationState *s) @@ -2707,6 +2727,10 @@ static int postcopy_start(MigrationState *ms, Error **errp) } } + if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) { + return -1; + } + trace_postcopy_start(); bql_lock(); trace_postcopy_start_set_run(); @@ -2887,21 +2911,18 @@ static bool migration_switchover_prepare(MigrationState *s) return true; } - /* Since leaving this state is not atomic with posting the semaphore + /* + * Since leaving this state is not atomic with setting the event * it's possible that someone could have issued multiple migrate_continue - * and the semaphore is incorrectly positive at this point; - * the docs say it's undefined to reinit a semaphore that's already - * init'd, so use timedwait to eat up any existing posts. + * and the event is incorrectly set at this point so reset it. */ - while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { - /* This block intentionally left blank */ - } + qemu_event_reset(&s->pause_event); /* Update [POSTCOPY_]ACTIVE to PRE_SWITCHOVER */ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_PRE_SWITCHOVER); bql_unlock(); - qemu_sem_wait(&s->pause_sem); + qemu_event_wait(&s->pause_event); bql_lock(); /* @@ -4016,7 +4037,7 @@ fail: migration_cleanup(s); } -static void migration_class_init(ObjectClass *klass, void *data) +static void migration_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -4033,7 +4054,7 @@ static void migration_instance_finalize(Object *obj) qemu_mutex_destroy(&ms->qemu_file_lock); qemu_sem_destroy(&ms->wait_unplug_sem); qemu_sem_destroy(&ms->rate_limit_sem); - qemu_sem_destroy(&ms->pause_sem); + qemu_event_destroy(&ms->pause_event); qemu_sem_destroy(&ms->postcopy_pause_sem); qemu_sem_destroy(&ms->rp_state.rp_sem); qemu_sem_destroy(&ms->rp_state.rp_pong_acks); @@ -4048,7 +4069,7 @@ static void migration_instance_init(Object *obj) ms->state = MIGRATION_STATUS_NONE; ms->mbps = -1; ms->pages_per_second = -1; - qemu_sem_init(&ms->pause_sem, 0); + qemu_event_init(&ms->pause_event, false); qemu_mutex_init(&ms->error_mutex); migrate_params_init(&ms->parameters); diff --git a/migration/migration.h b/migration/migration.h index d53f7ca..739289d 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -98,9 +98,9 @@ struct MigrationIncomingState { void (*transport_cleanup)(void *data); /* * Used to sync thread creations. Note that we can't create threads in - * parallel with this sem. + * parallel with this event. */ - QemuSemaphore thread_sync_sem; + QemuEvent thread_sync_event; /* * Free at the start of the main state load, set as the main thread finishes * loading state. @@ -186,7 +186,7 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *colo_incoming_co; - QemuSemaphore colo_incoming_sem; + QemuEvent colo_incoming_event; /* Optional load threads pool and its thread exit request flag */ ThreadPool *load_threads; @@ -379,10 +379,10 @@ struct MigrationState { QemuSemaphore wait_unplug_sem; /* Migration is paused due to pause-before-switchover */ - QemuSemaphore pause_sem; + QemuEvent pause_event; - /* The semaphore is used to notify COLO thread that failover is finished */ - QemuSemaphore colo_exit_sem; + /* The event is used to notify COLO thread that failover is finished */ + QemuEvent colo_exit_event; /* The event is used to notify COLO thread to do checkpoint */ QemuEvent colo_checkpoint_event; diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index d0f38b4..b48eae3 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -11,12 +11,13 @@ */ #include "qemu/osdep.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "file.h" #include "migration-stats.h" #include "multifd.h" #include "options.h" +#include "migration.h" #include "qapi/error.h" #include "qemu/cutils.h" #include "qemu/error-report.h" @@ -398,7 +399,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f) MultiFDSyncReq req; int ret; - if (!migrate_multifd()) { + if (!migrate_multifd() || migration_in_postcopy()) { return 0; } diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c index 6a0e989..7419e5d 100644 --- a/migration/multifd-qatzip.c +++ b/migration/multifd-qatzip.c @@ -13,7 +13,7 @@ */ #include "qemu/osdep.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "qapi/qapi-types-migration.h" diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 88e2344..52902eb 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -14,7 +14,7 @@ #include "qemu/module.h" #include "qapi/error.h" #include "qapi/qapi-types-migration.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "multifd.h" #include "qpl/qpl.h" diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index 6895c1f..fd7cd9b 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "migration.h" #include "multifd.h" #include "options.h" diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c index f1e988a..4cde868 100644 --- a/migration/multifd-zero-page.c +++ b/migration/multifd-zero-page.c @@ -12,7 +12,7 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "migration.h" #include "migration-stats.h" #include "multifd.h" @@ -85,9 +85,27 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) { for (int i = 0; i < p->zero_num; i++) { void *page = p->host + p->zero[i]; - if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { + bool received = + ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i]); + + /* + * During multifd migration zero page is written to the memory + * only if it is migrated more than once. + * + * It becomes a problem when both multifd & postcopy options are + * enabled. If the zero page which was skipped during multifd phase, + * is accessed during the postcopy phase of the migration, a page + * fault occurs. But this page fault is not served because the + * 'receivedmap' says the zero page is already received. Thus the + * thread accessing that page may hang. + * + * When postcopy is enabled, always write the zero page as and when + * it is migrated. + */ + if (migrate_postcopy_ram() || received) { memset(page, 0, multifd_ram_page_size()); - } else { + } + if (!received) { ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); } } diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 8cf8a26..8820b2a 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include <zlib.h> #include "qemu/rcu.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "qapi/error.h" #include "migration.h" diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index abed140..3c2dcf7 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" #include <zstd.h> #include "qemu/rcu.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "exec/target_page.h" #include "qapi/error.h" #include "migration.h" diff --git a/migration/multifd.c b/migration/multifd.c index dfb5189..b255778 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -16,7 +16,7 @@ #include "qemu/rcu.h" #include "exec/target_page.h" #include "system/system.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "file.h" @@ -36,11 +36,6 @@ #include "io/channel-socket.h" #include "yank_functions.h" -/* Multiple fd's */ - -#define MULTIFD_MAGIC 0x11223344U -#define MULTIFD_VERSION 1 - typedef struct { uint32_t magic; uint32_t version; @@ -695,6 +690,7 @@ static void *multifd_send_thread(void *opaque) if (qatomic_load_acquire(&p->pending_job)) { bool is_device_state = multifd_payload_device_state(p->data); size_t total_size; + int write_flags_masked = 0; p->flags = 0; p->iovs_num = 0; @@ -702,6 +698,9 @@ static void *multifd_send_thread(void *opaque) if (is_device_state) { multifd_device_state_send_prepare(p); + + /* Device state packets cannot be sent via zerocopy */ + write_flags_masked |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; } else { ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { @@ -723,7 +722,8 @@ static void *multifd_send_thread(void *opaque) &p->data->u.ram, &local_err); } else { ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, - NULL, 0, p->write_flags, + NULL, 0, + p->write_flags & ~write_flags_masked, &local_err); } @@ -1384,6 +1384,13 @@ static void *multifd_recv_thread(void *opaque) } if (has_data) { + /* + * multifd thread should not be active and receive data + * when migration is in the Postcopy phase. Two threads + * writing the same memory area could easily corrupt + * the guest state. + */ + assert(!migration_in_postcopy()); if (is_device_state) { assert(use_packets); ret = multifd_device_state_recv(p, &local_err); diff --git a/migration/multifd.h b/migration/multifd.h index 2d337e7..9b6d81e 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -49,6 +49,11 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); bool multifd_recv(void); MultiFDRecvData *multifd_get_recv_data(void); +/* Multiple fd's */ + +#define MULTIFD_MAGIC 0x11223344U +#define MULTIFD_VERSION 1 + /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/options.c b/migration/options.c index b0ac2ea..162c72c 100644 --- a/migration/options.c +++ b/migration/options.c @@ -448,6 +448,24 @@ static bool migrate_incoming_started(void) return !!migration_incoming_get_current()->transport_data; } +bool migrate_rdma_caps_check(bool *caps, Error **errp) +{ + if (caps[MIGRATION_CAPABILITY_XBZRLE]) { + error_setg(errp, "RDMA and XBZRLE can't be used together"); + return false; + } + if (caps[MIGRATION_CAPABILITY_MULTIFD]) { + error_setg(errp, "RDMA and multifd can't be used together"); + return false; + } + if (caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + error_setg(errp, "RDMA and postcopy-ram can't be used together"); + return false; + } + + return true; +} + /** * @migration_caps_check - check capability compatibility * @@ -491,11 +509,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) error_setg(errp, "Postcopy is not compatible with ignore-shared"); return false; } - - if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - error_setg(errp, "Postcopy is not yet compatible with multifd"); - return false; - } } if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { @@ -555,7 +568,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) return false; } - if (migrate_incoming_started()) { + if (!migrate_postcopy_preempt() && migrate_incoming_started()) { error_setg(errp, "Postcopy preempt must be set before incoming starts"); return false; @@ -563,7 +576,7 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - if (migrate_incoming_started()) { + if (!migrate_multifd() && migrate_incoming_started()) { error_setg(errp, "Multifd must be set before incoming starts"); return false; } @@ -611,6 +624,13 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } } + /* + * On destination side, check the cases that capability is being set + * after incoming thread has started. + */ + if (migrate_rdma() && !migrate_rdma_caps_check(new_caps, errp)) { + return false; + } return true; } @@ -1193,6 +1213,11 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->tls_hostname = params->tls_hostname->u.s; } + if (params->tls_authz) { + assert(params->tls_authz->type == QTYPE_QSTRING); + dest->tls_authz = params->tls_authz->u.s; + } + if (params->has_max_bandwidth) { dest->max_bandwidth = params->max_bandwidth; } diff --git a/migration/options.h b/migration/options.h index 762be4e..82d8397 100644 --- a/migration/options.h +++ b/migration/options.h @@ -57,6 +57,7 @@ bool migrate_tls(void); /* capabilities helpers */ +bool migrate_rdma_caps_check(bool *caps, Error **errp); bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp); /* parameters */ diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 5d3edfc..75fd310 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -31,7 +31,7 @@ #include "qemu/error-report.h" #include "trace.h" #include "hw/boards.h" -#include "exec/ramblock.h" +#include "system/ramblock.h" #include "socket.h" #include "yank_functions.h" #include "tls.h" @@ -90,10 +90,10 @@ void postcopy_thread_create(MigrationIncomingState *mis, QemuThread *thread, const char *name, void *(*fn)(void *), int joinable) { - qemu_sem_init(&mis->thread_sync_sem, 0); + qemu_event_init(&mis->thread_sync_event, false); qemu_thread_create(thread, name, fn, mis, joinable); - qemu_sem_wait(&mis->thread_sync_sem); - qemu_sem_destroy(&mis->thread_sync_sem); + qemu_event_wait(&mis->thread_sync_event); + qemu_event_destroy(&mis->thread_sync_event); } /* Postcopy needs to detect accesses to pages that haven't yet been copied @@ -964,7 +964,7 @@ static void *postcopy_ram_fault_thread(void *opaque) trace_postcopy_ram_fault_thread_entry(); rcu_register_thread(); mis->last_rb = NULL; /* last RAMBlock we sent part of */ - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); struct pollfd *pfd; size_t pfd_len = 2 + mis->postcopy_remote_fds->len; @@ -1716,7 +1716,7 @@ void *postcopy_preempt_thread(void *opaque) rcu_register_thread(); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); /* * The preempt channel is established in asynchronous way. Wait diff --git a/migration/ram.c b/migration/ram.c index dc909f5..2140785 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -48,7 +48,7 @@ #include "qapi/qapi-commands-migration.h" #include "qapi/qmp/qerror.h" #include "trace.h" -#include "exec/ram_addr.h" +#include "system/ram_addr.h" #include "exec/target_page.h" #include "qemu/rcu_queue.h" #include "migration/colo.h" @@ -91,6 +91,36 @@ XBZRLECacheStats xbzrle_counters; +/* + * This structure locates a specific location of a guest page. In QEMU, + * it's described in a tuple of (ramblock, offset). + */ +struct PageLocation { + RAMBlock *block; + unsigned long offset; +}; +typedef struct PageLocation PageLocation; + +/** + * PageLocationHint: describes a hint to a page location + * + * @valid set if the hint is vaild and to be consumed + * @location: the hint content + * + * In postcopy preempt mode, the urgent channel may provide hints to the + * background channel, so that QEMU source can try to migrate whatever is + * right after the requested urgent pages. + * + * This is based on the assumption that the VM (already running on the + * destination side) tends to access the memory with spatial locality. + * This is also the default behavior of vanilla postcopy (preempt off). + */ +struct PageLocationHint { + bool valid; + PageLocation location; +}; +typedef struct PageLocationHint PageLocationHint; + /* used by the search for pages to send */ struct PageSearchStatus { /* The migration channel used for a specific host page */ @@ -395,6 +425,13 @@ struct RAMState { * RAM migration. */ unsigned int postcopy_bmap_sync_requested; + /* + * Page hint during postcopy when preempt mode is on. Return path + * thread sets it, while background migration thread consumes it. + * + * Protected by @bitmap_mutex. + */ + PageLocationHint page_hint; }; typedef struct RAMState RAMState; @@ -794,14 +831,22 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, bool ret; /* - * Clear dirty bitmap if needed. This _must_ be called before we - * send any of the page in the chunk because we need to make sure - * we can capture further page content changes when we sync dirty - * log the next time. So as long as we are going to send any of - * the page in the chunk we clear the remote dirty bitmap for all. - * Clearing it earlier won't be a problem, but too late will. + * During the last stage (after source VM stopped), resetting the write + * protections isn't needed as we know there will be either (1) no + * further writes if migration will complete, or (2) migration fails + * at last then tracking isn't needed either. */ - migration_clear_memory_region_dirty_bitmap(rb, page); + if (!rs->last_stage) { + /* + * Clear dirty bitmap if needed. This _must_ be called before we + * send any of the page in the chunk because we need to make sure + * we can capture further page content changes when we sync dirty + * log the next time. So as long as we are going to send any of + * the page in the chunk we clear the remote dirty bitmap for all. + * Clearing it earlier won't be a problem, but too late will. + */ + migration_clear_memory_region_dirty_bitmap(rb, page); + } ret = test_and_clear_bit(page, rb->bmap); if (ret) { @@ -811,8 +856,8 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, return ret; } -static void dirty_bitmap_clear_section(MemoryRegionSection *section, - void *opaque) +static int dirty_bitmap_clear_section(MemoryRegionSection *section, + void *opaque) { const hwaddr offset = section->offset_within_region; const hwaddr size = int128_get64(section->size); @@ -831,6 +876,7 @@ static void dirty_bitmap_clear_section(MemoryRegionSection *section, } *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); bitmap_clear(rb->bmap, start, npages); + return 0; } /* @@ -1144,32 +1190,6 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, } /* - * @pages: the number of pages written by the control path, - * < 0 - error - * > 0 - number of pages written - * - * Return true if the pages has been saved, otherwise false is returned. - */ -static bool control_save_page(PageSearchStatus *pss, - ram_addr_t offset, int *pages) -{ - int ret; - - ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset, - TARGET_PAGE_SIZE); - if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { - return false; - } - - if (ret == RAM_SAVE_CONTROL_DELAYED) { - *pages = 1; - return true; - } - *pages = ret; - return true; -} - -/* * directly send the page to the stream * * Returns the number of pages written. @@ -1965,7 +1985,13 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) int res; /* Hand over to RDMA first */ - if (control_save_page(pss, offset, &res)) { + if (migrate_rdma()) { + res = rdma_control_save_page(pss->pss_channel, pss->block->offset, + offset, TARGET_PAGE_SIZE); + + if (res == RAM_SAVE_CONTROL_DELAYED) { + res = 1; + } return res; } @@ -1976,9 +2002,8 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) } } - if (migrate_multifd()) { - RAMBlock *block = pss->block; - return ram_save_multifd_page(block, offset); + if (migrate_multifd() && !migration_in_postcopy()) { + return ram_save_multifd_page(pss->block, offset); } return ram_save_page(rs, pss); @@ -2039,6 +2064,21 @@ static void pss_host_page_finish(PageSearchStatus *pss) pss->host_page_start = pss->host_page_end = 0; } +static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss) +{ + PageLocationHint *hint = &rs->page_hint; + + /* If there's a pending hint not consumed, don't bother */ + if (hint->valid) { + return; + } + + /* Provide a hint to the background stream otherwise */ + hint->location.block = pss->block; + hint->location.offset = pss->page; + hint->valid = true; +} + /* * Send an urgent host page specified by `pss'. Need to be called with * bitmap_mutex held. @@ -2084,6 +2124,7 @@ out: /* For urgent requests, flush immediately if sent */ if (sent) { qemu_fflush(pss->pss_channel); + ram_page_hint_update(rs, pss); } return ret; } @@ -2171,6 +2212,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return (res < 0 ? res : pages); } +static bool ram_page_hint_valid(RAMState *rs) +{ + /* There's only page hint during postcopy preempt mode */ + if (!postcopy_preempt_active()) { + return false; + } + + return rs->page_hint.valid; +} + +static void ram_page_hint_collect(RAMState *rs, RAMBlock **block, + unsigned long *page) +{ + PageLocationHint *hint = &rs->page_hint; + + assert(hint->valid); + + *block = hint->location.block; + *page = hint->location.offset; + + /* Mark the hint consumed */ + hint->valid = false; +} + /** * ram_find_and_save_block: finds a dirty page and sends it to f * @@ -2187,6 +2252,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) static int ram_find_and_save_block(RAMState *rs) { PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + unsigned long next_page; + RAMBlock *next_block; int pages = 0; /* No dirty page as there is zero RAM */ @@ -2206,7 +2273,14 @@ static int ram_find_and_save_block(RAMState *rs) rs->last_page = 0; } - pss_init(pss, rs->last_seen_block, rs->last_page); + if (ram_page_hint_valid(rs)) { + ram_page_hint_collect(rs, &next_block, &next_page); + } else { + next_block = rs->last_seen_block; + next_page = rs->last_page; + } + + pss_init(pss, next_block, next_page); while (true){ if (!get_queued_page(rs, pss)) { @@ -2339,6 +2413,13 @@ static void ram_save_cleanup(void *opaque) ram_state_cleanup(rsp); } +static void ram_page_hint_reset(PageLocationHint *hint) +{ + hint->location.block = NULL; + hint->location.offset = 0; + hint->valid = false; +} + static void ram_state_reset(RAMState *rs) { int i; @@ -2351,6 +2432,8 @@ static void ram_state_reset(RAMState *rs) rs->last_page = 0; rs->last_version = ram_list.version; rs->xbzrle_started = false; + + ram_page_hint_reset(&rs->page_hint); } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -3598,7 +3681,9 @@ static int ram_load_cleanup(void *opaque) RAMBlock *rb; RAMBLOCK_FOREACH_NOT_IGNORED(rb) { - qemu_ram_block_writeback(rb); + if (memory_region_is_nonvolatile(rb->mr)) { + qemu_ram_block_writeback(rb); + } } xbzrle_load_cleanup(); @@ -4418,6 +4503,42 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) return 0; } +static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) +{ + int ret; + + if (migrate_multifd()) { + /* + * When multifd is enabled, source QEMU needs to make sure all the + * pages queued before postcopy starts have been flushed. + * + * The load of these pages must happen before switching to postcopy. + * It's because loading of guest pages (so far) in multifd recv + * threads is still non-atomic, so the load cannot happen with vCPUs + * running on the destination side. + * + * This flush and sync will guarantee that those pages are loaded + * _before_ postcopy starts on the destination. The rationale is, + * this happens before VM stops (and before source QEMU sends all + * the rest of the postcopy messages). So when the destination QEMU + * receives the postcopy messages, it must have received the sync + * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, + * or RAM_SAVE_FLAG_EOS), and such message would guarantee that + * all previous guest pages queued in the multifd channels are + * completely loaded. + */ + ret = multifd_ram_flush_and_sync(f); + if (ret < 0) { + error_setg(errp, "%s: multifd flush and sync failed", __func__); + return false; + } + } + + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + + return true; +} + void postcopy_preempt_shutdown_file(MigrationState *s) { qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); @@ -4437,6 +4558,7 @@ static SaveVMHandlers savevm_ram_handlers = { .load_setup = ram_load_setup, .load_cleanup = ram_load_cleanup, .resume_prepare = ram_resume_prepare, + .save_postcopy_prepare = ram_save_postcopy_prepare, }; static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, diff --git a/migration/rdma.c b/migration/rdma.c index 76fb034..2d839fc 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -30,7 +30,7 @@ #include "qemu/sockets.h" #include "qemu/bitmap.h" #include "qemu/coroutine.h" -#include "exec/memory.h" +#include "system/memory.h" #include <sys/socket.h> #include <netdb.h> #include <arpa/inet.h> @@ -768,156 +768,12 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) } /* - * As of now, IPv6 over RoCE / iWARP is not supported by linux. - * We will try the next addrinfo struct, and fail if there are - * no other valid addresses to bind against. - * - * If user is listening on '[::]', then we will not have a opened a device - * yet and have no way of verifying if the device is RoCE or not. - * - * In this case, the source VM will throw an error for ALL types of - * connections (both IPv4 and IPv6) if the destination machine does not have - * a regular infiniband network available for use. - * - * The only way to guarantee that an error is thrown for broken kernels is - * for the management software to choose a *specific* interface at bind time - * and validate what time of hardware it is. - * - * Unfortunately, this puts the user in a fix: - * - * If the source VM connects with an IPv4 address without knowing that the - * destination has bound to '[::]' the migration will unconditionally fail - * unless the management software is explicitly listening on the IPv4 - * address while using a RoCE-based device. - * - * If the source VM connects with an IPv6 address, then we're OK because we can - * throw an error on the source (and similarly on the destination). - * - * But in mixed environments, this will be broken for a while until it is fixed - * inside linux. - * - * We do provide a *tiny* bit of help in this function: We can list all of the - * devices in the system and check to see if all the devices are RoCE or - * Infiniband. - * - * If we detect that we have a *pure* RoCE environment, then we can safely - * thrown an error even if the management software has specified '[::]' as the - * bind address. - * - * However, if there is are multiple hetergeneous devices, then we cannot make - * this assumption and the user just has to be sure they know what they are - * doing. - * - * Patches are being reviewed on linux-rdma. - */ -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) -{ - /* This bug only exists in linux, to our knowledge. */ -#ifdef CONFIG_LINUX - struct ibv_port_attr port_attr; - - /* - * Verbs are only NULL if management has bound to '[::]'. - * - * Let's iterate through all the devices and see if there any pure IB - * devices (non-ethernet). - * - * If not, then we can safely proceed with the migration. - * Otherwise, there are no guarantees until the bug is fixed in linux. - */ - if (!verbs) { - int num_devices; - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); - bool roce_found = false; - bool ib_found = false; - - for (int x = 0; x < num_devices; x++) { - verbs = ibv_open_device(dev_list[x]); - /* - * ibv_open_device() is not documented to set errno. If - * it does, it's somebody else's doc bug. If it doesn't, - * the use of errno below is wrong. - * TODO Find out whether ibv_open_device() sets errno. - */ - if (!verbs) { - if (errno == EPERM) { - continue; - } else { - error_setg_errno(errp, errno, - "could not open RDMA device context"); - return -1; - } - } - - if (ibv_query_port(verbs, 1, &port_attr)) { - ibv_close_device(verbs); - error_setg(errp, - "RDMA ERROR: Could not query initial IB port"); - return -1; - } - - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { - ib_found = true; - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - roce_found = true; - } - - ibv_close_device(verbs); - - } - - if (roce_found) { - if (ib_found) { - warn_report("migrations may fail:" - " IPv6 over RoCE / iWARP in linux" - " is broken. But since you appear to have a" - " mixed RoCE / IB environment, be sure to only" - " migrate over the IB fabric until the kernel " - " fixes the bug."); - } else { - error_setg(errp, "RDMA ERROR: " - "You only have RoCE / iWARP devices in your systems" - " and your management software has specified '[::]'" - ", but IPv6 over RoCE / iWARP is not supported in Linux."); - return -1; - } - } - - return 0; - } - - /* - * If we have a verbs context, that means that some other than '[::]' was - * used by the management software for binding. In which case we can - * actually warn the user about a potentially broken kernel. - */ - - /* IB ports start with 1, not 0 */ - if (ibv_query_port(verbs, 1, &port_attr)) { - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); - return -1; - } - - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - error_setg(errp, "RDMA ERROR: " - "Linux kernel's RoCE / iWARP does not support IPv6 " - "(but patches on linux-rdma in progress)"); - return -1; - } - -#endif - - return 0; -} - -/* * Figure out which RDMA device corresponds to the requested IP hostname * Also create the initial connection manager identifiers for opening * the connection. */ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) { - Error *err = NULL; int ret; struct rdma_addrinfo *res; char port_str[16]; @@ -953,9 +809,8 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) goto err_resolve_get_addr; } - /* Try all addresses, saving the first error in @err */ + /* Try all addresses, exit loop on first success of resolving address */ for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) { - Error **local_errp = err ? NULL : &err; inet_ntop(e->ai_family, &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); @@ -964,25 +819,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, RDMA_RESOLVE_TIMEOUT_MS); if (ret >= 0) { - if (e->ai_family == AF_INET6) { - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, - local_errp); - if (ret < 0) { - continue; - } - } - error_free(err); goto route; } } rdma_freeaddrinfo(res); - if (err) { - error_propagate(errp, err); - } else { - error_setg(errp, "RDMA ERROR: could not resolve address %s", - rdma->host); - } + error_setg(errp, "RDMA ERROR: could not resolve address %s", rdma->host); goto err_resolve_get_addr; route: @@ -2611,7 +2453,6 @@ err_rdma_source_connect: static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) { - Error *err = NULL; int ret; struct rdma_cm_id *listen_id; char ip[40] = "unknown"; @@ -2661,9 +2502,8 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) goto err_dest_init_bind_addr; } - /* Try all addresses, saving the first error in @err */ + /* Try all addresses */ for (e = res; e != NULL; e = e->ai_next) { - Error **local_errp = err ? NULL : &err; inet_ntop(e->ai_family, &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); @@ -2672,24 +2512,12 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) if (ret < 0) { continue; } - if (e->ai_family == AF_INET6) { - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, - local_errp); - if (ret < 0) { - continue; - } - } - error_free(err); break; } rdma_freeaddrinfo(res); if (!e) { - if (err) { - error_propagate(errp, err); - } else { - error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!"); - } + error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!"); goto err_dest_init_bind_addr; } @@ -3284,14 +3112,11 @@ err: int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, size_t size) { - if (!migrate_rdma() || migration_in_postcopy()) { - return RAM_SAVE_CONTROL_NOT_SUPP; - } + assert(migrate_rdma()); int ret = qemu_rdma_save_page(f, block_offset, offset, size); - if (ret != RAM_SAVE_CONTROL_DELAYED && - ret != RAM_SAVE_CONTROL_NOT_SUPP) { + if (ret != RAM_SAVE_CONTROL_DELAYED) { if (ret < 0) { qemu_file_set_error(f, ret); } @@ -3829,7 +3654,7 @@ int rdma_block_notification_handle(QEMUFile *f, const char *name) int rdma_registration_start(QEMUFile *f, uint64_t flags) { - if (!migrate_rdma() || migration_in_postcopy()) { + if (!migrate_rdma()) { return 0; } @@ -3861,7 +3686,7 @@ int rdma_registration_stop(QEMUFile *f, uint64_t flags) RDMAControlHeader head = { .len = 0, .repeat = 1 }; int ret; - if (!migrate_rdma() || migration_in_postcopy()) { + if (!migrate_rdma()) { return 0; } @@ -3985,7 +3810,7 @@ static void qio_channel_rdma_finalize(Object *obj) } static void qio_channel_rdma_class_init(ObjectClass *klass, - void *class_data G_GNUC_UNUSED) + const void *class_data G_GNUC_UNUSED) { QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass); diff --git a/migration/rdma.h b/migration/rdma.h index f55f28b..f74f16a 100644 --- a/migration/rdma.h +++ b/migration/rdma.h @@ -19,7 +19,7 @@ #ifndef QEMU_MIGRATION_RDMA_H #define QEMU_MIGRATION_RDMA_H -#include "exec/memory.h" +#include "system/memory.h" void rdma_start_outgoing_migration(void *opaque, InetSocketAddress *host_port, Error **errp); @@ -33,7 +33,6 @@ void rdma_start_incoming_migration(InetSocketAddress *host_port, Error **errp); #define RAM_CONTROL_ROUND 1 #define RAM_CONTROL_FINISH 3 -#define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 #ifdef CONFIG_RDMA @@ -56,7 +55,7 @@ static inline int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, size_t size) { - return RAM_SAVE_CONTROL_NOT_SUPP; + g_assert_not_reached(); } #endif #endif diff --git a/migration/savevm.c b/migration/savevm.c index ce158c3..bb04a45 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -48,8 +48,9 @@ #include "qapi/qapi-builtin-visit.h" #include "qemu/error-report.h" #include "system/cpus.h" -#include "exec/memory.h" +#include "system/memory.h" #include "exec/target_page.h" +#include "exec/page-vary.h" #include "trace.h" #include "qemu/iov.h" #include "qemu/job.h" @@ -265,7 +266,7 @@ typedef struct SaveState { static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), - .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL }, + .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL }, .global_section_id = 0, }; @@ -339,7 +340,7 @@ static int configuration_pre_load(void *opaque) * predates the variable-target-page-bits support and is using the * minimum possible value for this CPU. */ - state->target_page_bits = qemu_target_page_bits_min(); + state->target_page_bits = migration_legacy_page_bits(); return 0; } @@ -462,8 +463,7 @@ static const VMStateInfo vmstate_info_capability = { */ static bool vmstate_target_page_bits_needed(void *opaque) { - return qemu_target_page_bits() - > qemu_target_page_bits_min(); + return qemu_target_page_bits() > migration_legacy_page_bits(); } static const VMStateDescription vmstate_target_page_bits = { @@ -737,7 +737,7 @@ static int calculate_compat_instance_id(const char *idstr) static inline MigrationPriority save_state_priority(SaveStateEntry *se) { - if (se->vmsd) { + if (se->vmsd && se->vmsd->priority) { return se->vmsd->priority; } return MIG_PRI_DEFAULT; @@ -1523,6 +1523,39 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f) qemu_fflush(f); } +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp) +{ + SaveStateEntry *se; + bool ret; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->save_postcopy_prepare) { + continue; + } + + if (se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + + trace_savevm_section_start(se->idstr, se->section_id); + + save_section_header(f, se, QEMU_VM_SECTION_PART); + ret = se->ops->save_postcopy_prepare(f, se->opaque, errp); + save_section_footer(f, se); + + trace_savevm_section_end(se->idstr, se->section_id, ret); + + if (!ret) { + assert(*errp); + return false; + } + } + + return true; +} + int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) { int64_t start_ts_each, end_ts_each; @@ -2045,7 +2078,7 @@ static void *postcopy_ram_listen_thread(void *opaque) migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); trace_postcopy_ram_listen_thread_start(); rcu_register_thread(); diff --git a/migration/savevm.h b/migration/savevm.h index 138c39a..2d5e9c7 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -45,6 +45,7 @@ void qemu_savevm_state_pending_exact(uint64_t *must_precopy, void qemu_savevm_state_pending_estimate(uint64_t *must_precopy, uint64_t *can_postcopy); int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy); +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp); void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); void qemu_savevm_send_open_return_path(QEMUFile *f); int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); diff --git a/migration/target.c b/migration/target.c index a6ffa9a..12fd399 100644 --- a/migration/target.c +++ b/migration/target.c @@ -11,21 +11,21 @@ #include CONFIG_DEVICES #ifdef CONFIG_VFIO -#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio-migration.h" #endif #ifdef CONFIG_VFIO void migration_populate_vfio_info(MigrationInfo *info) { - if (vfio_mig_active()) { + if (vfio_migration_active()) { info->vfio = g_malloc0(sizeof(*info->vfio)); - info->vfio->transferred = vfio_mig_bytes_transferred(); + info->vfio->transferred = vfio_migration_bytes_transferred(); } } void migration_reset_vfio_bytes_transferred(void) { - vfio_reset_bytes_transferred(); + vfio_migration_reset_bytes_transferred(); } #else void migration_populate_vfio_info(MigrationInfo *info) |