diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2025-01-10 13:39:19 -0500 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2025-01-10 13:39:19 -0500 |
commit | 3214bec13d8d4c40f707d21d8350d04e4123ae97 (patch) | |
tree | aa360d13996fcd7047cd2c70b672b67eb824a5d7 | |
parent | 290f950361e79d43c9a73d063964631107cac851 (diff) | |
parent | a523bc52166c80d8a04d46584f9f3868bd53ef69 (diff) | |
download | qemu-3214bec13d8d4c40f707d21d8350d04e4123ae97.zip qemu-3214bec13d8d4c40f707d21d8350d04e4123ae97.tar.gz qemu-3214bec13d8d4c40f707d21d8350d04e4123ae97.tar.bz2 |
Merge tag 'migration-20250110-pull-request' of https://gitlab.com/farosas/qemu into staging
Migration pull request
- compression:
Shameer's fix for CONFIG_UADK build
Yuan Liu fixes for zero-page, QPL, qatzip
- multifd sync cleanups, prereq. for VFIO and postcopy work
- fixes for 9.2 regressions:
multifd with pre-9.0 -> post-9.1 migrations (#2720)
s390x migration (#2704)
- fix for assertions during paused migrations; rework of
late-block-activate logic (#2395, #686)
- fixes for compressed arrays creation and parsing, mostly affecting
s390x
# -----BEGIN PGP SIGNATURE-----
#
# iQJEBAABCAAuFiEEqhtIsKIjJqWkw2TPx5jcdBvsMZ0FAmeBDgkQHGZhcm9zYXNA
# c3VzZS5kZQAKCRDHmNx0G+wxnSlUEACl31wY+77JxWnBva/eDDwnJ9HiCrqsoqaZ
# YIJJXNlk4lYJWNdZRt6p27exzWrQwm+kWKPECeCakgCMlfhnKCvejGq7iV/fJY4o
# D8hjE3t1htQ8mfblY1+bqzg3Rml59KwXxiqAwvlljbNWdkXruv026dq9vgJMzFhi
# ia043fOO1tYULIoawgmwmLEHnztht0v+ZTZ1v5KQbrH655tpxls/8kHc6v5PXEpA
# 3PSmCrCQh1dPtkYRjuJ9yHyfU+/T8tYwIjrU6VR1wQW7MBNkjtqNudaqAFiuyuqn
# P8gh4rAQrMhA9y+aq6xSoJP8XGkuOHxLQtlNutlmtbcQyZ7JqgLmK9ZLdoPf21sK
# //erV63NoyaciYB9Nk3NXflwroc6zyvo8A584kGNPwBznZOJLESP4SPvVm/nlE29
# vbyq8AWHRjFiqqf6P0ttQLAFkusZJzM1Y9UakF51hyVBX70yfqLG20XXZtIq/aZA
# GbBB2Fo0MIlbmWaur3vLsSzn7B8d++Gl9TTGcK/eIXJ1ANCuCxGv9fbXJQlP5F4I
# 3OAoSmAVJ2eqw4v0+2WMiEa8yUA5drNnDSI3VRkG+0K9jRfHKXki466/QQdGrNw7
# 8GuuzLBNai3gEKbavDU0Be73r982KjXeYXj7RuAkQfm0d4H7tiwtg91Cd1dPKfzh
# mhpmOFJDCg==
# =joNM
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 10 Jan 2025 07:09:45 EST
# gpg: using RSA key AA1B48B0A22326A5A4C364CFC798DC741BEC319D
# gpg: issuer "farosas@suse.de"
# gpg: Good signature from "Fabiano Rosas <farosas@suse.de>" [unknown]
# gpg: aka "Fabiano Almeida Rosas <fabiano.rosas@suse.com>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: AA1B 48B0 A223 26A5 A4C3 64CF C798 DC74 1BEC 319D
* tag 'migration-20250110-pull-request' of https://gitlab.com/farosas/qemu: (25 commits)
multifd: bugfix for incorrect migration data with qatzip compression
multifd: bugfix for incorrect migration data with QPL compression
multifd: bugfix for migration using compression methods
s390x: Fix CSS migration
migration: Fix arrays of pointers in JSON writer
migration: Dump correct JSON format for nullptr replacement
migration: Rename vmstate_info_nullptr
migration: Fix parsing of s390 stream
migration: Remove unused argument in vmsd_desc_field_end
migration: Add more error handling to analyze-migration.py
migration/block: Rewrite disk activation
migration/block: Fix possible race with block_inactive
migration/block: Apply late-block-active behavior to postcopy
migration/block: Make late-block-active the default
qmp/cont: Only activate disks if migration completed
migration: Add helper to get target runstate
migration/multifd: Fix compat with QEMU < 9.0
migration/multifd: Document the reason to sync for save_setup()
migration/multifd: Cleanup src flushes on condition check
migration/multifd: Remove sync processing on postcopy
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r-- | hw/s390x/s390-virtio-ccw.c | 2 | ||||
-rw-r--r-- | include/migration/misc.h | 4 | ||||
-rw-r--r-- | migration/block-active.c | 94 | ||||
-rw-r--r-- | migration/colo.c | 2 | ||||
-rw-r--r-- | migration/meson.build | 1 | ||||
-rw-r--r-- | migration/migration.c | 136 | ||||
-rw-r--r-- | migration/migration.h | 6 | ||||
-rw-r--r-- | migration/multifd-nocomp.c | 77 | ||||
-rw-r--r-- | migration/multifd-qatzip.c | 1 | ||||
-rw-r--r-- | migration/multifd-qpl.c | 1 | ||||
-rw-r--r-- | migration/multifd-uadk.c | 2 | ||||
-rw-r--r-- | migration/multifd.c | 32 | ||||
-rw-r--r-- | migration/multifd.h | 27 | ||||
-rw-r--r-- | migration/ram.c | 89 | ||||
-rw-r--r-- | migration/ram.h | 28 | ||||
-rw-r--r-- | migration/rdma.h | 7 | ||||
-rw-r--r-- | migration/savevm.c | 46 | ||||
-rw-r--r-- | migration/trace-events | 3 | ||||
-rw-r--r-- | migration/vmstate-types.c | 2 | ||||
-rw-r--r-- | migration/vmstate.c | 151 | ||||
-rw-r--r-- | monitor/qmp-cmds.c | 22 | ||||
-rwxr-xr-x | scripts/analyze-migration.py | 140 |
22 files changed, 601 insertions, 272 deletions
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 8a242cc..38aeba1 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -1244,6 +1244,7 @@ static void ccw_machine_2_9_instance_options(MachineState *machine) s390_cpudef_featoff_greater(12, 1, S390_FEAT_ZPCI); s390_cpudef_featoff_greater(12, 1, S390_FEAT_ADAPTER_INT_SUPPRESSION); s390_cpudef_featoff_greater(12, 1, S390_FEAT_ADAPTER_EVENT_NOTIFICATION); + css_migration_enabled = false; } static void ccw_machine_2_9_class_options(MachineClass *mc) @@ -1256,7 +1257,6 @@ static void ccw_machine_2_9_class_options(MachineClass *mc) ccw_machine_2_10_class_options(mc); compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len); compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); - css_migration_enabled = false; } DEFINE_CCW_MACHINE(2, 9); diff --git a/include/migration/misc.h b/include/migration/misc.h index c0e23fd..67f7ef7 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -104,4 +104,8 @@ bool migration_incoming_postcopy_advised(void); /* True if background snapshot is active */ bool migration_in_bg_snapshot(void); +/* Wrapper for block active/inactive operations */ +bool migration_block_activate(Error **errp); +bool migration_block_inactivate(void); + #endif diff --git a/migration/block-active.c b/migration/block-active.c new file mode 100644 index 0000000..d477cf8 --- /dev/null +++ b/migration/block-active.c @@ -0,0 +1,94 @@ +/* + * Block activation tracking for migration purpose + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2024 Red Hat, Inc. + */ +#include "qemu/osdep.h" +#include "block/block.h" +#include "qapi/error.h" +#include "migration/migration.h" +#include "qemu/error-report.h" +#include "trace.h" + +/* + * Migration-only cache to remember the block layer activation status. + * Protected by BQL. + * + * We need this because.. + * + * - Migration can fail after block devices are invalidated (during + * switchover phase). When that happens, we need to be able to recover + * the block drive status by re-activating them. + * + * - Currently bdrv_inactivate_all() is not safe to be invoked on top of + * invalidated drives (even if bdrv_activate_all() is actually safe to be + * called any time!). It means remembering this could help migration to + * make sure it won't invalidate twice in a row, crashing QEMU. It can + * happen when we migrate a PAUSED VM from host1 to host2, then migrate + * again to host3 without starting it. TODO: a cleaner solution is to + * allow safe invoke of bdrv_inactivate_all() at anytime, like + * bdrv_activate_all(). + * + * For freshly started QEMU, the flag is initialized to TRUE reflecting the + * scenario where QEMU owns block device ownerships. + * + * For incoming QEMU taking a migration stream, the flag is initialized to + * FALSE reflecting that the incoming side doesn't own the block devices, + * not until switchover happens. + */ +static bool migration_block_active; + +/* Setup the disk activation status */ +void migration_block_active_setup(bool active) +{ + migration_block_active = active; +} + +bool migration_block_activate(Error **errp) +{ + ERRP_GUARD(); + + assert(bql_locked()); + + if (migration_block_active) { + trace_migration_block_activation("active-skipped"); + return true; + } + + trace_migration_block_activation("active"); + + bdrv_activate_all(errp); + if (*errp) { + error_report_err(error_copy(*errp)); + return false; + } + + migration_block_active = true; + return true; +} + +bool migration_block_inactivate(void) +{ + int ret; + + assert(bql_locked()); + + if (!migration_block_active) { + trace_migration_block_activation("inactive-skipped"); + return true; + } + + trace_migration_block_activation("inactive"); + + ret = bdrv_inactivate_all(); + if (ret) { + error_report("%s: bdrv_inactivate_all() failed: %d", + __func__, ret); + return false; + } + + migration_block_active = false; + return true; +} diff --git a/migration/colo.c b/migration/colo.c index afc9869..9a8e5fb 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -836,7 +836,7 @@ static void *colo_process_incoming_thread(void *opaque) /* Make sure all file formats throw away their mutable metadata */ bql_lock(); - bdrv_activate_all(&local_err); + migration_block_activate(&local_err); bql_unlock(); if (local_err) { error_report_err(local_err); diff --git a/migration/meson.build b/migration/meson.build index d53cf34..dac687e 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -11,6 +11,7 @@ migration_files = files( system_ss.add(files( 'block-dirty-bitmap.c', + 'block-active.c', 'channel.c', 'channel-block.c', 'cpu-throttle.c', diff --git a/migration/migration.c b/migration/migration.c index df61ca4..2d1da91 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -135,6 +135,21 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } +static RunState migration_get_target_runstate(void) +{ + /* + * When the global state is not migrated, it means we don't know the + * runstate of the src QEMU. We don't have much choice but assuming + * the VM is running. NOTE: this is pretty rare case, so far only Xen + * uses it. + */ + if (!global_state_received()) { + return RUN_STATE_RUNNING; + } + + return global_state_get_runstate(); +} + static bool transport_supports_multi_channels(MigrationAddress *addr) { if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { @@ -723,30 +738,10 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, static void process_incoming_migration_bh(void *opaque) { - Error *local_err = NULL; MigrationIncomingState *mis = opaque; trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter"); - /* If capability late_block_activate is set: - * Only fire up the block code now if we're going to restart the - * VM, else 'cont' will do it. - * This causes file locking to happen; so we don't want it to happen - * unless we really are starting the VM. - */ - if (!migrate_late_block_activate() || - (autostart && (!global_state_received() || - runstate_is_live(global_state_get_runstate())))) { - /* Make sure all file formats throw away their mutable metadata. - * If we get an error here, just don't restart the VM yet. */ - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - local_err = NULL; - autostart = false; - } - } - /* * This must happen after all error conditions are dealt with and * we're sure the VM is going to be running on this host. @@ -759,10 +754,23 @@ static void process_incoming_migration_bh(void *opaque) dirty_bitmap_mig_before_vm_start(); - if (!global_state_received() || - runstate_is_live(global_state_get_runstate())) { + if (runstate_is_live(migration_get_target_runstate())) { if (autostart) { - vm_start(); + /* + * Block activation is always delayed until VM starts, either + * here (which means we need to start the dest VM right now..), + * or until qmp_cont() later. + * + * We used to have cap 'late-block-activate' but now we do this + * unconditionally, as it has no harm but only benefit. E.g., + * it's not part of migration ABI on the time of disk activation. + * + * Make sure all file formats throw away their mutable + * metadata. If error, don't restart the VM yet. + */ + if (migration_block_activate(NULL)) { + vm_start(); + } } else { runstate_set(RUN_STATE_PAUSED); } @@ -1547,16 +1555,6 @@ static void migrate_fd_cancel(MigrationState *s) } } } - if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) { - Error *local_err = NULL; - - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } else { - s->block_inactive = false; - } - } } void migration_add_notifier_mode(NotifierWithReturn *notify, @@ -1840,6 +1838,12 @@ void qmp_migrate_incoming(const char *uri, bool has_channels, return; } + /* + * Newly setup incoming QEMU. Mark the block active state to reflect + * that the src currently owns the disks. + */ + migration_block_active_setup(false); + once = false; } @@ -2492,7 +2496,6 @@ static int postcopy_start(MigrationState *ms, Error **errp) QIOChannelBuffer *bioc; QEMUFile *fb; uint64_t bandwidth = migrate_max_postcopy_bandwidth(); - bool restart_block = false; int cur_state = MIGRATION_STATUS_ACTIVE; if (migrate_postcopy_preempt()) { @@ -2528,13 +2531,10 @@ static int postcopy_start(MigrationState *ms, Error **errp) goto fail; } - ret = bdrv_inactivate_all(); - if (ret < 0) { - error_setg_errno(errp, -ret, "%s: Failed in bdrv_inactivate_all()", - __func__); + if (!migration_block_inactivate()) { + error_setg(errp, "%s: Failed in bdrv_inactivate_all()", __func__); goto fail; } - restart_block = true; /* * Cause any non-postcopiable, but iterative devices to @@ -2604,8 +2604,6 @@ static int postcopy_start(MigrationState *ms, Error **errp) goto fail_closefb; } - restart_block = false; - /* Now send that blob */ if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { error_setg(errp, "%s: Failed to send packaged data", __func__); @@ -2650,17 +2648,7 @@ fail_closefb: fail: migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_FAILED); - if (restart_block) { - /* A failure happened early enough that we know the destination hasn't - * accessed block devices, so we're safe to recover. - */ - Error *local_err = NULL; - - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } - } + migration_block_activate(NULL); migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL); bql_unlock(); return -1; @@ -2729,14 +2717,11 @@ static int migration_completion_precopy(MigrationState *s, goto out_unlock; } - /* - * Inactivate disks except in COLO, and track that we have done so in order - * to remember to reactivate them if migration fails or is cancelled. - */ - s->block_inactive = !migrate_colo(); migration_rate_set(RATE_LIMIT_DISABLED); + + /* Inactivate disks except in COLO */ ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, - s->block_inactive); + !migrate_colo()); out_unlock: bql_unlock(); return ret; @@ -2761,31 +2746,6 @@ static void migration_completion_postcopy(MigrationState *s) trace_migration_completion_postcopy_end_after_complete(); } -static void migration_completion_failed(MigrationState *s, - int current_active_state) -{ - if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_DEVICE)) { - /* - * If not doing postcopy, vm_start() will be called: let's - * regain control on images. - */ - Error *local_err = NULL; - - bql_lock(); - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } else { - s->block_inactive = false; - } - bql_unlock(); - } - - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); -} - /** * migration_completion: Used by migration_thread when there's not much left. * The caller 'breaks' the loop when this returns. @@ -2839,7 +2799,8 @@ fail: error_free(local_err); } - migration_completion_failed(s, current_active_state); + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); } /** @@ -3269,6 +3230,11 @@ static void migration_iteration_finish(MigrationState *s) case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLING: + /* + * Re-activate the block drives if they're inactivated. Note, COLO + * shouldn't use block_active at all, so it should be no-op there. + */ + migration_block_activate(NULL); if (runstate_is_live(s->vm_old_state)) { if (!runstate_check(RUN_STATE_SHUTDOWN)) { vm_start(); @@ -3842,6 +3808,8 @@ static void migration_instance_init(Object *obj) ms->state = MIGRATION_STATUS_NONE; ms->mbps = -1; ms->pages_per_second = -1; + /* Freshly started QEMU owns all the block devices */ + migration_block_active_setup(true); qemu_sem_init(&ms->pause_sem, 0); qemu_mutex_init(&ms->error_mutex); diff --git a/migration/migration.h b/migration/migration.h index 7b6e718..0df2a18 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -370,9 +370,6 @@ struct MigrationState { /* Flag set once the migration thread is running (and needs joining) */ bool migration_thread_running; - /* Flag set once the migration thread called bdrv_inactivate_all */ - bool block_inactive; - /* Migration is waiting for guest to unplug device */ QemuSemaphore wait_unplug_sem; @@ -556,4 +553,7 @@ void migration_bitmap_sync_precopy(bool last_stage); /* migration/block-dirty-bitmap.c */ void dirty_bitmap_mig_init(void); +/* migration/block-active.c */ +void migration_block_active_setup(bool active); + #endif diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index 5519115..1325dba 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -20,6 +20,7 @@ #include "qemu/cutils.h" #include "qemu/error-report.h" #include "trace.h" +#include "qemu-file.h" static MultiFDSendData *multifd_ram_send; @@ -343,8 +344,53 @@ retry: return true; } -int multifd_ram_flush_and_sync(void) +/* + * We have two modes for multifd flushes: + * + * - Per-section mode: this is the legacy way to flush, it requires one + * MULTIFD_FLAG_SYNC message for each RAM_SAVE_FLAG_EOS. + * + * - Per-round mode: this is the modern way to flush, it requires one + * MULTIFD_FLAG_SYNC message only for each round of RAM scan. Normally + * it's paired with a new RAM_SAVE_FLAG_MULTIFD_FLUSH message in network + * based migrations. + * + * One thing to mention is mapped-ram always use the modern way to sync. + */ + +/* Do we need a per-section multifd flush (legacy way)? */ +bool multifd_ram_sync_per_section(void) +{ + if (!migrate_multifd()) { + return false; + } + + if (migrate_mapped_ram()) { + return false; + } + + return migrate_multifd_flush_after_each_section(); +} + +/* Do we need a per-round multifd flush (modern way)? */ +bool multifd_ram_sync_per_round(void) +{ + if (!migrate_multifd()) { + return false; + } + + if (migrate_mapped_ram()) { + return true; + } + + return !migrate_multifd_flush_after_each_section(); +} + +int multifd_ram_flush_and_sync(QEMUFile *f) { + MultiFDSyncReq req; + int ret; + if (!migrate_multifd()) { return 0; } @@ -356,12 +402,37 @@ int multifd_ram_flush_and_sync(void) } } - return multifd_send_sync_main(); + /* File migrations only need to sync with threads */ + req = migrate_mapped_ram() ? MULTIFD_SYNC_LOCAL : MULTIFD_SYNC_ALL; + + ret = multifd_send_sync_main(req); + if (ret) { + return ret; + } + + /* If we don't need to sync with remote at all, nothing else to do */ + if (req == MULTIFD_SYNC_LOCAL) { + return 0; + } + + /* + * Old QEMUs don't understand RAM_SAVE_FLAG_MULTIFD_FLUSH, it relies + * on RAM_SAVE_FLAG_EOS instead. + */ + if (migrate_multifd_flush_after_each_section()) { + return 0; + } + + qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); + qemu_fflush(f); + + return 0; } bool multifd_send_prepare_common(MultiFDSendParams *p) { MultiFDPages_t *pages = &p->data->u.ram; + multifd_send_prepare_header(p); multifd_send_zero_page_detect(p); if (!pages->normal_num) { @@ -369,8 +440,6 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) return false; } - multifd_send_prepare_header(p); - return true; } diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c index 7b68397..6a0e989 100644 --- a/migration/multifd-qatzip.c +++ b/migration/multifd-qatzip.c @@ -373,6 +373,7 @@ static int qatzip_recv(MultiFDRecvParams *p, Error **errp) /* Copy each page to its appropriate location. */ for (int i = 0; i < p->normal_num; i++) { memcpy(p->host + p->normal[i], q->out_buf + page_size * i, page_size); + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return 0; } diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index bbe4666..88e2344 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -679,6 +679,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); assert(qpl->zlen[i] <= multifd_ram_page_size()); zbuf_len += qpl->zlen[i]; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } /* read compressed pages */ diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index 6e6a290..6895c1f 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -169,7 +169,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) .src_len = page_size, .dst = buf, /* Set dst_len to double the src in case compressed out >= page_size */ - .dst_len = p->page_size * 2, + .dst_len = page_size * 2, }; if (uadk_data->handle) { diff --git a/migration/multifd.c b/migration/multifd.c index 4f973d7..ab73d6d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -252,9 +252,8 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; - if (!(p->flags & MULTIFD_FLAG_SYNC)) { - ret = multifd_ram_unfill_packet(p, errp); - } + /* Always unfill, old QEMUs (<9.0) send data along with SYNC */ + ret = multifd_ram_unfill_packet(p, errp); trace_multifd_recv_unfill(p->id, p->packet_num, p->flags, p->next_packet_size); @@ -523,11 +522,13 @@ static int multifd_zero_copy_flush(QIOChannel *c) return ret; } -int multifd_send_sync_main(void) +int multifd_send_sync_main(MultiFDSyncReq req) { int i; bool flush_zero_copy; + assert(req != MULTIFD_SYNC_NONE); + flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { @@ -543,8 +544,8 @@ int multifd_send_sync_main(void) * We should be the only user so far, so not possible to be set by * others concurrently. */ - assert(qatomic_read(&p->pending_sync) == false); - qatomic_set(&p->pending_sync, true); + assert(qatomic_read(&p->pending_sync) == MULTIFD_SYNC_NONE); + qatomic_set(&p->pending_sync, req); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -635,14 +636,17 @@ static void *multifd_send_thread(void *opaque) */ qatomic_store_release(&p->pending_job, false); } else { + MultiFDSyncReq req = qatomic_read(&p->pending_sync); + /* * If not a normal job, must be a sync request. Note that * pending_sync is a standalone flag (unlike pending_job), so * it doesn't require explicit memory barriers. */ - assert(qatomic_read(&p->pending_sync)); + assert(req != MULTIFD_SYNC_NONE); - if (use_packets) { + /* Only push the SYNC message if it involves a remote sync */ + if (req == MULTIFD_SYNC_ALL) { p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, @@ -654,7 +658,7 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->packet_len); } - qatomic_set(&p->pending_sync, false); + qatomic_set(&p->pending_sync, MULTIFD_SYNC_NONE); qemu_sem_post(&p->sem_sync); } } @@ -1151,9 +1155,13 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - if (!(flags & MULTIFD_FLAG_SYNC)) { - has_data = p->normal_num || p->zero_num; - } + + /* + * Even if it's a SYNC packet, this needs to be set + * because older QEMUs (<9.0) still send data along with + * the SYNC packet. + */ + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); } else { /* diff --git a/migration/multifd.h b/migration/multifd.h index 50d58c0..bd785b9 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -19,6 +19,22 @@ typedef struct MultiFDRecvData MultiFDRecvData; typedef struct MultiFDSendData MultiFDSendData; +typedef enum { + /* No sync request */ + MULTIFD_SYNC_NONE = 0, + /* Sync locally on the sender threads without pushing messages */ + MULTIFD_SYNC_LOCAL, + /* + * Sync not only on the sender threads, but also push MULTIFD_FLAG_SYNC + * message to the wire for each iochannel (which is for a remote sync). + * + * When remote sync is used, need to be paired with a follow up + * RAM_SAVE_FLAG_EOS / RAM_SAVE_FLAG_MULTIFD_FLUSH message on the main + * channel. + */ + MULTIFD_SYNC_ALL, +} MultiFDSyncReq; + bool multifd_send_setup(void); void multifd_send_shutdown(void); void multifd_send_channel_created(void); @@ -28,7 +44,7 @@ void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -int multifd_send_sync_main(void); +int multifd_send_sync_main(MultiFDSyncReq req); bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); bool multifd_recv(void); MultiFDRecvData *multifd_get_recv_data(void); @@ -143,7 +159,7 @@ typedef struct { /* multifd flags for each packet */ uint32_t flags; /* - * The sender thread has work to do if either of below boolean is set. + * The sender thread has work to do if either of below field is set. * * @pending_job: a job is pending * @pending_sync: a sync request is pending @@ -152,7 +168,8 @@ typedef struct { * cleared by the multifd sender threads. */ bool pending_job; - bool pending_sync; + MultiFDSyncReq pending_sync; + MultiFDSendData *data; /* thread local variables. No locking required */ @@ -337,7 +354,9 @@ static inline uint32_t multifd_ram_page_count(void) void multifd_ram_save_setup(void); void multifd_ram_save_cleanup(void); -int multifd_ram_flush_and_sync(void); +int multifd_ram_flush_and_sync(QEMUFile *f); +bool multifd_ram_sync_per_round(void); +bool multifd_ram_sync_per_section(void); size_t multifd_ram_payload_size(void); void multifd_ram_fill_packet(MultiFDSendParams *p); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); diff --git a/migration/ram.c b/migration/ram.c index a60666d..ce28328 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -72,27 +72,6 @@ /* ram save/restore */ /* - * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it - * worked for pages that were filled with the same char. We switched - * it to only search for the zero value. And to avoid confusion with - * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. - * - * RAM_SAVE_FLAG_FULL was obsoleted in 2009. - * - * RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1. - */ -#define RAM_SAVE_FLAG_FULL 0x01 -#define RAM_SAVE_FLAG_ZERO 0x02 -#define RAM_SAVE_FLAG_MEM_SIZE 0x04 -#define RAM_SAVE_FLAG_PAGE 0x08 -#define RAM_SAVE_FLAG_EOS 0x10 -#define RAM_SAVE_FLAG_CONTINUE 0x20 -#define RAM_SAVE_FLAG_XBZRLE 0x40 -/* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */ -#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 -/* We can't use any flag that is bigger than 0x200 */ - -/* * mapped-ram migration supports O_DIRECT, so we need to make sure the * userspace buffer, the IO operation size and the file offset are * aligned according to the underlying device's block size. The first @@ -1323,19 +1302,12 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) pss->page = 0; pss->block = QLIST_NEXT_RCU(pss->block, next); if (!pss->block) { - if (migrate_multifd() && - (!migrate_multifd_flush_after_each_section() || - migrate_mapped_ram())) { + if (multifd_ram_sync_per_round()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; - int ret = multifd_ram_flush_and_sync(); + int ret = multifd_ram_flush_and_sync(f); if (ret < 0) { return ret; } - - if (!migrate_mapped_ram()) { - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - qemu_fflush(f); - } } /* Hit the end of the list */ @@ -3064,19 +3036,39 @@ static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) migration_ops->ram_save_target_page = ram_save_target_page_legacy; } + /* + * This operation is unfortunate.. + * + * For legacy QEMUs using per-section sync + * ======================================= + * + * This must exist because the EOS below requires the SYNC messages + * per-channel to work. + * + * For modern QEMUs using per-round sync + * ===================================== + * + * Logically such sync is not needed, and recv threads should not run + * until setup ready (using things like channels_ready on src). Then + * we should be all fine. + * + * However even if we add channels_ready to recv side in new QEMUs, old + * QEMU won't have them so this sync will still be needed to make sure + * multifd recv threads won't start processing guest pages early before + * ram_load_setup() is properly done. + * + * Let's stick with this. Fortunately the overhead is low to sync + * during setup because the VM is running, so at least it's not + * accounted as part of downtime. + */ bql_unlock(); - ret = multifd_ram_flush_and_sync(); + ret = multifd_ram_flush_and_sync(f); bql_lock(); if (ret < 0) { error_setg(errp, "%s: multifd synchronization failed", __func__); return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section() - && !migrate_mapped_ram()) { - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - } - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ret = qemu_fflush(f); if (ret < 0) { @@ -3209,9 +3201,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 && migration_is_running()) { - if (migrate_multifd() && migrate_multifd_flush_after_each_section() && - !migrate_mapped_ram()) { - ret = multifd_ram_flush_and_sync(); + if (multifd_ram_sync_per_section()) { + ret = multifd_ram_flush_and_sync(f); if (ret < 0) { return ret; } @@ -3283,9 +3274,15 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } } - ret = multifd_ram_flush_and_sync(); - if (ret < 0) { - return ret; + if (multifd_ram_sync_per_section()) { + /* + * Only the old dest QEMU will need this sync, because each EOS + * will require one SYNC message on each channel. + */ + ret = multifd_ram_flush_and_sync(f); + if (ret < 0) { + return ret; + } } if (migrate_mapped_ram()) { @@ -3796,15 +3793,7 @@ int ram_load_postcopy(QEMUFile *f, int channel) TARGET_PAGE_SIZE); } break; - case RAM_SAVE_FLAG_MULTIFD_FLUSH: - multifd_recv_sync_main(); - break; case RAM_SAVE_FLAG_EOS: - /* normal exit */ - if (migrate_multifd() && - migrate_multifd_flush_after_each_section()) { - multifd_recv_sync_main(); - } break; default: error_report("Unknown combination of migration flags: 0x%x" diff --git a/migration/ram.h b/migration/ram.h index 0d1981f..921c39a 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -33,6 +33,34 @@ #include "exec/cpu-common.h" #include "io/channel.h" +/* + * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it + * worked for pages that were filled with the same char. We switched + * it to only search for the zero value. And to avoid confusion with + * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. + * + * RAM_SAVE_FLAG_FULL (0x01) was obsoleted in 2009. + * + * RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1. + * + * RAM_SAVE_FLAG_HOOK is only used in RDMA. Whenever this is found in the + * data stream, the flags will be passed to rdma functions in the + * incoming-migration side. + * + * We can't use any flag that is bigger than 0x200, because the flags are + * always assumed to be encoded in a ramblock address offset, which is + * multiple of PAGE_SIZE. Here it means QEMU supports migration with any + * architecture that has PAGE_SIZE>=1K (0x400). + */ +#define RAM_SAVE_FLAG_ZERO 0x002 +#define RAM_SAVE_FLAG_MEM_SIZE 0x004 +#define RAM_SAVE_FLAG_PAGE 0x008 +#define RAM_SAVE_FLAG_EOS 0x010 +#define RAM_SAVE_FLAG_CONTINUE 0x020 +#define RAM_SAVE_FLAG_XBZRLE 0x040 +#define RAM_SAVE_FLAG_HOOK 0x080 +#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 + extern XBZRLECacheStats xbzrle_counters; /* Should be holding either ram_list.mutex, or the RCU lock. */ diff --git a/migration/rdma.h b/migration/rdma.h index a8d27f3..f55f28b 100644 --- a/migration/rdma.h +++ b/migration/rdma.h @@ -33,13 +33,6 @@ void rdma_start_incoming_migration(InetSocketAddress *host_port, Error **errp); #define RAM_CONTROL_ROUND 1 #define RAM_CONTROL_FINISH 3 -/* - * Whenever this is found in the data stream, the flags - * will be passed to rdma functions in the incoming-migration - * side. - */ -#define RAM_SAVE_FLAG_HOOK 0x80 - #define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 diff --git a/migration/savevm.c b/migration/savevm.c index 927b114..c929da1 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1547,15 +1547,16 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, } if (inactivate_disks) { - /* Inactivate before sending QEMU_VM_EOF so that the - * bdrv_activate_all() on the other end won't fail. */ - ret = bdrv_inactivate_all(); - if (ret) { - error_setg(&local_err, "%s: bdrv_inactivate_all() failed (%d)", - __func__, ret); + /* + * Inactivate before sending QEMU_VM_EOF so that the + * bdrv_activate_all() on the other end won't fail. + */ + if (!migration_block_inactivate()) { + error_setg(&local_err, "%s: bdrv_inactivate_all() failed", + __func__); migrate_set_error(ms, local_err); error_report_err(local_err); - qemu_file_set_error(f, ret); + qemu_file_set_error(f, -EFAULT); return ret; } } @@ -2121,7 +2122,6 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) static void loadvm_postcopy_handle_run_bh(void *opaque) { - Error *local_err = NULL; MigrationIncomingState *mis = opaque; trace_vmstate_downtime_checkpoint("dst-postcopy-bh-enter"); @@ -2137,22 +2137,20 @@ static void loadvm_postcopy_handle_run_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-postcopy-bh-announced"); - /* Make sure all file formats throw away their mutable metadata. - * If we get an error here, just don't restart the VM yet. */ - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - local_err = NULL; - autostart = false; - } - - trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated"); - dirty_bitmap_mig_before_vm_start(); if (autostart) { - /* Hold onto your hats, starting the CPU */ - vm_start(); + /* + * Make sure all file formats throw away their mutable metadata. + * If we get an error here, just don't restart the VM yet. + */ + bool success = migration_block_activate(NULL); + + trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated"); + + if (success) { + vm_start(); + } } else { /* leave it paused and let management decide when to start the CPU */ runstate_set(RUN_STATE_PAUSED); @@ -3192,11 +3190,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, * side of the migration take control of the images. */ if (live && !saved_vm_running) { - ret = bdrv_inactivate_all(); - if (ret) { - error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)", - __func__, ret); - } + migration_block_inactivate(); } } diff --git a/migration/trace-events b/migration/trace-events index bb0e0cc..b82a1c5 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -383,3 +383,6 @@ migration_pagecache_insert(void) "Error allocating page" # cpu-throttle.c cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%" cpu_throttle_dirty_sync(void) "" + +# block-active.c +migration_block_activation(const char *name) "%s" diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c index e83bfcc..d70d573 100644 --- a/migration/vmstate-types.c +++ b/migration/vmstate-types.c @@ -338,7 +338,7 @@ static int put_nullptr(QEMUFile *f, void *pv, size_t size, } const VMStateInfo vmstate_info_nullptr = { - .name = "uint64", + .name = "nullptr", .get = get_nullptr, .put = put_nullptr, }; diff --git a/migration/vmstate.c b/migration/vmstate.c index fa002b2..82bd005 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -51,6 +51,36 @@ vmstate_field_exists(const VMStateDescription *vmsd, const VMStateField *field, return result; } +/* + * Create a fake nullptr field when there's a NULL pointer detected in the + * array of a VMS_ARRAY_OF_POINTER VMSD field. It's needed because we + * can't dereference the NULL pointer. + */ +static const VMStateField * +vmsd_create_fake_nullptr_field(const VMStateField *field) +{ + VMStateField *fake = g_new0(VMStateField, 1); + + /* It can only happen on an array of pointers! */ + assert(field->flags & VMS_ARRAY_OF_POINTER); + + /* Some of fake's properties should match the original's */ + fake->name = field->name; + fake->version_id = field->version_id; + + /* Do not need "field_exists" check as it always exists (which is null) */ + fake->field_exists = NULL; + + /* See vmstate_info_nullptr - use 1 byte to represent nullptr */ + fake->size = 1; + fake->info = &vmstate_info_nullptr; + fake->flags = VMS_SINGLE; + + /* All the rest fields shouldn't matter.. */ + + return (const VMStateField *)fake; +} + static int vmstate_n_elems(void *opaque, const VMStateField *field) { int n_elems = 1; @@ -143,23 +173,39 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, } for (i = 0; i < n_elems; i++) { void *curr_elem = first_elem + size * i; + const VMStateField *inner_field; if (field->flags & VMS_ARRAY_OF_POINTER) { curr_elem = *(void **)curr_elem; } + if (!curr_elem && size) { - /* if null pointer check placeholder and do not follow */ - assert(field->flags & VMS_ARRAY_OF_POINTER); - ret = vmstate_info_nullptr.get(f, curr_elem, size, NULL); - } else if (field->flags & VMS_STRUCT) { - ret = vmstate_load_state(f, field->vmsd, curr_elem, - field->vmsd->version_id); - } else if (field->flags & VMS_VSTRUCT) { - ret = vmstate_load_state(f, field->vmsd, curr_elem, - field->struct_version_id); + /* + * If null pointer found (which should only happen in + * an array of pointers), use null placeholder and do + * not follow. + */ + inner_field = vmsd_create_fake_nullptr_field(field); + } else { + inner_field = field; + } + + if (inner_field->flags & VMS_STRUCT) { + ret = vmstate_load_state(f, inner_field->vmsd, curr_elem, + inner_field->vmsd->version_id); + } else if (inner_field->flags & VMS_VSTRUCT) { + ret = vmstate_load_state(f, inner_field->vmsd, curr_elem, + inner_field->struct_version_id); } else { - ret = field->info->get(f, curr_elem, size, field); + ret = inner_field->info->get(f, curr_elem, size, + inner_field); + } + + /* If we used a fake temp field.. free it now */ + if (inner_field != field) { + g_clear_pointer((gpointer *)&inner_field, g_free); } + if (ret >= 0) { ret = qemu_file_get_error(f); } @@ -311,7 +357,7 @@ static void vmsd_desc_field_start(const VMStateDescription *vmsd, static void vmsd_desc_field_end(const VMStateDescription *vmsd, JSONWriter *vmdesc, - const VMStateField *field, size_t size, int i) + const VMStateField *field, size_t size) { if (!vmdesc) { return; @@ -379,37 +425,89 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, int size = vmstate_size(opaque, field); uint64_t old_offset, written_bytes; JSONWriter *vmdesc_loop = vmdesc; + bool is_prev_null = false; trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems); if (field->flags & VMS_POINTER) { first_elem = *(void **)first_elem; assert(first_elem || !n_elems || !size); } + for (i = 0; i < n_elems; i++) { void *curr_elem = first_elem + size * i; + const VMStateField *inner_field; + bool is_null; + int max_elems = n_elems - i; - vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems); old_offset = qemu_file_transferred(f); if (field->flags & VMS_ARRAY_OF_POINTER) { assert(curr_elem); curr_elem = *(void **)curr_elem; } + if (!curr_elem && size) { - /* if null pointer write placeholder and do not follow */ - assert(field->flags & VMS_ARRAY_OF_POINTER); - ret = vmstate_info_nullptr.put(f, curr_elem, size, NULL, - NULL); - } else if (field->flags & VMS_STRUCT) { - ret = vmstate_save_state(f, field->vmsd, curr_elem, - vmdesc_loop); - } else if (field->flags & VMS_VSTRUCT) { - ret = vmstate_save_state_v(f, field->vmsd, curr_elem, - vmdesc_loop, - field->struct_version_id, errp); + /* + * If null pointer found (which should only happen in + * an array of pointers), use null placeholder and do + * not follow. + */ + inner_field = vmsd_create_fake_nullptr_field(field); + is_null = true; + } else { + inner_field = field; + is_null = false; + } + + /* + * Due to the fake nullptr handling above, if there's mixed + * null/non-null data, it doesn't make sense to emit a + * compressed array representation spanning the entire array + * because the field types will be different (e.g. struct + * vs. nullptr). Search ahead for the next null/non-null element + * and start a new compressed array if found. + */ + if (field->flags & VMS_ARRAY_OF_POINTER && + is_null != is_prev_null) { + + is_prev_null = is_null; + vmdesc_loop = vmdesc; + + for (int j = i + 1; j < n_elems; j++) { + void *elem = *(void **)(first_elem + size * j); + bool elem_is_null = !elem && size; + + if (is_null != elem_is_null) { + max_elems = j - i; + break; + } + } + } + + vmsd_desc_field_start(vmsd, vmdesc_loop, inner_field, + i, max_elems); + + if (inner_field->flags & VMS_STRUCT) { + ret = vmstate_save_state(f, inner_field->vmsd, + curr_elem, vmdesc_loop); + } else if (inner_field->flags & VMS_VSTRUCT) { + ret = vmstate_save_state_v(f, inner_field->vmsd, + curr_elem, vmdesc_loop, + inner_field->struct_version_id, + errp); } else { - ret = field->info->put(f, curr_elem, size, field, - vmdesc_loop); + ret = inner_field->info->put(f, curr_elem, size, + inner_field, vmdesc_loop); + } + + written_bytes = qemu_file_transferred(f) - old_offset; + vmsd_desc_field_end(vmsd, vmdesc_loop, inner_field, + written_bytes); + + /* If we used a fake temp field.. free it now */ + if (inner_field != field) { + g_clear_pointer((gpointer *)&inner_field, g_free); } + if (ret) { error_setg(errp, "Save of field %s/%s failed", vmsd->name, field->name); @@ -419,9 +517,6 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, return ret; } - written_bytes = qemu_file_transferred(f) - old_offset; - vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i); - /* Compressed arrays only care about the first element */ if (vmdesc_loop && vmsd_can_compress(field)) { vmdesc_loop = NULL; diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c index 34f2150..1ca44fb 100644 --- a/monitor/qmp-cmds.c +++ b/monitor/qmp-cmds.c @@ -31,6 +31,7 @@ #include "qapi/type-helpers.h" #include "hw/mem/memory-device.h" #include "hw/intc/intc.h" +#include "migration/misc.h" NameInfo *qmp_query_name(Error **errp) { @@ -96,21 +97,18 @@ void qmp_cont(Error **errp) } } - /* Continuing after completed migration. Images have been inactivated to - * allow the destination to take control. Need to get control back now. - * - * If there are no inactive block nodes (e.g. because the VM was just - * paused rather than completing a migration), bdrv_inactivate_all() simply - * doesn't do anything. */ - bdrv_activate_all(&local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - if (runstate_check(RUN_STATE_INMIGRATE)) { autostart = 1; } else { + /* + * Continuing after completed migration. Images have been + * inactivated to allow the destination to take control. Need to + * get control back now. + */ + if (!migration_block_activate(&local_err)) { + error_propagate(errp, local_err); + return; + } vm_start(); } } diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py index 8a254a5..8e1fbf4 100755 --- a/scripts/analyze-migration.py +++ b/scripts/analyze-migration.py @@ -65,6 +65,9 @@ class MigrationFile(object): def tell(self): return self.file.tell() + def seek(self, a, b): + return self.file.seek(a, b) + # The VMSD description is at the end of the file, after EOF. Look for # the last NULL byte, then for the beginning brace of JSON. def read_migration_debug_json(self): @@ -272,11 +275,24 @@ class S390StorageAttributes(object): self.section_key = section_key def read(self): + pos = 0 while True: addr_flags = self.file.read64() flags = addr_flags & 0xfff - if (flags & (self.STATTR_FLAG_DONE | self.STATTR_FLAG_EOS)): + + if flags & self.STATTR_FLAG_DONE: + pos = self.file.tell() + continue + elif flags & self.STATTR_FLAG_EOS: return + else: + # No EOS came after DONE, that's OK, but rewind the + # stream because this is not our data. + if pos: + self.file.seek(pos, os.SEEK_SET) + return + raise Exception("Unknown flags %x", flags) + if (flags & self.STATTR_FLAG_ERROR): raise Exception("Error in migration stream") count = self.file.read64() @@ -401,6 +417,28 @@ class VMSDFieldIntLE(VMSDFieldInt): super(VMSDFieldIntLE, self).__init__(desc, file) self.dtype = '<i%d' % self.size +class VMSDFieldNull(VMSDFieldGeneric): + NULL_PTR_MARKER = b'0' + + def __init__(self, desc, file): + super(VMSDFieldNull, self).__init__(desc, file) + + def __repr__(self): + # A NULL pointer is encoded in the stream as a '0' to + # disambiguate from a mere 0x0 value and avoid consumers + # trying to follow the NULL pointer. Displaying '0', 0x30 or + # 0x0 when analyzing the JSON debug stream could become + # confusing, so use an explicit term instead. + return "nullptr" + + def __str__(self): + return self.__repr__() + + def read(self): + super(VMSDFieldNull, self).read() + assert(self.data == self.NULL_PTR_MARKER) + return self.data + class VMSDFieldBool(VMSDFieldGeneric): def __init__(self, desc, file): super(VMSDFieldBool, self).__init__(desc, file) @@ -429,6 +467,9 @@ class VMSDFieldStruct(VMSDFieldGeneric): super(VMSDFieldStruct, self).__init__(desc, file) self.data = collections.OrderedDict() + if 'fields' not in self.desc['struct']: + raise Exception("No fields in struct. VMSD:\n%s" % self.desc) + # When we see compressed array elements, unfold them here new_fields = [] for field in self.desc['struct']['fields']: @@ -461,15 +502,25 @@ class VMSDFieldStruct(VMSDFieldGeneric): field['data'] = reader(field, self.file) field['data'].read() - if 'index' in field: - if field['name'] not in self.data: - self.data[field['name']] = [] - a = self.data[field['name']] - if len(a) != int(field['index']): - raise Exception("internal index of data field unmatched (%d/%d)" % (len(a), int(field['index']))) - a.append(field['data']) + fname = field['name'] + fdata = field['data'] + + # The field could be: + # i) a single data entry, e.g. uint64 + # ii) an array, indicated by it containing the 'index' key + # + # However, the overall data after parsing the whole + # stream, could be a mix of arrays and single data fields, + # all sharing the same field name due to how QEMU breaks + # up arrays with NULL pointers into multiple compressed + # array segments. + if fname not in self.data: + self.data[fname] = fdata + elif type(self.data[fname]) == list: + self.data[fname].append(fdata) else: - self.data[field['name']] = field['data'] + tmp = self.data[fname] + self.data[fname] = [tmp, fdata] if 'subsections' in self.desc['struct']: for subsection in self.desc['struct']['subsections']: @@ -477,6 +528,10 @@ class VMSDFieldStruct(VMSDFieldGeneric): raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell())) name = self.file.readstr() version_id = self.file.read32() + + if not subsection: + raise Exception("Empty description for subsection: %s" % name) + self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0)) self.data[name].read() @@ -535,6 +590,7 @@ vmsd_field_readers = { "bitmap" : VMSDFieldGeneric, "struct" : VMSDFieldStruct, "capability": VMSDFieldCap, + "nullptr": VMSDFieldNull, "unknown" : VMSDFieldGeneric, } @@ -574,10 +630,13 @@ class MigrationDump(object): } self.filename = filename self.vmsd_desc = None + self.vmsd_json = "" - def read(self, desc_only = False, dump_memory = False, write_memory = False): + def read(self, desc_only = False, dump_memory = False, + write_memory = False): # Read in the whole file file = MigrationFile(self.filename) + self.vmsd_json = file.read_migration_debug_json() # File magic data = file.read32() @@ -635,9 +694,11 @@ class MigrationDump(object): file.close() def load_vmsd_json(self, file): - vmsd_json = file.read_migration_debug_json() - self.vmsd_desc = json.loads(vmsd_json, object_pairs_hook=collections.OrderedDict) + self.vmsd_desc = json.loads(self.vmsd_json, + object_pairs_hook=collections.OrderedDict) for device in self.vmsd_desc['devices']: + if 'fields' not in device: + raise Exception("vmstate for device %s has no fields" % device['name']) key = (device['name'], device['instance_id']) value = ( VMSDSection, device ) self.section_classes[key] = value @@ -666,31 +727,34 @@ args = parser.parse_args() jsonenc = JSONEncoder(indent=4, separators=(',', ': ')) -if args.extract: - dump = MigrationDump(args.file) +if not any([args.extract, args.dump == "state", args.dump == "desc"]): + raise Exception("Please specify either -x, -d state or -d desc") - dump.read(desc_only = True) - print("desc.json") - f = open("desc.json", "w") - f.truncate() - f.write(jsonenc.encode(dump.vmsd_desc)) - f.close() - - dump.read(write_memory = True) - dict = dump.getDict() - print("state.json") - f = open("state.json", "w") - f.truncate() - f.write(jsonenc.encode(dict)) - f.close() -elif args.dump == "state": +try: dump = MigrationDump(args.file) - dump.read(dump_memory = args.memory) - dict = dump.getDict() - print(jsonenc.encode(dict)) -elif args.dump == "desc": - dump = MigrationDump(args.file) - dump.read(desc_only = True) - print(jsonenc.encode(dump.vmsd_desc)) -else: - raise Exception("Please specify either -x, -d state or -d desc") + + if args.extract: + dump.read(desc_only = True) + + print("desc.json") + f = open("desc.json", "w") + f.truncate() + f.write(jsonenc.encode(dump.vmsd_desc)) + f.close() + + dump.read(write_memory = True) + dict = dump.getDict() + print("state.json") + f = open("state.json", "w") + f.truncate() + f.write(jsonenc.encode(dict)) + f.close() + elif args.dump == "state": + dump.read(dump_memory = args.memory) + dict = dump.getDict() + print(jsonenc.encode(dict)) + elif args.dump == "desc": + dump.read(desc_only = True) + print(jsonenc.encode(dump.vmsd_desc)) +except Exception: + raise Exception("Full JSON dump:\n%s", dump.vmsd_json) |