diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-05-17 11:10:12 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-05-17 11:10:12 +0100 |
commit | eb7514ae10d187ccf3bfc0f7dc159a4dfde20be4 (patch) | |
tree | 3f35a8a725b5720914644a9f4e99245bfac8445f | |
parent | 61126a8b4bea43212b575169d4140dc403fc7e90 (diff) | |
parent | 8b7bf2badac25c0a52aff1b181ad75fdb304dd0c (diff) | |
download | qemu-eb7514ae10d187ccf3bfc0f7dc159a4dfde20be4.zip qemu-eb7514ae10d187ccf3bfc0f7dc159a4dfde20be4.tar.gz qemu-eb7514ae10d187ccf3bfc0f7dc159a4dfde20be4.tar.bz2 |
Merge remote-tracking branch 'remotes/juanquintela/tags/migration/20180515' into staging
migration/next for 20180515
# gpg: Signature made Tue 15 May 2018 22:54:38 BST
# gpg: using RSA key F487EF185872D723
# gpg: Good signature from "Juan Quintela <quintela@redhat.com>"
# gpg: aka "Juan Quintela <quintela@trasno.org>"
# Primary key fingerprint: 1899 FF8E DEBF 58CC EE03 4B82 F487 EF18 5872 D723
* remotes/juanquintela/tags/migration/20180515: (40 commits)
Migration+TLS: Fix crash due to double cleanup
migration: Textual fixups for blocktime
migration: update index field when delete or qsort RDMALocalBlock
migration: update docs
migration/hmp: add migrate_pause command
migration/qmp: add command migrate-pause
migration: introduce lock for to_dst_file
hmp/migration: add migrate_recover command
qmp/migration: new command migrate-recover
migration: init dst in migration_object_init too
migration: final handshake for the resume
migration: setup ramstate for resume
migration: synchronize dirty bitmap for resume
migration: introduce SaveVMHandlers.resume_prepare
migration: new message MIG_RP_MSG_RESUME_ACK
migration: new cmd MIG_CMD_POSTCOPY_RESUME
migration: new message MIG_RP_MSG_RECV_BITMAP
migration: new cmd MIG_CMD_RECV_BITMAP
migration: wakeup dst ram-load-thread for recover
migration: new state "postcopy-recover"
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | docs/devel/migration.rst | 532 | ||||
-rw-r--r-- | hmp-commands.hx | 34 | ||||
-rw-r--r-- | hmp.c | 23 | ||||
-rw-r--r-- | hmp.h | 2 | ||||
-rw-r--r-- | include/migration/register.h | 2 | ||||
-rw-r--r-- | migration/channel.c | 12 | ||||
-rw-r--r-- | migration/exec.c | 9 | ||||
-rw-r--r-- | migration/fd.c | 9 | ||||
-rw-r--r-- | migration/migration.c | 559 | ||||
-rw-r--r-- | migration/migration.h | 22 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 54 | ||||
-rw-r--r-- | migration/ram.c | 500 | ||||
-rw-r--r-- | migration/ram.h | 6 | ||||
-rw-r--r-- | migration/rdma.c | 7 | ||||
-rw-r--r-- | migration/savevm.c | 191 | ||||
-rw-r--r-- | migration/savevm.h | 3 | ||||
-rw-r--r-- | migration/socket.c | 39 | ||||
-rw-r--r-- | migration/socket.h | 7 | ||||
-rw-r--r-- | migration/trace-events | 21 | ||||
-rw-r--r-- | qapi/migration.json | 57 | ||||
-rw-r--r-- | tests/migration-test.c | 85 |
21 files changed, 1864 insertions, 310 deletions
diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst index 9342a8a..40f136f 100644 --- a/docs/devel/migration.rst +++ b/docs/devel/migration.rst @@ -28,11 +28,11 @@ the guest to be stopped. Typically the time that the guest is unresponsive during live migration is the low hundred of milliseconds (notice that this depends on a lot of things). -Types of migration -================== +Transports +========== -Now that we have talked about live migration, there are several ways -to do migration: +The migration stream is normally just a byte stream that can be passed +over any transport. - tcp migration: do the migration using tcp sockets - unix migration: do the migration using unix sockets @@ -40,16 +40,16 @@ to do migration: - fd migration: do the migration using an file descriptor that is passed to QEMU. QEMU doesn't care how this file descriptor is opened. -All these four migration protocols use the same infrastructure to +In addition, support is included for migration using RDMA, which +transports the page data using ``RDMA``, where the hardware takes care of +transporting the pages, and the load on the CPU is much lower. While the +internals of RDMA migration are a bit different, this isn't really visible +outside the RAM migration code. + +All these migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. -State Live Migration -==================== - -This is used for RAM and block devices. It is not yet ported to vmstate. -<Fill more information here> - Common infrastructure ===================== @@ -57,60 +57,75 @@ The files, sockets or fd's that carry the migration stream are abstracted by the ``QEMUFile`` type (see `migration/qemu-file.h`). In most cases this is connected to a subtype of ``QIOChannel`` (see `io/`). + Saving the state of one device ============================== -The state of a device is saved using intermediate buffers. There are -some helper functions to assist this saving. - -There is a new concept that we have to explain here: device state -version. When we migrate a device, we save/load the state as a series -of fields. Some times, due to bugs or new functionality, we need to -change the state to store more/different information. We use the -version to identify each time that we do a change. Each version is -associated with a series of fields saved. The `save_state` always saves -the state as the newer version. But `load_state` sometimes is able to -load state from an older version. - -Legacy way ----------- - -This way is going to disappear as soon as all current users are ported to VMSTATE. - -Each device has to register two functions, one to save the state and -another to load the state back. - -.. code:: c - - int register_savevm(DeviceState *dev, - const char *idstr, - int instance_id, - int version_id, - SaveStateHandler *save_state, - LoadStateHandler *load_state, - void *opaque); - - typedef void SaveStateHandler(QEMUFile *f, void *opaque); - typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); - -The important functions for the device state format are the `save_state` -and `load_state`. Notice that `load_state` receives a version_id -parameter to know what state format is receiving. `save_state` doesn't -have a version_id parameter because it always uses the latest version. +For most devices, the state is saved in a single call to the migration +infrastructure; these are *non-iterative* devices. The data for these +devices is sent at the end of precopy migration, when the CPUs are paused. +There are also *iterative* devices, which contain a very large amount of +data (e.g. RAM or large tables). See the iterative device section below. + +General advice for device developers +------------------------------------ + +- The migration state saved should reflect the device being modelled rather + than the way your implementation works. That way if you change the implementation + later the migration stream will stay compatible. That model may include + internal state that's not directly visible in a register. + +- When saving a migration stream the device code may walk and check + the state of the device. These checks might fail in various ways (e.g. + discovering internal state is corrupt or that the guest has done something bad). + Consider carefully before asserting/aborting at this point, since the + normal response from users is that *migration broke their VM* since it had + apparently been running fine until then. In these error cases, the device + should log a message indicating the cause of error, and should consider + putting the device into an error state, allowing the rest of the VM to + continue execution. + +- The migration might happen at an inconvenient point, + e.g. right in the middle of the guest reprogramming the device, during + guest reboot or shutdown or while the device is waiting for external IO. + It's strongly preferred that migrations do not fail in this situation, + since in the cloud environment migrations might happen automatically to + VMs that the administrator doesn't directly control. + +- If you do need to fail a migration, ensure that sufficient information + is logged to identify what went wrong. + +- The destination should treat an incoming migration stream as hostile + (which we do to varying degrees in the existing code). Check that offsets + into buffers and the like can't cause overruns. Fail the incoming migration + in the case of a corrupted stream like this. + +- Take care with internal device state or behaviour that might become + migration version dependent. For example, the order of PCI capabilities + is required to stay constant across migration. Another example would + be that a special case handled by subsections (see below) might become + much more common if a default behaviour is changed. + +- The state of the source should not be changed or destroyed by the + outgoing migration. Migrations timing out or being failed by + higher levels of management, or failures of the destination host are + not unusual, and in that case the VM is restarted on the source. + Note that the management layer can validly revert the migration + even though the QEMU level of migration has succeeded as long as it + does it before starting execution on the destination. + +- Buses and devices should be able to explicitly specify addresses when + instantiated, and management tools should use those. For example, + when hot adding USB devices it's important to specify the ports + and addresses, since implicit ordering based on the command line order + may be different on the destination. This can result in the + device state being loaded into the wrong device. VMState ------- -The legacy way of saving/loading state of the device had the problem -that we have to maintain two functions in sync. If we did one change -in one of them and not in the other, we would get a failed migration. - -VMState changed the way that state is saved/loaded. Instead of using -a function to save the state and another to load it, it was changed to -a declarative way of what the state consisted of. Now VMState is able -to interpret that definition to be able to load/save the state. As -the state is declared only once, it can't go out of sync in the -save/load functions. +Most device data can be described using the ``VMSTATE`` macros (mostly defined +in ``include/migration/vmstate.h``). An example (from hw/input/pckbd.c) @@ -137,103 +152,99 @@ We registered this with: vmstate_register(NULL, 0, &vmstate_kbd, s); -Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. +For devices that are `qdev` based, we can register the device in the class +init function: -You can search for ``VMSTATE_*`` macros for lots of types used in QEMU in -include/hw/hw.h. - -More about versions -------------------- - -Version numbers are intended for major incompatible changes to the -migration of a device, and using them breaks backwards-migration -compatibility; in general most changes can be made by adding Subsections -(see below) or _TEST macros (see below) which won't break compatibility. - -You can see that there are several version fields: +.. code:: c -- `version_id`: the maximum version_id supported by VMState for that device. -- `minimum_version_id`: the minimum version_id that VMState is able to understand - for that device. -- `minimum_version_id_old`: For devices that were not able to port to vmstate, we can - assign a function that knows how to read this old state. This field is - ignored if there is no `load_state_old` handler. + dc->vmsd = &vmstate_kbd_isa; -So, VMState is able to read versions from minimum_version_id to -version_id. And the function ``load_state_old()`` (if present) is able to -load state from minimum_version_id_old to minimum_version_id. This -function is deprecated and will be removed when no more users are left. +The VMState macros take care of ensuring that the device data section +is formatted portably (normally big endian) and make some compile time checks +against the types of the fields in the structures. -Saving state will always create a section with the 'version_id' value -and thus can't be loaded by any older QEMU. +VMState macros can include other VMStateDescriptions to store substructures +(see ``VMSTATE_STRUCT_``), arrays (``VMSTATE_ARRAY_``) and variable length +arrays (``VMSTATE_VARRAY_``). Various other macros exist for special +cases. -Massaging functions -------------------- +Note that the format on the wire is still very raw; i.e. a VMSTATE_UINT32 +ends up with a 4 byte bigendian representation on the wire; in the future +it might be possible to use a more structured format. -Sometimes, it is not enough to be able to save the state directly -from one structure, we need to fill the correct values there. One -example is when we are using kvm. Before saving the cpu state, we -need to ask kvm to copy to QEMU the state that it is using. And the -opposite when we are loading the state, we need a way to tell kvm to -load the state for the cpu that we have just loaded from the QEMUFile. +Legacy way +---------- -The functions to do that are inside a vmstate definition, and are called: +This way is going to disappear as soon as all current users are ported to VMSTATE; +although converting existing code can be tricky, and thus 'soon' is relative. -- ``int (*pre_load)(void *opaque);`` +Each device has to register two functions, one to save the state and +another to load the state back. - This function is called before we load the state of one device. +.. code:: c -- ``int (*post_load)(void *opaque, int version_id);`` + int register_savevm_live(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveVMHandlers *ops, + void *opaque); - This function is called after we load the state of one device. +Two functions in the ``ops`` structure are the `save_state` +and `load_state` functions. Notice that `load_state` receives a version_id +parameter to know what state format is receiving. `save_state` doesn't +have a version_id parameter because it always uses the latest version. -- ``int (*pre_save)(void *opaque);`` +Note that because the VMState macros still save the data in a raw +format, in many cases it's possible to replace legacy code +with a carefully constructed VMState description that matches the +byte layout of the existing code. - This function is called before we save the state of one device. +Changing migration data structures +---------------------------------- -Example: You can look at hpet.c, that uses the three function to -massage the state that is transferred. - -If you use memory API functions that update memory layout outside -initialization (i.e., in response to a guest action), this is a strong -indication that you need to call these functions in a `post_load` callback. -Examples of such memory API functions are: - - - memory_region_add_subregion() - - memory_region_del_subregion() - - memory_region_set_readonly() - - memory_region_set_enabled() - - memory_region_set_address() - - memory_region_set_alias_offset() +When we migrate a device, we save/load the state as a series +of fields. Sometimes, due to bugs or new functionality, we need to +change the state to store more/different information. Changing the migration +state saved for a device can break migration compatibility unless +care is taken to use the appropriate techniques. In general QEMU tries +to maintain forward migration compatibility (i.e. migrating from +QEMU n->n+1) and there are users who benefit from backward compatibility +as well. Subsections ----------- -The use of version_id allows to be able to migrate from older versions -to newer versions of a device. But not the other way around. This -makes very complicated to fix bugs in stable branches. If we need to -add anything to the state to fix a bug, we have to disable migration -to older versions that don't have that bug-fix (i.e. a new field). +The most common structure change is adding new data, e.g. when adding +a newer form of device, or adding that state that you previously +forgot to migrate. This is best solved using a subsection. -But sometimes, that bug-fix is only needed sometimes, not always. For -instance, if the device is in the middle of a DMA operation, it is -using a specific functionality, .... - -It is impossible to create a way to make migration from any version to -any other version to work. But we can do better than only allowing -migration from older versions to newer ones. For that fields that are -only needed sometimes, we add the idea of subsections. A subsection -is "like" a device vmstate, but with a particularity, it has a Boolean -function that tells if that values are needed to be sent or not. If -this functions returns false, the subsection is not sent. +A subsection is "like" a device vmstate, but with a particularity, it +has a Boolean function that tells if that values are needed to be sent +or not. If this functions returns false, the subsection is not sent. +Subsections have a unique name, that is looked for on the receiving +side. On the receiving side, if we found a subsection for a device that we don't understand, we just fail the migration. If we understand all -the subsections, then we load the state with success. +the subsections, then we load the state with success. There's no check +that a subsection is loaded, so a newer QEMU that knows about a subsection +can (with care) load a stream from an older QEMU that didn't send +the subsection. + +If the new data is only needed in a rare case, then the subsection +can be made conditional on that case and the migration will still +succeed to older QEMUs in most cases. This is OK for data that's +critical, but in some use cases it's preferred that the migration +should succeed even with the data missing. To support this the +subsection can be connected to a device property and from there +to a versioned machine type. One important note is that the post_load() function is called "after" loading all subsections, because a newer subsection could change same -value that it uses. +value that it uses. A flag, and the combination of pre_load and post_load +can be used to detect whether a subsection was loaded, and to +fall back on default behaviour when the subsection isn't present. Example: @@ -288,9 +299,13 @@ save/send this state when we are in the middle of a pio operation not enabled, the values on that fields are garbage and don't need to be sent. +Connecting subsections to properties +------------------------------------ + Using a condition function that checks a 'property' to determine whether -to send a subsection allows backwards migration compatibility when -new subsections are added. +to send a subsection allows backward migration compatibility when +new subsections are added, especially when combined with versioned +machine types. For example: @@ -305,21 +320,7 @@ For example: Now that subsection will not be generated when using an older machine type and the migration stream will be accepted by older -QEMU versions. pre-load functions can be used to initialise state -on the newer version so that they default to suitable values -when loading streams created by older QEMU versions that do not -generate the subsection. - -In some cases subsections are added for data that had been accidentally -omitted by earlier versions; if the missing data causes the migration -process to succeed but the guest to behave badly then it may be better -to send the subsection and cause the migration to explicitly fail -with the unknown subsection error. If the bad behaviour only happens -with certain data values, making the subsection conditional on -the data value (rather than the machine type) allows migrations to succeed -in most cases. In general the preference is to tie the subsection to -the machine type, and allow reliable migrations, unless the behaviour -from omission of the subsection is really bad. +QEMU versions. Not sending existing elements ----------------------------- @@ -328,9 +329,13 @@ Sometimes members of the VMState are no longer needed: - removing them will break migration compatibility - - making them version dependent and bumping the version will break backwards migration compatibility. + - making them version dependent and bumping the version will break backward migration + compatibility. + +Adding a dummy field into the migration stream is normally the best way to preserve +compatibility. -The best way is to: +If the field really does need to be removed then: a) Add a new property/compatibility/function in the same way for subsections above. b) replace the VMSTATE macro with the _TEST version of the macro, e.g.: @@ -342,18 +347,208 @@ The best way is to: ``VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz)`` Sometime in the future when we no longer care about the ancient versions these can be killed off. + Note that for backward compatibility it's important to fill in the structure with + data that the destination will understand. + +Any difference in the predicates on the source and destination will end up +with different fields being enabled and data being loaded into the wrong +fields; for this reason conditional fields like this are very fragile. + +Versions +-------- + +Version numbers are intended for major incompatible changes to the +migration of a device, and using them breaks backward-migration +compatibility; in general most changes can be made by adding Subsections +(see above) or _TEST macros (see above) which won't break compatibility. + +Each version is associated with a series of fields saved. The `save_state` always saves +the state as the newer version. But `load_state` sometimes is able to +load state from an older version. + +You can see that there are several version fields: + +- `version_id`: the maximum version_id supported by VMState for that device. +- `minimum_version_id`: the minimum version_id that VMState is able to understand + for that device. +- `minimum_version_id_old`: For devices that were not able to port to vmstate, we can + assign a function that knows how to read this old state. This field is + ignored if there is no `load_state_old` handler. + +VMState is able to read versions from minimum_version_id to +version_id. And the function ``load_state_old()`` (if present) is able to +load state from minimum_version_id_old to minimum_version_id. This +function is deprecated and will be removed when no more users are left. + +There are *_V* forms of many ``VMSTATE_`` macros to load fields for version dependent fields, +e.g. + +.. code:: c + + VMSTATE_UINT16_V(ip_id, Slirp, 2), + +only loads that field for versions 2 and newer. + +Saving state will always create a section with the 'version_id' value +and thus can't be loaded by any older QEMU. + +Massaging functions +------------------- + +Sometimes, it is not enough to be able to save the state directly +from one structure, we need to fill the correct values there. One +example is when we are using kvm. Before saving the cpu state, we +need to ask kvm to copy to QEMU the state that it is using. And the +opposite when we are loading the state, we need a way to tell kvm to +load the state for the cpu that we have just loaded from the QEMUFile. + +The functions to do that are inside a vmstate definition, and are called: + +- ``int (*pre_load)(void *opaque);`` + + This function is called before we load the state of one device. + +- ``int (*post_load)(void *opaque, int version_id);`` + + This function is called after we load the state of one device. + +- ``int (*pre_save)(void *opaque);`` + + This function is called before we save the state of one device. + +Example: You can look at hpet.c, that uses the three function to +massage the state that is transferred. + +The ``VMSTATE_WITH_TMP`` macro may be useful when the migration +data doesn't match the stored device data well; it allows an +intermediate temporary structure to be populated with migration +data and then transferred to the main structure. + +If you use memory API functions that update memory layout outside +initialization (i.e., in response to a guest action), this is a strong +indication that you need to call these functions in a `post_load` callback. +Examples of such memory API functions are: + + - memory_region_add_subregion() + - memory_region_del_subregion() + - memory_region_set_readonly() + - memory_region_set_enabled() + - memory_region_set_address() + - memory_region_set_alias_offset() + +Iterative device migration +-------------------------- + +Some devices, such as RAM, Block storage or certain platform devices, +have large amounts of data that would mean that the CPUs would be +paused for too long if they were sent in one section. For these +devices an *iterative* approach is taken. + +The iterative devices generally don't use VMState macros +(although it may be possible in some cases) and instead use +qemu_put_*/qemu_get_* macros to read/write data to the stream. Specialist +versions exist for high bandwidth IO. + + +An iterative device must provide: + + - A ``save_setup`` function that initialises the data structures and + transmits a first section containing information on the device. In the + case of RAM this transmits a list of RAMBlocks and sizes. + + - A ``load_setup`` function that initialises the data structures on the + destination. + + - A ``save_live_pending`` function that is called repeatedly and must + indicate how much more data the iterative data must save. The core + migration code will use this to determine when to pause the CPUs + and complete the migration. + + - A ``save_live_iterate`` function (called after ``save_live_pending`` + when there is significant data still to be sent). It should send + a chunk of data until the point that stream bandwidth limits tell it + to stop. Each call generates one section. + + - A ``save_live_complete_precopy`` function that must transmit the + last section for the device containing any remaining data. + + - A ``load_state`` function used to load sections generated by + any of the save functions that generate sections. + + - ``cleanup`` functions for both save and load that are called + at the end of migration. + +Note that the contents of the sections for iterative migration tend +to be open-coded by the devices; care should be taken in parsing +the results and structuring the stream to make them easy to validate. + +Device ordering +--------------- + +There are cases in which the ordering of device loading matters; for +example in some systems where a device may assert an interrupt during loading, +if the interrupt controller is loaded later then it might lose the state. + +Some ordering is implicitly provided by the order in which the machine +definition creates devices, however this is somewhat fragile. + +The ``MigrationPriority`` enum provides a means of explicitly enforcing +ordering. Numerically higher priorities are loaded earlier. +The priority is set by setting the ``priority`` field of the top level +``VMStateDescription`` for the device. + +Stream structure +================ + +The stream tries to be word and endian agnostic, allowing migration between hosts +of different characteristics running the same VM. + + - Header + + - Magic + - Version + - VM configuration section + + - Machine type + - Target page bits + - List of sections + Each section contains a device, or one iteration of a device save. + + - section type + - section id + - ID string (First section of each device) + - instance id (First section of each device) + - version id (First section of each device) + - <device data> + - Footer mark + - EOF mark + - VM Description structure + Consisting of a JSON description of the contents for analysis only + +The ``device data`` in each section consists of the data produced +by the code described above. For non-iterative devices they have a single +section; iterative devices have an initial and last section and a set +of parts in between. +Note that there is very little checking by the common code of the integrity +of the ``device data`` contents, that's up to the devices themselves. +The ``footer mark`` provides a little bit of protection for the case where +the receiving side reads more or less data than expected. + +The ``ID string`` is normally unique, having been formed from a bus name +and device address, PCI devices and storage devices hung off PCI controllers +fit this pattern well. Some devices are fixed single instances (e.g. "pc-ram"). +Others (especially either older devices or system devices which for +some reason don't have a bus concept) make use of the ``instance id`` +for otherwise identically named devices. Return path ----------- -In most migration scenarios there is only a single data path that runs -from the source VM to the destination, typically along a single fd (although -possibly with another fd or similar for some fast way of throwing pages across). - -However, some uses need two way communication; in particular the Postcopy -destination needs to be able to request pages on demand from the source. +Only a unidirectional stream is required for normal migration, however a +``return path`` can be created when bidirectional communication is desired. +This is primarily used by postcopy, but is also used to return a success +flag to the source at the end of migration. -For these scenarios there is a 'return path' from the destination to the source; ``qemu_file_get_return_path(QEMUFile* fwdpath)`` gives the QEMUFile* for the return path. @@ -632,3 +827,28 @@ Retro-fitting postcopy to existing clients is possible: identified and the implication understood; for example if the guest memory access is made while holding a lock then all other threads waiting for that lock will also be blocked. + +Firmware +======== + +Migration migrates the copies of RAM and ROM, and thus when running +on the destination it includes the firmware from the source. Even after +resetting a VM, the old firmware is used. Only once QEMU has been restarted +is the new firmware in use. + +- Changes in firmware size can cause changes in the required RAMBlock size + to hold the firmware and thus migration can fail. In practice it's best + to pad firmware images to convenient powers of 2 with plenty of space + for growth. + +- Care should be taken with device emulation code so that newer + emulation code can work with older firmware to allow forward migration. + +- Care should be taken with newer firmware so that backward migration + to older systems with older device emulation code will work. + +In some cases it may be best to tie specific firmware versions to specific +versioned machine types to cut down on the combinations that will need +support. This is also useful when newer versions of firmware outgrow +the padding. + diff --git a/hmp-commands.hx b/hmp-commands.hx index 227f7ee..0734fea 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -897,13 +897,14 @@ ETEXI { .name = "migrate", - .args_type = "detach:-d,blk:-b,inc:-i,uri:s", - .params = "[-d] [-b] [-i] uri", + .args_type = "detach:-d,blk:-b,inc:-i,resume:-r,uri:s", + .params = "[-d] [-b] [-i] [-r] uri", .help = "migrate to URI (using -d to not wait for completion)" "\n\t\t\t -b for migration without shared storage with" " full copy of disk\n\t\t\t -i for migration without " "shared storage with incremental copy of disk " - "(base image shared between src and destination)", + "(base image shared between src and destination)" + "\n\t\t\t -r to resume a paused migration", .cmd = hmp_migrate, }, @@ -956,7 +957,34 @@ STEXI @findex migrate_incoming Continue an incoming migration using the @var{uri} (that has the same syntax as the -incoming option). +ETEXI + + { + .name = "migrate_recover", + .args_type = "uri:s", + .params = "uri", + .help = "Continue a paused incoming postcopy migration", + .cmd = hmp_migrate_recover, + }, +STEXI +@item migrate_recover @var{uri} +@findex migrate_recover +Continue a paused incoming postcopy migration using the @var{uri}. +ETEXI + + { + .name = "migrate_pause", + .args_type = "", + .params = "", + .help = "Pause an ongoing migration (postcopy-only)", + .cmd = hmp_migrate_pause, + }, + +STEXI +@item migrate_pause +@findex migrate_pause +Pause an ongoing migration. Currently it only supports postcopy. ETEXI { @@ -1517,6 +1517,25 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_migrate_recover(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + const char *uri = qdict_get_str(qdict, "uri"); + + qmp_migrate_recover(uri, &err); + + hmp_handle_error(mon, &err); +} + +void hmp_migrate_pause(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + + qmp_migrate_pause(&err); + + hmp_handle_error(mon, &err); +} + /* Kept for backwards compatibility */ void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict) { @@ -1929,10 +1948,12 @@ void hmp_migrate(Monitor *mon, const QDict *qdict) bool detach = qdict_get_try_bool(qdict, "detach", false); bool blk = qdict_get_try_bool(qdict, "blk", false); bool inc = qdict_get_try_bool(qdict, "inc", false); + bool resume = qdict_get_try_bool(qdict, "resume", false); const char *uri = qdict_get_str(qdict, "uri"); Error *err = NULL; - qmp_migrate(uri, !!blk, blk, !!inc, inc, false, false, &err); + qmp_migrate(uri, !!blk, blk, !!inc, inc, + false, false, true, resume, &err); if (err) { hmp_handle_error(mon, &err); return; @@ -68,6 +68,8 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict); void hmp_migrate_cancel(Monitor *mon, const QDict *qdict); void hmp_migrate_continue(Monitor *mon, const QDict *qdict); void hmp_migrate_incoming(Monitor *mon, const QDict *qdict); +void hmp_migrate_recover(Monitor *mon, const QDict *qdict); +void hmp_migrate_pause(Monitor *mon, const QDict *qdict); void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict); void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict); void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict); diff --git a/include/migration/register.h b/include/migration/register.h index f6f12f9..d287f4c 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -64,6 +64,8 @@ typedef struct SaveVMHandlers { LoadStateHandler *load_state; int (*load_setup)(QEMUFile *f, void *opaque); int (*load_cleanup)(void *opaque); + /* Called when postcopy migration wants to resume from failure */ + int (*resume_prepare)(MigrationState *s, void *opaque); } SaveVMHandlers; int register_savevm_live(DeviceState *dev, diff --git a/migration/channel.c b/migration/channel.c index c5eaf0f..33e0e9b 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -71,11 +71,21 @@ void migration_channel_connect(MigrationState *s, !object_dynamic_cast(OBJECT(ioc), TYPE_QIO_CHANNEL_TLS)) { migration_tls_channel_connect(s, ioc, hostname, &error); + + if (!error) { + /* tls_channel_connect will call back to this + * function after the TLS handshake, + * so we mustn't call migrate_fd_connect until then + */ + + return; + } } else { QEMUFile *f = qemu_fopen_channel_output(ioc); + qemu_mutex_lock(&s->qemu_file_lock); s->to_dst_file = f; - + qemu_mutex_unlock(&s->qemu_file_lock); } } migrate_fd_connect(s, error); diff --git a/migration/exec.c b/migration/exec.c index 0bc5a42..9d0f82f 100644 --- a/migration/exec.c +++ b/migration/exec.c @@ -65,9 +65,8 @@ void exec_start_incoming_migration(const char *command, Error **errp) } qio_channel_set_name(ioc, "migration-exec-incoming"); - qio_channel_add_watch(ioc, - G_IO_IN, - exec_accept_incoming_migration, - NULL, - NULL); + qio_channel_add_watch_full(ioc, G_IO_IN, + exec_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); } diff --git a/migration/fd.c b/migration/fd.c index cd06182..9a380bb 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -66,9 +66,8 @@ void fd_start_incoming_migration(const char *infd, Error **errp) } qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-incoming"); - qio_channel_add_watch(ioc, - G_IO_IN, - fd_accept_incoming_migration, - NULL, - NULL); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); } diff --git a/migration/migration.c b/migration/migration.c index 35f2781..05aec2c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -95,6 +95,8 @@ enum mig_rp_message_type { MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */ MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ + MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */ + MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */ MIG_RP_MSG_MAX }; @@ -104,6 +106,7 @@ enum mig_rp_message_type { dynamic creation of migration */ static MigrationState *current_migration; +static MigrationIncomingState *current_incoming; static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, @@ -119,6 +122,22 @@ void migration_object_init(void) assert(!current_migration); current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION)); + /* + * Init the migrate incoming object as well no matter whether + * we'll use it or not. + */ + assert(!current_incoming); + current_incoming = g_new0(MigrationIncomingState, 1); + current_incoming->state = MIGRATION_STATUS_NONE; + current_incoming->postcopy_remote_fds = + g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD)); + qemu_mutex_init(¤t_incoming->rp_mutex); + qemu_event_init(¤t_incoming->main_thread_load_event, false); + qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0); + qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0); + + init_dirty_bitmap_incoming_migration(); + if (!migration_object_check(current_migration, &err)) { error_report_err(err); exit(1); @@ -149,22 +168,8 @@ MigrationState *migrate_get_current(void) MigrationIncomingState *migration_incoming_get_current(void) { - static bool once; - static MigrationIncomingState mis_current; - - if (!once) { - mis_current.state = MIGRATION_STATUS_NONE; - memset(&mis_current, 0, sizeof(MigrationIncomingState)); - mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE, - sizeof(struct PostCopyFD)); - qemu_mutex_init(&mis_current.rp_mutex); - qemu_event_init(&mis_current.main_thread_load_event, false); - - init_dirty_bitmap_incoming_migration(); - - once = true; - } - return &mis_current; + assert(current_incoming); + return current_incoming; } void migration_incoming_state_destroy(void) @@ -430,7 +435,7 @@ static void migration_incoming_setup(QEMUFile *f) qemu_file_set_blocking(f, false); } -static void migration_incoming_process(void) +void migration_incoming_process(void) { Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); qemu_coroutine_enter(co); @@ -438,8 +443,34 @@ static void migration_incoming_process(void) void migration_fd_process_incoming(QEMUFile *f) { - migration_incoming_setup(f); - migration_incoming_process(); + MigrationIncomingState *mis = migration_incoming_get_current(); + + if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { + /* Resumed from a paused postcopy migration */ + + mis->from_src_file = f; + /* Postcopy has standalone thread to do vm load */ + qemu_file_set_blocking(f, true); + + /* Re-configure the return path */ + mis->to_src_file = qemu_file_get_return_path(f); + + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED, + MIGRATION_STATUS_POSTCOPY_RECOVER); + + /* + * Here, we only wake up the main loading thread (while the + * fault thread will still be waiting), so that we can receive + * commands from source now, and answer it if needed. The + * fault thread will be woken up afterwards until we are sure + * that source is ready to reply to page requests. + */ + qemu_sem_post(&mis->postcopy_pause_sem_dst); + } else { + /* New incoming migration */ + migration_incoming_setup(f); + migration_incoming_process(); + } } void migration_ioc_process_incoming(QIOChannel *ioc) @@ -448,9 +479,10 @@ void migration_ioc_process_incoming(QIOChannel *ioc) if (!mis->from_src_file) { QEMUFile *f = qemu_fopen_channel_input(ioc); - migration_fd_process_incoming(f); + migration_incoming_setup(f); + return; } - /* We still only have a single channel. Nothing to do here yet */ + multifd_recv_new_channel(ioc); } /** @@ -461,7 +493,11 @@ void migration_ioc_process_incoming(QIOChannel *ioc) */ bool migration_has_all_channels(void) { - return true; + bool all_channels; + + all_channels = multifd_recv_all_channels_created(); + + return all_channels; } /* @@ -491,6 +527,53 @@ void migrate_send_rp_pong(MigrationIncomingState *mis, migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); } +void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, + char *block_name) +{ + char buf[512]; + int len; + int64_t res; + + /* + * First, we send the header part. It contains only the len of + * idstr, and the idstr itself. + */ + len = strlen(block_name); + buf[0] = len; + memcpy(buf + 1, block_name, len); + + if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { + error_report("%s: MSG_RP_RECV_BITMAP only used for recovery", + __func__); + return; + } + + migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf); + + /* + * Next, we dump the received bitmap to the stream. + * + * TODO: currently we are safe since we are the only one that is + * using the to_src_file handle (fault thread is still paused), + * and it's ok even not taking the mutex. However the best way is + * to take the lock before sending the message header, and release + * the lock after sending the bitmap. + */ + qemu_mutex_lock(&mis->rp_mutex); + res = ramblock_recv_bitmap_send(mis->to_src_file, block_name); + qemu_mutex_unlock(&mis->rp_mutex); + + trace_migrate_send_rp_recv_bitmap(block_name, res); +} + +void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) +{ + uint32_t buf; + + buf = cpu_to_be32(value); + migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf); +} + MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) { MigrationCapabilityStatusList *head = NULL; @@ -569,6 +652,8 @@ static bool migration_is_setup_or_active(int state) switch (state) { case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: + case MIGRATION_STATUS_POSTCOPY_RECOVER: case MIGRATION_STATUS_SETUP: case MIGRATION_STATUS_PRE_SWITCHOVER: case MIGRATION_STATUS_DEVICE: @@ -649,6 +734,8 @@ static void fill_source_migration_info(MigrationInfo *info) case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_PRE_SWITCHOVER: case MIGRATION_STATUS_DEVICE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: + case MIGRATION_STATUS_POSTCOPY_RECOVER: /* TODO add some postcopy stats */ info->has_status = true; info->has_total_time = true; @@ -1147,6 +1234,7 @@ static void migrate_fd_cleanup(void *opaque) if (s->to_dst_file) { Error *local_err = NULL; + QEMUFile *tmp; trace_migrate_fd_cleanup(); qemu_mutex_unlock_iothread(); @@ -1159,8 +1247,15 @@ static void migrate_fd_cleanup(void *opaque) if (multifd_save_cleanup(&local_err) != 0) { error_report_err(local_err); } - qemu_fclose(s->to_dst_file); + qemu_mutex_lock(&s->qemu_file_lock); + tmp = s->to_dst_file; s->to_dst_file = NULL; + qemu_mutex_unlock(&s->qemu_file_lock); + /* + * Close the file handle without the lock to make sure the + * critical section won't block for long. + */ + qemu_fclose(tmp); } assert((s->state != MIGRATION_STATUS_ACTIVE) && @@ -1388,6 +1483,59 @@ void qmp_migrate_incoming(const char *uri, Error **errp) once = false; } +void qmp_migrate_recover(const char *uri, Error **errp) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { + error_setg(errp, "Migrate recover can only be run " + "when postcopy is paused."); + return; + } + + if (atomic_cmpxchg(&mis->postcopy_recover_triggered, + false, true) == true) { + error_setg(errp, "Migrate recovery is triggered already"); + return; + } + + /* + * Note that this call will never start a real migration; it will + * only re-setup the migration stream and poke existing migration + * to continue using that newly established channel. + */ + qemu_start_incoming_migration(uri, errp); +} + +void qmp_migrate_pause(Error **errp) +{ + MigrationState *ms = migrate_get_current(); + MigrationIncomingState *mis = migration_incoming_get_current(); + int ret; + + if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + /* Source side, during postcopy */ + qemu_mutex_lock(&ms->qemu_file_lock); + ret = qemu_file_shutdown(ms->to_dst_file); + qemu_mutex_unlock(&ms->qemu_file_lock); + if (ret) { + error_setg(errp, "Failed to pause source migration"); + } + return; + } + + if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + ret = qemu_file_shutdown(mis->from_src_file); + if (ret) { + error_setg(errp, "Failed to pause destination migration"); + } + return; + } + + error_setg(errp, "migrate-pause is currently only supported " + "during postcopy-active state"); +} + bool migration_is_blocked(Error **errp) { if (qemu_savevm_state_blocked(errp)) { @@ -1402,49 +1550,75 @@ bool migration_is_blocked(Error **errp) return false; } -void qmp_migrate(const char *uri, bool has_blk, bool blk, - bool has_inc, bool inc, bool has_detach, bool detach, - Error **errp) +/* Returns true if continue to migrate, or false if error detected */ +static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, + bool resume, Error **errp) { Error *local_err = NULL; - MigrationState *s = migrate_get_current(); - const char *p; + + if (resume) { + if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { + error_setg(errp, "Cannot resume if there is no " + "paused migration"); + return false; + } + /* This is a resume, skip init status */ + return true; + } if (migration_is_setup_or_active(s->state) || s->state == MIGRATION_STATUS_CANCELLING || s->state == MIGRATION_STATUS_COLO) { error_setg(errp, QERR_MIGRATION_ACTIVE); - return; + return false; } + if (runstate_check(RUN_STATE_INMIGRATE)) { error_setg(errp, "Guest is waiting for an incoming migration"); - return; + return false; } if (migration_is_blocked(errp)) { - return; + return false; } - if ((has_blk && blk) || (has_inc && inc)) { + if (blk || blk_inc) { if (migrate_use_block() || migrate_use_block_incremental()) { error_setg(errp, "Command options are incompatible with " "current migration capabilities"); - return; + return false; } migrate_set_block_enabled(true, &local_err); if (local_err) { error_propagate(errp, local_err); - return; + return false; } s->must_remove_block_options = true; } - if (has_inc && inc) { + if (blk_inc) { migrate_set_block_incremental(s, true); } migrate_init(s); + return true; +} + +void qmp_migrate(const char *uri, bool has_blk, bool blk, + bool has_inc, bool inc, bool has_detach, bool detach, + bool has_resume, bool resume, Error **errp) +{ + Error *local_err = NULL; + MigrationState *s = migrate_get_current(); + const char *p; + + if (!migrate_prepare(s, has_blk && blk, has_inc && inc, + has_resume && resume, errp)) { + /* Error detected, put into errp */ + return; + } + if (strstart(uri, "tcp:", &p)) { tcp_start_outgoing_migration(s, p, &local_err); #ifdef CONFIG_RDMA @@ -1739,6 +1913,8 @@ static struct rp_cmd_args { [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, + [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" }, [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, }; @@ -1771,6 +1947,51 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, } } +/* Return true to retry, false to quit */ +static bool postcopy_pause_return_path_thread(MigrationState *s) +{ + trace_postcopy_pause_return_path(); + + qemu_sem_wait(&s->postcopy_pause_rp_sem); + + trace_postcopy_pause_return_path_continued(); + + return true; +} + +static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name) +{ + RAMBlock *block = qemu_ram_block_by_name(block_name); + + if (!block) { + error_report("%s: invalid block name '%s'", __func__, block_name); + return -EINVAL; + } + + /* Fetch the received bitmap and refresh the dirty bitmap */ + return ram_dirty_bitmap_reload(s, block); +} + +static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value) +{ + trace_source_return_path_thread_resume_ack(value); + + if (value != MIGRATION_RESUME_ACK_VALUE) { + error_report("%s: illegal resume_ack value %"PRIu32, + __func__, value); + return -1; + } + + /* Now both sides are active. */ + migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + + /* Notify send thread that time to continue send pages */ + qemu_sem_post(&s->rp_state.rp_sem); + + return 0; +} + /* * Handles messages sent on the return path towards the source VM * @@ -1787,6 +2008,8 @@ static void *source_return_path_thread(void *opaque) int res; trace_source_return_path_thread_entry(); + +retry: while (!ms->rp_state.error && !qemu_file_get_error(rp) && migration_is_setup_or_active(ms->state)) { trace_source_return_path_thread_loop_top(); @@ -1874,23 +2097,61 @@ static void *source_return_path_thread(void *opaque) migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); break; + case MIG_RP_MSG_RECV_BITMAP: + if (header_len < 1) { + error_report("%s: missing block name", __func__); + mark_source_rp_bad(ms); + goto out; + } + /* Format: len (1B) + idstr (<255B). This ends the idstr. */ + buf[buf[0] + 1] = '\0'; + if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) { + mark_source_rp_bad(ms); + goto out; + } + break; + + case MIG_RP_MSG_RESUME_ACK: + tmp32 = ldl_be_p(buf); + if (migrate_handle_rp_resume_ack(ms, tmp32)) { + mark_source_rp_bad(ms); + goto out; + } + break; + default: break; } } - if (qemu_file_get_error(rp)) { + +out: + res = qemu_file_get_error(rp); + if (res) { + if (res == -EIO) { + /* + * Maybe there is something we can do: it looks like a + * network down issue, and we pause for a recovery. + */ + if (postcopy_pause_return_path_thread(ms)) { + /* Reload rp, reset the rest */ + rp = ms->rp_state.from_dst_file; + ms->rp_state.error = false; + goto retry; + } + } + trace_source_return_path_thread_bad_end(); mark_source_rp_bad(ms); } trace_source_return_path_thread_end(); -out: ms->rp_state.from_dst_file = NULL; qemu_fclose(rp); return NULL; } -static int open_return_path_on_source(MigrationState *ms) +static int open_return_path_on_source(MigrationState *ms, + bool create_thread) { ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); @@ -1899,6 +2160,12 @@ static int open_return_path_on_source(MigrationState *ms) } trace_open_return_path_on_source(); + + if (!create_thread) { + /* We're done */ + return 0; + } + qemu_thread_create(&ms->rp_state.rp_thread, "return path", source_return_path_thread, ms, QEMU_THREAD_JOINABLE); @@ -2238,6 +2505,156 @@ bool migrate_colo_enabled(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO]; } +typedef enum MigThrError { + /* No error detected */ + MIG_THR_ERR_NONE = 0, + /* Detected error, but resumed successfully */ + MIG_THR_ERR_RECOVERED = 1, + /* Detected fatal error, need to exit */ + MIG_THR_ERR_FATAL = 2, +} MigThrError; + +static int postcopy_resume_handshake(MigrationState *s) +{ + qemu_savevm_send_postcopy_resume(s->to_dst_file); + + while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { + qemu_sem_wait(&s->rp_state.rp_sem); + } + + if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + return 0; + } + + return -1; +} + +/* Return zero if success, or <0 for error */ +static int postcopy_do_resume(MigrationState *s) +{ + int ret; + + /* + * Call all the resume_prepare() hooks, so that modules can be + * ready for the migration resume. + */ + ret = qemu_savevm_state_resume_prepare(s); + if (ret) { + error_report("%s: resume_prepare() failure detected: %d", + __func__, ret); + return ret; + } + + /* + * Last handshake with destination on the resume (destination will + * switch to postcopy-active afterwards) + */ + ret = postcopy_resume_handshake(s); + if (ret) { + error_report("%s: handshake failed: %d", __func__, ret); + return ret; + } + + return 0; +} + +/* + * We don't return until we are in a safe state to continue current + * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or + * MIG_THR_ERR_FATAL if unrecovery failure happened. + */ +static MigThrError postcopy_pause(MigrationState *s) +{ + assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); + + while (true) { + QEMUFile *file; + + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_POSTCOPY_PAUSED); + + /* Current channel is possibly broken. Release it. */ + assert(s->to_dst_file); + qemu_mutex_lock(&s->qemu_file_lock); + file = s->to_dst_file; + s->to_dst_file = NULL; + qemu_mutex_unlock(&s->qemu_file_lock); + + qemu_file_shutdown(file); + qemu_fclose(file); + + error_report("Detected IO failure for postcopy. " + "Migration paused."); + + /* + * We wait until things fixed up. Then someone will setup the + * status back for us. + */ + while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { + qemu_sem_wait(&s->postcopy_pause_sem); + } + + if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { + /* Woken up by a recover procedure. Give it a shot */ + + /* + * Firstly, let's wake up the return path now, with a new + * return path channel. + */ + qemu_sem_post(&s->postcopy_pause_rp_sem); + + /* Do the resume logic */ + if (postcopy_do_resume(s) == 0) { + /* Let's continue! */ + trace_postcopy_pause_continued(); + return MIG_THR_ERR_RECOVERED; + } else { + /* + * Something wrong happened during the recovery, let's + * pause again. Pause is always better than throwing + * data away. + */ + continue; + } + } else { + /* This is not right... Time to quit. */ + return MIG_THR_ERR_FATAL; + } + } +} + +static MigThrError migration_detect_error(MigrationState *s) +{ + int ret; + + /* Try to detect any file errors */ + ret = qemu_file_get_error(s->to_dst_file); + + if (!ret) { + /* Everything is fine */ + return MIG_THR_ERR_NONE; + } + + if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) { + /* + * For postcopy, we allow the network to be down for a + * while. After that, it can be continued by a + * recovery phase. + */ + return postcopy_pause(s); + } else { + /* + * For precopy (or postcopy with error outside IO), we fail + * with no time. + */ + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + trace_migration_thread_file_err(); + + /* Time to stop the migration, now. */ + return MIG_THR_ERR_FATAL; + } +} + static void migration_calculate_complete(MigrationState *s) { uint64_t bytes = qemu_ftell(s->to_dst_file); @@ -2394,6 +2811,7 @@ static void *migration_thread(void *opaque) { MigrationState *s = opaque; int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); + MigThrError thr_error; rcu_register_thread(); @@ -2443,13 +2861,22 @@ static void *migration_thread(void *opaque) } } - if (qemu_file_get_error(s->to_dst_file)) { - if (migration_is_setup_or_active(s->state)) { - migrate_set_state(&s->state, s->state, - MIGRATION_STATUS_FAILED); - } - trace_migration_thread_file_err(); + /* + * Try to detect any kind of failures, and see whether we + * should stop the migration now. + */ + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ break; + } else if (thr_error == MIG_THR_ERR_RECOVERED) { + /* + * Just recovered from a e.g. network failure, reset all + * the local variables. This is important to avoid + * breaking transferred_bytes and bandwidth calculation + */ + s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->iteration_initial_bytes = 0; } current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -2471,6 +2898,9 @@ static void *migration_thread(void *opaque) void migrate_fd_connect(MigrationState *s, Error *error_in) { + int64_t rate_limit; + bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; + s->expected_downtime = s->parameters.downtime_limit; s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); if (error_in) { @@ -2479,12 +2909,21 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - qemu_file_set_blocking(s->to_dst_file, true); - qemu_file_set_rate_limit(s->to_dst_file, - s->parameters.max_bandwidth / XFER_LIMIT_RATIO); + if (resume) { + /* This is a resumed migration */ + rate_limit = INT64_MAX; + } else { + /* This is a fresh new migration */ + rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO; + s->expected_downtime = s->parameters.downtime_limit; + s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); - /* Notify before starting migration thread */ - notifier_list_notify(&migration_state_notifiers, s); + /* Notify before starting migration thread */ + notifier_list_notify(&migration_state_notifiers, s); + } + + qemu_file_set_rate_limit(s->to_dst_file, rate_limit); + qemu_file_set_blocking(s->to_dst_file, true); /* * Open the return path. For postcopy, it is used exclusively. For @@ -2492,15 +2931,22 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) * QEMU uses the return path. */ if (migrate_postcopy_ram() || migrate_use_return_path()) { - if (open_return_path_on_source(s)) { + if (open_return_path_on_source(s, !resume)) { error_report("Unable to open return-path for postcopy"); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); migrate_fd_cleanup(s); return; } } + if (resume) { + /* Wakeup the main migration thread to do the recovery */ + migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, + MIGRATION_STATUS_POSTCOPY_RECOVER); + qemu_sem_post(&s->postcopy_pause_sem); + return; + } + if (multifd_save_setup() != 0) { migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); @@ -2604,9 +3050,13 @@ static void migration_instance_finalize(Object *obj) MigrationParameters *params = &ms->parameters; qemu_mutex_destroy(&ms->error_mutex); + qemu_mutex_destroy(&ms->qemu_file_lock); g_free(params->tls_hostname); g_free(params->tls_creds); qemu_sem_destroy(&ms->pause_sem); + qemu_sem_destroy(&ms->postcopy_pause_sem); + qemu_sem_destroy(&ms->postcopy_pause_rp_sem); + qemu_sem_destroy(&ms->rp_state.rp_sem); error_free(ms->error); } @@ -2636,6 +3086,11 @@ static void migration_instance_init(Object *obj) params->has_x_multifd_channels = true; params->has_x_multifd_page_count = true; params->has_xbzrle_cache_size = true; + + qemu_sem_init(&ms->postcopy_pause_sem, 0); + qemu_sem_init(&ms->postcopy_pause_rp_sem, 0); + qemu_sem_init(&ms->rp_state.rp_sem, 0); + qemu_mutex_init(&ms->qemu_file_lock); } /* diff --git a/migration/migration.h b/migration/migration.h index 7c69598..8f0c821 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -24,6 +24,8 @@ struct PostcopyBlocktimeContext; +#define MIGRATION_RESUME_ACK_VALUE (1) + /* State for the incoming migration */ struct MigrationIncomingState { QEMUFile *from_src_file; @@ -73,6 +75,11 @@ struct MigrationIncomingState { * live migration, to calculate vCPU block time * */ struct PostcopyBlocktimeContext *blocktime_ctx; + + /* notify PAUSED postcopy incoming migrations to try to continue */ + bool postcopy_recover_triggered; + QemuSemaphore postcopy_pause_sem_dst; + QemuSemaphore postcopy_pause_sem_fault; }; MigrationIncomingState *migration_incoming_get_current(void); @@ -107,6 +114,12 @@ struct MigrationState QemuThread thread; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + /* + * Protects to_dst_file pointer. We need to make sure we won't + * yield or hang during the critical section, since this lock will + * be used in OOB command handler. + */ + QemuMutex qemu_file_lock; /* bytes already send at the beggining of current interation */ uint64_t iteration_initial_bytes; @@ -129,6 +142,7 @@ struct MigrationState QEMUFile *from_dst_file; QemuThread rp_thread; bool error; + QemuSemaphore rp_sem; } rp_state; double mbps; @@ -194,12 +208,17 @@ struct MigrationState bool send_configuration; /* Whether we send section footer during migration */ bool send_section_footer; + + /* Needed by postcopy-pause state */ + QemuSemaphore postcopy_pause_sem; + QemuSemaphore postcopy_pause_rp_sem; }; void migrate_set_state(int *state, int old_state, int new_state); void migration_fd_process_incoming(QEMUFile *f); void migration_ioc_process_incoming(QIOChannel *ioc); +void migration_incoming_process(void); bool migration_has_all_channels(void); @@ -251,6 +270,9 @@ void migrate_send_rp_pong(MigrationIncomingState *mis, uint32_t value); int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname, ram_addr_t start, size_t len); +void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, + char *block_name); +void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value); void dirty_bitmap_mig_before_vm_start(void); void init_dirty_bitmap_incoming_migration(void); diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 8ceeaa2..658b750 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -830,6 +830,17 @@ static void mark_postcopy_blocktime_end(uintptr_t addr) affected_cpu); } +static bool postcopy_pause_fault_thread(MigrationIncomingState *mis) +{ + trace_postcopy_pause_fault_thread(); + + qemu_sem_wait(&mis->postcopy_pause_sem_fault); + + trace_postcopy_pause_fault_thread_continued(); + + return true; +} + /* * Handle faults detected by the USERFAULT markings */ @@ -880,6 +891,22 @@ static void *postcopy_ram_fault_thread(void *opaque) break; } + if (!mis->to_src_file) { + /* + * Possibly someone tells us that the return path is + * broken already using the event. We should hold until + * the channel is rebuilt. + */ + if (postcopy_pause_fault_thread(mis)) { + mis->last_rb = NULL; + /* Continue to read the userfaultfd */ + } else { + error_report("%s: paused but don't allow to continue", + __func__); + break; + } + } + if (pfd[1].revents) { uint64_t tmp64 = 0; @@ -942,18 +969,37 @@ static void *postcopy_ram_fault_thread(void *opaque) (uintptr_t)(msg.arg.pagefault.address), msg.arg.pagefault.feat.ptid, rb); +retry: /* * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) */ if (rb != mis->last_rb) { mis->last_rb = rb; - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), - rb_offset, qemu_ram_pagesize(rb)); + ret = migrate_send_rp_req_pages(mis, + qemu_ram_get_idstr(rb), + rb_offset, + qemu_ram_pagesize(rb)); } else { /* Save some space */ - migrate_send_rp_req_pages(mis, NULL, - rb_offset, qemu_ram_pagesize(rb)); + ret = migrate_send_rp_req_pages(mis, + NULL, + rb_offset, + qemu_ram_pagesize(rb)); + } + + if (ret) { + /* May be network failure, try to wait for recovery */ + if (ret == -EIO && postcopy_pause_fault_thread(mis)) { + /* We got reconnected somehow, try to continue */ + mis->last_rb = NULL; + goto retry; + } else { + /* This is a unavoidable fault */ + error_report("%s: migrate_send_rp_req_pages() get %d", + __func__, ret); + break; + } } } diff --git a/migration/ram.c b/migration/ram.c index 912810c..5bcbf7a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -36,6 +36,7 @@ #include "xbzrle.h" #include "ram.h" #include "migration.h" +#include "socket.h" #include "migration/register.h" #include "migration/misc.h" #include "qemu-file.h" @@ -51,6 +52,9 @@ #include "qemu/rcu_queue.h" #include "migration/colo.h" #include "migration/block.h" +#include "sysemu/sysemu.h" +#include "qemu/uuid.h" +#include "savevm.h" /***********************************************************/ /* ram save/restore */ @@ -187,6 +191,70 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, nr); } +#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) + +/* + * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). + * + * Returns >0 if success with sent bytes, or <0 if error. + */ +int64_t ramblock_recv_bitmap_send(QEMUFile *file, + const char *block_name) +{ + RAMBlock *block = qemu_ram_block_by_name(block_name); + unsigned long *le_bitmap, nbits; + uint64_t size; + + if (!block) { + error_report("%s: invalid block name: %s", __func__, block_name); + return -1; + } + + nbits = block->used_length >> TARGET_PAGE_BITS; + + /* + * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit + * machines we may need 4 more bytes for padding (see below + * comment). So extend it a bit before hand. + */ + le_bitmap = bitmap_new(nbits + BITS_PER_LONG); + + /* + * Always use little endian when sending the bitmap. This is + * required that when source and destination VMs are not using the + * same endianess. (Note: big endian won't work.) + */ + bitmap_to_le(le_bitmap, block->receivedmap, nbits); + + /* Size of the bitmap, in bytes */ + size = nbits / 8; + + /* + * size is always aligned to 8 bytes for 64bit machines, but it + * may not be true for 32bit machines. We need this padding to + * make sure the migration can survive even between 32bit and + * 64bit machines. + */ + size = ROUND_UP(size, 8); + + qemu_put_be64(file, size); + qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); + /* + * Mark as an end, in case the middle part is screwed up due to + * some "misterious" reason. + */ + qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); + qemu_fflush(file); + + free(le_bitmap); + + if (qemu_file_get_error(file)) { + return qemu_file_get_error(file); + } + + return size + sizeof(size); +} + /* * An outstanding page request, on the source, having been received * and queued @@ -432,15 +500,117 @@ exit: /* Multiple fd's */ -struct MultiFDSendParams { +#define MULTIFD_MAGIC 0x11223344U +#define MULTIFD_VERSION 1 + +typedef struct { + uint32_t magic; + uint32_t version; + unsigned char uuid[16]; /* QemuUUID */ + uint8_t id; +} __attribute__((packed)) MultiFDInit_t; + +typedef struct { + /* this fields are not changed once the thread is created */ + /* channel number */ uint8_t id; + /* channel thread name */ char *name; + /* channel thread id */ QemuThread thread; + /* communication channel */ + QIOChannel *c; + /* sem where to wait for more work */ QemuSemaphore sem; + /* this mutex protects the following parameters */ QemuMutex mutex; + /* is this channel thread running */ + bool running; + /* should this thread finish */ bool quit; -}; -typedef struct MultiFDSendParams MultiFDSendParams; +} MultiFDSendParams; + +typedef struct { + /* this fields are not changed once the thread is created */ + /* channel number */ + uint8_t id; + /* channel thread name */ + char *name; + /* channel thread id */ + QemuThread thread; + /* communication channel */ + QIOChannel *c; + /* sem where to wait for more work */ + QemuSemaphore sem; + /* this mutex protects the following parameters */ + QemuMutex mutex; + /* is this channel thread running */ + bool running; + /* should this thread finish */ + bool quit; +} MultiFDRecvParams; + +static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) +{ + MultiFDInit_t msg; + int ret; + + msg.magic = cpu_to_be32(MULTIFD_MAGIC); + msg.version = cpu_to_be32(MULTIFD_VERSION); + msg.id = p->id; + memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid)); + + ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp); + if (ret != 0) { + return -1; + } + return 0; +} + +static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) +{ + MultiFDInit_t msg; + int ret; + + ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp); + if (ret != 0) { + return -1; + } + + be32_to_cpus(&msg.magic); + be32_to_cpus(&msg.version); + + if (msg.magic != MULTIFD_MAGIC) { + error_setg(errp, "multifd: received packet magic %x " + "expected %x", msg.magic, MULTIFD_MAGIC); + return -1; + } + + if (msg.version != MULTIFD_VERSION) { + error_setg(errp, "multifd: received packet version %d " + "expected %d", msg.version, MULTIFD_VERSION); + return -1; + } + + if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) { + char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid); + char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid); + + error_setg(errp, "multifd: received uuid '%s' and expected " + "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id); + g_free(uuid); + g_free(msg_uuid); + return -1; + } + + if (msg.id > migrate_multifd_channels()) { + error_setg(errp, "multifd: received channel version %d " + "expected %d", msg.version, MULTIFD_VERSION); + return -1; + } + + return msg.id; +} struct { MultiFDSendParams *params; @@ -448,11 +618,23 @@ struct { int count; } *multifd_send_state; -static void terminate_multifd_send_threads(Error *errp) +static void multifd_send_terminate_threads(Error *err) { int i; - for (i = 0; i < multifd_send_state->count; i++) { + if (err) { + MigrationState *s = migrate_get_current(); + migrate_set_error(s, err); + if (s->state == MIGRATION_STATUS_SETUP || + s->state == MIGRATION_STATUS_PRE_SWITCHOVER || + s->state == MIGRATION_STATUS_DEVICE || + s->state == MIGRATION_STATUS_ACTIVE) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } + } + + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; qemu_mutex_lock(&p->mutex); @@ -470,11 +652,15 @@ int multifd_save_cleanup(Error **errp) if (!migrate_use_multifd()) { return 0; } - terminate_multifd_send_threads(NULL); - for (i = 0; i < multifd_send_state->count; i++) { + multifd_send_terminate_threads(NULL); + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_thread_join(&p->thread); + if (p->running) { + qemu_thread_join(&p->thread); + } + socket_send_channel_destroy(p->c); + p->c = NULL; qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem); g_free(p->name); @@ -490,6 +676,11 @@ int multifd_save_cleanup(Error **errp) static void *multifd_send_thread(void *opaque) { MultiFDSendParams *p = opaque; + Error *local_err = NULL; + + if (multifd_send_initial_packet(p, &local_err) < 0) { + goto out; + } while (true) { qemu_mutex_lock(&p->mutex); @@ -501,9 +692,39 @@ static void *multifd_send_thread(void *opaque) qemu_sem_wait(&p->sem); } +out: + if (local_err) { + multifd_send_terminate_threads(local_err); + } + + qemu_mutex_lock(&p->mutex); + p->running = false; + qemu_mutex_unlock(&p->mutex); + return NULL; } +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) +{ + MultiFDSendParams *p = opaque; + QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task)); + Error *local_err = NULL; + + if (qio_task_propagate_error(task, &local_err)) { + if (multifd_save_cleanup(&local_err) != 0) { + migrate_set_error(migrate_get_current(), local_err); + } + } else { + p->c = QIO_CHANNEL(sioc); + qio_channel_set_delay(p->c, false); + p->running = true; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); + + atomic_inc(&multifd_send_state->count); + } +} + int multifd_save_setup(void) { int thread_count; @@ -515,7 +736,7 @@ int multifd_save_setup(void) thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); - multifd_send_state->count = 0; + atomic_set(&multifd_send_state->count, 0); for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -524,35 +745,32 @@ int multifd_save_setup(void) p->quit = false; p->id = i; p->name = g_strdup_printf("multifdsend_%d", i); - qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, - QEMU_THREAD_JOINABLE); - - multifd_send_state->count++; + socket_send_channel_create(multifd_new_send_channel_async, p); } return 0; } -struct MultiFDRecvParams { - uint8_t id; - char *name; - QemuThread thread; - QemuSemaphore sem; - QemuMutex mutex; - bool quit; -}; -typedef struct MultiFDRecvParams MultiFDRecvParams; - struct { MultiFDRecvParams *params; /* number of created threads */ int count; } *multifd_recv_state; -static void terminate_multifd_recv_threads(Error *errp) +static void multifd_recv_terminate_threads(Error *err) { int i; - for (i = 0; i < multifd_recv_state->count; i++) { + if (err) { + MigrationState *s = migrate_get_current(); + migrate_set_error(s, err); + if (s->state == MIGRATION_STATUS_SETUP || + s->state == MIGRATION_STATUS_ACTIVE) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } + } + + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; qemu_mutex_lock(&p->mutex); @@ -570,11 +788,15 @@ int multifd_load_cleanup(Error **errp) if (!migrate_use_multifd()) { return 0; } - terminate_multifd_recv_threads(NULL); - for (i = 0; i < multifd_recv_state->count; i++) { + multifd_recv_terminate_threads(NULL); + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - qemu_thread_join(&p->thread); + if (p->running) { + qemu_thread_join(&p->thread); + } + object_unref(OBJECT(p->c)); + p->c = NULL; qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem); g_free(p->name); @@ -602,6 +824,10 @@ static void *multifd_recv_thread(void *opaque) qemu_sem_wait(&p->sem); } + qemu_mutex_lock(&p->mutex); + p->running = false; + qemu_mutex_unlock(&p->mutex); + return NULL; } @@ -616,7 +842,7 @@ int multifd_load_setup(void) thread_count = migrate_multifd_channels(); multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); - multifd_recv_state->count = 0; + atomic_set(&multifd_recv_state->count, 0); for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; @@ -625,13 +851,52 @@ int multifd_load_setup(void) p->quit = false; p->id = i; p->name = g_strdup_printf("multifdrecv_%d", i); - qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, - QEMU_THREAD_JOINABLE); - multifd_recv_state->count++; } return 0; } +bool multifd_recv_all_channels_created(void) +{ + int thread_count = migrate_multifd_channels(); + + if (!migrate_use_multifd()) { + return true; + } + + return thread_count == atomic_read(&multifd_recv_state->count); +} + +void multifd_recv_new_channel(QIOChannel *ioc) +{ + MultiFDRecvParams *p; + Error *local_err = NULL; + int id; + + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + return; + } + + p = &multifd_recv_state->params[id]; + if (p->c != NULL) { + error_setg(&local_err, "multifd: received id '%d' already setup'", + id); + multifd_recv_terminate_threads(local_err); + return; + } + p->c = ioc; + object_ref(OBJECT(ioc)); + + p->running = true; + qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, + QEMU_THREAD_JOINABLE); + atomic_inc(&multifd_recv_state->count); + if (multifd_recv_state->count == migrate_multifd_channels()) { + migration_incoming_process(); + } +} + /** * save_page_header: write page header to wire * @@ -1490,7 +1755,7 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, * CPU resource. */ if (block == rs->last_sent_block && save_page_use_compression(rs)) { - res = compress_page_with_multi_thread(rs, block, offset); + return compress_page_with_multi_thread(rs, block, offset); } return ram_save_page(rs, pss, last_stage); @@ -2226,6 +2491,41 @@ static int ram_init_all(RAMState **rsp) return 0; } +static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) +{ + RAMBlock *block; + uint64_t pages = 0; + + /* + * Postcopy is not using xbzrle/compression, so no need for that. + * Also, since source are already halted, we don't need to care + * about dirty page logging as well. + */ + + RAMBLOCK_FOREACH(block) { + pages += bitmap_count_one(block->bmap, + block->used_length >> TARGET_PAGE_BITS); + } + + /* This may not be aligned with current bitmaps. Recalculate. */ + rs->migration_dirty_pages = pages; + + rs->last_seen_block = NULL; + rs->last_sent_block = NULL; + rs->last_page = 0; + rs->last_version = ram_list.version; + /* + * Disable the bulk stage, otherwise we'll resend the whole RAM no + * matter what we have sent. + */ + rs->ram_bulk_stage = false; + + /* Update RAMState cache of output QEMUFile */ + rs->f = out; + + trace_ram_state_resume_prepare(pages); +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -3100,6 +3400,139 @@ static bool ram_has_postcopy(void *opaque) return migrate_postcopy_ram(); } +/* Sync all the dirty bitmap with destination VM. */ +static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) +{ + RAMBlock *block; + QEMUFile *file = s->to_dst_file; + int ramblock_count = 0; + + trace_ram_dirty_bitmap_sync_start(); + + RAMBLOCK_FOREACH(block) { + qemu_savevm_send_recv_bitmap(file, block->idstr); + trace_ram_dirty_bitmap_request(block->idstr); + ramblock_count++; + } + + trace_ram_dirty_bitmap_sync_wait(); + + /* Wait until all the ramblocks' dirty bitmap synced */ + while (ramblock_count--) { + qemu_sem_wait(&s->rp_state.rp_sem); + } + + trace_ram_dirty_bitmap_sync_complete(); + + return 0; +} + +static void ram_dirty_bitmap_reload_notify(MigrationState *s) +{ + qemu_sem_post(&s->rp_state.rp_sem); +} + +/* + * Read the received bitmap, revert it as the initial dirty bitmap. + * This is only used when the postcopy migration is paused but wants + * to resume from a middle point. + */ +int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) +{ + int ret = -EINVAL; + QEMUFile *file = s->rp_state.from_dst_file; + unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; + uint64_t local_size = nbits / 8; + uint64_t size, end_mark; + + trace_ram_dirty_bitmap_reload_begin(block->idstr); + + if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { + error_report("%s: incorrect state %s", __func__, + MigrationStatus_str(s->state)); + return -EINVAL; + } + + /* + * Note: see comments in ramblock_recv_bitmap_send() on why we + * need the endianess convertion, and the paddings. + */ + local_size = ROUND_UP(local_size, 8); + + /* Add paddings */ + le_bitmap = bitmap_new(nbits + BITS_PER_LONG); + + size = qemu_get_be64(file); + + /* The size of the bitmap should match with our ramblock */ + if (size != local_size) { + error_report("%s: ramblock '%s' bitmap size mismatch " + "(0x%"PRIx64" != 0x%"PRIx64")", __func__, + block->idstr, size, local_size); + ret = -EINVAL; + goto out; + } + + size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); + end_mark = qemu_get_be64(file); + + ret = qemu_file_get_error(file); + if (ret || size != local_size) { + error_report("%s: read bitmap failed for ramblock '%s': %d" + " (size 0x%"PRIx64", got: 0x%"PRIx64")", + __func__, block->idstr, ret, local_size, size); + ret = -EIO; + goto out; + } + + if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { + error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64, + __func__, block->idstr, end_mark); + ret = -EINVAL; + goto out; + } + + /* + * Endianess convertion. We are during postcopy (though paused). + * The dirty bitmap won't change. We can directly modify it. + */ + bitmap_from_le(block->bmap, le_bitmap, nbits); + + /* + * What we received is "received bitmap". Revert it as the initial + * dirty bitmap for this ramblock. + */ + bitmap_complement(block->bmap, block->bmap, nbits); + + trace_ram_dirty_bitmap_reload_complete(block->idstr); + + /* + * We succeeded to sync bitmap for current ramblock. If this is + * the last one to sync, we need to notify the main send thread. + */ + ram_dirty_bitmap_reload_notify(s); + + ret = 0; +out: + free(le_bitmap); + return ret; +} + +static int ram_resume_prepare(MigrationState *s, void *opaque) +{ + RAMState *rs = *(RAMState **)opaque; + int ret; + + ret = ram_dirty_bitmap_sync_all(s, rs); + if (ret) { + return ret; + } + + ram_state_resume_prepare(rs, s->to_dst_file); + + return 0; +} + static SaveVMHandlers savevm_ram_handlers = { .save_setup = ram_save_setup, .save_live_iterate = ram_save_iterate, @@ -3111,6 +3544,7 @@ static SaveVMHandlers savevm_ram_handlers = { .save_cleanup = ram_save_cleanup, .load_setup = ram_load_setup, .load_cleanup = ram_load_cleanup, + .resume_prepare = ram_resume_prepare, }; void ram_mig_init(void) diff --git a/migration/ram.h b/migration/ram.h index 5030be1..d386f4d 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -32,6 +32,7 @@ #include "qemu-common.h" #include "qapi/qapi-types-migration.h" #include "exec/cpu-common.h" +#include "io/channel.h" extern MigrationStats ram_counters; extern XBZRLECacheStats xbzrle_counters; @@ -44,6 +45,8 @@ int multifd_save_setup(void); int multifd_save_cleanup(Error **errp); int multifd_load_setup(void); int multifd_load_cleanup(Error **errp); +bool multifd_recv_all_channels_created(void); +void multifd_recv_new_channel(QIOChannel *ioc); uint64_t ram_pagesize_summary(void); int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len); @@ -63,5 +66,8 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); +int64_t ramblock_recv_bitmap_send(QEMUFile *file, + const char *block_name); +int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb); #endif diff --git a/migration/rdma.c b/migration/rdma.c index da474fc..7d233b0 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -708,6 +708,9 @@ static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) memcpy(local->block + block->index, old + (block->index + 1), sizeof(RDMALocalBlock) * (local->nb_blocks - (block->index + 1))); + for (x = block->index; x < local->nb_blocks - 1; x++) { + local->block[x].index--; + } } } else { assert(block == local->block); @@ -3246,6 +3249,10 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) qsort(rdma->local_ram_blocks.block, rdma->local_ram_blocks.nb_blocks, sizeof(RDMALocalBlock), dest_ram_sort_func); + for (i = 0; i < local->nb_blocks; i++) { + local->block[i].index = i; + } + if (rdma->pin_all) { ret = qemu_rdma_reg_whole_ram_blocks(rdma); if (ret) { diff --git a/migration/savevm.c b/migration/savevm.c index e2be02a..4251125 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -80,7 +80,9 @@ enum qemu_vm_cmd { MIG_CMD_POSTCOPY_RAM_DISCARD, /* A list of pages to discard that were previously sent during precopy but are dirty. */ + MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */ + MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ MIG_CMD_MAX }; @@ -97,7 +99,9 @@ static struct mig_cmd_args { [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" }, [MIG_CMD_POSTCOPY_RAM_DISCARD] = { .len = -1, .name = "POSTCOPY_RAM_DISCARD" }, + [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, + [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, }; @@ -956,6 +960,25 @@ void qemu_savevm_send_postcopy_run(QEMUFile *f) qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL); } +void qemu_savevm_send_postcopy_resume(QEMUFile *f) +{ + trace_savevm_send_postcopy_resume(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL); +} + +void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) +{ + size_t len; + char buf[256]; + + trace_savevm_send_recv_bitmap(block_name); + + buf[0] = len = strlen(block_name); + memcpy(buf + 1, block_name, len); + + qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -1008,6 +1031,31 @@ void qemu_savevm_state_setup(QEMUFile *f) } } +int qemu_savevm_state_resume_prepare(MigrationState *s) +{ + SaveStateEntry *se; + int ret; + + trace_savevm_state_resume_prepare(); + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->resume_prepare) { + continue; + } + if (se->ops && se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + ret = se->ops->resume_prepare(s, se->opaque); + if (ret < 0) { + return ret; + } + } + + return 0; +} + /* * this function has three return values: * negative: there was one error, and we have -errno. @@ -1564,8 +1612,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, */ static void *postcopy_ram_listen_thread(void *opaque) { - QEMUFile *f = opaque; MigrationIncomingState *mis = migration_incoming_get_current(); + QEMUFile *f = mis->from_src_file; int load_res; migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, @@ -1579,6 +1627,14 @@ static void *postcopy_ram_listen_thread(void *opaque) */ qemu_file_set_blocking(f, true); load_res = qemu_loadvm_state_main(f, mis); + + /* + * This is tricky, but, mis->from_src_file can change after it + * returns, when postcopy recovery happened. In the future, we may + * want a wrapper for the QEMUFile handle. + */ + f = mis->from_src_file; + /* And non-blocking again so we don't block in any cleanup */ qemu_file_set_blocking(f, false); @@ -1668,7 +1724,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) /* Start up the listening thread and wait for it to signal ready */ qemu_sem_init(&mis->listen_thread_sem, 0); qemu_thread_create(&mis->listen_thread, "postcopy/listen", - postcopy_ram_listen_thread, mis->from_src_file, + postcopy_ram_listen_thread, NULL, QEMU_THREAD_DETACHED); qemu_sem_wait(&mis->listen_thread_sem); qemu_sem_destroy(&mis->listen_thread_sem); @@ -1745,6 +1801,31 @@ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) return LOADVM_QUIT; } +static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) +{ + if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { + error_report("%s: illegal resume received", __func__); + /* Don't fail the load, only for this. */ + return 0; + } + + /* + * This means source VM is ready to resume the postcopy migration. + * It's time to switch state and release the fault thread to + * continue service page faults. + */ + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + qemu_sem_post(&mis->postcopy_pause_sem_fault); + + trace_loadvm_postcopy_handle_resume(); + + /* Tell source that "we are ready" */ + migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE); + + return 0; +} + /** * Immediately following this command is a blob of data containing an embedded * chunk of migration stream; read it and load it. @@ -1794,6 +1875,49 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) } /* + * Handle request that source requests for recved_bitmap on + * destination. Payload format: + * + * len (1 byte) + ramblock_name (<255 bytes) + */ +static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis, + uint16_t len) +{ + QEMUFile *file = mis->from_src_file; + RAMBlock *rb; + char block_name[256]; + size_t cnt; + + cnt = qemu_get_counted_string(file, block_name); + if (!cnt) { + error_report("%s: failed to read block name", __func__); + return -EINVAL; + } + + /* Validate before using the data */ + if (qemu_file_get_error(file)) { + return qemu_file_get_error(file); + } + + if (len != cnt + 1) { + error_report("%s: invalid payload length (%d)", __func__, len); + return -EINVAL; + } + + rb = qemu_ram_block_by_name(block_name); + if (!rb) { + error_report("%s: block '%s' not found", __func__, block_name); + return -EINVAL; + } + + migrate_send_rp_recv_bitmap(mis, block_name); + + trace_loadvm_handle_recv_bitmap(block_name); + + return 0; +} + +/* * Process an incoming 'QEMU_VM_COMMAND' * 0 just a normal return * LOADVM_QUIT All good, but exit the loop @@ -1866,6 +1990,12 @@ static int loadvm_process_command(QEMUFile *f) case MIG_CMD_POSTCOPY_RAM_DISCARD: return loadvm_postcopy_ram_handle_discard(mis, len); + + case MIG_CMD_POSTCOPY_RESUME: + return loadvm_postcopy_handle_resume(mis); + + case MIG_CMD_RECV_BITMAP: + return loadvm_handle_recv_bitmap(mis, len); } return 0; @@ -2055,11 +2185,50 @@ void qemu_loadvm_state_cleanup(void) } } +/* Return true if we should continue the migration, or false. */ +static bool postcopy_pause_incoming(MigrationIncomingState *mis) +{ + trace_postcopy_pause_incoming(); + + /* Clear the triggered bit to allow one recovery */ + mis->postcopy_recover_triggered = false; + + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_POSTCOPY_PAUSED); + + assert(mis->from_src_file); + qemu_file_shutdown(mis->from_src_file); + qemu_fclose(mis->from_src_file); + mis->from_src_file = NULL; + + assert(mis->to_src_file); + qemu_file_shutdown(mis->to_src_file); + qemu_mutex_lock(&mis->rp_mutex); + qemu_fclose(mis->to_src_file); + mis->to_src_file = NULL; + qemu_mutex_unlock(&mis->rp_mutex); + + /* Notify the fault thread for the invalidated file handle */ + postcopy_fault_thread_notify(mis); + + error_report("Detected IO failure for postcopy. " + "Migration paused."); + + while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { + qemu_sem_wait(&mis->postcopy_pause_sem_dst); + } + + trace_postcopy_pause_incoming_continued(); + + return true; +} + static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) { uint8_t section_type; int ret = 0; +retry: while (true) { section_type = qemu_get_byte(f); @@ -2104,6 +2273,24 @@ static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) out: if (ret < 0) { qemu_file_set_error(f, ret); + + /* + * Detect whether it is: + * + * 1. postcopy running (after receiving all device data, which + * must be in POSTCOPY_INCOMING_RUNNING state. Note that + * POSTCOPY_INCOMING_LISTENING is still not enough, it's + * still receiving device states). + * 2. network failure (-EIO) + * + * If so, we try to wait for a recovery. + */ + if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && + ret == -EIO && postcopy_pause_incoming(mis)) { + /* Reset f to point to the newly created channel */ + f = mis->from_src_file; + goto retry; + } } return ret; } diff --git a/migration/savevm.h b/migration/savevm.h index cf4f0d3..a5e65b8 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -31,6 +31,7 @@ bool qemu_savevm_state_blocked(Error **errp); void qemu_savevm_state_setup(QEMUFile *f); +int qemu_savevm_state_resume_prepare(MigrationState *s); void qemu_savevm_state_header(QEMUFile *f); int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy); void qemu_savevm_state_cleanup(void); @@ -47,6 +48,8 @@ int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); void qemu_savevm_send_postcopy_advise(QEMUFile *f); void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); +void qemu_savevm_send_postcopy_resume(QEMUFile *f); +void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, uint16_t len, diff --git a/migration/socket.c b/migration/socket.c index 122d8cc..3456eb7 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -28,6 +28,28 @@ #include "trace.h" +struct SocketOutgoingArgs { + SocketAddress *saddr; +} outgoing_args; + +void socket_send_channel_create(QIOTaskFunc f, void *data) +{ + QIOChannelSocket *sioc = qio_channel_socket_new(); + qio_channel_socket_connect_async(sioc, outgoing_args.saddr, + f, data, NULL, NULL); +} + +int socket_send_channel_destroy(QIOChannel *send) +{ + /* Remove channel */ + object_unref(OBJECT(send)); + if (outgoing_args.saddr) { + qapi_free_SocketAddress(outgoing_args.saddr); + outgoing_args.saddr = NULL; + } + return 0; +} + static SocketAddress *tcp_build_address(const char *host_port, Error **errp) { SocketAddress *saddr; @@ -95,6 +117,11 @@ static void socket_start_outgoing_migration(MigrationState *s, struct SocketConnectData *data = g_new0(struct SocketConnectData, 1); data->s = s; + + /* in case previous migration leaked it */ + qapi_free_SocketAddress(outgoing_args.saddr); + outgoing_args.saddr = saddr; + if (saddr->type == SOCKET_ADDRESS_TYPE_INET) { data->hostname = g_strdup(saddr->u.inet.host); } @@ -106,7 +133,6 @@ static void socket_start_outgoing_migration(MigrationState *s, data, socket_connect_data_free, NULL); - qapi_free_SocketAddress(saddr); } void tcp_start_outgoing_migration(MigrationState *s, @@ -144,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener, qio_net_listener_disconnect(listener); object_unref(OBJECT(listener)); + + if (!migrate_use_multifd()) { + migration_incoming_process(); + } } } @@ -160,9 +190,10 @@ static void socket_start_incoming_migration(SocketAddress *saddr, return; } - qio_net_listener_set_client_func(listener, - socket_accept_incoming_migration, - NULL, NULL); + qio_net_listener_set_client_func_full(listener, + socket_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); } void tcp_start_incoming_migration(const char *host_port, Error **errp) diff --git a/migration/socket.h b/migration/socket.h index 6b91e9d..528c3b0 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -16,6 +16,13 @@ #ifndef QEMU_MIGRATION_SOCKET_H #define QEMU_MIGRATION_SOCKET_H + +#include "io/channel.h" +#include "io/task.h" + +void socket_send_channel_create(QIOTaskFunc f, void *data); +int socket_send_channel_destroy(QIOChannel *send); + void tcp_start_incoming_migration(const char *host_port, Error **errp); void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, diff --git a/migration/trace-events b/migration/trace-events index d6be74b..3c798dd 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -12,11 +12,13 @@ loadvm_state_cleanup(void) "" loadvm_handle_cmd_packaged(unsigned int length) "%u" loadvm_handle_cmd_packaged_main(int ret) "%d" loadvm_handle_cmd_packaged_received(int ret) "%d" +loadvm_handle_recv_bitmap(char *s) "%s" loadvm_postcopy_handle_advise(void) "" loadvm_postcopy_handle_listen(void) "" loadvm_postcopy_handle_run(void) "" loadvm_postcopy_handle_run_cpu_sync(void) "" loadvm_postcopy_handle_run_vmstart(void) "" +loadvm_postcopy_handle_resume(void) "" loadvm_postcopy_ram_handle_discard(void) "" loadvm_postcopy_ram_handle_discard_end(void) "" loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud" @@ -34,7 +36,10 @@ savevm_send_open_return_path(void) "" savevm_send_ping(uint32_t val) "0x%x" savevm_send_postcopy_listen(void) "" savevm_send_postcopy_run(void) "" +savevm_send_postcopy_resume(void) "" +savevm_send_recv_bitmap(char *name) "%s" savevm_state_setup(void) "" +savevm_state_resume_prepare(void) "" savevm_state_header(void) "" savevm_state_iterate(void) "" savevm_state_cleanup(void) "" @@ -77,6 +82,13 @@ ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x" ram_postcopy_send_discard_bitmap(void) "" ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p" ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx" +ram_dirty_bitmap_request(char *str) "%s" +ram_dirty_bitmap_reload_begin(char *str) "%s" +ram_dirty_bitmap_reload_complete(char *str) "%s" +ram_dirty_bitmap_sync_start(void) "" +ram_dirty_bitmap_sync_wait(void) "" +ram_dirty_bitmap_sync_complete(void) "" +ram_state_resume_prepare(uint64_t v) "%" PRId64 # migration/migration.c await_return_path_close_on_source_close(void) "" @@ -88,6 +100,7 @@ migrate_fd_cancel(void) "" migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx" migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")" migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d" +migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64 migration_completion_file_err(void) "" migration_completion_postcopy_end(void) "" migration_completion_postcopy_end_after_complete(void) "" @@ -99,6 +112,13 @@ migration_thread_setup_complete(void) "" open_return_path_on_source(void) "" open_return_path_on_source_continue(void) "" postcopy_start(void) "" +postcopy_pause_return_path(void) "" +postcopy_pause_return_path_continued(void) "" +postcopy_pause_fault_thread(void) "" +postcopy_pause_fault_thread_continued(void) "" +postcopy_pause_continued(void) "" +postcopy_pause_incoming(void) "" +postcopy_pause_incoming_continued(void) "" postcopy_start_set_run(void) "" source_return_path_thread_bad_end(void) "" source_return_path_thread_end(void) "" @@ -106,6 +126,7 @@ source_return_path_thread_entry(void) "" source_return_path_thread_loop_top(void) "" source_return_path_thread_pong(uint32_t val) "0x%x" source_return_path_thread_shut(uint32_t val) "0x%x" +source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 migrate_global_state_post_load(const char *state) "loaded state: %s" migrate_global_state_pre_save(const char *state) "saved state: %s" migration_thread_low_pending(uint64_t pending) "%" PRIu64 diff --git a/qapi/migration.json b/qapi/migration.json index f3974c6..3ec418d 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -89,6 +89,10 @@ # # @postcopy-active: like active, but now in postcopy mode. (since 2.5) # +# @postcopy-paused: during postcopy but paused. (since 2.13) +# +# @postcopy-recover: trying to recover from a paused postcopy. (since 2.13) +# # @completed: migration is finished. # # @failed: some error occurred during migration process. @@ -106,7 +110,8 @@ ## { 'enum': 'MigrationStatus', 'data': [ 'none', 'setup', 'cancelling', 'cancelled', - 'active', 'postcopy-active', 'completed', 'failed', 'colo', + 'active', 'postcopy-active', 'postcopy-paused', + 'postcopy-recover', 'completed', 'failed', 'colo', 'pre-switchover', 'device' ] } ## @@ -157,11 +162,13 @@ # error strings. (Since 2.7) # # @postcopy-blocktime: total time when all vCPU were blocked during postcopy -# live migration (Since 2.13) +# live migration. This is only present when the postcopy-blocktime +# migration capability is enabled. (Since 2.13) # -# @postcopy-vcpu-blocktime: list of the postcopy blocktime per vCPU (Since 2.13) +# @postcopy-vcpu-blocktime: list of the postcopy blocktime per vCPU. This is +# only present when the postcopy-blocktime migration capability +# is enabled. (Since 2.13) # - # # Since: 0.14.0 ## @@ -363,7 +370,6 @@ # # @x-multifd: Use more than one fd for migration (since 2.11) # -# # @dirty-bitmaps: If enabled, QEMU will migrate named dirty bitmaps. # (since 2.12) # @@ -1028,6 +1034,8 @@ # @detach: this argument exists only for compatibility reasons and # is ignored by QEMU # +# @resume: resume one paused migration, default "off". (since 2.13) +# # Returns: nothing on success # # Since: 0.14.0 @@ -1049,7 +1057,8 @@ # ## { 'command': 'migrate', - 'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } } + 'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', + '*detach': 'bool', '*resume': 'bool' } } ## # @migrate-incoming: @@ -1183,3 +1192,39 @@ # Since: 2.9 ## { 'command': 'xen-colo-do-checkpoint' } + +## +# @migrate-recover: +# +# Provide a recovery migration stream URI. +# +# @uri: the URI to be used for the recovery of migration stream. +# +# Returns: nothing. +# +# Example: +# +# -> { "execute": "migrate-recover", +# "arguments": { "uri": "tcp:192.168.1.200:12345" } } +# <- { "return": {} } +# +# Since: 2.13 +## +{ 'command': 'migrate-recover', 'data': { 'uri': 'str' }, + 'allow-oob': true } + +## +# @migrate-pause: +# +# Pause a migration. Currently it only supports postcopy. +# +# Returns: nothing. +# +# Example: +# +# -> { "execute": "migrate-pause" } +# <- { "return": {} } +# +# Since: 2.13 +## +{ 'command': 'migrate-pause', 'allow-oob': true } diff --git a/tests/migration-test.c b/tests/migration-test.c index b99661b..3a85446 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -19,9 +19,6 @@ #include "qemu/sockets.h" #include "chardev/char.h" #include "sysemu/sysemu.h" -#include "hw/nvram/chrp_nvram.h" - -#define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */ const unsigned start_address = 1024 * 1024; const unsigned end_address = 100 * 1024 * 1024; @@ -92,36 +89,6 @@ static void init_bootfile_x86(const char *bootpath) fclose(bootfile); } -static void init_bootfile_ppc(const char *bootpath) -{ - FILE *bootfile; - char buf[MIN_NVRAM_SIZE]; - ChrpNvramPartHdr *header = (ChrpNvramPartHdr *)buf; - - memset(buf, 0, MIN_NVRAM_SIZE); - - /* Create a "common" partition in nvram to store boot-command property */ - - header->signature = CHRP_NVPART_SYSTEM; - memcpy(header->name, "common", 6); - chrp_nvram_finish_partition(header, MIN_NVRAM_SIZE); - - /* FW_MAX_SIZE is 4MB, but slof.bin is only 900KB, - * so let's modify memory between 1MB and 100MB - * to do like PC bootsector - */ - - sprintf(buf + 16, - "boot-command=hex .\" _\" begin %x %x do i c@ 1 + i c! 1000 +loop " - ".\" B\" 0 until", end_address, start_address); - - /* Write partition to the NVRAM file */ - - bootfile = fopen(bootpath, "wb"); - g_assert_cmpint(fwrite(buf, MIN_NVRAM_SIZE, 1, bootfile), ==, 1); - fclose(bootfile); -} - /* * Wait for some output in the serial output file, * we get an 'A' followed by an endless string of 'B's @@ -422,12 +389,14 @@ static void test_migrate_start(QTestState **from, QTestState **to, if (access("/sys/module/kvm_hv", F_OK)) { accel = "tcg"; } - init_bootfile_ppc(bootpath); cmd_src = g_strdup_printf("-machine accel=%s -m 256M" " -name source,debug-threads=on" " -serial file:%s/src_serial" - " -drive file=%s,if=pflash,format=raw", - accel, tmpfs, bootpath); + " -prom-env '" + "boot-command=hex .\" _\" begin %x %x " + "do i c@ 1 + i c! 1000 +loop .\" B\" 0 " + "until'", accel, tmpfs, end_address, + start_address); cmd_dst = g_strdup_printf("-machine accel=%s -m 256M" " -name target,debug-threads=on" " -serial file:%s/dest_serial" @@ -536,7 +505,7 @@ static void test_deprecated(void) qtest_quit(from); } -static void test_migrate(void) +static void test_postcopy(void) { char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); QTestState *from, *to; @@ -611,6 +580,45 @@ static void test_baddest(void) test_migrate_end(from, to, false); } +static void test_precopy_unix(void) +{ + char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); + QTestState *from, *to; + + test_migrate_start(&from, &to, uri, false); + + /* We want to pick a speed slow enough that the test completes + * quickly, but that it doesn't complete precopy even on a slow + * machine, so also set the downtime. + */ + /* 1 ms should make it not converge*/ + migrate_set_parameter(from, "downtime-limit", "1"); + /* 1GB/s */ + migrate_set_parameter(from, "max-bandwidth", "1000000000"); + + /* Wait for the first serial output from the source */ + wait_for_serial("src_serial"); + + migrate(from, uri); + + wait_for_migration_pass(from); + + /* 300 ms should converge */ + migrate_set_parameter(from, "downtime-limit", "300"); + + if (!got_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + + qtest_qmp_eventwait(to, "RESUME"); + + wait_for_serial("dest_serial"); + wait_for_migration_complete(from); + + test_migrate_end(from, to, true); + g_free(uri); +} + int main(int argc, char **argv) { char template[] = "/tmp/migration-test-XXXXXX"; @@ -630,9 +638,10 @@ int main(int argc, char **argv) module_call_init(MODULE_INIT_QOM); - qtest_add_func("/migration/postcopy/unix", test_migrate); + qtest_add_func("/migration/postcopy/unix", test_postcopy); qtest_add_func("/migration/deprecated", test_deprecated); qtest_add_func("/migration/bad_dest", test_baddest); + qtest_add_func("/migration/precopy/unix", test_precopy_unix); ret = g_test_run(); |