diff options
author | Anthony Liguori <aliguori@us.ibm.com> | 2013-06-27 08:48:38 -0500 |
---|---|---|
committer | Anthony Liguori <aliguori@us.ibm.com> | 2013-06-27 08:48:38 -0500 |
commit | c394ace828a559be13ec0bde15b476970f186dad (patch) | |
tree | e15ec28f02b7d995171274aa47703244880dd827 | |
parent | 3e5087329489e0beceecf3426f1216619821937f (diff) | |
parent | 60d9222c8f50c3e5dd3df9ee84ddd1d1c4b35389 (diff) | |
download | qemu-c394ace828a559be13ec0bde15b476970f186dad.zip qemu-c394ace828a559be13ec0bde15b476970f186dad.tar.gz qemu-c394ace828a559be13ec0bde15b476970f186dad.tar.bz2 |
Merge remote-tracking branch 'quintela/migration.next' into staging
# By Michael R. Hines (9) and others
# Via Juan Quintela
* quintela/migration.next:
rdma: introduce capability x-rdma-pin-all
rdma: new QEMUFileOps hooks
rdma: introduce qemu_ram_foreach_block()
rdma: export qemu_fflush()
rdma: introduce qemu_file_mode_is_not_valid()
rdma: export throughput w/ MigrationStats QMP
rdma: export yield_until_fd_readable()
rdma: introduce qemu_update_position()
rdma: add documentation
migration: do not overwrite zero pages
Revert "migration: do not sent zero pages in bulk stage"
arch_init/ram_load: add error message for block length mismatch
Message-id: 1372329455-5995-1-git-send-email-quintela@redhat.com
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
-rw-r--r-- | arch_init.c | 42 | ||||
-rw-r--r-- | docs/rdma.txt | 415 | ||||
-rw-r--r-- | exec.c | 9 | ||||
-rw-r--r-- | hmp.c | 2 | ||||
-rw-r--r-- | include/block/coroutine.h | 6 | ||||
-rw-r--r-- | include/exec/cpu-common.h | 5 | ||||
-rw-r--r-- | include/migration/migration.h | 25 | ||||
-rw-r--r-- | include/migration/qemu-file.h | 32 | ||||
-rw-r--r-- | migration.c | 15 | ||||
-rw-r--r-- | qapi-schema.json | 12 | ||||
-rw-r--r-- | qemu-coroutine-io.c | 23 | ||||
-rw-r--r-- | savevm.c | 114 |
12 files changed, 647 insertions, 53 deletions
diff --git a/arch_init.c b/arch_init.c index a8b91ee..ea9ddad 100644 --- a/arch_init.c +++ b/arch_init.c @@ -457,15 +457,10 @@ static int ram_save_block(QEMUFile *f, bool last_stage) bytes_sent = -1; if (is_zero_page(p)) { acct_info.dup_pages++; - if (!ram_bulk_stage) { - bytes_sent = save_block_hdr(f, block, offset, cont, - RAM_SAVE_FLAG_COMPRESS); - qemu_put_byte(f, 0); - bytes_sent++; - } else { - acct_info.skipped_pages++; - bytes_sent = 0; - } + bytes_sent = save_block_hdr(f, block, offset, cont, + RAM_SAVE_FLAG_COMPRESS); + qemu_put_byte(f, 0); + bytes_sent++; } else if (!ram_bulk_stage && migrate_use_xbzrle()) { current_addr = block->offset + offset; bytes_sent = save_xbzrle_page(f, p, current_addr, block, @@ -498,6 +493,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage) static uint64_t bytes_transferred; +void acct_update_position(QEMUFile *f, size_t size, bool zero) +{ + uint64_t pages = size / TARGET_PAGE_SIZE; + if (zero) { + acct_info.dup_pages += pages; + } else { + acct_info.norm_pages += pages; + bytes_transferred += size; + qemu_update_position(f, size); + } +} + static ram_addr_t ram_save_remaining(void) { return migration_dirty_pages; @@ -808,6 +815,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) QTAILQ_FOREACH(block, &ram_list.blocks, next) { if (!strncmp(id, block->idstr, sizeof(id))) { if (block->length != length) { + fprintf(stderr, "Length mismatch: %s: %ld " + "in != " RAM_ADDR_FMT "\n", id, length, + block->length); ret = -EINVAL; goto done; } @@ -837,14 +847,16 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } ch = qemu_get_byte(f); - memset(host, ch, TARGET_PAGE_SIZE); + if (ch != 0 || !is_zero_page(host)) { + memset(host, ch, TARGET_PAGE_SIZE); #ifndef _WIN32 - if (ch == 0 && - (!kvm_enabled() || kvm_has_sync_mmu()) && - getpagesize() <= TARGET_PAGE_SIZE) { - qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED); - } + if (ch == 0 && + (!kvm_enabled() || kvm_has_sync_mmu()) && + getpagesize() <= TARGET_PAGE_SIZE) { + qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED); + } #endif + } } else if (flags & RAM_SAVE_FLAG_PAGE) { void *host; diff --git a/docs/rdma.txt b/docs/rdma.txt new file mode 100644 index 0000000..45a4b1d --- /dev/null +++ b/docs/rdma.txt @@ -0,0 +1,415 @@ +(RDMA: Remote Direct Memory Access) +RDMA Live Migration Specification, Version # 1 +============================================== +Wiki: http://wiki.qemu.org/Features/RDMALiveMigration +Github: git@github.com:hinesmr/qemu.git, 'rdma' branch + +Copyright (C) 2013 Michael R. Hines <mrhines@us.ibm.com> + +An *exhaustive* paper (2010) shows additional performance details +linked on the QEMU wiki above. + +Contents: +========= +* Introduction +* Before running +* Running +* Performance +* RDMA Migration Protocol Description +* Versioning and Capabilities +* QEMUFileRDMA Interface +* Migration of pc.ram +* Error handling +* TODO + +Introduction: +============= + +RDMA helps make your migration more deterministic under heavy load because +of the significantly lower latency and higher throughput over TCP/IP. This is +because the RDMA I/O architecture reduces the number of interrupts and +data copies by bypassing the host networking stack. In particular, a TCP-based +migration, under certain types of memory-bound workloads, may take a more +unpredicatable amount of time to complete the migration if the amount of +memory tracked during each live migration iteration round cannot keep pace +with the rate of dirty memory produced by the workload. + +RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA +over Convered Ethernet) as well as Infiniband-based. This implementation of +migration using RDMA is capable of using both technologies because of +the use of the OpenFabrics OFED software stack that abstracts out the +programming model irrespective of the underlying hardware. + +Refer to openfabrics.org or your respective RDMA hardware vendor for +an understanding on how to verify that you have the OFED software stack +installed in your environment. You should be able to successfully link +against the "librdmacm" and "libibverbs" libraries and development headers +for a working build of QEMU to run successfully using RDMA Migration. + +BEFORE RUNNING: +=============== + +Use of RDMA during migration requires pinning and registering memory +with the hardware. This means that memory must be physically resident +before the hardware can transmit that memory to another machine. +If this is not acceptable for your application or product, then the use +of RDMA migration may in fact be harmful to co-located VMs or other +software on the machine if there is not sufficient memory available to +relocate the entire footprint of the virtual machine. If so, then the +use of RDMA is discouraged and it is recommended to use standard TCP migration. + +Experimental: Next, decide if you want dynamic page registration. +For example, if you have an 8GB RAM virtual machine, but only 1GB +is in active use, then enabling this feature will cause all 8GB to +be pinned and resident in memory. This feature mostly affects the +bulk-phase round of the migration and can be enabled for extremely +high-performance RDMA hardware using the following command: + +QEMU Monitor Command: +$ migrate_set_capability x-rdma-pin-all on # disabled by default + +Performing this action will cause all 8GB to be pinned, so if that's +not what you want, then please ignore this step altogether. + +On the other hand, this will also significantly speed up the bulk round +of the migration, which can greatly reduce the "total" time of your migration. +Example performance of this using an idle VM in the previous example +can be found in the "Performance" section. + +Note: for very large virtual machines (hundreds of GBs), pinning all +*all* of the memory of your virtual machine in the kernel is very expensive +may extend the initial bulk iteration time by many seconds, +and thus extending the total migration time. However, this will not +affect the determinism or predictability of your migration you will +still gain from the benefits of advanced pinning with RDMA. + +RUNNING: +======== + +First, set the migration speed to match your hardware's capabilities: + +QEMU Monitor Command: +$ migrate_set_speed 40g # or whatever is the MAX of your RDMA device + +Next, on the destination machine, add the following to the QEMU command line: + +qemu ..... -incoming x-rdma:host:port + +Finally, perform the actual migration on the source machine: + +QEMU Monitor Command: +$ migrate -d x-rdma:host:port + +PERFORMANCE +=========== + +Here is a brief summary of total migration time and downtime using RDMA: +Using a 40gbps infiniband link performing a worst-case stress test, +using an 8GB RAM virtual machine: + +Using the following command: +$ apt-get install stress +$ stress --vm-bytes 7500M --vm 1 --vm-keep + +1. Migration throughput: 26 gigabits/second. +2. Downtime (stop time) varies between 15 and 100 milliseconds. + +EFFECTS of memory registration on bulk phase round: + +For example, in the same 8GB RAM example with all 8GB of memory in +active use and the VM itself is completely idle using the same 40 gbps +infiniband link: + +1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps + +These numbers would of course scale up to whatever size virtual machine +you have to migrate using RDMA. + +Enabling this feature does *not* have any measurable affect on +migration *downtime*. This is because, without this feature, all of the +memory will have already been registered already in advance during +the bulk round and does not need to be re-registered during the successive +iteration rounds. + +RDMA Protocol Description: +========================== + +Migration with RDMA is separated into two parts: + +1. The transmission of the pages using RDMA +2. Everything else (a control channel is introduced) + +"Everything else" is transmitted using a formal +protocol now, consisting of infiniband SEND messages. + +An infiniband SEND message is the standard ibverbs +message used by applications of infiniband hardware. +The only difference between a SEND message and an RDMA +message is that SEND messages cause notifications +to be posted to the completion queue (CQ) on the +infiniband receiver side, whereas RDMA messages (used +for pc.ram) do not (to behave like an actual DMA). + +Messages in infiniband require two things: + +1. registration of the memory that will be transmitted +2. (SEND only) work requests to be posted on both + sides of the network before the actual transmission + can occur. + +RDMA messages are much easier to deal with. Once the memory +on the receiver side is registered and pinned, we're +basically done. All that is required is for the sender +side to start dumping bytes onto the link. + +(Memory is not released from pinning until the migration +completes, given that RDMA migrations are very fast.) + +SEND messages require more coordination because the +receiver must have reserved space (using a receive +work request) on the receive queue (RQ) before QEMUFileRDMA +can start using them to carry all the bytes as +a control transport for migration of device state. + +To begin the migration, the initial connection setup is +as follows (migration-rdma.c): + +1. Receiver and Sender are started (command line or libvirt): +2. Both sides post two RQ work requests +3. Receiver does listen() +4. Sender does connect() +5. Receiver accept() +6. Check versioning and capabilities (described later) + +At this point, we define a control channel on top of SEND messages +which is described by a formal protocol. Each SEND message has a +header portion and a data portion (but together are transmitted +as a single SEND message). + +Header: + * Length (of the data portion, uint32, network byte order) + * Type (what command to perform, uint32, network byte order) + * Repeat (Number of commands in data portion, same type only) + +The 'Repeat' field is here to support future multiple page registrations +in a single message without any need to change the protocol itself +so that the protocol is compatible against multiple versions of QEMU. +Version #1 requires that all server implementations of the protocol must +check this field and register all requests found in the array of commands located +in the data portion and return an equal number of results in the response. +The maximum number of repeats is hard-coded to 4096. This is a conservative +limit based on the maximum size of a SEND message along with emperical +observations on the maximum future benefit of simultaneous page registrations. + +The 'type' field has 10 different command values: + 1. Unused + 2. Error (sent to the source during bad things) + 3. Ready (control-channel is available) + 4. QEMU File (for sending non-live device state) + 5. RAM Blocks request (used right after connection setup) + 6. RAM Blocks result (used right after connection setup) + 7. Compress page (zap zero page and skip registration) + 8. Register request (dynamic chunk registration) + 9. Register result ('rkey' to be used by sender) + 10. Register finished (registration for current iteration finished) + +A single control message, as hinted above, can contain within the data +portion an array of many commands of the same type. If there is more than +one command, then the 'repeat' field will be greater than 1. + +After connection setup, message 5 & 6 are used to exchange ram block +information and optionally pin all the memory if requested by the user. + +After ram block exchange is completed, we have two protocol-level +functions, responsible for communicating control-channel commands +using the above list of values: + +Logically: + +qemu_rdma_exchange_recv(header, expected command type) + +1. We transmit a READY command to let the sender know that + we are *ready* to receive some data bytes on the control channel. +2. Before attempting to receive the expected command, we post another + RQ work request to replace the one we just used up. +3. Block on a CQ event channel and wait for the SEND to arrive. +4. When the send arrives, librdmacm will unblock us. +5. Verify that the command-type and version received matches the one we expected. + +qemu_rdma_exchange_send(header, data, optional response header & data): + +1. Block on the CQ event channel waiting for a READY command + from the receiver to tell us that the receiver + is *ready* for us to transmit some new bytes. +2. Optionally: if we are expecting a response from the command + (that we have no yet transmitted), let's post an RQ + work request to receive that data a few moments later. +3. When the READY arrives, librdmacm will + unblock us and we immediately post a RQ work request + to replace the one we just used up. +4. Now, we can actually post the work request to SEND + the requested command type of the header we were asked for. +5. Optionally, if we are expecting a response (as before), + we block again and wait for that response using the additional + work request we previously posted. (This is used to carry + 'Register result' commands #6 back to the sender which + hold the rkey need to perform RDMA. Note that the virtual address + corresponding to this rkey was already exchanged at the beginning + of the connection (described below). + +All of the remaining command types (not including 'ready') +described above all use the aformentioned two functions to do the hard work: + +1. After connection setup, RAMBlock information is exchanged using + this protocol before the actual migration begins. This information includes + a description of each RAMBlock on the server side as well as the virtual addresses + and lengths of each RAMBlock. This is used by the client to determine the + start and stop locations of chunks and how to register them dynamically + before performing the RDMA operations. +2. During runtime, once a 'chunk' becomes full of pages ready to + be sent with RDMA, the registration commands are used to ask the + other side to register the memory for this chunk and respond + with the result (rkey) of the registration. +3. Also, the QEMUFile interfaces also call these functions (described below) + when transmitting non-live state, such as devices or to send + its own protocol information during the migration process. +4. Finally, zero pages are only checked if a page has not yet been registered + using chunk registration (or not checked at all and unconditionally + written if chunk registration is disabled. This is accomplished using + the "Compress" command listed above. If the page *has* been registered + then we check the entire chunk for zero. Only if the entire chunk is + zero, then we send a compress command to zap the page on the other side. + +Versioning and Capabilities +=========================== +Current version of the protocol is version #1. + +The same version applies to both for protocol traffic and capabilities +negotiation. (i.e. There is only one version number that is referred to +by all communication). + +librdmacm provides the user with a 'private data' area to be exchanged +at connection-setup time before any infiniband traffic is generated. + +Header: + * Version (protocol version validated before send/recv occurs), uint32, network byte order + * Flags (bitwise OR of each capability), uint32, network byte order + +There is no data portion of this header right now, so there is +no length field. The maximum size of the 'private data' section +is only 192 bytes per the Infiniband specification, so it's not +very useful for data anyway. This structure needs to remain small. + +This private data area is a convenient place to check for protocol +versioning because the user does not need to register memory to +transmit a few bytes of version information. + +This is also a convenient place to negotiate capabilities +(like dynamic page registration). + +If the version is invalid, we throw an error. + +If the version is new, we only negotiate the capabilities that the +requested version is able to perform and ignore the rest. + +Currently there is only *one* capability in Version #1: dynamic page registration + +Finally: Negotiation happens with the Flags field: If the primary-VM +sets a flag, but the destination does not support this capability, it +will return a zero-bit for that flag and the primary-VM will understand +that as not being an available capability and will thus disable that +capability on the primary-VM side. + +QEMUFileRDMA Interface: +======================= + +QEMUFileRDMA introduces a couple of new functions: + +1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops) +2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops) + +These two functions are very short and simply use the protocol +describe above to deliver bytes without changing the upper-level +users of QEMUFile that depend on a bytestream abstraction. + +Finally, how do we handoff the actual bytes to get_buffer()? + +Again, because we're trying to "fake" a bytestream abstraction +using an analogy not unlike individual UDP frames, we have +to hold on to the bytes received from control-channel's SEND +messages in memory. + +Each time we receive a complete "QEMU File" control-channel +message, the bytes from SEND are copied into a small local holding area. + +Then, we return the number of bytes requested by get_buffer() +and leave the remaining bytes in the holding area until get_buffer() +comes around for another pass. + +If the buffer is empty, then we follow the same steps +listed above and issue another "QEMU File" protocol command, +asking for a new SEND message to re-fill the buffer. + +Migration of pc.ram: +==================== + +At the beginning of the migration, (migration-rdma.c), +the sender and the receiver populate the list of RAMBlocks +to be registered with each other into a structure. +Then, using the aforementioned protocol, they exchange a +description of these blocks with each other, to be used later +during the iteration of main memory. This description includes +a list of all the RAMBlocks, their offsets and lengths, virtual +addresses and possibly includes pre-registered RDMA keys in case dynamic +page registration was disabled on the server-side, otherwise not. + +Main memory is not migrated with the aforementioned protocol, +but is instead migrated with normal RDMA Write operations. + +Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now). +Chunk size is not dynamic, but it could be in a future implementation. +There's nothing to indicate that this is useful right now. + +When a chunk is full (or a flush() occurs), the memory backed by +the chunk is registered with librdmacm is pinned in memory on +both sides using the aforementioned protocol. +After pinning, an RDMA Write is generated and transmitted +for the entire chunk. + +Chunks are also transmitted in batches: This means that we +do not request that the hardware signal the completion queue +for the completion of *every* chunk. The current batch size +is about 64 chunks (corresponding to 64 MB of memory). +Only the last chunk in a batch must be signaled. +This helps keep everything as asynchronous as possible +and helps keep the hardware busy performing RDMA operations. + +Error-handling: +=============== + +Infiniband has what is called a "Reliable, Connected" +link (one of 4 choices). This is the mode in which +we use for RDMA migration. + +If a *single* message fails, +the decision is to abort the migration entirely and +cleanup all the RDMA descriptors and unregister all +the memory. + +After cleanup, the Virtual Machine is returned to normal +operation the same way that would happen if the TCP +socket is broken during a non-RDMA based migration. + +TODO: +===== +1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be + renamed to 'rdma' after the experimental phase of this work has + completed upstream. +2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits + are not compatible with infinband memory pinning and will result in + an aborted migration (but with the source VM left unaffected). +3. Use of the recent /proc/<pid>/pagemap would likely speed up + the use of KSM and ballooning while using RDMA. +4. Also, some form of balloon-device usage tracking would also + help alleviate some issues. @@ -2630,3 +2630,12 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr) memory_region_is_romd(mr)); } #endif + +void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) +{ + RAMBlock *block; + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + func(block->host, block->offset, block->length, opaque); + } +} @@ -169,6 +169,8 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) if (info->has_ram) { monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n", info->ram->transferred >> 10); + monitor_printf(mon, "throughput: %0.2f mbps\n", + info->ram->mbps); monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n", info->ram->remaining >> 10); monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n", diff --git a/include/block/coroutine.h b/include/block/coroutine.h index a978162..377805a 100644 --- a/include/block/coroutine.h +++ b/include/block/coroutine.h @@ -209,4 +209,10 @@ void qemu_co_rwlock_unlock(CoRwlock *lock); */ void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns); +/** + * Yield until a file descriptor becomes readable + * + * Note that this function clobbers the handlers for the file descriptor. + */ +void coroutine_fn yield_until_fd_readable(int fd); #endif /* QEMU_COROUTINE_H */ diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index e061e21..92a4223 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -113,6 +113,11 @@ void cpu_physical_memory_write_rom(hwaddr addr, extern struct MemoryRegion io_mem_rom; extern struct MemoryRegion io_mem_notdirty; +typedef void (RAMBlockIterFunc)(void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque); + +void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque); + #endif #endif /* !CPU_COMMON_H */ diff --git a/include/migration/migration.h b/include/migration/migration.h index e2acec6..f0640e0 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -21,6 +21,7 @@ #include "qapi/error.h" #include "migration/vmstate.h" #include "qapi-types.h" +#include "exec/cpu-common.h" struct MigrationParams { bool blk; @@ -40,6 +41,7 @@ struct MigrationState int state; MigrationParams params; + double mbps; int64_t total_time; int64_t downtime; int64_t expected_downtime; @@ -92,6 +94,8 @@ uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_transferred(void); uint64_t ram_bytes_total(void); +void acct_update_position(QEMUFile *f, size_t size, bool zero); + extern SaveVMHandlers savevm_ram_handlers; uint64_t dup_mig_bytes_transferred(void); @@ -119,6 +123,8 @@ void migrate_add_blocker(Error *reason); */ void migrate_del_blocker(Error *reason); +bool migrate_rdma_pin_all(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); @@ -127,4 +133,23 @@ int migrate_use_xbzrle(void); int64_t migrate_xbzrle_cache_size(void); int64_t xbzrle_cache_resize(int64_t new_size); + +void ram_control_before_iterate(QEMUFile *f, uint64_t flags); +void ram_control_after_iterate(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, uint64_t flags); + +/* Whenever this is found in the data stream, the flags + * will be passed to ram_control_load_hook in the incoming-migration + * side. This lets before_ram_iterate/after_ram_iterate add + * transport-specific sections to the RAM migration data. + */ +#define RAM_SAVE_FLAG_HOOK 0x80 + +#define RAM_SAVE_CONTROL_NOT_SUPP -1000 +#define RAM_SAVE_CONTROL_DELAYED -2000 + +size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size, + int *bytes_sent); + #endif diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 7519464..0f757fb 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -23,6 +23,7 @@ */ #ifndef QEMU_FILE_H #define QEMU_FILE_H 1 +#include "exec/cpu-common.h" /* This function writes a chunk of data to a file at the given position. * The pos argument can be ignored if the file is only being used for @@ -57,12 +58,40 @@ typedef int (QEMUFileGetFD)(void *opaque); typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, int iovcnt, int64_t pos); +/* + * This function provides hooks around different + * stages of RAM migration. + */ +typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); + +/* + * Constants used by ram_control_* hooks + */ +#define RAM_CONTROL_SETUP 0 +#define RAM_CONTROL_ROUND 1 +#define RAM_CONTROL_HOOK 2 +#define RAM_CONTROL_FINISH 3 + +/* + * This function allows override of where the RAM page + * is saved (such as RDMA, for example.) + */ +typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, + ram_addr_t block_offset, + ram_addr_t offset, + size_t size, + int *bytes_sent); + typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; QEMUFileGetBufferFunc *get_buffer; QEMUFileCloseFunc *close; QEMUFileGetFD *get_fd; QEMUFileWritevBufferFunc *writev_buffer; + QEMURamHookFunc *before_ram_iterate; + QEMURamHookFunc *after_ram_iterate; + QEMURamHookFunc *hook_ram_load; + QEMURamSaveFunc *save_page; } QEMUFileOps; QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops); @@ -80,6 +109,7 @@ void qemu_put_byte(QEMUFile *f, int v); * The buffer should be available till it is sent asynchronously. */ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size); +bool qemu_file_mode_is_not_valid(const char *mode); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { @@ -93,6 +123,7 @@ void qemu_put_be32(QEMUFile *f, unsigned int v); void qemu_put_be64(QEMUFile *f, uint64_t v); int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size); int qemu_get_byte(QEMUFile *f); +void qemu_update_position(QEMUFile *f, size_t size); static inline unsigned int qemu_get_ubyte(QEMUFile *f) { @@ -110,6 +141,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f); void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate); int64_t qemu_file_get_rate_limit(QEMUFile *f); int qemu_file_get_error(QEMUFile *f); +void qemu_fflush(QEMUFile *f); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/migration.c b/migration.c index 058f9e6..a704d48 100644 --- a/migration.c +++ b/migration.c @@ -66,6 +66,7 @@ MigrationState *migrate_get_current(void) .state = MIG_STATE_SETUP, .bandwidth_limit = MAX_THROTTLE, .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, + .mbps = -1, }; return ¤t_migration; @@ -201,6 +202,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->normal = norm_mig_pages_transferred(); info->ram->normal_bytes = norm_mig_bytes_transferred(); info->ram->dirty_pages_rate = s->dirty_pages_rate; + info->ram->mbps = s->mbps; if (blk_mig_active()) { info->has_disk = true; @@ -230,6 +232,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->skipped = skipped_mig_pages_transferred(); info->ram->normal = norm_mig_pages_transferred(); info->ram->normal_bytes = norm_mig_bytes_transferred(); + info->ram->mbps = s->mbps; break; case MIG_STATE_ERROR: info->has_status = true; @@ -473,6 +476,15 @@ void qmp_migrate_set_downtime(double value, Error **errp) max_downtime = (uint64_t)value; } +bool migrate_rdma_pin_all(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]; +} + int migrate_use_xbzrle(void) { MigrationState *s; @@ -543,6 +555,9 @@ static void *migration_thread(void *opaque) double bandwidth = transferred_bytes / time_spent; max_size = bandwidth * migrate_max_downtime() / 1000000; + s->mbps = time_spent ? (((double) transferred_bytes * 8.0) / + ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1; + DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 "\n", transferred_bytes, time_spent, bandwidth, max_size); diff --git a/qapi-schema.json b/qapi-schema.json index 6cc07c2..a30a728 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -513,12 +513,15 @@ # @dirty-pages-rate: number of pages dirtied by second by the # guest (since 1.3) # +# @mbps: throughput in megabits/sec. (since 1.6) +# # Since: 0.14.0 ## { 'type': 'MigrationStats', 'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' , 'duplicate': 'int', 'skipped': 'int', 'normal': 'int', - 'normal-bytes': 'int', 'dirty-pages-rate' : 'int' } } + 'normal-bytes': 'int', 'dirty-pages-rate' : 'int', + 'mbps' : 'number' } } ## # @XBZRLECacheStats @@ -605,10 +608,15 @@ # This feature allows us to minimize migration traffic for certain work # loads, by sending compressed difference of the pages # +# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is +# mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage. +# Disabled by default. Experimental: may (or may not) be renamed after +# further testing is complete. (since 1.6) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle'] } + 'data': ['xbzrle', 'x-rdma-pin-all'] } ## # @MigrationCapabilityStatus diff --git a/qemu-coroutine-io.c b/qemu-coroutine-io.c index e8ad1a4..c4df35a 100644 --- a/qemu-coroutine-io.c +++ b/qemu-coroutine-io.c @@ -63,3 +63,26 @@ qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send) struct iovec iov = { .iov_base = buf, .iov_len = bytes }; return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send); } + +typedef struct { + Coroutine *co; + int fd; +} FDYieldUntilData; + +static void fd_coroutine_enter(void *opaque) +{ + FDYieldUntilData *data = opaque; + qemu_set_fd_handler(data->fd, NULL, NULL, NULL); + qemu_coroutine_enter(data->co, NULL); +} + +void coroutine_fn yield_until_fd_readable(int fd) +{ + FDYieldUntilData data; + + assert(qemu_in_coroutine()); + data.co = qemu_coroutine_self(); + data.fd = fd; + qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data); + qemu_coroutine_yield(); +} @@ -149,34 +149,6 @@ typedef struct QEMUFileSocket QEMUFile *file; } QEMUFileSocket; -typedef struct { - Coroutine *co; - int fd; -} FDYieldUntilData; - -static void fd_coroutine_enter(void *opaque) -{ - FDYieldUntilData *data = opaque; - qemu_set_fd_handler(data->fd, NULL, NULL, NULL); - qemu_coroutine_enter(data->co, NULL); -} - -/** - * Yield until a file descriptor becomes readable - * - * Note that this function clobbers the handlers for the file descriptor. - */ -static void coroutine_fn yield_until_fd_readable(int fd) -{ - FDYieldUntilData data; - - assert(qemu_in_coroutine()); - data.co = qemu_coroutine_self(); - data.fd = fd; - qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data); - qemu_coroutine_yield(); -} - static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, int64_t pos) { @@ -477,14 +449,23 @@ static const QEMUFileOps socket_write_ops = { .close = socket_close }; -QEMUFile *qemu_fopen_socket(int fd, const char *mode) +bool qemu_file_mode_is_not_valid(const char *mode) { - QEMUFileSocket *s; - if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || mode[1] != 'b' || mode[2] != 0) { fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); + return true; + } + + return false; +} + +QEMUFile *qemu_fopen_socket(int fd, const char *mode) +{ + QEMUFileSocket *s; + + if (qemu_file_mode_is_not_valid(mode)) { return NULL; } @@ -503,10 +484,7 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode) { QEMUFileStdio *s; - if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { - fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); + if (qemu_file_mode_is_not_valid(mode)) { return NULL; } @@ -611,7 +589,7 @@ static inline bool qemu_file_is_writable(QEMUFile *f) * If there is writev_buffer QEMUFileOps it uses it otherwise uses * put_buffer ops. */ -static void qemu_fflush(QEMUFile *f) +void qemu_fflush(QEMUFile *f) { ssize_t ret = 0; @@ -638,6 +616,65 @@ static void qemu_fflush(QEMUFile *f) } } +void ram_control_before_iterate(QEMUFile *f, uint64_t flags) +{ + int ret = 0; + + if (f->ops->before_ram_iterate) { + ret = f->ops->before_ram_iterate(f, f->opaque, flags); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } +} + +void ram_control_after_iterate(QEMUFile *f, uint64_t flags) +{ + int ret = 0; + + if (f->ops->after_ram_iterate) { + ret = f->ops->after_ram_iterate(f, f->opaque, flags); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } +} + +void ram_control_load_hook(QEMUFile *f, uint64_t flags) +{ + int ret = 0; + + if (f->ops->hook_ram_load) { + ret = f->ops->hook_ram_load(f, f->opaque, flags); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } else { + qemu_file_set_error(f, ret); + } +} + +size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size, int *bytes_sent) +{ + if (f->ops->save_page) { + int ret = f->ops->save_page(f, f->opaque, block_offset, + offset, size, bytes_sent); + + if (ret != RAM_SAVE_CONTROL_DELAYED) { + if (*bytes_sent > 0) { + qemu_update_position(f, *bytes_sent); + } else if (ret < 0) { + qemu_file_set_error(f, ret); + } + } + + return ret; + } + + return RAM_SAVE_CONTROL_NOT_SUPP; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; @@ -671,6 +708,11 @@ int qemu_get_fd(QEMUFile *f) return -1; } +void qemu_update_position(QEMUFile *f, size_t size) +{ + f->pos += size; +} + /** Closes the file * * Returns negative error value if any error happened on previous operations or |