diff options
-rw-r--r-- | README.md | 22 | ||||
-rw-r--r-- | lib/migration.c | 6 | ||||
-rw-r--r-- | samples/client.c | 203 | ||||
-rw-r--r-- | samples/server.c | 157 |
4 files changed, 270 insertions, 118 deletions
@@ -124,18 +124,28 @@ client/server model where basic tasks are performed. The server implements a device that can be programmed to trigger interrupts (INTx) to the client. This is done by writing the desired time in seconds since -Epoch. The server then trigger an eventfd-based IRQ and then a message-based +Epoch to BAR0. The server then triggers an eventfd-based IRQ and then a message-based one (in order to demonstrate how it's done when passing of file descriptors -isn't possible/desirable). +isn't possible/desirable). The device also works as memory storage: BAR1 can +be freely written to/read from by the host. + +Since this is a completely made up device, there's no kernel driver (yet). +[Client](./samples/client.c) implements a client that knows how to drive this +particular device (that would normally be QEMU + guest VM + kernel driver). The client excercises all commands in the vfio-user protocol, and then proceeds to perform live migration. The client spawns the destination server (this would -be normally done by `libvirt`) and then migrates the device state, before +be normally done by libvirt) and then migrates the device state, before switching entirely to the destination server. We re-use the source client instead of spawning a destination one as this is something libvirt/QEMU would -normally do. To spice things up, the client programmes the source server to -trigger an interrupt and then quickly migrates to the destination server; the -programmed interrupt is delivered by the destination server. +normally do. + +To spice things up, the client programs the source server to trigger an +interrupt and then migrates to the destination server; the programmed interrupt +is delivered by the destination server. Also, while the device is being live +migrated, the client spawns a thread that constantly writes to BAR1 in a tight +loop. This thread emulates the guest VM accessing the device while the main +thread (what would normally be QEMU) is driving the migration. Start the source server as follows (pick whatever you like for `/tmp/vfio-user.sock`): diff --git a/lib/migration.c b/lib/migration.c index 31a8d9d..c4eb6f6 100644 --- a/lib/migration.c +++ b/lib/migration.c @@ -450,6 +450,12 @@ migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count, assert(migr != NULL); assert(buf != NULL); + /* + * FIXME don't call the device callback if the migration state is in not in + * pre-copy/stop-and-copy/resuming state, since the behavior is undefined + * in that case. + */ + if (pos + count <= sizeof(struct vfio_device_migration_info)) { ret = migration_region_access_registers(vfu_ctx, buf, count, pos, is_write); diff --git a/samples/client.c b/samples/client.c index 18558b8..e7228ac 100644 --- a/samples/client.c +++ b/samples/client.c @@ -717,20 +717,29 @@ usage(char *path) { basename(path)); } -static void -migrate_from(int sock, int migr_reg_index, void **data, __u64 *len) +/* + * Normally each time the source client (QEMU) would read migration data from + * the device it would send them to the destination client. However, since in + * our sample both the source and the destination client are the same process, + * we simply accumulate the migration data of each iteration and apply it to + * the destination server at the end. + * + * Performs as many migration loops as @nr_iters or until the device has no + * more migration data (pending_bytes is zero), which ever comes first. The + * result of each migration iteration is stored in @migr_iter. @migr_iter must + * be at least @nr_iters. + * + * @returns the number of iterations performed + */ +static size_t +do_migrate(int sock, int migr_reg_index, size_t nr_iters, + struct iovec *migr_iter) { - __u32 device_state = VFIO_DEVICE_STATE_SAVING; + int ret; __u64 pending_bytes, data_offset, data_size; + size_t i = 0; - /* XXX set device state to stop-and-copy */ - int ret = access_region(sock, migr_reg_index, true, - offsetof(struct vfio_device_migration_info, device_state), - &device_state, sizeof(device_state)); - if (ret < 0) { - errx(EXIT_FAILURE, "failed to write to device state: %s", - strerror(-ret)); - } + assert(nr_iters > 0); /* XXX read pending_bytes */ ret = access_region(sock, migr_reg_index, false, @@ -744,19 +753,7 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len) /* We do expect some migration data. */ assert(pending_bytes > 0); - /* - * The only expectation about pending_bytes is whether it's zero or - * non-zero, therefore it must be considered volatile, even acrosss - * iterantions. In the sample server we know it's static so it's fairly - * straightforward. - */ - *len = pending_bytes; - *data = malloc(*len); - if (*data == NULL) { - err(EXIT_FAILURE, "failed to allocate migration buffer"); - } - - while (pending_bytes > 0) { + for (i = 0; i < nr_iters && pending_bytes > 0; i++) { /* XXX read data_offset and data_size */ ret = access_region(sock, migr_reg_index, false, @@ -775,11 +772,17 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len) strerror(-ret)); } - assert(data_offset - sizeof(struct vfio_device_migration_info) + data_size <= *len); + assert(data_offset - sizeof(struct vfio_device_migration_info) + data_size <= pending_bytes); + + migr_iter[i].iov_len = data_size; + migr_iter[i].iov_base = malloc(data_size); + if (migr_iter[i].iov_base == NULL) { + err(EXIT_FAILURE, "failed to allocate migration buffer"); + } /* XXX read migration data */ ret = access_region(sock, migr_reg_index, false, data_offset, - (char*)*data + data_offset - sizeof(struct vfio_device_migration_info), + (char*)migr_iter[i].iov_base + data_offset - sizeof(struct vfio_device_migration_info), data_size); if (ret < 0) { errx(EXIT_FAILURE, "failed to read migration data: %s", @@ -800,9 +803,43 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len) strerror(-ret)); } } + return i; +} - /* XXX read device state, migration must have finished now */ - device_state = VFIO_DEVICE_STATE_STOP; +static size_t +migrate_from(int sock, int migr_reg_index, size_t *nr_iters, struct iovec **migr_iters) +{ + __u32 device_state; + int ret; + size_t _nr_iters; + + /* + * FIXME to fully demonstrate live migration we'll need a way to change + * device state while the client is running the migration iteration. One + * way to do that is to have the client randomly modify a big-ish device + * region while running the live migration iterations, and at some point + * stopping to do the stop-and-copy phase. It can store in a buffer the + * modifications it makes and then after the device has been migrated it + * should compare the buffer with the migrated device region. + */ + + /* + * TODO The server generates migration data while it's in pre-copy state. + * + * FIXME the server generates 4 rounds of migration data while in pre-copy + * state and 1 while in stop-and-copy state. Don't assume this. + */ + *nr_iters = 5; + *migr_iters = malloc(sizeof(struct iovec) * *nr_iters); + if (*migr_iters == NULL) { + err(EXIT_FAILURE, NULL); + } + + /* + * XXX set device state to pre-copy. This is technically optional but any + * VMM that cares about performance needs this. + */ + device_state = VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING; ret = access_region(sock, migr_reg_index, true, offsetof(struct vfio_device_migration_info, device_state), &device_state, sizeof(device_state)); @@ -810,18 +847,57 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len) errx(EXIT_FAILURE, "failed to write to device state: %s", strerror(-ret)); } + + _nr_iters = do_migrate(sock, migr_reg_index, 4, *migr_iters); + + if (_nr_iters != 4) { + errx(EXIT_FAILURE, + "expected 4 iterations instead of %ld while in pre-copy state\n", + _nr_iters); + } + + printf("setting device state to stop-and-copy\n"); + + device_state = VFIO_DEVICE_STATE_SAVING; + ret = access_region(sock, migr_reg_index, true, + offsetof(struct vfio_device_migration_info, device_state), + &device_state, sizeof(device_state)); + if (ret < 0) { + errx(EXIT_FAILURE, "failed to write to device state: %s", + strerror(-ret)); + } + + _nr_iters += do_migrate(sock, migr_reg_index, 1, (*migr_iters) + _nr_iters); + if (_nr_iters != 5) { + errx(EXIT_FAILURE, + "expected 5 iterations instead of %ld while in stop-and-copy state\n", + _nr_iters); + } + + /* XXX read device state, migration must have finished now */ + device_state = VFIO_DEVICE_STATE_STOP; + ret = access_region(sock, migr_reg_index, true, + offsetof(struct vfio_device_migration_info, device_state), + &device_state, sizeof(device_state)); + if (ret < 0) { + errx(EXIT_FAILURE, "failed to write to device state: %s", + strerror(-ret)); + } + + return _nr_iters; } static int migrate_to(char *old_sock_path, int *server_max_fds, - size_t *pgsize, void *migr_data, __u64 migr_data_len, + size_t *pgsize, size_t nr_iters, struct iovec *migr_iters, char *path_to_server, int migr_reg_index) { int ret, sock; char *sock_path; struct stat sb; __u32 device_state = VFIO_DEVICE_STATE_RESUMING; - __u64 data_offset; + __u64 data_offset, data_len; + size_t i; assert(old_sock_path != NULL); @@ -874,33 +950,42 @@ migrate_to(char *old_sock_path, int *server_max_fds, strerror(-ret)); } - /* XXX read data offset */ - ret = access_region(sock, migr_reg_index, false, - offsetof(struct vfio_device_migration_info, data_offset), - &data_offset, sizeof(data_offset)); - if (ret < 0) { - errx(EXIT_FAILURE, "failed to read data offset: %s", strerror(-ret)); - } + for (i = 0; i < nr_iters; i++) { - /* XXX write migration data */ + /* XXX read data offset */ + ret = access_region(sock, migr_reg_index, false, + offsetof(struct vfio_device_migration_info, data_offset), + &data_offset, sizeof(data_offset)); + if (ret < 0) { + errx(EXIT_FAILURE, "failed to read migration data offset: %s", + strerror(-ret)); + } - /* - * TODO write half of migration data via regular write and other half via - * memopy map. - */ - ret = access_region(sock, migr_reg_index, true, - data_offset, migr_data, migr_data_len); - if (ret < 0) { - errx(EXIT_FAILURE, "failed to write migration data: %s", - strerror(-ret)); - } + /* XXX write migration data */ - /* XXX write data_size */ - ret = access_region(sock, migr_reg_index, true, - offsetof(struct vfio_device_migration_info, data_size), - &migr_data_len, sizeof migr_data_len); - if (ret < 0) { - errx(EXIT_FAILURE, "failed to write data size: %s", strerror(-ret)); + /* + * TODO write half of migration data via regular write and other half via + * memopy map. + */ + printf("writing migration device data %#llx-%#llx\n", data_offset, + data_offset + migr_iters[i].iov_len - 1); + ret = access_region(sock, migr_reg_index, true, + data_offset, migr_iters[i].iov_base, + migr_iters[i].iov_len); + if (ret < 0) { + errx(EXIT_FAILURE, "failed to write device migration data: %s", + strerror(-ret)); + } + + /* XXX write data_size */ + data_len = migr_iters[i].iov_len; + ret = access_region(sock, migr_reg_index, true, + offsetof(struct vfio_device_migration_info, data_size), + &data_len, sizeof data_len); + if (ret < 0) { + errx(EXIT_FAILURE, "failed to write migration data size: %s", + strerror(-ret)); + } } /* XXX set device state to running */ @@ -953,11 +1038,11 @@ int main(int argc, char *argv[]) struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0}; int opt; time_t t; - void *migr_data; - __u64 migr_data_len; char *path_to_server = NULL; vfu_pci_hdr_t config_space; int migr_reg_index; + struct iovec *migr_iters; + size_t nr_iters; while ((opt = getopt(argc, argv, "h")) != -1) { switch (opt) { @@ -1117,13 +1202,13 @@ int main(int argc, char *argv[]) /* * By sleeping here for 1s after migration finishes on the source server - * (but not yet started on the destination server), the timer should be be + * (but not yet started on the destination server), the timer should be * armed on the destination server for 2-1=1 seconds. If we don't sleep * then it will be armed for 2 seconds, which isn't as interesting. */ sleep(1); - migrate_from(sock, migr_reg_index, &migr_data, &migr_data_len); + migrate_from(sock, migr_reg_index, &nr_iters, &migr_iters); /* * Normally the client would now send the device state to the destination @@ -1136,7 +1221,7 @@ int main(int argc, char *argv[]) } sock = migrate_to(argv[optind], &server_max_fds, &pgsize, - migr_data, migr_data_len, path_to_server, migr_reg_index); + nr_iters, migr_iters, path_to_server, migr_reg_index); /* * Now we must reconfigure the destination server. diff --git a/samples/server.c b/samples/server.c index 4c22922..f7a1cd1 100644 --- a/samples/server.c +++ b/samples/server.c @@ -58,7 +58,8 @@ struct dma_regions { struct server_data { time_t bar0; - uint8_t *bar1; + void *bar1; + size_t bar1_size; struct dma_regions regions[NR_DMA_REGIONS]; struct { __u64 pending_bytes; @@ -118,15 +119,26 @@ bar0_access(vfu_ctx_t *vfu_ctx, char * const buf, size_t count, loff_t offset, } ssize_t -bar1_access(vfu_ctx_t *vfu_ctx UNUSED, UNUSED char * const buf, - UNUSED size_t count, UNUSED loff_t offset, - UNUSED const bool is_write) +bar1_access(vfu_ctx_t *vfu_ctx, char * const buf, + size_t count, loff_t offset, + const bool is_write) { - assert(false); + struct server_data *server_data = vfu_get_private(vfu_ctx); + + if (offset + count > server_data->bar1_size) { + vfu_log(vfu_ctx, LOG_ERR, "bad BAR1 access %#lx-%#lx", + offset, offset + count - 1); + errno = EINVAL; + return -1; + } - /* FIXME assert that only the 2nd page is accessed */ + if (is_write) { + memcpy(server_data->bar1 + offset, buf, count); + } else { + memcpy(buf, server_data->bar1, count); + } - return -ENOTSUP; + return count; } bool irq_triggered = false; @@ -253,8 +265,12 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) switch (state) { case VFU_MIGR_STATE_STOP_AND_COPY: + server_data->migration.pending_bytes = sizeof(time_t); /* FIXME BAR0 region size */ + server_data->migration.data_size = 0; + break; + case VFU_MIGR_STATE_PRE_COPY: /* TODO must be less than size of data region in migration region */ - server_data->migration.pending_bytes = sysconf(_SC_PAGESIZE); + server_data->migration.pending_bytes = server_data->bar1_size; break; case VFU_MIGR_STATE_STOP: assert(server_data->migration.pending_bytes == 0); @@ -291,6 +307,10 @@ migration_prepare_data(vfu_ctx_t *vfu_ctx, __u64 *offset, __u64 *size) struct server_data *server_data = vfu_get_private(vfu_ctx); *offset = 0; + /* + * Don't provide all migration data in one go in order to make it a bit + * more interesting. + */ *size = server_data->migration.data_size = MIN(server_data->migration.pending_bytes, server_data->migration.migr_data_len / 4); return 0; } @@ -299,6 +319,8 @@ static size_t migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, __u64 size, __u64 offset) { struct server_data *server_data = vfu_get_private(vfu_ctx); + uint8_t *p; + size_t bar_size; if (server_data->migration.data_size < size) { vfu_log(vfu_ctx, LOG_ERR, "invalid migration data read %#llx-%#llx", @@ -306,13 +328,38 @@ migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, __u64 size, __u64 offset) return -EINVAL; } - /* FIXME implement, client should be able to write any byte range */ - assert((offset == 0 && size >= sizeof server_data->bar0) - || offset >= sizeof server_data->bar0); + /* + * If in pre-copy state we copy BAR1, if in stop-and-copy state we copy + * BAR0. This behavior is purely an artifact of this server implementation + * simply to make it as simple as possible. Note that the client might go + * from state running to stop-and-copy, completely skipping the pre-copy + * state. This is legitimate but we don't support it for now. + * + * FIXME implement transitioning from the running state straight to the + * stop-and-copy state. + */ - if (offset == 0 && size >= sizeof server_data->bar0) { - memcpy(buf, &server_data->bar0, sizeof server_data->bar0); + if (server_data->migration.state == VFU_MIGR_STATE_PRE_COPY) { + p = server_data->bar1; + bar_size = server_data->bar1_size; + } else if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) { + p = (uint8_t*)&server_data->bar0; + bar_size = sizeof server_data->bar0; + } else { + /* + * Reading from the migration region in any other state is undefined + * (I think). + */ + return 0; } + if (offset > bar_size) { + errno = EINVAL; + return -1; + } + if (offset + size > bar_size) { + size = bar_size - offset; + } + memcpy(buf, p + offset, size); return size; } @@ -324,41 +371,44 @@ migration_write_data(vfu_ctx_t *vfu_ctx, void *data, __u64 size, __u64 offset) assert(server_data != NULL); assert(data != NULL); - if (offset + size > server_data->migration.migr_data_len) { - vfu_log(vfu_ctx, LOG_ERR, "invalid write %#llx-%#llx", - offset, offset + size - 1); - } + /* + * During pre-copy state we save BAR1 and during stop-and-copy state we + * save BAR0. + */ + vfu_log(vfu_ctx, LOG_DEBUG, + "apply device migration data to BAR%d %#llx-%#llx", + offset < server_data->bar1_size ? 1 : 0, + offset, offset + size - 1); + + if (offset < server_data->bar1_size) { + assert(offset + size <= server_data->bar1_size); /* FIXME */ + memcpy(server_data->bar1 + offset, data, size); + } else { + int ret; - memcpy(server_data->migration.migr_data + offset, data, size); + /* FIXME should be able to write any valid subrange */ + assert(offset - server_data->bar1_size == 0); + assert(size == sizeof server_data->bar0); + + ret = bar0_access(vfu_ctx, data, sizeof server_data->bar0, 0, true); + + assert(ret == (int)size); /* FIXME */ + } return 0; } static int -migration_data_written(vfu_ctx_t *vfu_ctx, __u64 count, __u64 offset) +migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, UNUSED __u64 count, + UNUSED __u64 offset) { - struct server_data *server_data = vfu_get_private(vfu_ctx); - int ret; - - assert(server_data != NULL); - - if (offset + count > server_data->migration.migr_data_len) { - vfu_log(vfu_ctx, LOG_ERR, "bad migration data range %#llx-%#llx", - offset, offset + count - 1); - return -EINVAL; - } - - if (offset == 0 && count >= sizeof server_data->bar0) { - - /* apply device state */ - /* FIXME must arm timer only after device is resumed!!! */ - ret = bar0_access(vfu_ctx, server_data->migration.migr_data, - sizeof server_data->bar0, 0, true); - if (ret < 0) { - return ret; - } - } + /* + * We apply migration state directly in the migration_write_data callback, + * we don't need to do anything here. We would have to apply migration + * state in this callback if the migration region was memory mappable, in + * which case we wouldn't know when the client wrote migration data. + */ return 0; } @@ -369,10 +419,10 @@ int main(int argc, char *argv[]) bool verbose = false; char opt; struct sigaction act = {.sa_handler = _sa_handler}; + size_t bar1_size = 0x3000; struct server_data server_data = { .migration = { - /* one page so that we can memory map it */ - .migr_data_len = sysconf(_SC_PAGESIZE), + .migr_data_len = bar1_size + sizeof(time_t), .state = VFU_MIGR_STATE_RUNNING } }; @@ -393,12 +443,6 @@ int main(int argc, char *argv[]) errx(EXIT_FAILURE, "missing vfio-user socket path"); } - /* coverity[NEGATIVE_RETURNS] */ - server_data.bar1 = malloc(sysconf(_SC_PAGESIZE)); - if (server_data.bar1 == NULL) { - err(EXIT_FAILURE, "BAR1"); - } - sigemptyset(&act.sa_mask); if (sigaction(SIGALRM, &act, NULL) == -1) { err(EXIT_FAILURE, "failed to register signal handler"); @@ -431,21 +475,29 @@ int main(int argc, char *argv[]) /* * Setup BAR1 to be 3 pages in size where only the first and the last pages - * are mappable. + * are mappable. The client can still mmap the 2nd page, we can't prohibit + * this under Linux. If we really want to prohibit it we have to use + * separate files for the same region. */ if ((fp = tmpfile()) == NULL) { err(EXIT_FAILURE, "failed to create BAR1 file"); } - if (ftruncate(fileno(fp), 0x3000) == -1) { + server_data.bar1_size = bar1_size; + if (ftruncate(fileno(fp), server_data.bar1_size) == -1) { err(EXIT_FAILURE, "failed to truncate BAR1 file"); } + server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(fp), 0); + if (server_data.bar1 == MAP_FAILED) { + err(EXIT_FAILURE, "failed to mmap BAR1"); + } struct iovec mmap_areas[] = { { .iov_base = (void*)0, .iov_len = 0x1000 }, { .iov_base = (void*)0x2000, .iov_len = 0x1000 } }; ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR1_REGION_IDX, - 0x3000, &bar1_access, VFU_REGION_FLAG_RW, - mmap_areas, 2, fileno(fp)); + server_data.bar1_size, &bar1_access, + VFU_REGION_FLAG_RW, mmap_areas, 2, fileno(fp)); if (ret < 0) { err(EXIT_FAILURE, "failed to setup BAR1 region"); } @@ -525,7 +577,6 @@ int main(int argc, char *argv[]) vfu_destroy_ctx(vfu_ctx); free(server_data.migration.migr_data); - free(server_data.bar1); return EXIT_SUCCESS; } |