aboutsummaryrefslogtreecommitdiff
path: root/samples
diff options
context:
space:
mode:
authorThanos Makatos <thanos.makatos@nutanix.com>2021-01-25 17:39:35 +0000
committerGitHub <noreply@github.com>2021-01-25 17:39:35 +0000
commitf2fe9d8e4fc4c5eb80dc61cc243a18087cc12ca3 (patch)
tree06c6a12d428fbfadbcccf0dea780c393f076f6ca /samples
parent915a5b01f34a61cacef5ae5b50a215d13628b8b0 (diff)
downloadlibvfio-user-f2fe9d8e4fc4c5eb80dc61cc243a18087cc12ca3.zip
libvfio-user-f2fe9d8e4fc4c5eb80dc61cc243a18087cc12ca3.tar.gz
libvfio-user-f2fe9d8e4fc4c5eb80dc61cc243a18087cc12ca3.tar.bz2
add pre-copy phase in live migration example (#247)
This patch adds a simplistic pre-copy phase in the live migration sample. The end goal is have a separate thread in the client to modify device state while the device is in the pre-copy phase. This will be done in later patches. BAR1 is saved during the pre-copy phase and BAR0 is saved during the stop-and-copy phase. This is purely for convenience. There are quite a few assumptions and FIXMEs, even in the client code. We plan to address them in future patches. Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
Diffstat (limited to 'samples')
-rw-r--r--samples/client.c203
-rw-r--r--samples/server.c157
2 files changed, 248 insertions, 112 deletions
diff --git a/samples/client.c b/samples/client.c
index 18558b8..e7228ac 100644
--- a/samples/client.c
+++ b/samples/client.c
@@ -717,20 +717,29 @@ usage(char *path) {
basename(path));
}
-static void
-migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
+/*
+ * Normally each time the source client (QEMU) would read migration data from
+ * the device it would send them to the destination client. However, since in
+ * our sample both the source and the destination client are the same process,
+ * we simply accumulate the migration data of each iteration and apply it to
+ * the destination server at the end.
+ *
+ * Performs as many migration loops as @nr_iters or until the device has no
+ * more migration data (pending_bytes is zero), which ever comes first. The
+ * result of each migration iteration is stored in @migr_iter. @migr_iter must
+ * be at least @nr_iters.
+ *
+ * @returns the number of iterations performed
+ */
+static size_t
+do_migrate(int sock, int migr_reg_index, size_t nr_iters,
+ struct iovec *migr_iter)
{
- __u32 device_state = VFIO_DEVICE_STATE_SAVING;
+ int ret;
__u64 pending_bytes, data_offset, data_size;
+ size_t i = 0;
- /* XXX set device state to stop-and-copy */
- int ret = access_region(sock, migr_reg_index, true,
- offsetof(struct vfio_device_migration_info, device_state),
- &device_state, sizeof(device_state));
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to write to device state: %s",
- strerror(-ret));
- }
+ assert(nr_iters > 0);
/* XXX read pending_bytes */
ret = access_region(sock, migr_reg_index, false,
@@ -744,19 +753,7 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
/* We do expect some migration data. */
assert(pending_bytes > 0);
- /*
- * The only expectation about pending_bytes is whether it's zero or
- * non-zero, therefore it must be considered volatile, even acrosss
- * iterantions. In the sample server we know it's static so it's fairly
- * straightforward.
- */
- *len = pending_bytes;
- *data = malloc(*len);
- if (*data == NULL) {
- err(EXIT_FAILURE, "failed to allocate migration buffer");
- }
-
- while (pending_bytes > 0) {
+ for (i = 0; i < nr_iters && pending_bytes > 0; i++) {
/* XXX read data_offset and data_size */
ret = access_region(sock, migr_reg_index, false,
@@ -775,11 +772,17 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
strerror(-ret));
}
- assert(data_offset - sizeof(struct vfio_device_migration_info) + data_size <= *len);
+ assert(data_offset - sizeof(struct vfio_device_migration_info) + data_size <= pending_bytes);
+
+ migr_iter[i].iov_len = data_size;
+ migr_iter[i].iov_base = malloc(data_size);
+ if (migr_iter[i].iov_base == NULL) {
+ err(EXIT_FAILURE, "failed to allocate migration buffer");
+ }
/* XXX read migration data */
ret = access_region(sock, migr_reg_index, false, data_offset,
- (char*)*data + data_offset - sizeof(struct vfio_device_migration_info),
+ (char*)migr_iter[i].iov_base + data_offset - sizeof(struct vfio_device_migration_info),
data_size);
if (ret < 0) {
errx(EXIT_FAILURE, "failed to read migration data: %s",
@@ -800,9 +803,43 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
strerror(-ret));
}
}
+ return i;
+}
- /* XXX read device state, migration must have finished now */
- device_state = VFIO_DEVICE_STATE_STOP;
+static size_t
+migrate_from(int sock, int migr_reg_index, size_t *nr_iters, struct iovec **migr_iters)
+{
+ __u32 device_state;
+ int ret;
+ size_t _nr_iters;
+
+ /*
+ * FIXME to fully demonstrate live migration we'll need a way to change
+ * device state while the client is running the migration iteration. One
+ * way to do that is to have the client randomly modify a big-ish device
+ * region while running the live migration iterations, and at some point
+ * stopping to do the stop-and-copy phase. It can store in a buffer the
+ * modifications it makes and then after the device has been migrated it
+ * should compare the buffer with the migrated device region.
+ */
+
+ /*
+ * TODO The server generates migration data while it's in pre-copy state.
+ *
+ * FIXME the server generates 4 rounds of migration data while in pre-copy
+ * state and 1 while in stop-and-copy state. Don't assume this.
+ */
+ *nr_iters = 5;
+ *migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
+ if (*migr_iters == NULL) {
+ err(EXIT_FAILURE, NULL);
+ }
+
+ /*
+ * XXX set device state to pre-copy. This is technically optional but any
+ * VMM that cares about performance needs this.
+ */
+ device_state = VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING;
ret = access_region(sock, migr_reg_index, true,
offsetof(struct vfio_device_migration_info, device_state),
&device_state, sizeof(device_state));
@@ -810,18 +847,57 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
errx(EXIT_FAILURE, "failed to write to device state: %s",
strerror(-ret));
}
+
+ _nr_iters = do_migrate(sock, migr_reg_index, 4, *migr_iters);
+
+ if (_nr_iters != 4) {
+ errx(EXIT_FAILURE,
+ "expected 4 iterations instead of %ld while in pre-copy state\n",
+ _nr_iters);
+ }
+
+ printf("setting device state to stop-and-copy\n");
+
+ device_state = VFIO_DEVICE_STATE_SAVING;
+ ret = access_region(sock, migr_reg_index, true,
+ offsetof(struct vfio_device_migration_info, device_state),
+ &device_state, sizeof(device_state));
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write to device state: %s",
+ strerror(-ret));
+ }
+
+ _nr_iters += do_migrate(sock, migr_reg_index, 1, (*migr_iters) + _nr_iters);
+ if (_nr_iters != 5) {
+ errx(EXIT_FAILURE,
+ "expected 5 iterations instead of %ld while in stop-and-copy state\n",
+ _nr_iters);
+ }
+
+ /* XXX read device state, migration must have finished now */
+ device_state = VFIO_DEVICE_STATE_STOP;
+ ret = access_region(sock, migr_reg_index, true,
+ offsetof(struct vfio_device_migration_info, device_state),
+ &device_state, sizeof(device_state));
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write to device state: %s",
+ strerror(-ret));
+ }
+
+ return _nr_iters;
}
static int
migrate_to(char *old_sock_path, int *server_max_fds,
- size_t *pgsize, void *migr_data, __u64 migr_data_len,
+ size_t *pgsize, size_t nr_iters, struct iovec *migr_iters,
char *path_to_server, int migr_reg_index)
{
int ret, sock;
char *sock_path;
struct stat sb;
__u32 device_state = VFIO_DEVICE_STATE_RESUMING;
- __u64 data_offset;
+ __u64 data_offset, data_len;
+ size_t i;
assert(old_sock_path != NULL);
@@ -874,33 +950,42 @@ migrate_to(char *old_sock_path, int *server_max_fds,
strerror(-ret));
}
- /* XXX read data offset */
- ret = access_region(sock, migr_reg_index, false,
- offsetof(struct vfio_device_migration_info, data_offset),
- &data_offset, sizeof(data_offset));
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to read data offset: %s", strerror(-ret));
- }
+ for (i = 0; i < nr_iters; i++) {
- /* XXX write migration data */
+ /* XXX read data offset */
+ ret = access_region(sock, migr_reg_index, false,
+ offsetof(struct vfio_device_migration_info, data_offset),
+ &data_offset, sizeof(data_offset));
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to read migration data offset: %s",
+ strerror(-ret));
+ }
- /*
- * TODO write half of migration data via regular write and other half via
- * memopy map.
- */
- ret = access_region(sock, migr_reg_index, true,
- data_offset, migr_data, migr_data_len);
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to write migration data: %s",
- strerror(-ret));
- }
+ /* XXX write migration data */
- /* XXX write data_size */
- ret = access_region(sock, migr_reg_index, true,
- offsetof(struct vfio_device_migration_info, data_size),
- &migr_data_len, sizeof migr_data_len);
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to write data size: %s", strerror(-ret));
+ /*
+ * TODO write half of migration data via regular write and other half via
+ * memopy map.
+ */
+ printf("writing migration device data %#llx-%#llx\n", data_offset,
+ data_offset + migr_iters[i].iov_len - 1);
+ ret = access_region(sock, migr_reg_index, true,
+ data_offset, migr_iters[i].iov_base,
+ migr_iters[i].iov_len);
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write device migration data: %s",
+ strerror(-ret));
+ }
+
+ /* XXX write data_size */
+ data_len = migr_iters[i].iov_len;
+ ret = access_region(sock, migr_reg_index, true,
+ offsetof(struct vfio_device_migration_info, data_size),
+ &data_len, sizeof data_len);
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write migration data size: %s",
+ strerror(-ret));
+ }
}
/* XXX set device state to running */
@@ -953,11 +1038,11 @@ int main(int argc, char *argv[])
struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0};
int opt;
time_t t;
- void *migr_data;
- __u64 migr_data_len;
char *path_to_server = NULL;
vfu_pci_hdr_t config_space;
int migr_reg_index;
+ struct iovec *migr_iters;
+ size_t nr_iters;
while ((opt = getopt(argc, argv, "h")) != -1) {
switch (opt) {
@@ -1117,13 +1202,13 @@ int main(int argc, char *argv[])
/*
* By sleeping here for 1s after migration finishes on the source server
- * (but not yet started on the destination server), the timer should be be
+ * (but not yet started on the destination server), the timer should be
* armed on the destination server for 2-1=1 seconds. If we don't sleep
* then it will be armed for 2 seconds, which isn't as interesting.
*/
sleep(1);
- migrate_from(sock, migr_reg_index, &migr_data, &migr_data_len);
+ migrate_from(sock, migr_reg_index, &nr_iters, &migr_iters);
/*
* Normally the client would now send the device state to the destination
@@ -1136,7 +1221,7 @@ int main(int argc, char *argv[])
}
sock = migrate_to(argv[optind], &server_max_fds, &pgsize,
- migr_data, migr_data_len, path_to_server, migr_reg_index);
+ nr_iters, migr_iters, path_to_server, migr_reg_index);
/*
* Now we must reconfigure the destination server.
diff --git a/samples/server.c b/samples/server.c
index 4c22922..f7a1cd1 100644
--- a/samples/server.c
+++ b/samples/server.c
@@ -58,7 +58,8 @@ struct dma_regions {
struct server_data {
time_t bar0;
- uint8_t *bar1;
+ void *bar1;
+ size_t bar1_size;
struct dma_regions regions[NR_DMA_REGIONS];
struct {
__u64 pending_bytes;
@@ -118,15 +119,26 @@ bar0_access(vfu_ctx_t *vfu_ctx, char * const buf, size_t count, loff_t offset,
}
ssize_t
-bar1_access(vfu_ctx_t *vfu_ctx UNUSED, UNUSED char * const buf,
- UNUSED size_t count, UNUSED loff_t offset,
- UNUSED const bool is_write)
+bar1_access(vfu_ctx_t *vfu_ctx, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write)
{
- assert(false);
+ struct server_data *server_data = vfu_get_private(vfu_ctx);
+
+ if (offset + count > server_data->bar1_size) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad BAR1 access %#lx-%#lx",
+ offset, offset + count - 1);
+ errno = EINVAL;
+ return -1;
+ }
- /* FIXME assert that only the 2nd page is accessed */
+ if (is_write) {
+ memcpy(server_data->bar1 + offset, buf, count);
+ } else {
+ memcpy(buf, server_data->bar1, count);
+ }
- return -ENOTSUP;
+ return count;
}
bool irq_triggered = false;
@@ -253,8 +265,12 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
switch (state) {
case VFU_MIGR_STATE_STOP_AND_COPY:
+ server_data->migration.pending_bytes = sizeof(time_t); /* FIXME BAR0 region size */
+ server_data->migration.data_size = 0;
+ break;
+ case VFU_MIGR_STATE_PRE_COPY:
/* TODO must be less than size of data region in migration region */
- server_data->migration.pending_bytes = sysconf(_SC_PAGESIZE);
+ server_data->migration.pending_bytes = server_data->bar1_size;
break;
case VFU_MIGR_STATE_STOP:
assert(server_data->migration.pending_bytes == 0);
@@ -291,6 +307,10 @@ migration_prepare_data(vfu_ctx_t *vfu_ctx, __u64 *offset, __u64 *size)
struct server_data *server_data = vfu_get_private(vfu_ctx);
*offset = 0;
+ /*
+ * Don't provide all migration data in one go in order to make it a bit
+ * more interesting.
+ */
*size = server_data->migration.data_size = MIN(server_data->migration.pending_bytes, server_data->migration.migr_data_len / 4);
return 0;
}
@@ -299,6 +319,8 @@ static size_t
migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, __u64 size, __u64 offset)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
+ uint8_t *p;
+ size_t bar_size;
if (server_data->migration.data_size < size) {
vfu_log(vfu_ctx, LOG_ERR, "invalid migration data read %#llx-%#llx",
@@ -306,13 +328,38 @@ migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, __u64 size, __u64 offset)
return -EINVAL;
}
- /* FIXME implement, client should be able to write any byte range */
- assert((offset == 0 && size >= sizeof server_data->bar0)
- || offset >= sizeof server_data->bar0);
+ /*
+ * If in pre-copy state we copy BAR1, if in stop-and-copy state we copy
+ * BAR0. This behavior is purely an artifact of this server implementation
+ * simply to make it as simple as possible. Note that the client might go
+ * from state running to stop-and-copy, completely skipping the pre-copy
+ * state. This is legitimate but we don't support it for now.
+ *
+ * FIXME implement transitioning from the running state straight to the
+ * stop-and-copy state.
+ */
- if (offset == 0 && size >= sizeof server_data->bar0) {
- memcpy(buf, &server_data->bar0, sizeof server_data->bar0);
+ if (server_data->migration.state == VFU_MIGR_STATE_PRE_COPY) {
+ p = server_data->bar1;
+ bar_size = server_data->bar1_size;
+ } else if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
+ p = (uint8_t*)&server_data->bar0;
+ bar_size = sizeof server_data->bar0;
+ } else {
+ /*
+ * Reading from the migration region in any other state is undefined
+ * (I think).
+ */
+ return 0;
}
+ if (offset > bar_size) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (offset + size > bar_size) {
+ size = bar_size - offset;
+ }
+ memcpy(buf, p + offset, size);
return size;
}
@@ -324,41 +371,44 @@ migration_write_data(vfu_ctx_t *vfu_ctx, void *data, __u64 size, __u64 offset)
assert(server_data != NULL);
assert(data != NULL);
- if (offset + size > server_data->migration.migr_data_len) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid write %#llx-%#llx",
- offset, offset + size - 1);
- }
+ /*
+ * During pre-copy state we save BAR1 and during stop-and-copy state we
+ * save BAR0.
+ */
+ vfu_log(vfu_ctx, LOG_DEBUG,
+ "apply device migration data to BAR%d %#llx-%#llx",
+ offset < server_data->bar1_size ? 1 : 0,
+ offset, offset + size - 1);
+
+ if (offset < server_data->bar1_size) {
+ assert(offset + size <= server_data->bar1_size); /* FIXME */
+ memcpy(server_data->bar1 + offset, data, size);
+ } else {
+ int ret;
- memcpy(server_data->migration.migr_data + offset, data, size);
+ /* FIXME should be able to write any valid subrange */
+ assert(offset - server_data->bar1_size == 0);
+ assert(size == sizeof server_data->bar0);
+
+ ret = bar0_access(vfu_ctx, data, sizeof server_data->bar0, 0, true);
+
+ assert(ret == (int)size); /* FIXME */
+ }
return 0;
}
static int
-migration_data_written(vfu_ctx_t *vfu_ctx, __u64 count, __u64 offset)
+migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, UNUSED __u64 count,
+ UNUSED __u64 offset)
{
- struct server_data *server_data = vfu_get_private(vfu_ctx);
- int ret;
-
- assert(server_data != NULL);
-
- if (offset + count > server_data->migration.migr_data_len) {
- vfu_log(vfu_ctx, LOG_ERR, "bad migration data range %#llx-%#llx",
- offset, offset + count - 1);
- return -EINVAL;
- }
-
- if (offset == 0 && count >= sizeof server_data->bar0) {
-
- /* apply device state */
- /* FIXME must arm timer only after device is resumed!!! */
- ret = bar0_access(vfu_ctx, server_data->migration.migr_data,
- sizeof server_data->bar0, 0, true);
- if (ret < 0) {
- return ret;
- }
- }
+ /*
+ * We apply migration state directly in the migration_write_data callback,
+ * we don't need to do anything here. We would have to apply migration
+ * state in this callback if the migration region was memory mappable, in
+ * which case we wouldn't know when the client wrote migration data.
+ */
return 0;
}
@@ -369,10 +419,10 @@ int main(int argc, char *argv[])
bool verbose = false;
char opt;
struct sigaction act = {.sa_handler = _sa_handler};
+ size_t bar1_size = 0x3000;
struct server_data server_data = {
.migration = {
- /* one page so that we can memory map it */
- .migr_data_len = sysconf(_SC_PAGESIZE),
+ .migr_data_len = bar1_size + sizeof(time_t),
.state = VFU_MIGR_STATE_RUNNING
}
};
@@ -393,12 +443,6 @@ int main(int argc, char *argv[])
errx(EXIT_FAILURE, "missing vfio-user socket path");
}
- /* coverity[NEGATIVE_RETURNS] */
- server_data.bar1 = malloc(sysconf(_SC_PAGESIZE));
- if (server_data.bar1 == NULL) {
- err(EXIT_FAILURE, "BAR1");
- }
-
sigemptyset(&act.sa_mask);
if (sigaction(SIGALRM, &act, NULL) == -1) {
err(EXIT_FAILURE, "failed to register signal handler");
@@ -431,21 +475,29 @@ int main(int argc, char *argv[])
/*
* Setup BAR1 to be 3 pages in size where only the first and the last pages
- * are mappable.
+ * are mappable. The client can still mmap the 2nd page, we can't prohibit
+ * this under Linux. If we really want to prohibit it we have to use
+ * separate files for the same region.
*/
if ((fp = tmpfile()) == NULL) {
err(EXIT_FAILURE, "failed to create BAR1 file");
}
- if (ftruncate(fileno(fp), 0x3000) == -1) {
+ server_data.bar1_size = bar1_size;
+ if (ftruncate(fileno(fp), server_data.bar1_size) == -1) {
err(EXIT_FAILURE, "failed to truncate BAR1 file");
}
+ server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fileno(fp), 0);
+ if (server_data.bar1 == MAP_FAILED) {
+ err(EXIT_FAILURE, "failed to mmap BAR1");
+ }
struct iovec mmap_areas[] = {
{ .iov_base = (void*)0, .iov_len = 0x1000 },
{ .iov_base = (void*)0x2000, .iov_len = 0x1000 }
};
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR1_REGION_IDX,
- 0x3000, &bar1_access, VFU_REGION_FLAG_RW,
- mmap_areas, 2, fileno(fp));
+ server_data.bar1_size, &bar1_access,
+ VFU_REGION_FLAG_RW, mmap_areas, 2, fileno(fp));
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup BAR1 region");
}
@@ -525,7 +577,6 @@ int main(int argc, char *argv[])
vfu_destroy_ctx(vfu_ctx);
free(server_data.migration.migr_data);
- free(server_data.bar1);
return EXIT_SUCCESS;
}