aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md22
-rw-r--r--lib/migration.c6
-rw-r--r--samples/client.c203
-rw-r--r--samples/server.c157
4 files changed, 270 insertions, 118 deletions
diff --git a/README.md b/README.md
index c019250..5d34612 100644
--- a/README.md
+++ b/README.md
@@ -124,18 +124,28 @@ client/server model where basic tasks are performed.
The server implements a device that can be programmed to trigger interrupts
(INTx) to the client. This is done by writing the desired time in seconds since
-Epoch. The server then trigger an eventfd-based IRQ and then a message-based
+Epoch to BAR0. The server then triggers an eventfd-based IRQ and then a message-based
one (in order to demonstrate how it's done when passing of file descriptors
-isn't possible/desirable).
+isn't possible/desirable). The device also works as memory storage: BAR1 can
+be freely written to/read from by the host.
+
+Since this is a completely made up device, there's no kernel driver (yet).
+[Client](./samples/client.c) implements a client that knows how to drive this
+particular device (that would normally be QEMU + guest VM + kernel driver).
The client excercises all commands in the vfio-user protocol, and then proceeds
to perform live migration. The client spawns the destination server (this would
-be normally done by `libvirt`) and then migrates the device state, before
+be normally done by libvirt) and then migrates the device state, before
switching entirely to the destination server. We re-use the source client
instead of spawning a destination one as this is something libvirt/QEMU would
-normally do. To spice things up, the client programmes the source server to
-trigger an interrupt and then quickly migrates to the destination server; the
-programmed interrupt is delivered by the destination server.
+normally do.
+
+To spice things up, the client programs the source server to trigger an
+interrupt and then migrates to the destination server; the programmed interrupt
+is delivered by the destination server. Also, while the device is being live
+migrated, the client spawns a thread that constantly writes to BAR1 in a tight
+loop. This thread emulates the guest VM accessing the device while the main
+thread (what would normally be QEMU) is driving the migration.
Start the source server as follows (pick whatever you like for
`/tmp/vfio-user.sock`):
diff --git a/lib/migration.c b/lib/migration.c
index 31a8d9d..c4eb6f6 100644
--- a/lib/migration.c
+++ b/lib/migration.c
@@ -450,6 +450,12 @@ migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
assert(migr != NULL);
assert(buf != NULL);
+ /*
+ * FIXME don't call the device callback if the migration state is in not in
+ * pre-copy/stop-and-copy/resuming state, since the behavior is undefined
+ * in that case.
+ */
+
if (pos + count <= sizeof(struct vfio_device_migration_info)) {
ret = migration_region_access_registers(vfu_ctx, buf, count,
pos, is_write);
diff --git a/samples/client.c b/samples/client.c
index 18558b8..e7228ac 100644
--- a/samples/client.c
+++ b/samples/client.c
@@ -717,20 +717,29 @@ usage(char *path) {
basename(path));
}
-static void
-migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
+/*
+ * Normally each time the source client (QEMU) would read migration data from
+ * the device it would send them to the destination client. However, since in
+ * our sample both the source and the destination client are the same process,
+ * we simply accumulate the migration data of each iteration and apply it to
+ * the destination server at the end.
+ *
+ * Performs as many migration loops as @nr_iters or until the device has no
+ * more migration data (pending_bytes is zero), which ever comes first. The
+ * result of each migration iteration is stored in @migr_iter. @migr_iter must
+ * be at least @nr_iters.
+ *
+ * @returns the number of iterations performed
+ */
+static size_t
+do_migrate(int sock, int migr_reg_index, size_t nr_iters,
+ struct iovec *migr_iter)
{
- __u32 device_state = VFIO_DEVICE_STATE_SAVING;
+ int ret;
__u64 pending_bytes, data_offset, data_size;
+ size_t i = 0;
- /* XXX set device state to stop-and-copy */
- int ret = access_region(sock, migr_reg_index, true,
- offsetof(struct vfio_device_migration_info, device_state),
- &device_state, sizeof(device_state));
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to write to device state: %s",
- strerror(-ret));
- }
+ assert(nr_iters > 0);
/* XXX read pending_bytes */
ret = access_region(sock, migr_reg_index, false,
@@ -744,19 +753,7 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
/* We do expect some migration data. */
assert(pending_bytes > 0);
- /*
- * The only expectation about pending_bytes is whether it's zero or
- * non-zero, therefore it must be considered volatile, even acrosss
- * iterantions. In the sample server we know it's static so it's fairly
- * straightforward.
- */
- *len = pending_bytes;
- *data = malloc(*len);
- if (*data == NULL) {
- err(EXIT_FAILURE, "failed to allocate migration buffer");
- }
-
- while (pending_bytes > 0) {
+ for (i = 0; i < nr_iters && pending_bytes > 0; i++) {
/* XXX read data_offset and data_size */
ret = access_region(sock, migr_reg_index, false,
@@ -775,11 +772,17 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
strerror(-ret));
}
- assert(data_offset - sizeof(struct vfio_device_migration_info) + data_size <= *len);
+ assert(data_offset - sizeof(struct vfio_device_migration_info) + data_size <= pending_bytes);
+
+ migr_iter[i].iov_len = data_size;
+ migr_iter[i].iov_base = malloc(data_size);
+ if (migr_iter[i].iov_base == NULL) {
+ err(EXIT_FAILURE, "failed to allocate migration buffer");
+ }
/* XXX read migration data */
ret = access_region(sock, migr_reg_index, false, data_offset,
- (char*)*data + data_offset - sizeof(struct vfio_device_migration_info),
+ (char*)migr_iter[i].iov_base + data_offset - sizeof(struct vfio_device_migration_info),
data_size);
if (ret < 0) {
errx(EXIT_FAILURE, "failed to read migration data: %s",
@@ -800,9 +803,43 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
strerror(-ret));
}
}
+ return i;
+}
- /* XXX read device state, migration must have finished now */
- device_state = VFIO_DEVICE_STATE_STOP;
+static size_t
+migrate_from(int sock, int migr_reg_index, size_t *nr_iters, struct iovec **migr_iters)
+{
+ __u32 device_state;
+ int ret;
+ size_t _nr_iters;
+
+ /*
+ * FIXME to fully demonstrate live migration we'll need a way to change
+ * device state while the client is running the migration iteration. One
+ * way to do that is to have the client randomly modify a big-ish device
+ * region while running the live migration iterations, and at some point
+ * stopping to do the stop-and-copy phase. It can store in a buffer the
+ * modifications it makes and then after the device has been migrated it
+ * should compare the buffer with the migrated device region.
+ */
+
+ /*
+ * TODO The server generates migration data while it's in pre-copy state.
+ *
+ * FIXME the server generates 4 rounds of migration data while in pre-copy
+ * state and 1 while in stop-and-copy state. Don't assume this.
+ */
+ *nr_iters = 5;
+ *migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
+ if (*migr_iters == NULL) {
+ err(EXIT_FAILURE, NULL);
+ }
+
+ /*
+ * XXX set device state to pre-copy. This is technically optional but any
+ * VMM that cares about performance needs this.
+ */
+ device_state = VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING;
ret = access_region(sock, migr_reg_index, true,
offsetof(struct vfio_device_migration_info, device_state),
&device_state, sizeof(device_state));
@@ -810,18 +847,57 @@ migrate_from(int sock, int migr_reg_index, void **data, __u64 *len)
errx(EXIT_FAILURE, "failed to write to device state: %s",
strerror(-ret));
}
+
+ _nr_iters = do_migrate(sock, migr_reg_index, 4, *migr_iters);
+
+ if (_nr_iters != 4) {
+ errx(EXIT_FAILURE,
+ "expected 4 iterations instead of %ld while in pre-copy state\n",
+ _nr_iters);
+ }
+
+ printf("setting device state to stop-and-copy\n");
+
+ device_state = VFIO_DEVICE_STATE_SAVING;
+ ret = access_region(sock, migr_reg_index, true,
+ offsetof(struct vfio_device_migration_info, device_state),
+ &device_state, sizeof(device_state));
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write to device state: %s",
+ strerror(-ret));
+ }
+
+ _nr_iters += do_migrate(sock, migr_reg_index, 1, (*migr_iters) + _nr_iters);
+ if (_nr_iters != 5) {
+ errx(EXIT_FAILURE,
+ "expected 5 iterations instead of %ld while in stop-and-copy state\n",
+ _nr_iters);
+ }
+
+ /* XXX read device state, migration must have finished now */
+ device_state = VFIO_DEVICE_STATE_STOP;
+ ret = access_region(sock, migr_reg_index, true,
+ offsetof(struct vfio_device_migration_info, device_state),
+ &device_state, sizeof(device_state));
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write to device state: %s",
+ strerror(-ret));
+ }
+
+ return _nr_iters;
}
static int
migrate_to(char *old_sock_path, int *server_max_fds,
- size_t *pgsize, void *migr_data, __u64 migr_data_len,
+ size_t *pgsize, size_t nr_iters, struct iovec *migr_iters,
char *path_to_server, int migr_reg_index)
{
int ret, sock;
char *sock_path;
struct stat sb;
__u32 device_state = VFIO_DEVICE_STATE_RESUMING;
- __u64 data_offset;
+ __u64 data_offset, data_len;
+ size_t i;
assert(old_sock_path != NULL);
@@ -874,33 +950,42 @@ migrate_to(char *old_sock_path, int *server_max_fds,
strerror(-ret));
}
- /* XXX read data offset */
- ret = access_region(sock, migr_reg_index, false,
- offsetof(struct vfio_device_migration_info, data_offset),
- &data_offset, sizeof(data_offset));
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to read data offset: %s", strerror(-ret));
- }
+ for (i = 0; i < nr_iters; i++) {
- /* XXX write migration data */
+ /* XXX read data offset */
+ ret = access_region(sock, migr_reg_index, false,
+ offsetof(struct vfio_device_migration_info, data_offset),
+ &data_offset, sizeof(data_offset));
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to read migration data offset: %s",
+ strerror(-ret));
+ }
- /*
- * TODO write half of migration data via regular write and other half via
- * memopy map.
- */
- ret = access_region(sock, migr_reg_index, true,
- data_offset, migr_data, migr_data_len);
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to write migration data: %s",
- strerror(-ret));
- }
+ /* XXX write migration data */
- /* XXX write data_size */
- ret = access_region(sock, migr_reg_index, true,
- offsetof(struct vfio_device_migration_info, data_size),
- &migr_data_len, sizeof migr_data_len);
- if (ret < 0) {
- errx(EXIT_FAILURE, "failed to write data size: %s", strerror(-ret));
+ /*
+ * TODO write half of migration data via regular write and other half via
+ * memopy map.
+ */
+ printf("writing migration device data %#llx-%#llx\n", data_offset,
+ data_offset + migr_iters[i].iov_len - 1);
+ ret = access_region(sock, migr_reg_index, true,
+ data_offset, migr_iters[i].iov_base,
+ migr_iters[i].iov_len);
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write device migration data: %s",
+ strerror(-ret));
+ }
+
+ /* XXX write data_size */
+ data_len = migr_iters[i].iov_len;
+ ret = access_region(sock, migr_reg_index, true,
+ offsetof(struct vfio_device_migration_info, data_size),
+ &data_len, sizeof data_len);
+ if (ret < 0) {
+ errx(EXIT_FAILURE, "failed to write migration data size: %s",
+ strerror(-ret));
+ }
}
/* XXX set device state to running */
@@ -953,11 +1038,11 @@ int main(int argc, char *argv[])
struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0};
int opt;
time_t t;
- void *migr_data;
- __u64 migr_data_len;
char *path_to_server = NULL;
vfu_pci_hdr_t config_space;
int migr_reg_index;
+ struct iovec *migr_iters;
+ size_t nr_iters;
while ((opt = getopt(argc, argv, "h")) != -1) {
switch (opt) {
@@ -1117,13 +1202,13 @@ int main(int argc, char *argv[])
/*
* By sleeping here for 1s after migration finishes on the source server
- * (but not yet started on the destination server), the timer should be be
+ * (but not yet started on the destination server), the timer should be
* armed on the destination server for 2-1=1 seconds. If we don't sleep
* then it will be armed for 2 seconds, which isn't as interesting.
*/
sleep(1);
- migrate_from(sock, migr_reg_index, &migr_data, &migr_data_len);
+ migrate_from(sock, migr_reg_index, &nr_iters, &migr_iters);
/*
* Normally the client would now send the device state to the destination
@@ -1136,7 +1221,7 @@ int main(int argc, char *argv[])
}
sock = migrate_to(argv[optind], &server_max_fds, &pgsize,
- migr_data, migr_data_len, path_to_server, migr_reg_index);
+ nr_iters, migr_iters, path_to_server, migr_reg_index);
/*
* Now we must reconfigure the destination server.
diff --git a/samples/server.c b/samples/server.c
index 4c22922..f7a1cd1 100644
--- a/samples/server.c
+++ b/samples/server.c
@@ -58,7 +58,8 @@ struct dma_regions {
struct server_data {
time_t bar0;
- uint8_t *bar1;
+ void *bar1;
+ size_t bar1_size;
struct dma_regions regions[NR_DMA_REGIONS];
struct {
__u64 pending_bytes;
@@ -118,15 +119,26 @@ bar0_access(vfu_ctx_t *vfu_ctx, char * const buf, size_t count, loff_t offset,
}
ssize_t
-bar1_access(vfu_ctx_t *vfu_ctx UNUSED, UNUSED char * const buf,
- UNUSED size_t count, UNUSED loff_t offset,
- UNUSED const bool is_write)
+bar1_access(vfu_ctx_t *vfu_ctx, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write)
{
- assert(false);
+ struct server_data *server_data = vfu_get_private(vfu_ctx);
+
+ if (offset + count > server_data->bar1_size) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad BAR1 access %#lx-%#lx",
+ offset, offset + count - 1);
+ errno = EINVAL;
+ return -1;
+ }
- /* FIXME assert that only the 2nd page is accessed */
+ if (is_write) {
+ memcpy(server_data->bar1 + offset, buf, count);
+ } else {
+ memcpy(buf, server_data->bar1, count);
+ }
- return -ENOTSUP;
+ return count;
}
bool irq_triggered = false;
@@ -253,8 +265,12 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
switch (state) {
case VFU_MIGR_STATE_STOP_AND_COPY:
+ server_data->migration.pending_bytes = sizeof(time_t); /* FIXME BAR0 region size */
+ server_data->migration.data_size = 0;
+ break;
+ case VFU_MIGR_STATE_PRE_COPY:
/* TODO must be less than size of data region in migration region */
- server_data->migration.pending_bytes = sysconf(_SC_PAGESIZE);
+ server_data->migration.pending_bytes = server_data->bar1_size;
break;
case VFU_MIGR_STATE_STOP:
assert(server_data->migration.pending_bytes == 0);
@@ -291,6 +307,10 @@ migration_prepare_data(vfu_ctx_t *vfu_ctx, __u64 *offset, __u64 *size)
struct server_data *server_data = vfu_get_private(vfu_ctx);
*offset = 0;
+ /*
+ * Don't provide all migration data in one go in order to make it a bit
+ * more interesting.
+ */
*size = server_data->migration.data_size = MIN(server_data->migration.pending_bytes, server_data->migration.migr_data_len / 4);
return 0;
}
@@ -299,6 +319,8 @@ static size_t
migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, __u64 size, __u64 offset)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
+ uint8_t *p;
+ size_t bar_size;
if (server_data->migration.data_size < size) {
vfu_log(vfu_ctx, LOG_ERR, "invalid migration data read %#llx-%#llx",
@@ -306,13 +328,38 @@ migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, __u64 size, __u64 offset)
return -EINVAL;
}
- /* FIXME implement, client should be able to write any byte range */
- assert((offset == 0 && size >= sizeof server_data->bar0)
- || offset >= sizeof server_data->bar0);
+ /*
+ * If in pre-copy state we copy BAR1, if in stop-and-copy state we copy
+ * BAR0. This behavior is purely an artifact of this server implementation
+ * simply to make it as simple as possible. Note that the client might go
+ * from state running to stop-and-copy, completely skipping the pre-copy
+ * state. This is legitimate but we don't support it for now.
+ *
+ * FIXME implement transitioning from the running state straight to the
+ * stop-and-copy state.
+ */
- if (offset == 0 && size >= sizeof server_data->bar0) {
- memcpy(buf, &server_data->bar0, sizeof server_data->bar0);
+ if (server_data->migration.state == VFU_MIGR_STATE_PRE_COPY) {
+ p = server_data->bar1;
+ bar_size = server_data->bar1_size;
+ } else if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
+ p = (uint8_t*)&server_data->bar0;
+ bar_size = sizeof server_data->bar0;
+ } else {
+ /*
+ * Reading from the migration region in any other state is undefined
+ * (I think).
+ */
+ return 0;
}
+ if (offset > bar_size) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (offset + size > bar_size) {
+ size = bar_size - offset;
+ }
+ memcpy(buf, p + offset, size);
return size;
}
@@ -324,41 +371,44 @@ migration_write_data(vfu_ctx_t *vfu_ctx, void *data, __u64 size, __u64 offset)
assert(server_data != NULL);
assert(data != NULL);
- if (offset + size > server_data->migration.migr_data_len) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid write %#llx-%#llx",
- offset, offset + size - 1);
- }
+ /*
+ * During pre-copy state we save BAR1 and during stop-and-copy state we
+ * save BAR0.
+ */
+ vfu_log(vfu_ctx, LOG_DEBUG,
+ "apply device migration data to BAR%d %#llx-%#llx",
+ offset < server_data->bar1_size ? 1 : 0,
+ offset, offset + size - 1);
+
+ if (offset < server_data->bar1_size) {
+ assert(offset + size <= server_data->bar1_size); /* FIXME */
+ memcpy(server_data->bar1 + offset, data, size);
+ } else {
+ int ret;
- memcpy(server_data->migration.migr_data + offset, data, size);
+ /* FIXME should be able to write any valid subrange */
+ assert(offset - server_data->bar1_size == 0);
+ assert(size == sizeof server_data->bar0);
+
+ ret = bar0_access(vfu_ctx, data, sizeof server_data->bar0, 0, true);
+
+ assert(ret == (int)size); /* FIXME */
+ }
return 0;
}
static int
-migration_data_written(vfu_ctx_t *vfu_ctx, __u64 count, __u64 offset)
+migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, UNUSED __u64 count,
+ UNUSED __u64 offset)
{
- struct server_data *server_data = vfu_get_private(vfu_ctx);
- int ret;
-
- assert(server_data != NULL);
-
- if (offset + count > server_data->migration.migr_data_len) {
- vfu_log(vfu_ctx, LOG_ERR, "bad migration data range %#llx-%#llx",
- offset, offset + count - 1);
- return -EINVAL;
- }
-
- if (offset == 0 && count >= sizeof server_data->bar0) {
-
- /* apply device state */
- /* FIXME must arm timer only after device is resumed!!! */
- ret = bar0_access(vfu_ctx, server_data->migration.migr_data,
- sizeof server_data->bar0, 0, true);
- if (ret < 0) {
- return ret;
- }
- }
+ /*
+ * We apply migration state directly in the migration_write_data callback,
+ * we don't need to do anything here. We would have to apply migration
+ * state in this callback if the migration region was memory mappable, in
+ * which case we wouldn't know when the client wrote migration data.
+ */
return 0;
}
@@ -369,10 +419,10 @@ int main(int argc, char *argv[])
bool verbose = false;
char opt;
struct sigaction act = {.sa_handler = _sa_handler};
+ size_t bar1_size = 0x3000;
struct server_data server_data = {
.migration = {
- /* one page so that we can memory map it */
- .migr_data_len = sysconf(_SC_PAGESIZE),
+ .migr_data_len = bar1_size + sizeof(time_t),
.state = VFU_MIGR_STATE_RUNNING
}
};
@@ -393,12 +443,6 @@ int main(int argc, char *argv[])
errx(EXIT_FAILURE, "missing vfio-user socket path");
}
- /* coverity[NEGATIVE_RETURNS] */
- server_data.bar1 = malloc(sysconf(_SC_PAGESIZE));
- if (server_data.bar1 == NULL) {
- err(EXIT_FAILURE, "BAR1");
- }
-
sigemptyset(&act.sa_mask);
if (sigaction(SIGALRM, &act, NULL) == -1) {
err(EXIT_FAILURE, "failed to register signal handler");
@@ -431,21 +475,29 @@ int main(int argc, char *argv[])
/*
* Setup BAR1 to be 3 pages in size where only the first and the last pages
- * are mappable.
+ * are mappable. The client can still mmap the 2nd page, we can't prohibit
+ * this under Linux. If we really want to prohibit it we have to use
+ * separate files for the same region.
*/
if ((fp = tmpfile()) == NULL) {
err(EXIT_FAILURE, "failed to create BAR1 file");
}
- if (ftruncate(fileno(fp), 0x3000) == -1) {
+ server_data.bar1_size = bar1_size;
+ if (ftruncate(fileno(fp), server_data.bar1_size) == -1) {
err(EXIT_FAILURE, "failed to truncate BAR1 file");
}
+ server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fileno(fp), 0);
+ if (server_data.bar1 == MAP_FAILED) {
+ err(EXIT_FAILURE, "failed to mmap BAR1");
+ }
struct iovec mmap_areas[] = {
{ .iov_base = (void*)0, .iov_len = 0x1000 },
{ .iov_base = (void*)0x2000, .iov_len = 0x1000 }
};
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR1_REGION_IDX,
- 0x3000, &bar1_access, VFU_REGION_FLAG_RW,
- mmap_areas, 2, fileno(fp));
+ server_data.bar1_size, &bar1_access,
+ VFU_REGION_FLAG_RW, mmap_areas, 2, fileno(fp));
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup BAR1 region");
}
@@ -525,7 +577,6 @@ int main(int argc, char *argv[])
vfu_destroy_ctx(vfu_ctx);
free(server_data.migration.migr_data);
- free(server_data.bar1);
return EXIT_SUCCESS;
}