aboutsummaryrefslogtreecommitdiff
path: root/samples
diff options
context:
space:
mode:
authorWilliam Henderson <william.henderson@nutanix.com>2023-09-15 16:07:01 +0100
committerGitHub <noreply@github.com>2023-09-15 16:07:01 +0100
commit190f85bf9c114bf7c981bb8908394368f84c0c04 (patch)
tree92273a811fc3a8af74a5f62cec8871f345d6999b /samples
parent1569a37a54ecb63bd4008708c76339ccf7d06115 (diff)
downloadlibvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.zip
libvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.tar.gz
libvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.tar.bz2
adapt to VFIO live migration v2 (#782)
This commit adapts the vfio-user protocol specification and the libvfio-user implementation to v2 of the VFIO live migration interface, as used in the kernel and QEMU. The differences between v1 and v2 are discussed in this email thread [1], and we slightly differ from upstream VFIO v2 in that instead of transferring data over a new FD, we use the existing UNIX socket with new commands VFIO_USER_MIG_DATA_READ/WRITE. We also don't yet use P2P states. The updated spec was submitted to qemu-devel [2]. [1] https://lore.kernel.org/all/20220130160826.32449-9-yishaih@nvidia.com/ [2] https://lore.kernel.org/all/20230718094150.110183-1-william.henderson@nutanix.com/ Signed-off-by: William Henderson <william.henderson@nutanix.com>
Diffstat (limited to 'samples')
-rw-r--r--samples/client.c422
-rw-r--r--samples/gpio-pci-idio-16.c54
-rw-r--r--samples/server.c207
3 files changed, 325 insertions, 358 deletions
diff --git a/samples/client.c b/samples/client.c
index ed66a30..e8b737f 100644
--- a/samples/client.c
+++ b/samples/client.c
@@ -36,6 +36,7 @@
#include <errno.h>
#include <sys/mman.h>
#include <sys/eventfd.h>
+#include <sys/param.h>
#include <time.h>
#include <err.h>
#include <assert.h>
@@ -63,6 +64,8 @@ static char const *irq_to_str[] = {
[VFU_DEV_REQ_IRQ] = "REQ"
};
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
struct client_dma_region {
/*
* Our DMA regions are one page in size so we only need one bit to mark them as
@@ -121,12 +124,9 @@ send_version(int sock)
"{"
"\"capabilities\":{"
"\"max_msg_fds\":%u,"
- "\"max_data_xfer_size\":%u,"
- "\"migration\":{"
- "\"pgsize\":%ld"
- "}"
+ "\"max_data_xfer_size\":%u"
"}"
- "}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE, sysconf(_SC_PAGESIZE));
+ "}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE);
cversion.major = LIB_VFIO_USER_MAJOR;
cversion.minor = LIB_VFIO_USER_MINOR;
@@ -225,14 +225,11 @@ send_device_reset(int sock)
}
}
-/* returns whether a VFIO migration capability is found */
-static bool
+static void
get_region_vfio_caps(struct vfio_info_cap_header *header,
struct vfio_region_info_cap_sparse_mmap **sparse)
{
- struct vfio_region_info_cap_type *type;
unsigned int i;
- bool migr = false;
while (true) {
switch (header->id) {
@@ -247,16 +244,6 @@ get_region_vfio_caps(struct vfio_info_cap_header *header,
(ull_t)(*sparse)->areas[i].size);
}
break;
- case VFIO_REGION_INFO_CAP_TYPE:
- type = (struct vfio_region_info_cap_type*)header;
- if (type->type != VFIO_REGION_TYPE_MIGRATION ||
- type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) {
- errx(EXIT_FAILURE, "bad region type %d/%d", type->type,
- type->subtype);
- }
- migr = true;
- printf("client: migration region\n");
- break;
default:
errx(EXIT_FAILURE, "bad VFIO cap ID %#x", header->id);
}
@@ -265,7 +252,6 @@ get_region_vfio_caps(struct vfio_info_cap_header *header,
}
header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info));
}
- return migr;
}
static void
@@ -281,7 +267,7 @@ do_get_device_region_info(int sock, struct vfio_region_info *region_info,
}
static void
-mmap_sparse_areas(int *fds, struct vfio_region_info *region_info,
+mmap_sparse_areas(int fd, struct vfio_region_info *region_info,
struct vfio_region_info_cap_sparse_mmap *sparse)
{
size_t i;
@@ -293,14 +279,14 @@ mmap_sparse_areas(int *fds, struct vfio_region_info *region_info,
char pathname[PATH_MAX];
char buf[PATH_MAX] = "";
- ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fds[i]);
+ ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fd);
assert(ret != -1 && (size_t)ret < sizeof(pathname));
ret = readlink(pathname, buf, sizeof(buf) - 1);
if (ret == -1) {
- err(EXIT_FAILURE, "failed to resolve file descriptor %d", fds[i]);
+ err(EXIT_FAILURE, "failed to resolve file descriptor %d", fd);
}
addr = mmap(NULL, sparse->areas[i].size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fds[i], region_info->offset +
+ MAP_SHARED, fd, region_info->offset +
sparse->areas[i].offset);
if (addr == MAP_FAILED) {
err(EXIT_FAILURE,
@@ -357,16 +343,15 @@ get_device_region_info(int sock, uint32_t index)
nr_fds);
if (cap_sz) {
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
- if (get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1),
- &sparse)) {
- if (sparse != NULL) {
- assert((index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 2) ||
- (index == VFU_PCI_DEV_MIGR_REGION_IDX && nr_fds == 1));
- assert(nr_fds == sparse->nr_areas);
- mmap_sparse_areas(fds, region_info, sparse);
- }
+ get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1),
+ &sparse);
+
+ if (sparse != NULL) {
+ assert(index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 1);
+ mmap_sparse_areas(fds[0], region_info, sparse);
+ } else {
+ assert(index != VFU_PCI_DEV_BAR1_REGION_IDX);
}
-
}
free(region_info);
}
@@ -399,7 +384,7 @@ get_device_info(int sock, struct vfio_user_device_info *dev_info)
err(EXIT_FAILURE, "failed to get device info");
}
- if (dev_info->num_regions != 10) {
+ if (dev_info->num_regions != 9) {
errx(EXIT_FAILURE, "bad number of device regions %d",
dev_info->num_regions);
}
@@ -484,7 +469,6 @@ access_region(int sock, int region, bool is_write, uint64_t offset,
.iov_len = data_len
}
};
- static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
struct vfio_user_region_access *recv_data;
size_t nr_send_iovecs, recv_data_len;
int op, ret;
@@ -539,6 +523,123 @@ access_region(int sock, int region, bool is_write, uint64_t offset,
return 0;
}
+static int
+set_migration_state(int sock, uint32_t state)
+{
+ static int msg_id = 0xfab1;
+ struct vfio_user_device_feature req = {
+ .argsz = sizeof(struct vfio_user_device_feature)
+ + sizeof(struct vfio_user_device_feature_mig_state),
+ .flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
+ };
+ struct vfio_user_device_feature_mig_state change_state = {
+ .device_state = state,
+ .data_fd = -1
+ };
+ struct iovec send_iovecs[3] = {
+ [1] = {
+ .iov_base = &req,
+ .iov_len = sizeof(req)
+ },
+ [2] = {
+ .iov_base = &change_state,
+ .iov_len = sizeof(change_state)
+ }
+ };
+ void *response = alloca(sizeof(req) + sizeof(change_state));
+
+ if (response == NULL) {
+ return -1;
+ }
+
+ pthread_mutex_lock(&mutex);
+ int ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_DEVICE_FEATURE,
+ send_iovecs, 3, NULL, 0, NULL,
+ response, sizeof(req) + sizeof(change_state),
+ NULL, 0);
+ pthread_mutex_unlock(&mutex);
+
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to set state: %d", ret);
+ }
+
+ if (memcmp(&req, response, sizeof(req)) != 0) {
+ err(EXIT_FAILURE, "invalid response to set_migration_state (header)");
+ }
+
+ if (memcmp(&change_state, response + sizeof(req),
+ sizeof(change_state)) != 0) {
+ err(EXIT_FAILURE, "invalid response to set_migration_state (payload)");
+ }
+
+ return ret;
+}
+
+static ssize_t
+read_migr_data(int sock, void *buf, size_t len)
+{
+ static int msg_id = 0x6904;
+ struct vfio_user_mig_data req = {
+ .argsz = sizeof(struct vfio_user_mig_data) + len,
+ .size = len
+ };
+ struct iovec send_iovecs[2] = {
+ [1] = {
+ .iov_base = &req,
+ .iov_len = sizeof(req)
+ }
+ };
+ struct vfio_user_mig_data *res = calloc(1, sizeof(req) + len);
+
+ assert(res != NULL);
+
+ pthread_mutex_lock(&mutex);
+ ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_READ,
+ send_iovecs, 2, NULL, 0, NULL,
+ res, sizeof(req) + len, NULL, 0);
+ pthread_mutex_unlock(&mutex);
+
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to read migration data: %ld", ret);
+ }
+
+ memcpy(buf, res->data, res->size);
+
+ ssize_t size = res->size;
+
+ free(res);
+
+ return size;
+}
+
+static ssize_t
+write_migr_data(int sock, void *buf, size_t len)
+{
+ static int msg_id = 0x2023;
+ struct vfio_user_mig_data req = {
+ .argsz = sizeof(struct vfio_user_mig_data) + len,
+ .size = len
+ };
+ struct iovec send_iovecs[3] = {
+ [1] = {
+ .iov_base = &req,
+ .iov_len = sizeof(req)
+ },
+ [2] = {
+ .iov_base = buf,
+ .iov_len = len
+ }
+ };
+
+ pthread_mutex_lock(&mutex);
+ ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_WRITE,
+ send_iovecs, 3, NULL, 0, NULL,
+ &req, sizeof(req), NULL, 0);
+ pthread_mutex_unlock(&mutex);
+
+ return ret;
+}
+
static void
access_bar0(int sock, time_t *t)
{
@@ -712,34 +813,33 @@ static void
get_dirty_bitmap(int sock, struct client_dma_region *dma_region,
bool expect_dirty)
{
- uint64_t bitmap_size = _get_bitmap_size(dma_region->map.size,
- sysconf(_SC_PAGESIZE));
- struct vfio_user_dirty_pages *dirty_pages;
- struct vfio_user_bitmap_range *range;
+ struct vfio_user_device_feature *res;
+ struct vfio_user_device_feature_dma_logging_report *report;
char *bitmap;
- size_t size;
- void *data;
int ret;
- size = sizeof(*dirty_pages) + sizeof(*range) + bitmap_size;
+ uint64_t bitmap_size = get_bitmap_size(dma_region->map.size,
+ sysconf(_SC_PAGESIZE));
- data = calloc(1, size);
+ size_t size = sizeof(*res) + sizeof(*report) + bitmap_size;
+
+ void *data = calloc(1, size);
assert(data != NULL);
- dirty_pages = data;
- dirty_pages->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
- dirty_pages->argsz = sizeof(*dirty_pages) + sizeof(*range) + bitmap_size;
+ res = data;
+ res->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT
+ | VFIO_DEVICE_FEATURE_GET;
+ res->argsz = size;
- range = data + sizeof(*dirty_pages);
- range->iova = dma_region->map.addr;
- range->size = dma_region->map.size;
- range->bitmap.size = bitmap_size;
- range->bitmap.pgsize = sysconf(_SC_PAGESIZE);
+ report = (struct vfio_user_device_feature_dma_logging_report *)(res + 1);
+ report->iova = dma_region->map.addr;
+ report->length = dma_region->map.size;
+ report->page_size = sysconf(_SC_PAGESIZE);
- bitmap = data + sizeof(*dirty_pages) + sizeof(*range);
+ bitmap = data + sizeof(*res) + sizeof(*report);
- ret = tran_sock_msg(sock, 0x99, VFIO_USER_DIRTY_PAGES,
- data, sizeof(*dirty_pages) + sizeof(*range),
+ ret = tran_sock_msg(sock, 0x99, VFIO_USER_DEVICE_FEATURE,
+ data, sizeof(*res) + sizeof(*report),
NULL, data, size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to get dirty page bitmap");
@@ -749,14 +849,14 @@ get_dirty_bitmap(int sock, struct client_dma_region *dma_region,
char dirtied_by_client = (dma_region->flags & CLIENT_DIRTY_DMA_REGION) != 0;
char dirtied = dirtied_by_server | dirtied_by_client;
- printf("client: %s: %#llx-%#llx\t%#x\n", __func__,
- (ull_t)range->iova,
- (ull_t)(range->iova + range->size - 1), dirtied);
-
if (expect_dirty) {
assert(dirtied);
}
+ printf("client: %s: %#llx-%#llx\t%#x\n", __func__,
+ (ull_t)report->iova,
+ (ull_t)(report->iova + report->length - 1), dirtied);
+
free(data);
}
@@ -782,64 +882,32 @@ usage(char *argv0)
* @returns the number of iterations performed
*/
static size_t
-do_migrate(int sock, size_t nr_iters, struct iovec *migr_iter)
+do_migrate(int sock, size_t nr_iters, size_t max_iter_size,
+ struct iovec *migr_iter)
{
- int ret;
- uint64_t pending_bytes, data_offset, data_size;
+ ssize_t ret;
size_t i = 0;
- assert(nr_iters > 0);
-
- /* XXX read pending_bytes */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, pending_bytes),
- &pending_bytes, sizeof(pending_bytes));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read pending_bytes");
- }
-
- for (i = 0; i < nr_iters && pending_bytes > 0; i++) {
-
- /* XXX read data_offset and data_size */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, data_offset),
- &data_offset, sizeof(data_offset));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read data_offset");
- }
+ for (i = 0; i < nr_iters; i++) {
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, data_size),
- &data_size, sizeof(data_size));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read data_size");
- }
+ migr_iter[i].iov_len = max_iter_size;
+ migr_iter[i].iov_base = malloc(migr_iter[i].iov_len);
- migr_iter[i].iov_len = data_size;
- migr_iter[i].iov_base = malloc(data_size);
if (migr_iter[i].iov_base == NULL) {
err(EXIT_FAILURE, "failed to allocate migration buffer");
}
/* XXX read migration data */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- data_offset,
- (char *)migr_iter[i].iov_base, data_size);
+ ret = read_migr_data(sock, migr_iter[i].iov_base, migr_iter[i].iov_len);
if (ret < 0) {
err(EXIT_FAILURE, "failed to read migration data");
}
- /* FIXME send migration data to the destination client process */
+ migr_iter[i].iov_len = ret;
- /*
- * XXX read pending_bytes again to indicate to the server that the
- * migration data have been consumed.
- */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, pending_bytes),
- &pending_bytes, sizeof(pending_bytes));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read pending_bytes");
+ // We know we've finished transferring data when we read 0 bytes.
+ if (ret == 0) {
+ break;
}
}
return i;
@@ -883,11 +951,12 @@ fake_guest(void *arg)
static size_t
migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
- uint32_t *crcp, size_t bar1_size)
+ uint32_t *crcp, size_t bar1_size, size_t max_iter_size)
{
+ size_t expected_data;
uint32_t device_state;
+ size_t iters;
int ret;
- size_t _nr_iters;
pthread_t thread;
struct fake_guest_data fake_guest_data = {
.sock = sock,
@@ -902,7 +971,9 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
err(EXIT_FAILURE, "failed to create pthread");
}
- *nr_iters = 2;
+ expected_data = bar1_size;
+ *nr_iters = (expected_data + max_iter_size - 1) / max_iter_size;
+ assert(*nr_iters == 12);
*migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
if (*migr_iters == NULL) {
err(EXIT_FAILURE, NULL);
@@ -912,16 +983,15 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
* XXX set device state to pre-copy. This is technically optional but any
* VMM that cares about performance needs this.
*/
- device_state = VFIO_DEVICE_STATE_V1_SAVING | VFIO_DEVICE_STATE_V1_RUNNING;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_PRE_COPY;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
- _nr_iters = do_migrate(sock, 1, *migr_iters);
- assert(_nr_iters == 1);
+ iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters);
+ assert(iters == *nr_iters);
+
printf("client: stopping fake guest thread\n");
fake_guest_data.done = true;
__sync_synchronize();
@@ -933,31 +1003,32 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
printf("client: setting device state to stop-and-copy\n");
- device_state = VFIO_DEVICE_STATE_V1_SAVING;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_STOP_COPY;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
- _nr_iters += do_migrate(sock, 1, (*migr_iters) + _nr_iters);
- if (_nr_iters != 2) {
- errx(EXIT_FAILURE,
- "expected 2 iterations instead of %zu while in stop-and-copy state",
- _nr_iters);
+ expected_data = bar1_size + sizeof(time_t);
+ *nr_iters = (expected_data + max_iter_size - 1) / max_iter_size;
+ assert(*nr_iters == 13);
+ free(*migr_iters);
+ *migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
+ if (*migr_iters == NULL) {
+ err(EXIT_FAILURE, NULL);
}
+ iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters);
+ assert(iters == *nr_iters);
+
/* XXX read device state, migration must have finished now */
- device_state = VFIO_DEVICE_STATE_V1_STOP;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_STOP;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
- return _nr_iters;
+ return iters;
}
static int
@@ -966,11 +1037,11 @@ migrate_to(char *old_sock_path, int *server_max_fds,
struct iovec *migr_iters, char *path_to_server,
uint32_t src_crc, size_t bar1_size)
{
- int ret, sock;
+ ssize_t ret;
+ int sock;
char *sock_path;
struct stat sb;
- uint32_t device_state = VFIO_DEVICE_STATE_V1_RESUMING;
- uint64_t data_offset, data_len;
+ uint32_t device_state = VFIO_USER_DEVICE_STATE_RESUMING;
size_t i;
uint32_t dst_crc;
char buf[bar1_size];
@@ -1020,57 +1091,26 @@ migrate_to(char *old_sock_path, int *server_max_fds,
negotiate(sock, server_max_fds, server_max_data_xfer_size, pgsize);
- /* XXX set device state to resuming */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_RESUMING;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to set device state to resuming");
}
for (i = 0; i < nr_iters; i++) {
-
- /* XXX read data offset */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, data_offset),
- &data_offset, sizeof(data_offset));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read migration data offset");
- }
-
/* XXX write migration data */
-
- /*
- * TODO write half of migration data via regular write and other half via
- * memopy map.
- */
- printf("client: writing migration device data %#llx-%#llx\n",
- (ull_t)data_offset,
- (ull_t)(data_offset + migr_iters[i].iov_len - 1));
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- data_offset, migr_iters[i].iov_base,
- migr_iters[i].iov_len);
+ ret = write_migr_data(sock, migr_iters[i].iov_base,
+ migr_iters[i].iov_len);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write device migration data");
}
-
- /* XXX write data_size */
- data_len = migr_iters[i].iov_len;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, data_size),
- &data_len, sizeof(data_len));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to write migration data size");
- }
}
- /* XXX set device state to running */
- device_state = VFIO_DEVICE_STATE_V1_RUNNING;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ /* XXX set device state to stop to finish the transfer */
+ device_state = VFIO_USER_DEVICE_STATE_STOP;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
- err(EXIT_FAILURE, "failed to set device state to running");
+ err(EXIT_FAILURE, "failed to set device state to stop");
}
/* validate contents of BAR1 */
@@ -1086,6 +1126,13 @@ migrate_to(char *old_sock_path, int *server_max_fds,
abort();
}
+ /* XXX set device state to running */
+ device_state = VFIO_USER_DEVICE_STATE_RUNNING;
+ ret = set_migration_state(sock, device_state);
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to set device state to running");
+ }
+
return sock;
}
@@ -1125,7 +1172,6 @@ int main(int argc, char *argv[])
size_t server_max_data_xfer_size;
size_t pgsize;
int nr_dma_regions;
- struct vfio_user_dirty_pages dirty_pages = {0};
int opt;
time_t t;
char *path_to_server = NULL;
@@ -1135,6 +1181,14 @@ int main(int argc, char *argv[])
uint32_t crc;
size_t bar1_size = 0x3000; /* FIXME get this value from region info */
+ struct vfio_user_device_feature *dirty_pages_feature;
+ struct vfio_user_device_feature_dma_logging_control *dirty_pages_control;
+ size_t dirty_pages_size = sizeof(*dirty_pages_feature) +
+ sizeof(*dirty_pages_control);
+ void *dirty_pages = malloc(dirty_pages_size);
+ dirty_pages_feature = dirty_pages;
+ dirty_pages_control = (void *)(dirty_pages_feature + 1);
+
while ((opt = getopt(argc, argv, "h")) != -1) {
switch (opt) {
case 'h':
@@ -1229,11 +1283,16 @@ int main(int argc, char *argv[])
*/
irq_fd = configure_irqs(sock);
- dirty_pages.argsz = sizeof(dirty_pages);
- dirty_pages.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
- ret = tran_sock_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
- &dirty_pages, sizeof(dirty_pages),
- NULL, NULL, 0);
+ /* start dirty pages logging */
+ dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) +
+ sizeof(*dirty_pages_control);
+ dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_START |
+ VFIO_DEVICE_FEATURE_SET;
+ dirty_pages_control->num_ranges = 0;
+ dirty_pages_control->page_size = sysconf(_SC_PAGESIZE);
+
+ ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages,
+ dirty_pages_size, NULL, dirty_pages, dirty_pages_size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to start dirty page logging");
}
@@ -1270,11 +1329,16 @@ int main(int argc, char *argv[])
get_dirty_bitmap(sock, &dma_regions[i], i < 2);
}
- dirty_pages.argsz = sizeof(dirty_pages);
- dirty_pages.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
- ret = tran_sock_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
- &dirty_pages, sizeof(dirty_pages),
- NULL, NULL, 0);
+ /* stop logging dirty pages */
+ dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) +
+ sizeof(*dirty_pages_control);
+ dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP |
+ VFIO_DEVICE_FEATURE_SET;
+ dirty_pages_control->num_ranges = 0;
+ dirty_pages_control->page_size = sysconf(_SC_PAGESIZE);
+
+ ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages,
+ dirty_pages_size, NULL, dirty_pages, dirty_pages_size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to stop dirty page logging");
}
@@ -1316,7 +1380,8 @@ int main(int argc, char *argv[])
err(EXIT_FAILURE, "failed to write to BAR0");
}
- nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size);
+ nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size,
+ MIN(server_max_data_xfer_size, CLIENT_MAX_DATA_XFER_SIZE));
/*
* Normally the client would now send the device state to the destination
@@ -1374,6 +1439,7 @@ int main(int argc, char *argv[])
}
free(dma_regions);
+ free(dirty_pages);
return 0;
}
diff --git a/samples/gpio-pci-idio-16.c b/samples/gpio-pci-idio-16.c
index b50f407..6c4e99b 100644
--- a/samples/gpio-pci-idio-16.c
+++ b/samples/gpio-pci-idio-16.c
@@ -77,49 +77,23 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
return 0;
}
-static uint64_t
-migration_get_pending_bytes(UNUSED vfu_ctx_t *vfu_ctx)
+static ssize_t
+migration_read_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
+ assert(size == sizeof(pin));
+
if (dirty) {
+ memcpy(buf, &pin, sizeof(pin));
+ dirty = false;
return sizeof(pin);
}
- return 0;
-}
-static int
-migration_prepare_data(UNUSED vfu_ctx_t *vfu_ctx,
- uint64_t *offset, uint64_t *size)
-{
- *offset = 0;
- if (size != NULL) { /* null means resuming */
- *size = sizeof(pin);
- }
return 0;
}
static ssize_t
-migration_read_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t size, uint64_t offset)
+migration_write_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
- assert(offset == 0);
- assert(size == sizeof(pin));
- memcpy(buf, &pin, sizeof(pin));
- dirty = false;
- return 0;
-}
-
-static int
-migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, uint64_t count)
-{
- assert(count == sizeof(pin));
- return 0;
-}
-
-static ssize_t
-migration_write_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t size, uint64_t offset)
-{
- assert(offset == 0);
assert(size == sizeof(pin));
memcpy(&pin, buf, sizeof(pin));
return 0;
@@ -145,16 +119,10 @@ main(int argc, char *argv[])
int opt;
struct sigaction act = { .sa_handler = _sa_handler };
vfu_ctx_t *vfu_ctx;
- size_t migr_regs_size = vfu_get_migr_register_area_size();
- size_t migr_data_size = sysconf(_SC_PAGE_SIZE);
- size_t migr_size = migr_regs_size + migr_data_size;
const vfu_migration_callbacks_t migr_callbacks = {
.version = VFU_MIGR_CALLBACKS_VERS,
.transition = &migration_device_state_transition,
- .get_pending_bytes = &migration_get_pending_bytes,
- .prepare_data = &migration_prepare_data,
.read_data = &migration_read_data,
- .data_written = &migration_data_written,
.write_data = &migration_write_data
};
@@ -214,13 +182,7 @@ main(int argc, char *argv[])
}
if (enable_migr) {
- ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, migr_size,
- NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to setup migration region");
- }
- ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks,
- migr_regs_size);
+ ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device migration");
}
diff --git a/samples/server.c b/samples/server.c
index 565974d..5edf674 100644
--- a/samples/server.c
+++ b/samples/server.c
@@ -60,7 +60,7 @@ struct server_data {
size_t bar1_size;
struct dma_regions regions[NR_DMA_REGIONS];
struct {
- uint64_t pending_bytes;
+ uint64_t bytes_transferred;
vfu_migr_state_t state;
} migration;
};
@@ -130,10 +130,6 @@ bar1_access(vfu_ctx_t *vfu_ctx, char * const buf,
}
if (is_write) {
- if (server_data->migration.state == VFU_MIGR_STATE_PRE_COPY) {
- /* dirty the whole thing */
- server_data->migration.pending_bytes = server_data->bar1_size;
- }
memcpy(server_data->bar1 + offset, buf, count);
} else {
memcpy(buf, server_data->bar1, count);
@@ -322,19 +318,24 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
if (setitimer(ITIMER_REAL, &new, NULL) != 0) {
err(EXIT_FAILURE, "failed to disable timer");
}
- server_data->migration.pending_bytes = server_data->bar1_size + sizeof(time_t); /* FIXME BAR0 region size */
+ server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_PRE_COPY:
- /* TODO must be less than size of data region in migration region */
- server_data->migration.pending_bytes = server_data->bar1_size;
+ server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_STOP:
/* FIXME should gracefully fail */
- assert(server_data->migration.pending_bytes == 0);
+ if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
+ assert(server_data->migration.bytes_transferred ==
+ server_data->bar1_size + sizeof(time_t));
+ }
break;
case VFU_MIGR_STATE_RESUME:
+ server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_RUNNING:
+ assert(server_data->migration.bytes_transferred ==
+ server_data->bar1_size + sizeof(time_t));
ret = arm_timer(vfu_ctx, server_data->bar0);
if (ret < 0) {
return ret;
@@ -347,125 +348,100 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
return 0;
}
-static uint64_t
-migration_get_pending_bytes(vfu_ctx_t *vfu_ctx)
-{
- struct server_data *server_data = vfu_get_private(vfu_ctx);
- return server_data->migration.pending_bytes;
-}
-
-static int
-migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size)
-{
- struct server_data *server_data = vfu_get_private(vfu_ctx);
-
- *offset = 0;
- if (size != NULL) {
- *size = server_data->migration.pending_bytes;
- }
- return 0;
-}
-
static ssize_t
-migration_read_data(vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t size, uint64_t offset)
+migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
- if (server_data->migration.state != VFU_MIGR_STATE_PRE_COPY &&
- server_data->migration.state != VFU_MIGR_STATE_STOP_AND_COPY)
- {
- return size;
- }
-
/*
- * For ease of implementation we expect the client to read all migration
- * data in one go; partial reads are not supported. This is allowed by VFIO
- * however we don't yet support it. Similarly, when resuming, partial
- * writes are supported by VFIO, however we don't in this sample.
- *
* If in pre-copy state we copy BAR1, if in stop-and-copy state we copy
* both BAR1 and BAR0. Since we always copy BAR1 in the stop-and-copy state,
* copying BAR1 in the pre-copy state is pointless. Fixing this requires
* more complex state tracking which exceeds the scope of this sample.
*/
- if (offset != 0 || size != server_data->migration.pending_bytes) {
- errno = EINVAL;
- return -1;
- }
+ uint32_t total_to_read = server_data->bar1_size;
- memcpy(buf, server_data->bar1, server_data->bar1_size);
if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
- memcpy(buf + server_data->bar1_size, &server_data->bar0,
- sizeof(server_data->bar0));
+ total_to_read += sizeof(server_data->bar0);
+ }
+
+ if (server_data->migration.bytes_transferred == total_to_read || size == 0) {
+ vfu_log(vfu_ctx, LOG_DEBUG, "no data left to read");
+ return 0;
+ }
+
+ uint32_t read_start = server_data->migration.bytes_transferred;
+ uint32_t read_end = MIN(read_start + size, total_to_read);
+ assert(read_end > read_start);
+
+ uint32_t bytes_read = read_end - read_start;
+
+ uint32_t length_in_bar1 = 0;
+ uint32_t length_in_bar0 = 0;
+
+ /* read bar1, if any */
+ if (read_start < server_data->bar1_size) {
+ length_in_bar1 = MIN(bytes_read, server_data->bar1_size - read_start);
+ memcpy(buf, server_data->bar1 + read_start, length_in_bar1);
+ read_start += length_in_bar1;
+ }
+
+ /* read bar0, if any */
+ if (read_end > server_data->bar1_size) {
+ length_in_bar0 = read_end - read_start;
+ read_start -= server_data->bar1_size;
+ memcpy(buf + length_in_bar1, &server_data->bar0 + read_start,
+ length_in_bar0);
}
- server_data->migration.pending_bytes = 0;
- return size;
+ server_data->migration.bytes_transferred += bytes_read;
+
+ return bytes_read;
}
static ssize_t
-migration_write_data(vfu_ctx_t *vfu_ctx, void *data,
- uint64_t size, uint64_t offset)
+migration_write_data(vfu_ctx_t *vfu_ctx, void *data, uint64_t size)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
char *buf = data;
- int ret;
assert(server_data != NULL);
assert(data != NULL);
- if (offset != 0 || size < server_data->bar1_size) {
- vfu_log(vfu_ctx, LOG_DEBUG, "XXX bad migration data write %#llx-%#llx",
- (unsigned long long)offset,
- (unsigned long long)offset + size - 1);
- errno = EINVAL;
- return -1;
- }
+ uint32_t total_to_write = server_data->bar1_size + sizeof(server_data->bar0);
- memcpy(server_data->bar1, buf, server_data->bar1_size);
- buf += server_data->bar1_size;
- size -= server_data->bar1_size;
- if (size == 0) {
+ if (server_data->migration.bytes_transferred == total_to_write || size == 0) {
return 0;
}
- if (size != sizeof(server_data->bar0)) {
- errno = EINVAL;
- return -1;
- }
- memcpy(&server_data->bar0, buf, sizeof(server_data->bar0));
- ret = bar0_access(vfu_ctx, buf, sizeof(server_data->bar0), 0, true);
- assert(ret == (int)size); /* FIXME */
- return 0;
-}
+ uint32_t write_start = server_data->migration.bytes_transferred;
+ uint32_t write_end = MIN(write_start + size, total_to_write); // exclusive
+ assert(write_end > write_start);
+ uint32_t bytes_written = write_end - write_start;
-static int
-migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, UNUSED uint64_t count)
-{
- /*
- * We apply migration state directly in the migration_write_data callback,
- * so we don't need to do anything here. We would have to apply migration
- * state in this callback if the migration region was memory mappable, in
- * which case we wouldn't know when the client wrote migration data.
- */
+ uint32_t length_in_bar1 = 0;
+ uint32_t length_in_bar0 = 0;
- return 0;
-}
+ /* write to bar1, if any */
+ if (write_start < server_data->bar1_size) {
+ length_in_bar1 = MIN(bytes_written, server_data->bar1_size - write_start);
+ memcpy(server_data->bar1 + write_start, buf, length_in_bar1);
+ write_start += length_in_bar1;
+ }
-static size_t
-nr_pages(size_t size)
-{
- return (size / sysconf(_SC_PAGE_SIZE) +
- (size % sysconf(_SC_PAGE_SIZE) > 1));
-}
+ /* write to bar0, if any */
+ if (write_end > server_data->bar1_size) {
+ length_in_bar0 = write_end - write_start;
+ write_start -= server_data->bar1_size;
+ memcpy(&server_data->bar0 + write_start, buf + length_in_bar1,
+ length_in_bar0);
+ }
-static size_t
-page_align(size_t size)
-{
- return nr_pages(size) * sysconf(_SC_PAGE_SIZE);
+ server_data->migration.bytes_transferred += bytes_written;
+
+ return bytes_written;
}
int main(int argc, char *argv[])
@@ -476,7 +452,6 @@ int main(int argc, char *argv[])
int opt;
struct sigaction act = {.sa_handler = _sa_handler};
const size_t bar1_size = 0x3000;
- size_t migr_regs_size, migr_data_size, migr_size;
struct server_data server_data = {
.migration = {
.state = VFU_MIGR_STATE_RUNNING
@@ -488,10 +463,7 @@ int main(int argc, char *argv[])
const vfu_migration_callbacks_t migr_callbacks = {
.version = VFU_MIGR_CALLBACKS_VERS,
.transition = &migration_device_state_transition,
- .get_pending_bytes = &migration_get_pending_bytes,
- .prepare_data = &migration_prepare_data,
.read_data = &migration_read_data,
- .data_written = &migration_data_written,
.write_data = &migration_write_data
};
@@ -550,9 +522,6 @@ int main(int argc, char *argv[])
* are mappable. The client can still mmap the 2nd page, we can't prohibit
* this under Linux. If we really want to prohibit it we have to use
* separate files for the same region.
- *
- * We choose to use a single file which contains both BAR1 and the migration
- * registers. They could also be completely different files.
*/
if ((tmpfd = mkstemp(template)) == -1) {
err(EXIT_FAILURE, "failed to create backing file");
@@ -562,16 +531,7 @@ int main(int argc, char *argv[])
server_data.bar1_size = bar1_size;
- /*
- * The migration registers aren't memory mappable, so in order to make the
- * rest of the migration region memory mappable we must effectively reserve
- * an entire page.
- */
- migr_regs_size = vfu_get_migr_register_area_size();
- migr_data_size = page_align(bar1_size + sizeof(time_t));
- migr_size = migr_regs_size + migr_data_size;
-
- if (ftruncate(tmpfd, server_data.bar1_size + migr_size) == -1) {
+ if (ftruncate(tmpfd, server_data.bar1_size) == -1) {
err(EXIT_FAILURE, "failed to truncate backing file");
}
server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE,
@@ -591,29 +551,8 @@ int main(int argc, char *argv[])
err(EXIT_FAILURE, "failed to setup BAR1 region");
}
- /* setup migration */
-
- struct iovec migr_mmap_areas[] = {
- [0] = {
- .iov_base = (void *)migr_regs_size,
- .iov_len = migr_data_size
- },
- };
-
- /*
- * The migration region comes after bar1 in the backing file, so offset is
- * server_data.bar1_size.
- */
- ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, migr_size,
- NULL, VFU_REGION_FLAG_RW, migr_mmap_areas,
- ARRAY_SIZE(migr_mmap_areas), tmpfd,
- server_data.bar1_size);
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to setup migration region");
- }
-
- ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks,
- migr_regs_size);
+ ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks);
+
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device migration");
}