aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorWilliam Henderson <william.henderson@nutanix.com>2023-09-15 16:07:01 +0100
committerGitHub <noreply@github.com>2023-09-15 16:07:01 +0100
commit190f85bf9c114bf7c981bb8908394368f84c0c04 (patch)
tree92273a811fc3a8af74a5f62cec8871f345d6999b /lib
parent1569a37a54ecb63bd4008708c76339ccf7d06115 (diff)
downloadlibvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.zip
libvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.tar.gz
libvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.tar.bz2
adapt to VFIO live migration v2 (#782)
This commit adapts the vfio-user protocol specification and the libvfio-user implementation to v2 of the VFIO live migration interface, as used in the kernel and QEMU. The differences between v1 and v2 are discussed in this email thread [1], and we slightly differ from upstream VFIO v2 in that instead of transferring data over a new FD, we use the existing UNIX socket with new commands VFIO_USER_MIG_DATA_READ/WRITE. We also don't yet use P2P states. The updated spec was submitted to qemu-devel [2]. [1] https://lore.kernel.org/all/20220130160826.32449-9-yishaih@nvidia.com/ [2] https://lore.kernel.org/all/20230718094150.110183-1-william.henderson@nutanix.com/ Signed-off-by: William Henderson <william.henderson@nutanix.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/common.h41
-rw-r--r--lib/dma.c242
-rw-r--r--lib/dma.h1
-rw-r--r--lib/libvfio-user.c506
-rw-r--r--lib/migration.c572
-rw-r--r--lib/migration.h24
-rw-r--r--lib/migration_priv.h86
-rw-r--r--lib/private.h14
8 files changed, 783 insertions, 703 deletions
diff --git a/lib/common.h b/lib/common.h
index 07a74a5..40b9b27 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -41,6 +41,7 @@
#include <limits.h>
#include <stdint.h>
#include <unistd.h>
+#include <sys/uio.h>
#define UNUSED __attribute__((unused))
#define EXPORT __attribute__((visibility("default")))
@@ -62,6 +63,20 @@
typedef unsigned long long ull_t;
+static inline int
+ERROR_INT(int err)
+{
+ errno = err;
+ return -1;
+}
+
+static inline void *
+ERROR_PTR(int err)
+{
+ errno = err;
+ return NULL;
+}
+
/* Saturating uint64_t addition. */
static inline uint64_t
satadd_u64(uint64_t a, uint64_t b)
@@ -73,11 +88,21 @@ satadd_u64(uint64_t a, uint64_t b)
/*
* The size, in bytes, of the bitmap that represents the given range with the
* given page size.
+ *
+ * Returns -1 and sets errno if the given page size is invalid for the given
+ * range.
*/
-static inline size_t
-_get_bitmap_size(size_t size, size_t pgsize)
+static inline ssize_t
+get_bitmap_size(size_t region_size, size_t pgsize)
{
- size_t nr_pages = (size / pgsize) + (size % pgsize != 0);
+ if (pgsize == 0) {
+ return ERROR_INT(EINVAL);
+ }
+ if (region_size < pgsize) {
+ return ERROR_INT(EINVAL);
+ }
+
+ size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
return ROUND_UP(nr_pages, sizeof(uint64_t) * CHAR_BIT) / CHAR_BIT;
}
@@ -107,6 +132,16 @@ close_safely(int *fd)
errno = saved_errno;
}
+static inline void
+iov_free(struct iovec *iov)
+{
+ if (iov->iov_base != NULL) {
+ free(iov->iov_base);
+ iov->iov_base = NULL;
+ }
+ iov->iov_len = 0;
+}
+
#ifdef UNIT_TEST
#define MOCK_DEFINE(f) \
diff --git a/lib/dma.c b/lib/dma.c
index 9ca34d0..10e38ff 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -255,19 +255,6 @@ dma_map_region(dma_controller_t *dma, dma_memory_region_t *region)
return 0;
}
-static ssize_t
-get_bitmap_size(size_t region_size, size_t pgsize)
-{
- if (pgsize == 0) {
- return ERROR_INT(EINVAL);
- }
- if (region_size < pgsize) {
- return ERROR_INT(EINVAL);
- }
-
- return _get_bitmap_size(region_size, pgsize);
-}
-
static int
dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize)
{
@@ -530,28 +517,173 @@ dma_controller_dirty_page_logging_stop(dma_controller_t *dma)
#ifdef DEBUG
static void
log_dirty_bitmap(vfu_ctx_t *vfu_ctx, dma_memory_region_t *region,
- char *bitmap, size_t size)
+ char *bitmap, size_t size, size_t pgsize)
{
size_t i;
size_t count;
for (i = 0, count = 0; i < size; i++) {
count += __builtin_popcount((uint8_t)bitmap[i]);
}
- vfu_log(vfu_ctx, LOG_DEBUG, "dirty pages: get [%p, %p), %zu dirty pages",
+ vfu_log(vfu_ctx, LOG_DEBUG,
+ "dirty pages: get [%p, %p), %zu dirty pages of size %zu",
region->info.iova.iov_base, iov_end(&region->info.iova),
- count);
+ count, pgsize);
}
#endif
+static void
+dirty_page_exchange(uint8_t *outp, uint8_t *bitmap)
+{
+ /*
+ * If no bits are dirty, avoid the atomic exchange. This is obviously
+ * racy, but it's OK: if we miss a dirty bit being set, we'll catch it
+ * the next time around.
+ *
+ * Otherwise, atomically exchange the dirty bits with zero: as we use
+ * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might
+ * miss a bit being set after, but again, we'll catch that next time
+ * around.
+ */
+ if (*bitmap == 0) {
+ *outp = 0;
+ } else {
+ uint8_t zero = 0;
+ __atomic_exchange(bitmap, &zero, outp, __ATOMIC_SEQ_CST);
+ }
+}
+
+static void
+dirty_page_get_same_pgsize(dma_memory_region_t *region, char *bitmap,
+ size_t bitmap_size)
+{
+ for (size_t i = 0; i < bitmap_size; i++) {
+ dirty_page_exchange((uint8_t *)&bitmap[i], &region->dirty_bitmap[i]);
+ }
+}
+
+static void
+dirty_page_get_extend(dma_memory_region_t *region, char *bitmap,
+ size_t server_bitmap_size, size_t server_pgsize,
+ size_t client_bitmap_size, size_t client_pgsize)
+{
+ /*
+ * The index of the bit in the client bitmap that we are currently
+ * considering. By keeping track of this separately to the for loop, we
+ * allow for one server bit to be repeated for multiple client bytes.
+ */
+ uint8_t client_bit_idx = 0;
+ size_t server_byte_idx;
+ int server_bit_idx;
+ size_t factor = server_pgsize / client_pgsize;
+
+ /*
+ * Iterate through the bytes of the server bitmap.
+ */
+ for (server_byte_idx = 0; server_byte_idx < server_bitmap_size;
+ server_byte_idx++) {
+
+ if (client_bit_idx / CHAR_BIT >= client_bitmap_size) {
+ break;
+ }
+
+ uint8_t out = 0;
+
+ dirty_page_exchange(&out, &region->dirty_bitmap[server_byte_idx]);
+
+ /*
+ * Iterate through the bits of the server byte, repeating bits to reach
+ * the desired page size.
+ */
+ for (server_bit_idx = 0; server_bit_idx < CHAR_BIT; server_bit_idx++) {
+ uint8_t server_bit = (out >> server_bit_idx) & 1;
+
+ /*
+ * Repeat `factor` times the bit at index `j` of `out`.
+ *
+ * OR the same bit from the server bitmap (`server_bit`) with
+ * `factor` bits in the client bitmap, from `client_bit_idx` to
+ * `end_client_bit_idx`.
+ */
+ for (size_t end_client_bit_idx = client_bit_idx + factor;
+ client_bit_idx < end_client_bit_idx;
+ client_bit_idx++) {
+
+ bitmap[client_bit_idx / CHAR_BIT] |=
+ server_bit << (client_bit_idx % CHAR_BIT);
+ }
+ }
+ }
+}
+
+static void
+dirty_page_get_combine(dma_memory_region_t *region, char *bitmap,
+ size_t server_bitmap_size, size_t server_pgsize,
+ size_t client_bitmap_size, size_t client_pgsize)
+{
+ /*
+ * The index of the bit in the client bitmap that we are currently
+ * considering. By keeping track of this separately to the for loop, we
+ * allow multiple bytes' worth of server bits to be OR'd together to
+ * calculate one client bit.
+ */
+ uint8_t client_bit_idx = 0;
+ size_t server_byte_idx;
+ int server_bit_idx;
+ size_t factor = client_pgsize / server_pgsize;
+
+ /*
+ * Iterate through the bytes of the server bitmap.
+ */
+ for (server_byte_idx = 0; server_byte_idx < server_bitmap_size;
+ server_byte_idx++) {
+
+ if (client_bit_idx / CHAR_BIT >= client_bitmap_size) {
+ break;
+ }
+
+ uint8_t out = 0;
+
+ dirty_page_exchange(&out, &region->dirty_bitmap[server_byte_idx]);
+
+ /*
+ * Iterate through the bits of the server byte, combining bits to reach
+ * the desired page size.
+ */
+ for (server_bit_idx = 0; server_bit_idx < CHAR_BIT; server_bit_idx++) {
+ uint8_t server_bit = (out >> server_bit_idx) & 1;
+
+ /*
+ * OR `factor` bits of the server bitmap with the same bit at
+ * index `client_bit_idx` in the client bitmap.
+ */
+ bitmap[client_bit_idx / CHAR_BIT] |=
+ server_bit << (client_bit_idx % CHAR_BIT);
+
+ /*
+ * Only move onto the next bit in the client bitmap once we've
+ * OR'd `factor` bits.
+ */
+ if (((server_byte_idx * CHAR_BIT) + server_bit_idx) % factor
+ == factor - 1) {
+ client_bit_idx++;
+
+ if (client_bit_idx / CHAR_BIT >= client_bitmap_size) {
+ return;
+ }
+ }
+ }
+ }
+}
+
int
dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
- uint64_t len, size_t pgsize, size_t size,
+ uint64_t len, size_t client_pgsize, size_t size,
char *bitmap)
{
dma_memory_region_t *region;
- ssize_t bitmap_size;
+ ssize_t server_bitmap_size;
+ ssize_t client_bitmap_size;
dma_sg_t sg;
- size_t i;
int ret;
assert(dma != NULL);
@@ -574,24 +706,40 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
return ERROR_INT(ENOTSUP);
}
- if (pgsize != dma->dirty_pgsize) {
- vfu_log(dma->vfu_ctx, LOG_ERR, "bad page size %zu", pgsize);
+ /*
+ * If dirty page logging is not enabled, the requested page size is zero,
+ * or the requested page size is not a power of two, return an error.
+ */
+ if (dma->dirty_pgsize == 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "dirty page logging not enabled");
+ return ERROR_INT(EINVAL);
+ }
+ if (client_pgsize == 0 || (client_pgsize & (client_pgsize - 1)) != 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "bad client page size %zu",
+ client_pgsize);
return ERROR_INT(EINVAL);
}
- bitmap_size = get_bitmap_size(len, pgsize);
- if (bitmap_size < 0) {
- vfu_log(dma->vfu_ctx, LOG_ERR, "failed to get bitmap size");
- return bitmap_size;
+ server_bitmap_size = get_bitmap_size(len, dma->dirty_pgsize);
+ if (server_bitmap_size < 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "failed to get server bitmap size");
+ return server_bitmap_size;
+ }
+
+ client_bitmap_size = get_bitmap_size(len, client_pgsize);
+ if (client_bitmap_size < 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "bad client page size %zu",
+ client_pgsize);
+ return client_bitmap_size;
}
/*
* They must be equal because this is how much data the client expects to
* receive.
*/
- if (size != (size_t)bitmap_size) {
- vfu_log(dma->vfu_ctx, LOG_ERR, "bad bitmap size %zu != %zu", size,
- bitmap_size);
+ if (size != (size_t)client_bitmap_size) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "bad client bitmap size %zu != %zu",
+ size, client_bitmap_size);
return ERROR_INT(EINVAL);
}
@@ -602,31 +750,29 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
return ERROR_INT(EINVAL);
}
- for (i = 0; i < (size_t)bitmap_size; i++) {
- uint8_t val = region->dirty_bitmap[i];
- uint8_t *outp = (uint8_t *)&bitmap[i];
-
+ if (client_pgsize == dma->dirty_pgsize) {
+ dirty_page_get_same_pgsize(region, bitmap, client_bitmap_size);
+ } else if (client_pgsize < dma->dirty_pgsize) {
/*
- * If no bits are dirty, avoid the atomic exchange. This is obviously
- * racy, but it's OK: if we miss a dirty bit being set, we'll catch it
- * the next time around.
- *
- * Otherwise, atomically exchange the dirty bits with zero: as we use
- * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might
- * miss a bit being set after, but again, we'll catch that next time
- * around.
+ * If the requested page size is less than that used for logging by
+ * the server, the bitmap will need to be extended, repeating bits.
*/
- if (val == 0) {
- *outp = 0;
- } else {
- uint8_t zero = 0;
- __atomic_exchange(&region->dirty_bitmap[i], &zero,
- outp, __ATOMIC_SEQ_CST);
- }
+ dirty_page_get_extend(region, bitmap, server_bitmap_size,
+ dma->dirty_pgsize, client_bitmap_size,
+ client_pgsize);
+ } else {
+ /*
+ * If the requested page size is larger than that used for logging by
+ * the server, the bitmap will need to combine bits with OR, losing
+ * accuracy.
+ */
+ dirty_page_get_combine(region, bitmap, server_bitmap_size,
+ dma->dirty_pgsize, client_bitmap_size,
+ client_pgsize);
}
#ifdef DEBUG
- log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size);
+ log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size, client_pgsize);
#endif
return 0;
diff --git a/lib/dma.h b/lib/dma.h
index 9687f49..789904f 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -386,6 +386,7 @@ int
dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
uint64_t len, size_t pgsize, size_t size,
char *bitmap);
+
bool
dma_sg_is_mappable(const dma_controller_t *dma, const dma_sg_t *sg);
diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c
index 271a269..81b0010 100644
--- a/lib/libvfio-user.c
+++ b/lib/libvfio-user.c
@@ -83,21 +83,16 @@ vfu_log(vfu_ctx_t *vfu_ctx, int level, const char *fmt, ...)
}
static size_t
-get_vfio_caps_size(bool is_migr_reg, vfu_reg_info_t *reg)
+get_vfio_caps_size(vfu_reg_info_t *reg)
{
- size_t type_size = 0;
size_t sparse_size = 0;
- if (is_migr_reg) {
- type_size = sizeof(struct vfio_region_info_cap_type);
- }
-
if (reg->nr_mmap_areas != 0) {
sparse_size = sizeof(struct vfio_region_info_cap_sparse_mmap)
+ (reg->nr_mmap_areas * sizeof(struct vfio_region_sparse_mmap_area));
}
- return type_size + sparse_size;
+ return sparse_size;
}
/*
@@ -106,7 +101,7 @@ get_vfio_caps_size(bool is_migr_reg, vfu_reg_info_t *reg)
* points accordingly.
*/
static int
-dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, bool is_migr_reg,
+dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg,
struct vfio_region_info *vfio_reg, int **fds, size_t *nr_fds)
{
struct vfio_info_cap_header *header;
@@ -120,16 +115,6 @@ dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, bool is_migr_reg,
header = (struct vfio_info_cap_header*)(vfio_reg + 1);
- if (is_migr_reg) {
- type = (struct vfio_region_info_cap_type *)header;
- type->header.id = VFIO_REGION_INFO_CAP_TYPE;
- type->header.version = 1;
- type->header.next = 0;
- type->type = VFIO_REGION_TYPE_MIGRATION;
- type->subtype = VFIO_REGION_SUBTYPE_MIGRATION;
- vfio_reg->cap_offset = sizeof(struct vfio_region_info);
- }
-
if (vfu_reg->mmap_areas != NULL) {
int i, nr_mmap_areas = vfu_reg->nr_mmap_areas;
if (type != NULL) {
@@ -218,14 +203,6 @@ region_access(vfu_ctx_t *vfu_ctx, size_t region, char *buf,
if (ret == -1) {
goto out;
}
- } else if (region == VFU_PCI_DEV_MIGR_REGION_IDX) {
- if (vfu_ctx->migration == NULL) {
- vfu_log(vfu_ctx, LOG_ERR, "migration not enabled");
- ret = ERROR_INT(EINVAL);
- goto out;
- }
-
- ret = migration_region_access(vfu_ctx, buf, count, offset, is_write);
} else {
vfu_region_access_cb_t *cb = vfu_ctx->reg_info[region].cb;
@@ -293,8 +270,7 @@ is_valid_region_access(vfu_ctx_t *vfu_ctx, size_t size, uint16_t cmd,
return false;
}
- if (unlikely(device_is_stopped_and_copying(vfu_ctx->migration) &&
- index != VFU_PCI_DEV_MIGR_REGION_IDX)) {
+ if (unlikely(device_is_stopped_and_copying(vfu_ctx->migration))) {
vfu_log(vfu_ctx, LOG_ERR,
"cannot access region %zu while device in stop-and-copy state",
index);
@@ -421,8 +397,7 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
vfu_reg = &vfu_ctx->reg_info[in_info->index];
if (vfu_reg->size > 0) {
- caps_size = get_vfio_caps_size(in_info->index == VFU_PCI_DEV_MIGR_REGION_IDX,
- vfu_reg);
+ caps_size = get_vfio_caps_size(vfu_reg);
}
msg->out.iov.iov_len = MIN(sizeof(*out_info) + caps_size, in_info->argsz);
@@ -457,9 +432,8 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
/* Only actually provide the caps if they fit. */
if (in_info->argsz >= out_info->argsz) {
out_info->flags |= VFIO_REGION_INFO_FLAG_CAPS;
- ret = dev_get_caps(vfu_ctx, vfu_reg,
- in_info->index == VFU_PCI_DEV_MIGR_REGION_IDX,
- out_info, &msg->out.fds, &msg->out.nr_fds);
+ ret = dev_get_caps(vfu_ctx, vfu_reg, out_info, &msg->out.fds,
+ &msg->out.nr_fds);
if (ret < 0) {
return ret;
}
@@ -917,133 +891,320 @@ static int
device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t reason)
{
int ret;
-
+
ret = call_reset_cb(vfu_ctx, reason);
if (ret < 0) {
return ret;
}
if (vfu_ctx->migration != NULL) {
- return handle_device_state(vfu_ctx, vfu_ctx->migration,
- VFIO_DEVICE_STATE_V1_RUNNING, false);
+ migr_state_transition(vfu_ctx->migration,
+ VFIO_USER_DEVICE_STATE_RUNNING);
}
return 0;
}
-static int
-handle_dirty_pages_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
+static uint32_t
+device_feature_flags_supported(vfu_ctx_t *vfu_ctx, uint32_t feature)
{
- struct vfio_user_dirty_pages *dirty_pages_in;
- struct vfio_user_dirty_pages *dirty_pages_out;
- struct vfio_user_bitmap_range *range_in;
- struct vfio_user_bitmap_range *range_out;
- size_t argsz;
- int ret;
+ if (vfu_ctx->migration == NULL) {
+ /*
+ * All of the current features require migration.
+ */
+ return 0;
+ }
+ switch (feature) {
+ case VFIO_DEVICE_FEATURE_MIGRATION:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
+ return VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE;
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+ return VFIO_DEVICE_FEATURE_GET
+ | VFIO_DEVICE_FEATURE_SET
+ | VFIO_DEVICE_FEATURE_PROBE;
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
+ return VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_PROBE;
+ default:
+ return 0;
+ };
+}
- dirty_pages_in = msg->in.iov.iov_base;
+static bool
+is_migration_feature(uint32_t feature)
+{
+ switch (feature) {
+ case VFIO_DEVICE_FEATURE_MIGRATION:
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+ return true;
+ }
- if (msg->in.iov.iov_len < sizeof(*dirty_pages_in) + sizeof(*range_in) ||
- dirty_pages_in->argsz > SERVER_MAX_DATA_XFER_SIZE ||
- dirty_pages_in->argsz < sizeof(*dirty_pages_out)) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid message size=%zu argsz=%u",
- msg->in.iov.iov_len, dirty_pages_in->argsz);
- return ERROR_INT(EINVAL);
+ return false;
+}
+
+static bool
+is_dma_feature(uint32_t feature)
+{
+ switch (feature) {
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
+ return true;
}
- range_in = msg->in.iov.iov_base + sizeof(*dirty_pages_in);
+ return false;
+}
- /*
- * range_in is client-controlled, but we only need to protect against
- * overflow here: we'll take MIN() against a validated value next, and
- * dma_controller_dirty_page_get() will validate the actual ->bitmap.size
- * value later, anyway.
+static int
+handle_migration_device_feature_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg,
+ struct vfio_user_device_feature *req)
+{
+ /*
+ * All supported outgoing data is currently the same size as
+ * struct vfio_user_device_feature_migration.
*/
- argsz = satadd_u64(sizeof(*dirty_pages_out) + sizeof(*range_out),
- range_in->bitmap.size);
+ msg->out.iov.iov_len = sizeof(struct vfio_user_device_feature)
+ + sizeof(struct vfio_user_device_feature_migration);
+
+ if (req->argsz < msg->out.iov.iov_len) {
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len);
- msg->out.iov.iov_len = MIN(dirty_pages_in->argsz, argsz);
- msg->out.iov.iov_base = malloc(msg->out.iov.iov_len);
if (msg->out.iov.iov_base == NULL) {
- return -1;
+ return ERROR_INT(ENOMEM);
}
- dirty_pages_out = msg->out.iov.iov_base;
- memcpy(dirty_pages_out, dirty_pages_in, sizeof(*dirty_pages_out));
- dirty_pages_out->argsz = argsz;
- /*
- * If the reply doesn't fit, reply with just the dirty pages header, giving
- * the needed argsz. Typically this shouldn't happen, as the client knows
- * the needed reply size and has already provided the correct bitmap size.
- */
- if (dirty_pages_in->argsz >= argsz) {
- void *bitmap_out = msg->out.iov.iov_base + sizeof(*dirty_pages_out)
- + sizeof(*range_out);
- range_out = msg->out.iov.iov_base + sizeof(*dirty_pages_out);
- memcpy(range_out, range_in, sizeof(*range_out));
- ret = dma_controller_dirty_page_get(vfu_ctx->dma,
- (vfu_dma_addr_t)(uintptr_t)range_in->iova,
- range_in->size,
- range_in->bitmap.pgsize,
- range_in->bitmap.size, bitmap_out);
- if (ret != 0) {
- ret = errno;
- vfu_log(vfu_ctx, LOG_WARNING,
- "failed to get dirty bitmap from DMA controller: %m");
- free(msg->out.iov.iov_base);
- msg->out.iov.iov_base = NULL;
- msg->out.iov.iov_len = 0;
- return ERROR_INT(ret);
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base,
+ sizeof(struct vfio_user_device_feature));
+
+ struct vfio_user_device_feature *res = msg->out.iov.iov_base;
+ res->argsz = msg->out.iov.iov_len;
+
+ switch (req->flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_MIGRATION: {
+ struct vfio_user_device_feature_migration *mig =
+ (void *)res->data;
+ // FIXME are these always supported? Can we consider to be
+ // "supported" if said support is just an empty callback?
+ //
+ // We don't need to return RUNNING or ERROR since they are
+ // always supported.
+ mig->flags = VFIO_MIGRATION_STOP_COPY
+ | VFIO_MIGRATION_PRE_COPY;
+ return 0;
}
- } else {
- vfu_log(vfu_ctx, LOG_ERR,
- "dirty pages: get [%#llx, %#llx): buffer too small (%u < %zu)",
- (ull_t)range_in->iova, (ull_t)range_in->iova + range_in->size,
- dirty_pages_in->argsz, argsz);
+
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: {
+ struct vfio_user_device_feature_mig_state *state =
+ (void *)res->data;
+ state->device_state = migration_get_state(vfu_ctx);
+ return 0;
+ }
+
+ default:
+ vfu_log(vfu_ctx, LOG_ERR, "invalid flags for migration GET (%d)",
+ req->flags);
+ return ERROR_INT(EINVAL);
}
+}
- return 0;
+static int
+handle_migration_device_feature_set(vfu_ctx_t *vfu_ctx, uint32_t feature,
+ struct vfio_user_device_feature *res)
+{
+ assert(feature == VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE);
+
+ struct vfio_user_device_feature_mig_state *state = (void *)res->data;
+
+ return migration_set_state(vfu_ctx, state->device_state);
}
static int
-handle_dirty_pages(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
+handle_dma_device_feature_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg,
+ struct vfio_user_device_feature *req)
{
- struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base;
- int ret;
+ const size_t header_size = sizeof(struct vfio_user_device_feature)
+ + sizeof(struct vfio_user_device_feature_dma_logging_report);
+
+ struct vfio_user_device_feature_dma_logging_report *rep =
+ (void *)req->data;
+
+ dma_controller_t *dma = vfu_ctx->dma;
+
+ if (dma == NULL) {
+ vfu_log(vfu_ctx, LOG_ERR, "DMA not enabled for DMA device feature");
+ return ERROR_INT(EINVAL);
+ }
+
+ ssize_t bitmap_size = get_bitmap_size(rep->length, rep->page_size);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+
+ msg->out.iov.iov_len = header_size + bitmap_size;
+
+ if (req->argsz < msg->out.iov.iov_len) {
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len);
+
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
+ }
+
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base, header_size);
+
+ struct vfio_user_device_feature *res = msg->out.iov.iov_base;
+
+ res->argsz = msg->out.iov.iov_len;
+ char *bitmap = (char *)msg->out.iov.iov_base + header_size;
+
+ int ret = dma_controller_dirty_page_get(dma,
+ (vfu_dma_addr_t) rep->iova,
+ rep->length,
+ rep->page_size,
+ bitmap_size,
+ bitmap);
+
+ if (ret < 0) {
+ iov_free(&msg->out.iov);
+ }
+
+ return ret;
+}
+
+static int
+handle_dma_device_feature_set(vfu_ctx_t *vfu_ctx, uint32_t feature,
+ struct vfio_user_device_feature *res)
+{
+ dma_controller_t *dma = vfu_ctx->dma;
+
+ assert(dma != NULL);
+
+ if (feature == VFIO_DEVICE_FEATURE_DMA_LOGGING_START) {
+ struct vfio_user_device_feature_dma_logging_control *ctl =
+ (void *)res->data;
+ return dma_controller_dirty_page_logging_start(dma,
+ ctl->page_size);
+ }
+
+ assert(feature == VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP);
+
+ dma_controller_dirty_page_logging_stop(dma);
+ return 0;
+}
+
+static int
+handle_device_feature(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
+{
assert(vfu_ctx != NULL);
assert(msg != NULL);
- if (msg->in.iov.iov_len < sizeof(*dirty_pages) ||
- dirty_pages->argsz < sizeof(*dirty_pages)) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid message size %zu", msg->in.iov.iov_len);
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_device_feature)) {
+ vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)",
+ msg->in.iov.iov_len);
return ERROR_INT(EINVAL);
}
- if (vfu_ctx->migration == NULL) {
- vfu_log(vfu_ctx, LOG_ERR, "migration not configured");
- return ERROR_INT(ENOTSUP);
+ struct vfio_user_device_feature *req = msg->in.iov.iov_base;
+
+ uint32_t operations = req->flags & ~VFIO_DEVICE_FEATURE_MASK;
+ uint32_t feature = req->flags & VFIO_DEVICE_FEATURE_MASK;
+
+ uint32_t supported_ops = device_feature_flags_supported(vfu_ctx, feature);
+
+ if ((req->flags & supported_ops) != operations || supported_ops == 0) {
+ vfu_log(vfu_ctx, LOG_ERR, "unsupported operation(s), flags=%d",
+ req->flags);
+ return ERROR_INT(EINVAL);
}
- switch (dirty_pages->flags) {
- case VFIO_IOMMU_DIRTY_PAGES_FLAG_START:
- ret = dma_controller_dirty_page_logging_start(vfu_ctx->dma,
- migration_get_pgsize(vfu_ctx->migration));
- break;
+ ssize_t ret;
- case VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP:
- dma_controller_dirty_page_logging_stop(vfu_ctx->dma);
- ret = 0;
- break;
+ switch (operations) {
+ case VFIO_DEVICE_FEATURE_GET: {
+ if (is_migration_feature(feature)) {
+ ret = handle_migration_device_feature_get(vfu_ctx, msg, req);
+ } else if (is_dma_feature(feature)) {
+ ret = handle_dma_device_feature_get(vfu_ctx, msg, req);
+ } else {
+ vfu_log(vfu_ctx, LOG_ERR, "unsupported feature %d for GET",
+ feature);
+ return ERROR_INT(EINVAL);
+ }
+ break;
+ }
- case VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP:
- ret = handle_dirty_pages_get(vfu_ctx, msg);
- break;
+ case VFIO_DEVICE_FEATURE_SET: {
+ msg->out.iov.iov_len = msg->in.iov.iov_len;
- default:
- vfu_log(vfu_ctx, LOG_ERR, "bad flags %#x", dirty_pages->flags);
- ret = ERROR_INT(EINVAL);
- break;
+ if (req->argsz < msg->out.iov.iov_len) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad argsz (%d<%ld)", req->argsz,
+ msg->out.iov.iov_len);
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = malloc(msg->out.iov.iov_len);
+
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
+ }
+
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base,
+ msg->out.iov.iov_len);
+
+ struct vfio_user_device_feature *res = msg->out.iov.iov_base;
+
+ if (is_migration_feature(feature)) {
+ ret = handle_migration_device_feature_set(vfu_ctx, feature, res);
+ } else if (is_dma_feature(feature)) {
+ ret = handle_dma_device_feature_set(vfu_ctx, feature, res);
+ } else {
+ vfu_log(vfu_ctx, LOG_ERR, "unsupported feature %d for SET",
+ feature);
+ return ERROR_INT(EINVAL);
+ }
+ break;
+ }
+
+ default: {
+ /*
+ * PROBE allows GET/SET to also be set (to specify which operations
+ * we want to probe the feature for), so we only check that PROBE
+ * is set, not that it is the only operation flag set.
+ */
+ if (!(operations & VFIO_DEVICE_FEATURE_PROBE)) {
+ vfu_log(vfu_ctx, LOG_ERR, "no operation specified");
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_len = msg->in.iov.iov_len;
+
+ if (req->argsz < msg->out.iov.iov_len) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad argsz (%d<%ld)", req->argsz,
+ msg->out.iov.iov_len);
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = malloc(msg->out.iov.iov_len);
+
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
+ }
+
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base,
+ msg->out.iov.iov_len);
+
+ ret = 0;
+ }
}
return ret;
@@ -1207,13 +1368,16 @@ handle_request(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
ret = device_reset(vfu_ctx, VFU_RESET_DEVICE);
break;
- case VFIO_USER_DIRTY_PAGES:
- // FIXME: don't allow migration calls if migration == NULL
- if (vfu_ctx->dma != NULL) {
- ret = handle_dirty_pages(vfu_ctx, msg);
- } else {
- ret = 0;
- }
+ case VFIO_USER_DEVICE_FEATURE:
+ ret = handle_device_feature(vfu_ctx, msg);
+ break;
+
+ case VFIO_USER_MIG_DATA_READ:
+ ret = handle_mig_data_read(vfu_ctx, msg);
+ break;
+
+ case VFIO_USER_MIG_DATA_WRITE:
+ ret = handle_mig_data_write(vfu_ctx, msg);
break;
default:
@@ -1317,7 +1481,8 @@ MOCK_DEFINE(cmd_allowed_when_stopped_and_copying)(uint16_t cmd)
{
return cmd == VFIO_USER_REGION_READ ||
cmd == VFIO_USER_REGION_WRITE ||
- cmd == VFIO_USER_DIRTY_PAGES;
+ cmd == VFIO_USER_DEVICE_FEATURE ||
+ cmd == VFIO_USER_MIG_DATA_READ;
}
bool
@@ -1343,14 +1508,14 @@ static bool
access_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
uint64_t offset)
{
- return access_migration_needs_quiesce(vfu_ctx, region_index, offset)
- || access_is_pci_cap_exp(vfu_ctx, region_index, offset);
+ return access_is_pci_cap_exp(vfu_ctx, region_index, offset);
}
static bool
command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
{
struct vfio_user_region_access *reg;
+ struct vfio_user_device_feature *feature;
if (vfu_ctx->quiesce == NULL) {
return false;
@@ -1364,22 +1529,11 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
case VFIO_USER_DEVICE_RESET:
return true;
- case VFIO_USER_DIRTY_PAGES: {
- struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base;
-
- if (msg->in.iov.iov_len < sizeof(*dirty_pages)) {
- return false;
- }
-
- return !(dirty_pages->flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP);
- }
-
case VFIO_USER_REGION_WRITE:
if (msg->in.iov.iov_len < sizeof(*reg)) {
/*
* bad request, it will be eventually failed by
* handle_region_access
- *
*/
return false;
}
@@ -1388,8 +1542,23 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
return true;
}
break;
+
+ case VFIO_USER_DEVICE_FEATURE:
+ if (msg->in.iov.iov_len < sizeof(*feature)) {
+ /*
+ * bad request, it will be eventually failed by
+ * handle_region_access
+ */
+ return false;
+ }
+ feature = msg->in.iov.iov_base;
+ if (migration_feature_needs_quiesce(feature)) {
+ return true;
+ }
+ break;
}
+
return false;
}
@@ -1842,38 +2011,6 @@ copyin_mmap_areas(vfu_reg_info_t *reg_info,
return 0;
}
-static bool
-ranges_intersect(size_t off1, size_t size1, size_t off2, size_t size2)
-{
- /*
- * For two ranges to intersect, the start of each range must be before the
- * end of the other range.
- * TODO already defined in lib/pci_caps.c, maybe introduce a file for misc
- * utility functions?
- */
- return (off1 < (off2 + size2) && off2 < (off1 + size1));
-}
-
-static bool
-maps_over_migr_regs(struct iovec *iov)
-{
- return ranges_intersect(0, vfu_get_migr_register_area_size(),
- (size_t)iov->iov_base, iov->iov_len);
-}
-
-static bool
-validate_sparse_mmaps_for_migr_reg(vfu_reg_info_t *reg)
-{
- int i;
-
- for (i = 0; i < reg->nr_mmap_areas; i++) {
- if (maps_over_migr_regs(&reg->mmap_areas[i])) {
- return false;
- }
- }
- return true;
-}
-
EXPORT int
vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
vfu_region_access_cb_t *cb, int flags,
@@ -1919,12 +2056,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
return ERROR_INT(EINVAL);
}
- if (region_idx == VFU_PCI_DEV_MIGR_REGION_IDX &&
- size < vfu_get_migr_register_area_size()) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid migration region size %zu", size);
- return ERROR_INT(EINVAL);
- }
-
for (i = 0; i < nr_mmap_areas; i++) {
struct iovec *iov = &mmap_areas[i];
if ((size_t)iov_end(iov) > size) {
@@ -1956,15 +2087,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
}
}
- if (region_idx == VFU_PCI_DEV_MIGR_REGION_IDX) {
- if (!validate_sparse_mmaps_for_migr_reg(reg)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "migration registers cannot be memory mapped");
- errno = EINVAL;
- goto err;
- }
- }
-
return 0;
err:
@@ -2044,26 +2166,20 @@ vfu_setup_irq_state_callback(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type,
EXPORT int
vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx,
- const vfu_migration_callbacks_t *callbacks,
- uint64_t data_offset)
+ const vfu_migration_callbacks_t *callbacks)
{
int ret = 0;
assert(vfu_ctx != NULL);
assert(callbacks != NULL);
- if (vfu_ctx->reg_info[VFU_PCI_DEV_MIGR_REGION_IDX].size == 0) {
- vfu_log(vfu_ctx, LOG_ERR, "no device migration region");
- return ERROR_INT(EINVAL);
- }
-
if (callbacks->version != VFU_MIGR_CALLBACKS_VERS) {
vfu_log(vfu_ctx, LOG_ERR, "unsupported migration callbacks version %d",
callbacks->version);
return ERROR_INT(EINVAL);
}
- vfu_ctx->migration = init_migration(callbacks, data_offset, &ret);
+ vfu_ctx->migration = init_migration(callbacks, &ret);
if (vfu_ctx->migration == NULL) {
vfu_log(vfu_ctx, LOG_ERR, "failed to initialize device migration");
return ERROR_INT(ret);
diff --git a/lib/migration.c b/lib/migration.c
index 794e7b8..02c29c1 100644
--- a/lib/migration.c
+++ b/lib/migration.c
@@ -39,17 +39,100 @@
#include "private.h"
#include "migration_priv.h"
+/*
+ * This defines valid migration state transitions. Each element in the array
+ * corresponds to a FROM state and each bit of the element to a TO state. If the
+ * bit is set, then the transition is allowed.
+ *
+ * The indices of each state are those in the vfio_user_device_mig_state enum.
+ */
+static const char transitions[VFIO_USER_DEVICE_NUM_STATES] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = 0,
+ [VFIO_USER_DEVICE_STATE_STOP] = (1 << VFIO_USER_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_USER_DEVICE_STATE_STOP_COPY) |
+ (1 << VFIO_USER_DEVICE_STATE_RESUMING),
+ [VFIO_USER_DEVICE_STATE_RUNNING] = (1 << VFIO_USER_DEVICE_STATE_STOP) |
+ (1 << VFIO_USER_DEVICE_STATE_PRE_COPY),
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = 1 << VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = 1 << VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = 0,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = (1 << VFIO_USER_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_USER_DEVICE_STATE_STOP_COPY),
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = 0
+};
+
+/*
+ * The spec dictates that, if no direct transition is allowed, and the
+ * transition is not one of the explicitly disallowed ones (i.e. anything to
+ * ERROR, anything from ERROR, and STOP_COPY -> PRE_COPY), we should take the
+ * shortest allowed path.
+ *
+ * This can be indexed as `next_state[current][target] == next`. If next is
+ * ERROR, then the transition is not allowed.
+ */
+static const uint32_t
+next_state[VFIO_USER_DEVICE_NUM_STATES][VFIO_USER_DEVICE_NUM_STATES] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = { 0, 0, 0, 0, 0, 0, 0, 0 },
+ [VFIO_USER_DEVICE_STATE_STOP] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RESUMING,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_RUNNING] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_PRE_COPY,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_RESUMING] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RESUMING,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = { 0, 0, 0, 0, 0, 0, 0, 0 },
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_PRE_COPY,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = { 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
bool
MOCK_DEFINE(vfio_migr_state_transition_is_valid)(uint32_t from, uint32_t to)
{
- return migr_states[from].state & (1 << to);
-}
-
-EXPORT size_t
-vfu_get_migr_register_area_size(void)
-{
- return ROUND_UP(sizeof(struct vfio_user_migration_info),
- sysconf(_SC_PAGE_SIZE));
+ return from < VFIO_USER_DEVICE_NUM_STATES
+ && to < VFIO_USER_DEVICE_NUM_STATES
+ && (transitions[from] & (1 << to)) != 0;
}
/*
@@ -57,16 +140,10 @@ vfu_get_migr_register_area_size(void)
* in vfu_ctx_t.
*/
struct migration *
-init_migration(const vfu_migration_callbacks_t * callbacks,
- uint64_t data_offset, int *err)
+init_migration(const vfu_migration_callbacks_t *callbacks, int *err)
{
struct migration *migr;
- if (data_offset < vfu_get_migr_register_area_size()) {
- *err = EINVAL;
- return NULL;
- }
-
migr = calloc(1, sizeof(*migr));
if (migr == NULL) {
*err = ENOMEM;
@@ -81,15 +158,13 @@ init_migration(const vfu_migration_callbacks_t * callbacks,
migr->pgsize = sysconf(_SC_PAGESIZE);
/* FIXME this should be done in vfu_ctx_realize */
- migr->info.device_state = VFIO_DEVICE_STATE_V1_RUNNING;
- migr->data_offset = data_offset;
+ migr->state = VFIO_USER_DEVICE_STATE_RUNNING;
migr->callbacks = *callbacks;
if (migr->callbacks.transition == NULL ||
- migr->callbacks.get_pending_bytes == NULL ||
- migr->callbacks.prepare_data == NULL ||
migr->callbacks.read_data == NULL ||
- migr->callbacks.write_data == NULL) {
+ migr->callbacks.write_data == NULL ||
+ migr->callbacks.version != VFU_MIGR_CALLBACKS_VERS) {
free(migr);
*err = EINVAL;
return NULL;
@@ -100,35 +175,29 @@ init_migration(const vfu_migration_callbacks_t * callbacks,
void
MOCK_DEFINE(migr_state_transition)(struct migration *migr,
- enum migr_iter_state state)
+ enum vfio_user_device_mig_state state)
{
assert(migr != NULL);
- /* FIXME validate that state transition */
- migr->iter.state = state;
+ migr->state = state;
}
vfu_migr_state_t
-MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t device_state)
+MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t state)
{
- switch (device_state) {
- case VFIO_DEVICE_STATE_V1_STOP:
+ switch (state) {
+ case VFIO_USER_DEVICE_STATE_STOP:
return VFU_MIGR_STATE_STOP;
- case VFIO_DEVICE_STATE_V1_RUNNING:
+ case VFIO_USER_DEVICE_STATE_RUNNING:
return VFU_MIGR_STATE_RUNNING;
- case VFIO_DEVICE_STATE_V1_SAVING:
- /*
- * FIXME How should the device operate during the stop-and-copy
- * phase? Should we only allow the migration data to be read from
- * the migration region? E.g. Access to any other region should be
- * failed? This might be a good question to send to LKML.
- */
+ case VFIO_USER_DEVICE_STATE_STOP_COPY:
return VFU_MIGR_STATE_STOP_AND_COPY;
- case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
- return VFU_MIGR_STATE_PRE_COPY;
- case VFIO_DEVICE_STATE_V1_RESUMING:
+ case VFIO_USER_DEVICE_STATE_RESUMING:
return VFU_MIGR_STATE_RESUME;
+ case VFIO_USER_DEVICE_STATE_PRE_COPY:
+ return VFU_MIGR_STATE_PRE_COPY;
+ default:
+ return -1;
}
- return -1;
}
/**
@@ -165,8 +234,7 @@ MOCK_DEFINE(migr_trans_to_valid_state)(vfu_ctx_t *vfu_ctx, struct migration *mig
return ret;
}
}
- migr->info.device_state = device_state;
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_INITIAL);
+ migr_state_transition(migr, device_state);
return 0;
}
@@ -178,372 +246,176 @@ MOCK_DEFINE(handle_device_state)(vfu_ctx_t *vfu_ctx, struct migration *migr,
uint32_t device_state, bool notify)
{
+ assert(vfu_ctx != NULL);
assert(migr != NULL);
- if (!vfio_migr_state_transition_is_valid(migr->info.device_state,
- device_state)) {
+ if (!vfio_migr_state_transition_is_valid(migr->state, device_state)) {
return ERROR_INT(EINVAL);
}
return migr_trans_to_valid_state(vfu_ctx, migr, device_state, notify);
}
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_pending_bytes(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t *pending_bytes, bool is_write)
+size_t
+migration_get_state(vfu_ctx_t *vfu_ctx)
{
- assert(migr != NULL);
- assert(pending_bytes != NULL);
+ return vfu_ctx->migration->state;
+}
- if (is_write) {
+ssize_t
+migration_set_state(vfu_ctx_t *vfu_ctx, uint32_t device_state)
+{
+ struct migration *migr = vfu_ctx->migration;
+ uint32_t state;
+ ssize_t ret = 0;
+
+ if (device_state > VFIO_USER_DEVICE_NUM_STATES) {
return ERROR_INT(EINVAL);
}
+
+ while (migr->state != device_state && ret == 0) {
+ state = next_state[migr->state][device_state];
- if (migr->iter.state == VFIO_USER_MIGR_ITER_STATE_FINISHED) {
- *pending_bytes = 0;
- return 0;
- }
-
- switch (migr->iter.state) {
- case VFIO_USER_MIGR_ITER_STATE_INITIAL:
- case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED:
- /*
- * FIXME what happens if data haven't been consumed in the previous
- * iteration? Check https://www.spinics.net/lists/kvm/msg228608.html.
- */
- *pending_bytes = migr->iter.pending_bytes = migr->callbacks.get_pending_bytes(vfu_ctx);
-
- if (*pending_bytes == 0) {
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_FINISHED);
- } else {
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_STARTED);
- }
- break;
- case VFIO_USER_MIGR_ITER_STATE_STARTED:
- /*
- * FIXME We might be wrong returning a cached value, check
- * https://www.spinics.net/lists/kvm/msg228608.html
- *
- */
- *pending_bytes = migr->iter.pending_bytes;
- break;
- default:
+ if (state == VFIO_USER_DEVICE_STATE_ERROR) {
return ERROR_INT(EINVAL);
- }
- return 0;
-}
+ }
-/*
- * FIXME reading or writing migration registers with the wrong device state or
- * out of sequence is undefined, but should not result in EINVAL, it should
- * simply be ignored. However this way it's easier to catch development errors.
- * Make this behavior conditional.
- */
+ ret = handle_device_state(vfu_ctx, migr, state, true);
+ };
+
+ return ret;
+}
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_data_offset_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr,
- bool is_write)
+ssize_t
+handle_mig_data_read(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
{
- int ret = 0;
-
- assert(migr != NULL);
+ assert(vfu_ctx != NULL);
+ assert(msg != NULL);
- if (is_write) {
- vfu_log(vfu_ctx, LOG_ERR, "data_offset is RO when saving");
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data)) {
+ vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)",
+ msg->in.iov.iov_len);
return ERROR_INT(EINVAL);
}
- switch (migr->iter.state) {
- case VFIO_USER_MIGR_ITER_STATE_STARTED:
- ret = migr->callbacks.prepare_data(vfu_ctx, &migr->iter.offset,
- &migr->iter.size);
- if (ret != 0) {
- return ret;
- }
- /*
- * FIXME must first read data_offset and then data_size. They way we've
- * implemented it now, if data_size is read before data_offset we
- * transition to state VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED without
- * calling callbacks.prepare_data, which is wrong. Maybe we need
- * separate states for data_offset and data_size.
- */
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED);
- break;
- case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED:
- /*
- * data_offset is invariant during a save iteration.
- */
- break;
- default:
- vfu_log(vfu_ctx, LOG_ERR,
- "reading data_offset out of sequence is undefined");
+ struct migration *migr = vfu_ctx->migration;
+ struct vfio_user_mig_data *req = msg->in.iov.iov_base;
+
+ if (vfu_ctx->migration == NULL) {
+ vfu_log(vfu_ctx, LOG_ERR, "migration not enabled");
return ERROR_INT(EINVAL);
}
- return 0;
-}
-
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_data_offset(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t *offset, bool is_write)
-{
- int ret;
-
- assert(migr != NULL);
- assert(offset != NULL);
-
- switch (migr->info.device_state) {
- case VFIO_DEVICE_STATE_V1_SAVING:
- case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
- ret = handle_data_offset_when_saving(vfu_ctx, migr, is_write);
- if (ret == 0 && !is_write) {
- *offset = migr->iter.offset + migr->data_offset;
- }
- return ret;
- case VFIO_DEVICE_STATE_V1_RESUMING:
- if (is_write) {
- /* TODO writing to read-only registers should be simply ignored */
- vfu_log(vfu_ctx, LOG_ERR, "bad write to migration data_offset");
- return ERROR_INT(EINVAL);
- }
- ret = migr->callbacks.prepare_data(vfu_ctx, offset, NULL);
- if (ret != 0) {
- return ret;
- }
- *offset += migr->data_offset;
- return 0;
+ if (migr->state != VFIO_USER_DEVICE_STATE_PRE_COPY
+ && migr->state != VFIO_USER_DEVICE_STATE_STOP_COPY) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad migration state to read data: %d",
+ migr->state);
+ return ERROR_INT(EINVAL);
}
- /* TODO improve error message */
- vfu_log(vfu_ctx, LOG_ERR,
- "bad access to migration data_offset in state %s",
- migr_states[migr->info.device_state].name);
- return ERROR_INT(EINVAL);
-}
-
-/**
- * Returns 0 on success, -1 on failure setting errno.
- */
-static ssize_t
-handle_data_size_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr,
- bool is_write)
-{
- assert(migr != NULL);
- if (is_write) {
- /* TODO improve error message */
- vfu_log(vfu_ctx, LOG_ERR, "data_size is RO when saving");
+ if (req->size > vfu_ctx->client_max_data_xfer_size) {
+ vfu_log(vfu_ctx, LOG_ERR, "transfer size exceeds limit (%d > %ld)",
+ req->size, vfu_ctx->client_max_data_xfer_size);
return ERROR_INT(EINVAL);
}
- if (migr->iter.state != VFIO_USER_MIGR_ITER_STATE_STARTED &&
- migr->iter.state != VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED) {
- vfu_log(vfu_ctx, LOG_ERR,
- "reading data_size ouf of sequence is undefined");
+ if (req->argsz < sizeof(struct vfio_user_mig_data) + req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "argsz too small (%d < %ld)",
+ req->argsz, sizeof(struct vfio_user_mig_data) + req->size);
return ERROR_INT(EINVAL);
}
- return 0;
-}
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_data_size_when_resuming(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t size, bool is_write)
-{
- assert(migr != NULL);
+ msg->out.iov.iov_len = msg->in.iov.iov_len + req->size;
+ msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len);
- if (is_write) {
- return migr->callbacks.data_written(vfu_ctx, size);
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
}
- return 0;
-}
-/**
- * Returns 0 on success, -1 on failure setting errno.
- */
-static ssize_t
-handle_data_size(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t *size, bool is_write)
-{
- int ret;
+ struct vfio_user_mig_data *res = msg->out.iov.iov_base;
- assert(vfu_ctx != NULL);
- assert(size != NULL);
-
- switch (migr->info.device_state){
- case VFIO_DEVICE_STATE_V1_SAVING:
- case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
- ret = handle_data_size_when_saving(vfu_ctx, migr, is_write);
- if (ret == 0 && !is_write) {
- *size = migr->iter.size;
- }
+ ssize_t ret = migr->callbacks.read_data(vfu_ctx, &res->data, req->size);
+
+ if (ret < 0) {
+ vfu_log(vfu_ctx, LOG_ERR, "read_data callback failed, errno=%d", errno);
+ iov_free(&msg->out.iov);
return ret;
- case VFIO_DEVICE_STATE_V1_RESUMING:
- return handle_data_size_when_resuming(vfu_ctx, migr, *size, is_write);
}
- /* TODO improve error message */
- vfu_log(vfu_ctx, LOG_ERR, "bad access to data_size");
- return ERROR_INT(EINVAL);
+
+ res->size = ret;
+ res->argsz = sizeof(struct vfio_user_mig_data) + ret;
+
+ return 0;
}
-/**
- * Returns 0 on success, -1 on failure setting errno.
- */
ssize_t
-MOCK_DEFINE(migration_region_access_registers)(vfu_ctx_t *vfu_ctx, char *buf,
- size_t count, loff_t pos,
- bool is_write)
+handle_mig_data_write(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
{
+ assert(vfu_ctx != NULL);
+ assert(msg != NULL);
+
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data)) {
+ vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)",
+ msg->in.iov.iov_len);
+ return ERROR_INT(EINVAL);
+ }
+
struct migration *migr = vfu_ctx->migration;
- int ret;
- uint32_t *device_state, old_device_state;
+ struct vfio_user_mig_data *req = msg->in.iov.iov_base;
- assert(migr != NULL);
+ if (vfu_ctx->migration == NULL) {
+ vfu_log(vfu_ctx, LOG_ERR, "migration not enabled");
+ return ERROR_INT(EINVAL);
+ }
- switch (pos) {
- case offsetof(struct vfio_user_migration_info, device_state):
- if (count != sizeof(migr->info.device_state)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad device_state access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- device_state = (uint32_t *)buf;
- if (!is_write) {
- *device_state = migr->info.device_state;
- return 0;
- }
- old_device_state = migr->info.device_state;
- vfu_log(vfu_ctx, LOG_DEBUG,
- "migration: transitioning from state %s to state %s",
- migr_states[old_device_state].name,
- migr_states[*device_state].name);
-
- ret = handle_device_state(vfu_ctx, migr, *device_state, true);
- if (ret == 0) {
- vfu_log(vfu_ctx, LOG_DEBUG,
- "migration: transitioned from state %s to state %s",
- migr_states[old_device_state].name,
- migr_states[*device_state].name);
- } else {
- vfu_log(vfu_ctx, LOG_ERR,
- "migration: failed to transition from state %s to state %s",
- migr_states[old_device_state].name,
- migr_states[*device_state].name);
- }
- break;
- case offsetof(struct vfio_user_migration_info, pending_bytes):
- if (count != sizeof(migr->info.pending_bytes)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad pending_bytes access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- ret = handle_pending_bytes(vfu_ctx, migr, (uint64_t *)buf, is_write);
- break;
- case offsetof(struct vfio_user_migration_info, data_offset):
- if (count != sizeof(migr->info.data_offset)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad data_offset access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- ret = handle_data_offset(vfu_ctx, migr, (uint64_t *)buf, is_write);
- break;
- case offsetof(struct vfio_user_migration_info, data_size):
- if (count != sizeof(migr->info.data_size)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad data_size access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- ret = handle_data_size(vfu_ctx, migr, (uint64_t *)buf, is_write);
- break;
- default:
- vfu_log(vfu_ctx, LOG_ERR,
- "bad migration region register offset %#llx",
- (ull_t)pos);
+ if (migr->state != VFIO_USER_DEVICE_STATE_RESUMING) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad migration state to write data: %d",
+ migr->state);
return ERROR_INT(EINVAL);
}
- return ret;
-}
-ssize_t
-migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
- loff_t pos, bool is_write)
-{
- struct migration *migr = vfu_ctx->migration;
- ssize_t ret;
+ if (req->size > vfu_ctx->client_max_data_xfer_size) {
+ vfu_log(vfu_ctx, LOG_ERR, "transfer size exceeds limit (%d > %ld)",
+ req->size, vfu_ctx->client_max_data_xfer_size);
+ return ERROR_INT(EINVAL);
+ }
- assert(migr != NULL);
- assert(buf != NULL);
+ if (req->argsz < sizeof(struct vfio_user_mig_data) + req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "argsz too small (%d < %ld)",
+ req->argsz, sizeof(struct vfio_user_mig_data) + req->size);
+ return ERROR_INT(EINVAL);
+ }
- /*
- * FIXME don't call the device callback if the migration state is in not in
- * pre-copy/stop-and-copy/resuming state, since the behavior is undefined
- * in that case.
- */
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data) + req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "short write (%d < %ld)",
+ req->argsz, sizeof(struct vfio_user_mig_data) + req->size);
+ return ERROR_INT(EINVAL);
+ }
- if (pos + count <= sizeof(struct vfio_user_migration_info)) {
- ret = migration_region_access_registers(vfu_ctx, buf, count,
- pos, is_write);
- if (ret != 0) {
- return ret;
- }
- } else {
-
- if (pos < (loff_t)migr->data_offset) {
- /*
- * TODO we can simply ignore the access to that part and handle
- * any access to the data region properly.
- */
- vfu_log(vfu_ctx, LOG_WARNING,
- "bad access to dead space %#llx - %#llx in migration region",
- (ull_t)pos,
- (ull_t)(pos + count - 1));
- return ERROR_INT(EINVAL);
- }
+ ssize_t ret = migr->callbacks.write_data(vfu_ctx, &req->data, req->size);
- pos -= migr->data_offset;
- if (is_write) {
- ret = migr->callbacks.write_data(vfu_ctx, buf, count, pos);
- if (ret < 0) {
- return -1;
- }
- } else {
- /*
- * FIXME <linux/vfio.h> says:
- *
- * d. Read data_size bytes of data from (region + data_offset) from the
- * migration region.
- *
- * Does this mean that partial reads are not allowed?
- */
- ret = migr->callbacks.read_data(vfu_ctx, buf, count, pos);
- if (ret < 0) {
- return -1;
- }
- }
+ if (ret < 0) {
+ vfu_log(vfu_ctx, LOG_ERR, "write_data callback failed, errno=%d",
+ errno);
+ return ret;
+ } else if (ret != req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "migration data partial write of size=%ld",
+ ret);
+ return ERROR_INT(EINVAL);
}
- return count;
+ return 0;
}
bool
MOCK_DEFINE(device_is_stopped_and_copying)(struct migration *migr)
{
- return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_SAVING;
+ return migr != NULL && migr->state == VFIO_USER_DEVICE_STATE_STOP_COPY;
}
bool
MOCK_DEFINE(device_is_stopped)(struct migration *migr)
{
- return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_STOP;
+ return migr != NULL && migr->state == VFIO_USER_DEVICE_STATE_STOP;
}
size_t
@@ -569,17 +441,11 @@ migration_set_pgsize(struct migration *migr, size_t pgsize)
}
bool
-access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
- uint64_t offset)
+migration_feature_needs_quiesce(struct vfio_user_device_feature *feature)
{
- /*
- * Writing to the migration state register with an unaligned access won't
- * trigger this check but that's not a problem because
- * migration_region_access_registers will fail the access.
- */
- return region_index == VFU_PCI_DEV_MIGR_REGION_IDX
- && vfu_ctx->migration != NULL
- && offset == offsetof(struct vfio_user_migration_info, device_state);
+ return ((feature->flags &
+ (VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE)) != 0)
+ && !(feature->flags & VFIO_DEVICE_FEATURE_PROBE);
}
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/migration.h b/lib/migration.h
index 26fd744..928a7e5 100644
--- a/lib/migration.h
+++ b/lib/migration.h
@@ -45,12 +45,19 @@
#include "private.h"
struct migration *
-init_migration(const vfu_migration_callbacks_t *callbacks,
- uint64_t data_offset, int *err);
+init_migration(const vfu_migration_callbacks_t *callbacks, int *err);
+
+size_t
+migration_get_state(vfu_ctx_t *vfu_ctx);
+
+ssize_t
+migration_set_state(vfu_ctx_t *vfu_ctx, uint32_t device_state);
ssize_t
-migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
- loff_t pos, bool is_write);
+handle_mig_data_read(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg);
+
+ssize_t
+handle_mig_data_write(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg);
bool
migration_available(vfu_ctx_t *vfu_ctx);
@@ -65,6 +72,12 @@ migration_get_pgsize(struct migration *migr);
int
migration_set_pgsize(struct migration *migr, size_t pgsize);
+uint64_t
+migration_get_flags(struct migration *migr);
+
+MOCK_DECLARE(void, migr_state_transition, struct migration *migr,
+ enum vfio_user_device_mig_state state);
+
MOCK_DECLARE(bool, vfio_migr_state_transition_is_valid, uint32_t from,
uint32_t to);
@@ -72,8 +85,7 @@ MOCK_DECLARE(ssize_t, handle_device_state, vfu_ctx_t *vfu_ctx,
struct migration *migr, uint32_t device_state, bool notify);
bool
-access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
- uint64_t offset);
+migration_feature_needs_quiesce(struct vfio_user_device_feature *feature);
#endif /* LIB_VFIO_USER_MIGRATION_H */
diff --git a/lib/migration_priv.h b/lib/migration_priv.h
index d5643af..83c5f7e 100644
--- a/lib/migration_priv.h
+++ b/lib/migration_priv.h
@@ -33,94 +33,12 @@
#include <linux/vfio.h>
-/*
- * FSM to simplify saving device state.
- */
-enum migr_iter_state {
- VFIO_USER_MIGR_ITER_STATE_INITIAL,
- VFIO_USER_MIGR_ITER_STATE_STARTED,
- VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED,
- VFIO_USER_MIGR_ITER_STATE_FINISHED
-};
-
struct migration {
- /*
- * TODO if the user provides an FD then should mmap it and use the migration
- * registers in the file
- */
- struct vfio_user_migration_info info;
+ enum vfio_user_device_mig_state state;
size_t pgsize;
vfu_migration_callbacks_t callbacks;
- uint64_t data_offset;
-
- /*
- * This is only for the saving state. The resuming state is simpler so we
- * don't need it.
- */
- struct {
- enum migr_iter_state state;
- uint64_t pending_bytes;
- uint64_t offset;
- uint64_t size;
- } iter;
-};
-
-struct migr_state_data {
- uint32_t state;
- const char *name;
-};
-
-#define VFIO_DEVICE_STATE_V1_ERROR (VFIO_DEVICE_STATE_V1_SAVING | VFIO_DEVICE_STATE_V1_RESUMING)
-
-/* valid migration state transitions */
-static const struct migr_state_data migr_states[(VFIO_DEVICE_STATE_MASK + 1)] = {
- [VFIO_DEVICE_STATE_V1_STOP] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING),
- .name = "stopped"
- },
- [VFIO_DEVICE_STATE_V1_RUNNING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING) |
- (1 << VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << (VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING)) |
- (1 << VFIO_DEVICE_STATE_V1_RESUMING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "running"
- },
- [VFIO_DEVICE_STATE_V1_SAVING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING) |
- (1 << VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "stop-and-copy"
- },
- [VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "pre-copy"
- },
- [VFIO_DEVICE_STATE_V1_RESUMING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_RUNNING) |
- (1 << VFIO_DEVICE_STATE_V1_RESUMING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "resuming"
- }
};
-MOCK_DECLARE(ssize_t, migration_region_access_registers, vfu_ctx_t *vfu_ctx,
- char *buf, size_t count, loff_t pos, bool is_write);
-
-MOCK_DECLARE(void, migr_state_transition, struct migration *migr,
- enum migr_iter_state state);
-
MOCK_DECLARE(vfu_migr_state_t, migr_state_vfio_to_vfu, uint32_t device_state);
MOCK_DECLARE(int, state_trans_notify, vfu_ctx_t *vfu_ctx,
@@ -129,4 +47,4 @@ MOCK_DECLARE(int, state_trans_notify, vfu_ctx_t *vfu_ctx,
#endif
-/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ \ No newline at end of file
diff --git a/lib/private.h b/lib/private.h
index fdd804f..6e0170e 100644
--- a/lib/private.h
+++ b/lib/private.h
@@ -195,20 +195,6 @@ typedef struct ioeventfd {
LIST_ENTRY(ioeventfd) entry;
} ioeventfd_t;
-static inline int
-ERROR_INT(int err)
-{
- errno = err;
- return -1;
-}
-
-static inline void *
-ERROR_PTR(int err)
-{
- errno = err;
- return NULL;
-}
-
int
consume_fd(int *fds, size_t nr_fds, size_t index);