diff options
author | Thanos Makatos <thanos.makatos@nutanix.com> | 2020-11-02 04:16:08 -0500 |
---|---|---|
committer | Thanos Makatos <thanos.makatos@nutanix.com> | 2020-11-05 10:49:29 -0500 |
commit | 3459c17659584501747823071dd6adbd4a2e0cd3 (patch) | |
tree | edbe765fbe46c3d14cb0ece345ee45eb5025a444 /lib | |
parent | 41914861c6bde1fd6e369cbbe28a5b3df5dd6b67 (diff) | |
download | libvfio-user-3459c17659584501747823071dd6adbd4a2e0cd3.zip libvfio-user-3459c17659584501747823071dd6adbd4a2e0cd3.tar.gz libvfio-user-3459c17659584501747823071dd6adbd4a2e0cd3.tar.bz2 |
experimental support for running the device state and copy iteration protocol
Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libmuser.c | 304 | ||||
-rw-r--r-- | lib/libmuser_pci.c | 1 | ||||
-rw-r--r-- | lib/muser.h | 59 |
3 files changed, 358 insertions, 6 deletions
diff --git a/lib/libmuser.c b/lib/libmuser.c index 2d0805f..23c1f6a 100644 --- a/lib/libmuser.c +++ b/lib/libmuser.c @@ -87,6 +87,13 @@ typedef struct { int efds[0]; /* XXX must be last */ } lm_irqs_t; +enum migration_iteration_state { + VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL, + VFIO_USER_MIGRATION_ITERATION_STATE_STARTED, + VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED, + VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED +}; + struct lm_ctx { void *pvt; dma_controller_t *dma; @@ -112,7 +119,16 @@ struct lm_ctx { int client_max_fds; - size_t migration_pgsize; + struct { + struct vfio_device_migration_info info; + size_t pgsize; + lm_migration_callbacks_t callbacks; + struct { + enum migration_iteration_state state; + __u64 offset; + __u64 size; + } iter; + } migration; lm_irqs_t irqs; /* XXX must be last */ }; @@ -507,7 +523,7 @@ set_version(lm_ctx_t *lm_ctx, int sock) } ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true, - &lm_ctx->client_max_fds, &lm_ctx->migration_pgsize); + &lm_ctx->client_max_fds, &lm_ctx->migration.pgsize); if (ret < 0) { lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret)); goto out; @@ -520,7 +536,7 @@ set_version(lm_ctx_t *lm_ctx, int sock) ret = -EINVAL; goto out; } - if (lm_ctx->migration_pgsize == 0) { + if (lm_ctx->migration.pgsize == 0) { lm_log(lm_ctx, LM_ERR, "bad migration page size"); ret = -EINVAL; goto out; @@ -528,7 +544,7 @@ set_version(lm_ctx_t *lm_ctx, int sock) /* FIXME need to check max_fds */ - lm_ctx->migration_pgsize = MIN(lm_ctx->migration_pgsize, + lm_ctx->migration.pgsize = MIN(lm_ctx->migration.pgsize, sysconf(_SC_PAGESIZE)); out: free(server_caps); @@ -1541,6 +1557,259 @@ handle_pci_config_space_access(lm_ctx_t *lm_ctx, char *buf, size_t count, return count; } +/* valid migration state transitions */ +__u32 migration_states[VFIO_DEVICE_STATE_MASK] = { + [VFIO_DEVICE_STATE_STOP] = 1 << VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_RUNNING] = /* running */ + (1 << VFIO_DEVICE_STATE_STOP) | + (1 << VFIO_DEVICE_STATE_RUNNING) | + (1 << VFIO_DEVICE_STATE_SAVING) | + (1 << (VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING)) | + (1 << VFIO_DEVICE_STATE_RESUMING), + [VFIO_DEVICE_STATE_SAVING] = /* stop-and-copy */ + (1 << VFIO_DEVICE_STATE_STOP) | + (1 << VFIO_DEVICE_STATE_SAVING), + [VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING] = /* pre-copy */ + (1 << VFIO_DEVICE_STATE_SAVING) | + (1 << VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING), + [VFIO_DEVICE_STATE_RESUMING] = /* resuming */ + (1 << VFIO_DEVICE_STATE_RUNNING) | + (1 << VFIO_DEVICE_STATE_RESUMING) +}; + +static bool +_migration_state_transition_is_valid(__u32 from, __u32 to) +{ + return migration_states[from] & (1 << to); +} + +static ssize_t +handle_migration_device_state(lm_ctx_t *lm_ctx, __u32 *device_state, + bool is_write) { + + int ret; + + assert(lm_ctx != NULL); + assert(device_state != NULL); + + if (!is_write) { + *device_state = lm_ctx->migration.info.device_state; + return 0; + } + + if (*device_state & ~VFIO_DEVICE_STATE_MASK) { + return -EINVAL; + } + + if (!_migration_state_transition_is_valid(lm_ctx->migration.info.device_state, + *device_state)) { + return -EINVAL; + } + + switch (*device_state) { + case VFIO_DEVICE_STATE_STOP: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_STOP); + break; + case VFIO_DEVICE_STATE_RUNNING: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_START); + break; + case VFIO_DEVICE_STATE_SAVING: + /* + * FIXME How should the device operate during the stop-and-copy + * phase? Should we only allow the migration data to be read from + * the migration region? E.g. Access to any other region should be + * failed? This might be a good question to send to LKML. + */ + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_STOP_AND_COPY); + break; + case VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_PRE_COPY); + break; + case VFIO_DEVICE_STATE_RESUMING: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_RESUME); + break; + default: + ret = -EINVAL; + } + + if (ret == 0) { + lm_ctx->migration.info.device_state = *device_state; + } + + return ret; +} + +static ssize_t +handle_migration_pending_bytes(lm_ctx_t *lm_ctx, __u64 *pending_bytes, + bool is_write) +{ + assert(lm_ctx != NULL); + assert(pending_bytes != NULL); + + if (is_write) { + return -EINVAL; + } + + if (lm_ctx->migration.iter.state == VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED) { + *pending_bytes = 0; + return 0; + } + + *pending_bytes = lm_ctx->migration.callbacks.get_pending_bytes(lm_ctx->pvt); + + switch (lm_ctx->migration.iter.state) { + case VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL: + case VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED: + /* + * FIXME what happens if data haven't been consumed in the previous + * iteration? Ask on LKML. + */ + if (*pending_bytes == 0) { + lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED; + } else { + lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_STARTED; + } + break; + case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED: + /* + * Repeated reads of pending_bytes should not have any side effects. + * FIXME does it have to be the same as the previous value? Can it + * increase or even decrease? I suppose it can't be lower than + * data_size? Ask on LKML. + */ + break; + default: + return -EINVAL; + } + return 0; +} + +static ssize_t +handle_migration_data_offset(lm_ctx_t *lm_ctx, __u64 *offset, bool is_write) +{ + int ret; + + assert(lm_ctx != NULL); + assert(offset != NULL); + + if (is_write) { + return -EINVAL; + } + + switch (lm_ctx->migration.iter.state) { + case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED: + break; + default: + /* + * FIXME it's not clear whether these registers can be accessed in + * other parts of the iteration, need clarification on the + * following: + * + * Read on data_offset and data_size should return the offset and + * size of the current buffer if the user application reads + * data_offset and data_size more than once here. + */ + return -EINVAL; + } + + ret = lm_ctx->migration.callbacks.prepare_data(lm_ctx->pvt, + &lm_ctx->migration.iter.offset, + &lm_ctx->migration.iter.size); + if (ret < 0) { + return ret; + } + + *offset = lm_ctx->migration.iter.offset + sizeof(struct vfio_device_migration_info); + + return ret; +} + +static ssize_t +handle_migration_data_size(lm_ctx_t *lm_ctx, __u64 *size, bool is_write) +{ + assert(lm_ctx != NULL); + assert(size != NULL); + + if (is_write) { + return -EINVAL; + } + + switch (lm_ctx->migration.iter.state) { + case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED: + break; + default: + /* FIXME see comment in handle_migration_data_offset */ + return -EINVAL; + } + + *size = lm_ctx->migration.iter.size; + + return 0; +} + +static ssize_t +handle_migration_region_access(lm_ctx_t *lm_ctx, char *buf, size_t count, + loff_t pos, bool is_write) +{ + int ret; + + assert(lm_ctx != NULL); + assert(buf != NULL); + + if (pos + count > lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size) { + lm_log(lm_ctx, LM_ERR, "read %#x-%#x past end of migration region", + pos, pos + count - 1); + return -EINVAL; + } + switch (pos) { + case offsetof(struct vfio_device_migration_info, device_state): + if (count != sizeof(lm_ctx->migration.info.device_state)) { + return -EINVAL; + } + ret = handle_migration_device_state(lm_ctx, (__u32*)buf, + is_write); + break; + case offsetof(struct vfio_device_migration_info, pending_bytes): + if (count != sizeof(lm_ctx->migration.info.pending_bytes)) { + return -EINVAL; + } + ret = handle_migration_pending_bytes(lm_ctx, (__u64*)buf, is_write); + break; + case offsetof(struct vfio_device_migration_info, data_offset): + if (count != sizeof(lm_ctx->migration.info.data_offset)) { + return -EINVAL; + } + ret = handle_migration_data_offset(lm_ctx, (__u64*)buf, is_write); + break; + case offsetof(struct vfio_device_migration_info, data_size): + if (count != sizeof(lm_ctx->migration.info.data_size)) { + return -EINVAL; + } + ret = handle_migration_data_size(lm_ctx, (__u64*)buf, is_write); + break; + default: + if (is_write) { + /* FIXME how do we handle the offset? */ + ret = lm_ctx->migration.callbacks.write_data(lm_ctx->pvt, + buf, count); + } else { + ret = lm_ctx->migration.callbacks.read_data(lm_ctx->pvt, + buf, count, + pos - sizeof(struct vfio_device_migration_info)); + } + } + + if (ret == 0) { + ret = count; + } + return ret; +} + static ssize_t do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write) { @@ -1569,6 +1838,11 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write) is_write); } + if (idx == LM_DEV_MIGRATION_REG_IDX) { + return handle_migration_region_access(lm_ctx, buf, count, offset, + is_write); + } + /* * Checking whether a callback exists might sound expensive however this * code is not performance critical. This works well when we don't expect a @@ -2098,7 +2372,7 @@ handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, goto out; } *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get); - *iovecs = malloc(*nr_iovecs); + *iovecs = malloc(*nr_iovecs * sizeof(struct iovec)); if (*iovecs == NULL) { ret = -errno; goto out; @@ -2154,7 +2428,7 @@ handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { ret = dma_controller_dirty_page_logging_start(lm_ctx->dma, - lm_ctx->migration_pgsize); + lm_ctx->migration.pgsize); } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma); } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { @@ -2188,9 +2462,15 @@ process_request(lm_ctx_t *lm_ctx) assert(lm_ctx != NULL); + if (lm_ctx->pci_info.reg_info[LM_DEV_CFG_REG_IDX].size > 0 && + lm_ctx->migration.info.device_state == VFIO_DEVICE_STATE_STOP) { + return -ESHUTDOWN; + } + nr_fds = lm_ctx->client_max_fds; fds = alloca(nr_fds * sizeof(int)); + /* FIXME get request shouldn't set errno, it should return it as -errno */ ret = transports_ops[lm_ctx->trans].get_request(lm_ctx, &hdr, fds, &nr_fds); if (unlikely(ret < 0)) { if (ret == -EAGAIN || ret == -EWOULDBLOCK) { @@ -2581,6 +2861,18 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) if (migr_reg->size < sizeof(struct vfio_device_migration_info)) { return -EINVAL; } + + /* FIXME this should be done in lm_ctx_run or poll */ + lm_ctx->migration.info.device_state = VFIO_DEVICE_STATE_RUNNING; + + lm_ctx->migration.callbacks = dev_info->migration_callbacks; + if (lm_ctx->migration.callbacks.transition == NULL || + lm_ctx->migration.callbacks.get_pending_bytes == NULL || + lm_ctx->migration.callbacks.prepare_data == NULL || + lm_ctx->migration.callbacks.read_data == NULL || + lm_ctx->migration.callbacks.write_data == NULL) { + return -EINVAL; + } } return 0; diff --git a/lib/libmuser_pci.c b/lib/libmuser_pci.c index 78d65fa..711acc4 100644 --- a/lib/libmuser_pci.c +++ b/lib/libmuser_pci.c @@ -314,6 +314,7 @@ muser_is_pci_hdr_access(loff_t pos) return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF; } +/* FIXME this function is misleading, remove it */ int muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count, loff_t *pos, bool is_write, diff --git a/lib/muser.h b/lib/muser.h index 375be0e..949680c 100644 --- a/lib/muser.h +++ b/lib/muser.h @@ -139,6 +139,8 @@ typedef struct { /* * Callback function that is called when the region is read or written. + * Note that the memory of the region is owned by the user, except for the + * standard header (first 64 bytes) of the PCI configuration space. */ lm_region_access_t *fn; @@ -260,6 +262,62 @@ typedef enum { #define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF +/* + * FIXME the names of migration callback functions are probably far too long, + * but for now it helps with the implementation. + */ +typedef int (lm_migration_callback_t)(void *pvt); + +typedef enum { + LM_MIGR_STATE_STOP, + LM_MIGR_STATE_START, + LM_MIGR_STATE_STOP_AND_COPY, + LM_MIGR_STATE_PRE_COPY, + LM_MIGR_STATE_RESUME +} lm_migr_state_t; + +typedef struct { + + /* migration state transition callback */ + /* TODO rename to lm_migration_state_transition_callback */ + /* FIXME maybe we should create a single callback and pass the state? */ + int (*transition)(void *pvt, lm_migr_state_t state); + + /* Callbacks for saving device state */ + + /* + * Function that is called to retrieve pending migration data. If migration + * data were previously made available (function prepare_data has been + * called) then calling this function signifies that they have been read + * (e.g. migration data can be discarded). If the function returns 0 then + * migration has finished and this function won't be called again. + */ + __u64 (*get_pending_bytes)(void *pvt); + + /* + * Function that is called to instruct the device to prepare migration data. + * The function must return only after migration data are available at the + * specified offset. + */ + int (*prepare_data)(void *pvt, __u64 *offset, __u64 *size); + + /* + * Function that is called to read migration data. offset and size can + * be any subrange on the offset and size previously returned by + * prepare_data. The function must return the amount of data read. This + * function can be called even if the migration data can be memory mapped. + * + * Does this mean that reading data_offset/data_size updates the values? + */ + size_t (*read_data)(void *pvt, void *buf, __u64 count, __u64 offset); + + /* Callback for restoring device state */ + + /* Fuction that is called for writing previously stored device state. */ + size_t (*write_data)(void *pvt, void *data, __u64 size); + +} lm_migration_callbacks_t; + /** * Device information structure, used to create the lm_ctx. * To be filled and passed to lm_ctx_create() @@ -326,6 +384,7 @@ typedef struct { int nr_caps; lm_cap_t **caps; + lm_migration_callbacks_t migration_callbacks; } lm_dev_info_t; |