diff options
author | Thanos Makatos <thanos.makatos@nutanix.com> | 2020-10-29 17:57:08 -0400 |
---|---|---|
committer | Thanos Makatos <thanos.makatos@nutanix.com> | 2020-10-29 17:57:08 -0400 |
commit | fe27b18d7c20064281633eac541752e6ef6e8ada (patch) | |
tree | 7ab968a73ab30a1bb00de3d735df8e719ee177a1 | |
parent | 05a1d6d9bc63370e59fe7fd7c5e8acd57249e315 (diff) | |
download | libvfio-user-fe27b18d7c20064281633eac541752e6ef6e8ada.zip libvfio-user-fe27b18d7c20064281633eac541752e6ef6e8ada.tar.gz libvfio-user-fe27b18d7c20064281633eac541752e6ef6e8ada.tar.bz2 |
support for live migration region and dirty page logging
This patch adds support for the live migration region and dirty page logging
following VFIO. Live migration is NOT yet functional as handling accesses to
the migration region is not yet implemented. Currenty the live migration region
is fixed at index 9 simply for simplifying the implementation. Dirty page
logging is simplified by requiring IOVA ranges to match exactly the entire IOVA
range.
Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
-rw-r--r-- | lib/dma.c | 123 | ||||
-rw-r--r-- | lib/dma.h | 85 | ||||
-rw-r--r-- | lib/libmuser.c | 469 | ||||
-rw-r--r-- | lib/muser.h | 24 | ||||
-rw-r--r-- | lib/muser_priv.h | 29 | ||||
-rw-r--r-- | lib/vfio_user.h | 60 | ||||
-rw-r--r-- | samples/client.c | 252 | ||||
-rw-r--r-- | samples/server.c | 23 |
8 files changed, 855 insertions, 210 deletions
@@ -81,6 +81,7 @@ dma_controller_create(lm_ctx_t *lm_ctx, int max_regions) dma->max_regions = max_regions; dma->nregions = 0; memset(dma->regions, 0, max_regions * sizeof(dma->regions[0])); + dma->dirty_pgsize = 0; return dma; } @@ -106,6 +107,10 @@ _dma_controller_do_remove_region(dma_controller_t *dma, } } +/* + * FIXME no longer used. Also, it doesn't work for addresses that span two + * DMA regions. + */ bool dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr, size_t size) @@ -328,7 +333,7 @@ dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len) int _dma_addr_sg_split(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg) + dma_sg_t *sg, int max_sg, int prot) { int idx; int cnt = 0; @@ -348,6 +353,9 @@ _dma_addr_sg_split(const dma_controller_t *dma, sg[cnt].region = idx; sg[cnt].offset = dma_addr - region->dma_addr; sg[cnt].length = region_len; + if (_dma_should_mark_dirty(dma, prot)) { + _dma_mark_dirty(dma, region, sg); + } } cnt++; @@ -376,4 +384,117 @@ out: return cnt; } +ssize_t _get_bitmap_size(size_t region_size, size_t pgsize) +{ + if (pgsize == 0) { + return -EINVAL; + } + if (region_size < pgsize) { + return -EINVAL; + } + size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0); + return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0); +} + +int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize) +{ + int i; + + assert(dma != NULL); + + if (pgsize == 0) { + return -EINVAL; + } + + if (dma->dirty_pgsize > 0) { + if (dma->dirty_pgsize != pgsize) { + return -EINVAL; + } + return 0; + } + + for (i = 0; i < dma->nregions; i++) { + dma_memory_region_t *region = &dma->regions[i]; + ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize); + if (bitmap_size < 0) { + return bitmap_size; + } + region->dirty_bitmap = calloc(bitmap_size, sizeof(char)); + if (region->dirty_bitmap == NULL) { + int j, ret = -errno; + for (j = 0; j < i; j++) { + free(region->dirty_bitmap); + region->dirty_bitmap = NULL; + } + return ret; + } + } + dma->dirty_pgsize = pgsize; + return 0; +} + +int dma_controller_dirty_page_logging_stop(dma_controller_t *dma) +{ + int i; + + assert(dma != NULL); + + if (dma->dirty_pgsize == 0) { + return 0; + } + + for (i = 0; i < dma->nregions; i++) { + free(dma->regions[i].dirty_bitmap); + dma->regions[i].dirty_bitmap = NULL; + } + dma->dirty_pgsize = 0; + return 0; +} + +int +dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len, + size_t pgsize, size_t size, char **data) +{ + int ret; + ssize_t bitmap_size; + dma_sg_t sg; + dma_memory_region_t *region; + + assert(dma != NULL); + assert(data != NULL); + + /* + * FIXME for now we support IOVAs that match exactly the DMA region. This + * is purely for simplifying the implementation. We MUST allow arbitrary + * IOVAs. + */ + ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE); + if (ret != 1 || sg.dma_addr != addr || sg.length != len) { + return -ENOTSUP; + } + + if (pgsize != dma->dirty_pgsize) { + return -EINVAL; + } + + bitmap_size = _get_bitmap_size(len, pgsize); + if (bitmap_size < 0) { + return bitmap_size; + } + + /* + * FIXME they must be equal because this is how much data the client + * expects to receive. + */ + if (size != (size_t)bitmap_size) { + return -EINVAL; + } + + region = &dma->regions[sg.region]; + + *data = region->dirty_bitmap; + + return 0; +} + /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ @@ -32,6 +32,11 @@ #define DMA_DMA_H /* + * FIXME check whether DMA regions must be page aligned. If so then the + * implementation can be greatly simpified. + */ + +/* * This library emulates a DMA controller for a device emulation application to * perform DMA operations on a foreign memory space. * @@ -82,12 +87,14 @@ typedef struct { off_t offset; // File offset void *virt_addr; // Virtual address of this region int refcnt; // Number of users of this region + char *dirty_bitmap; // Dirty page bitmap } dma_memory_region_t; typedef struct { int max_regions; int nregions; struct lm_ctx *lm_ctx; + size_t dirty_pgsize; // Dirty page granularity dma_memory_region_t regions[0]; } dma_controller_t; @@ -118,7 +125,59 @@ dma_controller_remove_region(dma_controller_t *dma, int _dma_addr_sg_split(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg); + dma_sg_t *sg, int max_sg, int prot); + +static bool +_dma_should_mark_dirty(const dma_controller_t *dma, int prot) +{ + assert(dma != NULL); + + return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0; +} + +static size_t +_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset) +{ + return (offset - base_addr) / pgsize; +} + +static size_t +_get_pgend(size_t pgsize, uint64_t len, size_t start) +{ + return start + (len / pgsize) + (len % pgsize != 0) - 1; +} + +static void +_dma_bitmap_get_pgrange(const dma_controller_t *dma, + const dma_memory_region_t *region, + const dma_sg_t *sg, size_t *start, size_t *end) +{ + assert(dma != NULL); + assert(region != NULL); + assert(sg != NULL); + assert(start != NULL); + assert(end != NULL); + + *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset); + *end = _get_pgend(dma->dirty_pgsize, sg->length, *start); +} + +static void +_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region, + dma_sg_t *sg) +{ + size_t i, start, end; + + assert(dma != NULL); + assert(region != NULL); + assert(sg != NULL); + assert(region->dirty_bitmap != NULL); + + _dma_bitmap_get_pgrange(dma, region, sg, &start, &end); + for (i = start; i <= end; i++) { + region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT); + } +} /* Takes a linear dma address span and returns a sg list suitable for DMA. * A single linear dma address span may need to be split into multiple @@ -134,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma, static inline int dma_addr_to_sg(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg) + dma_sg_t *sg, int max_sg, int prot) { static __thread int region_hint; int cnt; @@ -150,10 +209,13 @@ dma_addr_to_sg(const dma_controller_t *dma, sg->region = region_hint; sg->offset = dma_addr - region->dma_addr; sg->length = len; + if (_dma_should_mark_dirty(dma, prot)) { + _dma_mark_dirty(dma, region, sg); + } return 1; } // Slow path: search through regions. - cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg); + cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot); if (likely(cnt > 0)) { region_hint = sg->region; } @@ -186,6 +248,7 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov, return 0; } +/* FIXME useless define */ #define UNUSED __attribute__((unused)) static inline void @@ -215,12 +278,12 @@ dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg, } static inline void * -dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len) +dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot) { dma_sg_t sg; struct iovec iov; - if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 && + if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 && dma_map_sg(dma, &sg, &iov, 1) == 0) { return iov.iov_base; } @@ -239,12 +302,22 @@ dma_unmap_addr(dma_controller_t *dma, }; int r; - r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1); + r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE); assert(r == 1); dma_unmap_sg(dma, &sg, &iov, 1); } +int +dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize); + +int +dma_controller_dirty_page_logging_stop(dma_controller_t *dma); + +int +dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len, + size_t pgsize, size_t size, char **data); + bool dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr, size_t size); diff --git a/lib/libmuser.c b/lib/libmuser.c index 547a318..0aa3443 100644 --- a/lib/libmuser.c +++ b/lib/libmuser.c @@ -112,6 +112,8 @@ struct lm_ctx { int client_max_fds; + size_t migration_pgsize; + lm_irqs_t irqs; /* XXX must be last */ }; @@ -270,14 +272,20 @@ __free_s(char **p) } int -send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, - enum vfio_user_command cmd, void *data, int len, +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, int *fds, int count) { int ret; struct vfio_user_header hdr = {.msg_id = msg_id}; - struct iovec iov[2]; struct msghdr msg; + size_t i; + + if (nr_iovecs == 0) { + iovecs = alloca(sizeof(*iovecs)); + nr_iovecs = 1; + } memset(&msg, 0, sizeof(msg)); @@ -288,23 +296,15 @@ send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, hdr.flags.type = VFIO_USER_F_TYPE_COMMAND; } - if (data != NULL && len == 0) { - return -EINVAL; - } - - hdr.msg_size = sizeof(hdr) + len; + iovecs[0].iov_base = &hdr; + iovecs[0].iov_len = sizeof(hdr); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); - msg.msg_iovlen = 1; - - if (data != NULL) { - msg.msg_iovlen++; - iov[1].iov_base = data; - iov[1].iov_len = len; + for (i = 0; i < nr_iovecs; i++) { + hdr.msg_size += iovecs[i].iov_len; } - msg.msg_iov = iov; + msg.msg_iovlen = nr_iovecs; + msg.msg_iov = iovecs; if (fds != NULL) { size_t size = count * sizeof *fds; @@ -329,26 +329,43 @@ send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, } int +send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count) { + + struct iovec iovecs[2] = { + [1] = { + .iov_base = data, + .iov_len = data_len + } + }; + return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs, + ARRAY_SIZE(iovecs), fds, count); +} + +int send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, char *caps) { int ret; - char *data __attribute__((__cleanup__(__free_s))) = NULL; + char *data; - ret = asprintf(&data, "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}", + ret = asprintf(&data, + "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}", major, minor, caps != NULL ? caps : "{}"); if (ret == -1) { - data = NULL; return -1; } - - return send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data, - ret, NULL, 0); + ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data, + ret, NULL, 0); + free(data); + return ret; } int recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, - uint16_t *msg_id, void *data, int *len) + uint16_t *msg_id, void *data, size_t *len) { int ret; @@ -388,7 +405,7 @@ recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, if (ret < 0) { return ret; } - if (*len != ret) { /* FIXME we should allow receiving less */ + if (*len != (size_t)ret) { /* FIXME we should allow receiving less */ return -EINVAL; } *len = ret; @@ -398,7 +415,7 @@ recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, int recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, - int *max_fds) + int *max_fds, size_t *pgsize) { int ret; struct vfio_user_header hdr; @@ -424,23 +441,23 @@ recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, /* FIXME use proper parsing */ ret = sscanf(data, - "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d}}", - major, minor, max_fds); - if (ret != 3) { + "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}", + major, minor, max_fds, pgsize); + if (ret != 4) { return -EINVAL; } return 0; } int -send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, - void *send_data, int send_len, - int *send_fds, int fd_count, - struct vfio_user_header *hdr, - void *recv_data, int recv_len) -{ - int ret = send_vfio_user_msg(sock, msg_id, false, cmd, send_data, send_len, - send_fds, fd_count); +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs, + send_fds, fd_count); if (ret < 0) { return ret; } @@ -450,6 +467,24 @@ send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len); } +int +send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + struct iovec iovecs[2] = { + [1] = { + .iov_base = send_data, + .iov_len = send_len + } + }; + return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs, + ARRAY_SIZE(iovecs), send_fds, fd_count, + hdr, recv_data, recv_len); +} + static int set_version(lm_ctx_t *lm_ctx, int sock) { @@ -458,7 +493,8 @@ set_version(lm_ctx_t *lm_ctx, int sock) uint16_t msg_id = 0; char *server_caps; - ret = asprintf(&server_caps, "{max_fds: %d}", MAX_FDS); + ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}", + MAX_FDS, sysconf(_SC_PAGESIZE)); if (ret == -1) { return -ENOMEM; } @@ -471,7 +507,7 @@ set_version(lm_ctx_t *lm_ctx, int sock) } ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true, - &lm_ctx->client_max_fds); + &lm_ctx->client_max_fds, &lm_ctx->migration_pgsize); if (ret < 0) { lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret)); goto out; @@ -482,7 +518,18 @@ set_version(lm_ctx_t *lm_ctx, int sock) LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN, client_mj, client_mn); ret = -EINVAL; + goto out; + } + if (lm_ctx->migration_pgsize == 0) { + lm_log(lm_ctx, LM_ERR, "bad migration page size"); + ret = -EINVAL; + goto out; } + + /* FIXME need to check max_fds */ + + lm_ctx->migration_pgsize = MIN(lm_ctx->migration_pgsize, + sysconf(_SC_PAGESIZE)); out: free(server_caps); return ret; @@ -932,23 +979,33 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info) * points accordingly. */ static int -dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, +dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index, struct vfio_region_info **vfio_reg, bool is_kernel) { + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *type = NULL; struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct lm_sparse_mmap_areas *mmap_areas; int nr_mmap_areas, i; - size_t sparse_size; + size_t type_size = 0; + size_t sparse_size = 0; + size_t cap_size; ssize_t ret; void *cap_ptr; - if (lm_reg->mmap_areas == NULL) { - lm_log(lm_ctx, LM_DBG, "bad mmap_areas\n"); - return -EINVAL; + if (reg_index == LM_DEV_MIGRATION_REG_IDX) { + type_size = sizeof(struct vfio_region_info_cap_type); + } + + if (lm_reg->mmap_areas != NULL) { + nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas; + sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas)); } - nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas; - sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas)); + cap_size = type_size + sparse_size; + if (cap_size == 0) { + return 0; + } /* * If vfio_reg does not have enough space to accommodate sparse info then @@ -956,54 +1013,79 @@ dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, * is only for kernel/muser.ko, where the request comes from kernel/vfio. */ - if ((*vfio_reg)->argsz < sparse_size + sizeof(**vfio_reg) && is_kernel) { + if ((*vfio_reg)->argsz < cap_size + sizeof(**vfio_reg) && is_kernel) { lm_log(lm_ctx, LM_DBG, "vfio_reg too small=%d\n", (*vfio_reg)->argsz); - (*vfio_reg)->argsz = sparse_size + sizeof(**vfio_reg); + (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg); (*vfio_reg)->cap_offset = 0; return 0; } - sparse = calloc(1, sparse_size); - if (sparse == NULL) + /* TODO deosn't need to be calloc, we overwrite it entirely */ + header = calloc(1, cap_size); + if (header == NULL) { return -ENOMEM; - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->header.next = 0; - sparse->nr_areas = nr_mmap_areas; + } - lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__, - sparse_size, nr_mmap_areas); - mmap_areas = lm_reg->mmap_areas; - for (i = 0; i < nr_mmap_areas; i++) { - sparse->areas[i].offset = mmap_areas->areas[i].start; - sparse->areas[i].size = mmap_areas->areas[i].size; - lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__, - i, sparse->areas[i].offset, sparse->areas[i].size); + if (reg_index == LM_DEV_MIGRATION_REG_IDX) { + type = (struct vfio_region_info_cap_type*)header; + type->header.id = VFIO_REGION_INFO_CAP_TYPE; + type->header.version = 1; + type->header.next = 0; + type->type = VFIO_REGION_TYPE_MIGRATION; + type->subtype = VFIO_REGION_SUBTYPE_MIGRATION; + (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info); } + if (lm_reg->mmap_areas != NULL) { + if (type != NULL) { + type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type); + sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1); + } else { + (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info); + sparse = (struct vfio_region_info_cap_sparse_mmap*)header; + } + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->header.next = 0; + sparse->nr_areas = nr_mmap_areas; + + lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__, + sparse_size, nr_mmap_areas); + mmap_areas = lm_reg->mmap_areas; + for (i = 0; i < nr_mmap_areas; i++) { + sparse->areas[i].offset = mmap_areas->areas[i].start; + sparse->areas[i].size = mmap_areas->areas[i].size; + lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__, + i, sparse->areas[i].offset, sparse->areas[i].size); + } + } + + /* + * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is + * memory-mappable in general, not only if it supports sparse mmap. + */ (*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS; - (*vfio_reg)->cap_offset = sizeof(**vfio_reg); if (is_kernel) { /* write the sparse mmap cap info to vfio-client user pages */ - ret = write(lm_ctx->conn_fd, sparse, sparse_size); - if (ret != (ssize_t)sparse_size) { - free(sparse); + ret = write(lm_ctx->conn_fd, header, cap_size); + if (ret != (ssize_t)cap_size) { + free(header); return -EIO; } } else { - (*vfio_reg)->argsz = sparse_size + sizeof(**vfio_reg); + (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg); *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz); if (*vfio_reg == NULL) { - free(sparse); + free(header); return -ENOMEM; } cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset; - memcpy(cap_ptr, sparse, sparse_size); + memcpy(cap_ptr, header, cap_size); } - free(sparse); + free(header); return 0; } @@ -1073,11 +1155,10 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg, (*vfio_reg)->flags = lm_reg->flags; (*vfio_reg)->size = lm_reg->size; - if (lm_reg->mmap_areas != NULL) { - err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg, is_kernel); - if (err) { - return err; - } + err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg, + is_kernel); + if (err) { + return err; } lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu " @@ -1089,8 +1170,9 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg, } static long -dev_get_info(struct vfio_device_info *dev_info) +dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info) { + assert(lm_ctx != NULL); assert(dev_info != NULL); // Ensure provided argsz is sufficiently big. @@ -1114,7 +1196,7 @@ do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data) assert(lm_ctx != NULL); switch (cmd_ioctl->vfio_cmd) { case VFIO_DEVICE_GET_INFO: - err = dev_get_info(&cmd_ioctl->data.dev_info); + err = dev_get_info(lm_ctx, &cmd_ioctl->data.dev_info); break; case VFIO_DEVICE_GET_REGION_INFO: reg_info = &cmd_ioctl->data.reg_info; @@ -1737,7 +1819,7 @@ static int handle_device_get_info(lm_ctx_t *lm_ctx, return -errno; } - ret = dev_get_info(dev_info); + ret = dev_get_info(lm_ctx, dev_info); if (ret < 0) { return ret; } @@ -1919,7 +2001,7 @@ handle_device_reset(lm_ctx_t *lm_ctx) static int handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, - void **data, int *len) + void **data, size_t *len) { struct vfio_user_region_access region_access; struct muser_cmd muser_cmd = {}; @@ -1973,6 +2055,105 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, return 0; } +static int +handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct iovec **iovecs, size_t *nr_iovecs) +{ + int size, ret; + size_t i; + struct vfio_iommu_type1_dirty_bitmap_get *ranges; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(iovecs != NULL); + assert(nr_iovecs != NULL); + + size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap); + if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) { + return -EINVAL; + } + ranges = malloc(size); + if (ranges == NULL) { + return -errno; + } + ret = recv(lm_ctx->conn_fd, ranges, size, 0); + if (ret == -1) { + ret = -errno; + goto out; + } + if (ret != size) { + ret = -EINVAL; + goto out; + } + *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get); + *iovecs = malloc(*nr_iovecs); + if (*iovecs == NULL) { + ret = -errno; + goto out; + } + + for (i = 1; i < *nr_iovecs; i++) { + struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */ + ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size, + r->bitmap.pgsize, r->bitmap.size, + (char**)&((*iovecs)[i].iov_base)); + if (ret != 0) { + goto out; + } + (*iovecs)[i].iov_len = r->bitmap.size; + } +out: + if (ret != 0) { + if (*iovecs != NULL) { + free(*iovecs); + *iovecs = NULL; + } + } + free(ranges); + return ret; +} + +static int +handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct iovec **iovecs, size_t *nr_iovecs) +{ + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap; + int ret; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(iovecs != NULL); + assert(nr_iovecs != NULL); + + if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) { + lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size); + return -EINVAL; + } + + /* FIXME must also check argsz */ + + ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0); + if (ret == -1) { + return -errno; + } + if ((size_t)ret < sizeof dirty_bitmap) { + return -EINVAL; + } + + if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { + ret = dma_controller_dirty_page_logging_start(lm_ctx->dma, + lm_ctx->migration_pgsize); + } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { + ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma); + } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { + ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs); + } else { + ret = -EINVAL; + } + + return ret; +} + /* * FIXME return value is messed up, sometimes we return -1 and set errno while * other times we return -errno. Fix. @@ -1988,9 +2169,9 @@ process_request(lm_ctx_t *lm_ctx) struct vfio_irq_info irq_info; struct vfio_device_info dev_info; struct vfio_region_info *dev_reg_info = NULL; - void *data = NULL; - bool free_data = false; - int len = 0; + struct iovec _iovecs[2] = {0}, *iovecs = NULL; + size_t nr_iovecs = 0; + bool free_iovec_data = true; assert(lm_ctx != NULL); @@ -2034,6 +2215,12 @@ process_request(lm_ctx_t *lm_ctx) return -EINVAL; } + /* FIXME in most of the following function we check that hdr.count is >= + * than the command-specific struct and there is an additional recv(2) for + * that data. We should eliminate duplicating this common code and move it + * here. + */ + switch (hdr.cmd) { case VFIO_USER_DMA_MAP: case VFIO_USER_DMA_UNMAP: @@ -2044,23 +2231,28 @@ process_request(lm_ctx_t *lm_ctx) case VFIO_USER_DEVICE_GET_INFO: ret = handle_device_get_info(lm_ctx, &hdr, &dev_info); if (ret == 0) { - data = &dev_info; - len = dev_info.argsz; + _iovecs[1].iov_base = &dev_info; + _iovecs[1].iov_len = dev_info.argsz; + iovecs = _iovecs; + nr_iovecs = 2; } break; case VFIO_USER_DEVICE_GET_REGION_INFO: ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info); if (ret == 0) { - data = dev_reg_info; - len = dev_reg_info->argsz; - free_data = true; + _iovecs[1].iov_base = dev_reg_info; + _iovecs[1].iov_len = dev_reg_info->argsz; + iovecs = _iovecs; + nr_iovecs = 2; } break; case VFIO_USER_DEVICE_GET_IRQ_INFO: ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info); if (ret == 0) { - data = &irq_info; - len = sizeof irq_info; + _iovecs[1].iov_base = &irq_info; + _iovecs[1].iov_len = sizeof irq_info; + iovecs = _iovecs; + nr_iovecs = 2; } break; case VFIO_USER_DEVICE_SET_IRQS: @@ -2068,12 +2260,20 @@ process_request(lm_ctx_t *lm_ctx) break; case VFIO_USER_REGION_READ: case VFIO_USER_REGION_WRITE: - ret = handle_region_access(lm_ctx, &hdr, &data, &len); - free_data = true; + iovecs = _iovecs; + ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base, + &iovecs[1].iov_len); + nr_iovecs = 2; break; case VFIO_USER_DEVICE_RESET: ret = handle_device_reset(lm_ctx); break; + case VFIO_USER_DIRTY_PAGES: + ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs); + if (ret >= 0) { + free_iovec_data = false; + } + break; default: lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd); return -EINVAL; @@ -2083,14 +2283,23 @@ process_request(lm_ctx_t *lm_ctx) * TODO: In case of error during command handling set errno respectively * in the reply message. */ - ret = send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true, - 0, data, len, NULL, 0); + if (ret < 0) { + assert(false); /* FIXME */ + } + ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true, + 0, iovecs, nr_iovecs, NULL, 0); if (unlikely(ret < 0)) { lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n", strerror(-ret)); } - if (free_data) { - free(data); + if (iovecs != NULL && iovecs != _iovecs) { + if (free_iovec_data) { + size_t i; + for (i = 0; i < nr_iovecs; i++) { + free(iovecs[i].iov_base); + } + } + free(iovecs); } return ret; @@ -2190,8 +2399,9 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex) irq_info.subindex = subindex; ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, - VFIO_USER_VM_INTERRUPT, &irq_info, - sizeof(irq_info), NULL, 0, NULL, NULL, 0); + VFIO_USER_VM_INTERRUPT, + &irq_info, sizeof irq_info, + NULL, 0, NULL, NULL, 0); if (ret < 0) { errno = -ret; return -1; @@ -2291,6 +2501,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) { lm_reg_info_t *cfg_reg; const lm_reg_info_t zero_reg = { 0 }; + lm_reg_info_t *migr_reg; int i; assert(lm_ctx != NULL); @@ -2347,6 +2558,16 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF; } + /* + * Check the migration region. + */ + migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX]; + if (migr_reg->size > 0) { + if (migr_reg->size < sizeof(struct vfio_device_migration_info)) { + return -EINVAL; + } + } + return 0; err: @@ -2529,13 +2750,15 @@ lm_get_region_info(lm_ctx_t *lm_ctx) inline int lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, - uint32_t len, dma_sg_t *sg, int max_sg) + uint32_t len, dma_sg_t *sg, int max_sg, int prot) { + assert(lm_ctx != NULL); + if (unlikely(lm_ctx->unmap_dma == NULL)) { errno = EINVAL; return -1; } - return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg); + return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot); } inline int @@ -2581,62 +2804,54 @@ lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id) } int -lm_dma_read(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data) +lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data) { struct vfio_user_dma_region_access *dma_recv; - struct vfio_user_dma_region_access dma_send = { - .addr = addr, - .count = count - }; - int recv_size = sizeof(*dma_recv) + count; + struct vfio_user_dma_region_access dma_send; + int recv_size; int msg_id = 1, ret; - if (!dma_controller_region_valid(lm_ctx->dma, addr, count)) { - lm_log(lm_ctx, LM_ERR, "DMA region addr %#lx count %llu doest not " - "exists", addr, count); - return -ENOENT; - } + assert(lm_ctx != NULL); + assert(sg != NULL); + + recv_size = sizeof(*dma_recv) + sg->length; dma_recv = calloc(recv_size, 1); if (dma_recv == NULL) { return -ENOMEM; } - dma_recv->addr = addr; - dma_recv->count = count; + dma_send.addr = sg->dma_addr; + dma_send.count = sg->length; ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ, - &dma_send, sizeof(dma_send), NULL, 0, NULL, + &dma_send, sizeof dma_send, NULL, 0, NULL, dma_recv, recv_size); - memcpy(data, dma_recv->data, count); + memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */ free(dma_recv); return ret; } int -lm_dma_write(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data) +lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data) { struct vfio_user_dma_region_access *dma_send, dma_recv; - int send_size = sizeof(*dma_send) + count; + int send_size = sizeof(*dma_send) + sg->length; int msg_id = 1, ret; - if (!dma_controller_region_valid(lm_ctx->dma, addr, count)) { - lm_log(lm_ctx, LM_ERR, "DMA region addr %#lx count %llu does not " - "exists", addr, count); - return -ENOENT; - } + assert(lm_ctx != NULL); + assert(sg != NULL); dma_send = calloc(send_size, 1); if (dma_send == NULL) { return -ENOMEM; } - dma_send->addr = addr; - dma_send->count = count; - memcpy(dma_send->data, data, count); - + dma_send->addr = sg->dma_addr; + dma_send->count = sg->length; + memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE, - dma_send, send_size, NULL, 0, NULL, &dma_recv, - sizeof(dma_recv)); + dma_send, send_size, + NULL, 0, NULL, &dma_recv, sizeof(dma_recv)); free(dma_send); return ret; diff --git a/lib/muser.h b/lib/muser.h index 3c24a7a..375be0e 100644 --- a/lib/muser.h +++ b/lib/muser.h @@ -159,6 +159,7 @@ enum { LM_DEV_NUM_IRQS }; +/* FIXME these are PCI regions */ enum { LM_DEV_BAR0_REG_IDX, LM_DEV_BAR1_REG_IDX, @@ -169,7 +170,15 @@ enum { LM_DEV_ROM_REG_IDX, LM_DEV_CFG_REG_IDX, LM_DEV_VGA_REG_IDX, - LM_DEV_NUM_REGS = 9 + /* + * FIXME this really belong here, but simplifies implementation for now. A + * migration region can exist for non-PCI devices (can its index be + * anything?). In any case, we should allow the user to define custom regions + * at will, by fixing the migration region in that position we don't allow + * this. + */ + LM_DEV_MIGRATION_REG_IDX, + LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */ }; typedef struct { @@ -426,6 +435,7 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex); * @len: size of memory to be mapped * @sg: array that receives the scatter/gather entries to be mapped * @max_sg: maximum number of elements in above array + * @prot: protection as define in <sys/mman.h> * * @returns the number of scatter/gather entries created on success, and on * failure: @@ -435,7 +445,7 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex); */ int lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg); + dma_sg_t *sg, int max_sg, int prot); /** * Maps a list scatter/gather entries from the guest's physical address space @@ -488,23 +498,21 @@ lm_get_region(loff_t pos, size_t count, loff_t *off); * Read from the dma region exposed by the client. * * @lm_ctx: the libmuser context - * @addr: dma address exposed by the client - * @count: size of the data to read + * @sg: a DMA segment obtained from dma_addr_to_sg * @data: data buffer to read into */ int -lm_dma_read(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data); +lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data); /** * Write to the dma region exposed by the client. * * @lm_ctx: the libmuser context - * @addr: dma address exposed by the client - * @count: size of the data to write + * @sg: a DMA segment obtained from dma_addr_to_sg * @data: data buffer to write */ int -lm_dma_write(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data); +lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data); /* * Advanced stuff. diff --git a/lib/muser_priv.h b/lib/muser_priv.h index 8d07b2c..c45a8f3 100644 --- a/lib/muser_priv.h +++ b/lib/muser_priv.h @@ -48,13 +48,21 @@ uint64_t region_to_offset(uint32_t region); int +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *fds, int count); + +int send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, - enum vfio_user_command cmd, void *data, int len, int *fds, - int count); + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count); + int recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, - uint16_t *msg_id, void *data, int *len); + uint16_t *msg_id, void *data, size_t *len); int send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, @@ -62,14 +70,21 @@ send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, int recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, - int *max_fds); + int *max_fds, size_t *pgsize); + +int +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len); int send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, - void *send_data, int send_len, - int *send_fds, int fd_count, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, struct vfio_user_header *hdr, - void *recv_data, int recv_len); + void *recv_data, size_t recv_len); #endif /* MUSER_PRIV_H */ diff --git a/lib/vfio_user.h b/lib/vfio_user.h index 890fc43..19f751a 100644 --- a/lib/vfio_user.h +++ b/lib/vfio_user.h @@ -36,6 +36,7 @@ #include <inttypes.h> #include <linux/vfio.h> +#include <linux/version.h> enum vfio_user_command { VFIO_USER_VERSION = 1, @@ -51,6 +52,7 @@ enum vfio_user_command { VFIO_USER_DMA_WRITE = 11, VFIO_USER_VM_INTERRUPT = 12, VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_DIRTY_PAGES = 14, VFIO_USER_MAX, }; @@ -102,6 +104,64 @@ struct vfio_user_irq_info { uint32_t subindex; } __attribute__((packed)); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) + +/* copied from <linux/vfio.h> */ + +#define VFIO_REGION_TYPE_MIGRATION (3) +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 *data; /* one bit per page */ +}; + +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#endif + #endif /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/samples/client.c b/samples/client.c index 925335d..5ff79cd 100644 --- a/samples/client.c +++ b/samples/client.c @@ -39,6 +39,7 @@ #include <time.h> #include <err.h> #include <assert.h> +//#include <sys/uio.h> #include "../lib/muser.h" #include "../lib/muser_priv.h" @@ -66,13 +67,16 @@ init_sock(const char *path) } static int -set_version(int sock, int client_max_fds, int *server_max_fds) +set_version(int sock, int client_max_fds, int *server_max_fds, size_t *pgsize) { int ret, mj, mn; uint16_t msg_id; char *client_caps = NULL; - ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds); + assert(server_max_fds != NULL); + assert(pgsize != NULL); + + ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds, pgsize); if (ret < 0) { fprintf(stderr, "failed to receive version from server: %s\n", strerror(-ret)); @@ -85,7 +89,8 @@ set_version(int sock, int client_max_fds, int *server_max_fds) goto out; } - ret = asprintf(&client_caps, "{max_fds: %d}", client_max_fds); + ret = asprintf(&client_caps, "{max_fds: %d, migration: {pgsize: %lu}}", + client_max_fds, sysconf(_SC_PAGESIZE)); if (ret == -1) { client_caps = NULL; ret = -ENOMEM; /* FIXME */ @@ -115,14 +120,64 @@ send_device_reset(int sock) } static int +get_region_vfio_caps(int sock, size_t cap_sz) +{ + struct vfio_info_cap_header *header, *_header; + struct vfio_region_info_cap_type *type; + struct vfio_region_info_cap_sparse_mmap *sparse; + int i, ret; + + header = _header = calloc(cap_sz, 1); + if (header == NULL) { + return -ENOMEM; + } + + ret = recv(sock, header, cap_sz, 0); + if (ret < 0) { + err(EXIT_FAILURE, "failed to receive VFIO cap info"); + } + assert(ret == cap_sz); + + while (true) { + switch (header->id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + sparse = (struct vfio_region_info_cap_sparse_mmap*)header; + fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__, + sparse->nr_areas); + for (i = 0; i < sparse->nr_areas; i++) { + fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__, + i, sparse->areas[i].offset, sparse->areas[i].size); + } + break; + case VFIO_REGION_INFO_CAP_TYPE: + type = (struct vfio_region_info_cap_type*)header; + if (type->type != VFIO_REGION_TYPE_MIGRATION || + type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) { + fprintf(stderr, "bad region type %d/%d\n", type->type, + type->subtype); + exit(EXIT_FAILURE); + } + printf("migration region\n"); + break; + default: + fprintf(stderr, "bad VFIO cap ID %#x\n", header->id); + exit(EXIT_FAILURE); + } + if (header->next == 0) { + break; + } + header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info)); + } + free(_header); +} + +static int get_device_region_info(int sock, struct vfio_device_info *client_dev_info) { struct vfio_region_info region_info; - struct vfio_region_info_cap_sparse_mmap *sparse; struct vfio_user_header hdr; uint16_t msg_id = 0; size_t cap_sz; - int regsz = sizeof(region_info); int i, ret; msg_id = 1; @@ -133,8 +188,9 @@ get_device_region_info(int sock, struct vfio_device_info *client_dev_info) msg_id++; ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_REGION_INFO, - ®ion_info, regsz, NULL, 0, NULL, - ®ion_info, regsz); + ®ion_info, sizeof region_info, + NULL, 0, NULL, + ®ion_info, sizeof(region_info)); if (ret < 0) { fprintf(stderr, "failed to get device region info: %s\n", strerror(-ret)); @@ -146,44 +202,26 @@ get_device_region_info(int sock, struct vfio_device_info *client_dev_info) "cap_sz %d\n", __func__, i, region_info.offset, region_info.flags, region_info.size, cap_sz); if (cap_sz) { - int j; - - sparse = calloc(cap_sz, 1); - if (sparse == NULL) { - return -ENOMEM; - } - - ret = recv(sock, sparse, cap_sz, 0); - if (ret < 0) { - ret = -errno; - fprintf(stderr, "%s: failed to receive sparse cap info: %s\n", - __func__, strerror(-ret)); - free(sparse); + ret = get_region_vfio_caps(sock, cap_sz); + if (ret != 0) { return ret; } - fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__, - sparse->nr_areas); - for (j = 0; j < sparse->nr_areas; j++) { - fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__, - j, sparse->areas[j].offset, sparse->areas[j].size); - } - free(sparse); } } + return 0; } static int get_device_info(int sock, struct vfio_device_info *dev_info) { struct vfio_user_header hdr; - int dev_info_sz = sizeof(*dev_info); uint16_t msg_id; int ret; - dev_info->argsz = dev_info_sz; + dev_info->argsz = sizeof(*dev_info); msg_id = 1; ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_INFO, - dev_info, dev_info_sz, NULL, 0, NULL, - dev_info, dev_info_sz); + dev_info, sizeof(*dev_info), NULL, 0, NULL, + dev_info, sizeof(*dev_info)); if (ret < 0) { fprintf(stderr, "failed to get device info: %s\n", strerror(-ret)); return ret; @@ -197,30 +235,35 @@ static int get_device_info(int sock, struct vfio_device_info *dev_info) static int configure_irqs(int sock) { - int i, size; - int ret; + int i, ret; + size_t size; struct vfio_irq_set irq_set; - struct vfio_user_irq_info irq_info; + struct vfio_user_irq_info vfio_user_irq_info; struct vfio_user_header hdr; uint16_t msg_id = 1; int irq_fd; uint64_t val; + struct iovec iovecs[2]; - for (i = 0; i < LM_DEV_NUM_IRQS; i++) { - struct vfio_irq_info irq_info = {.argsz = sizeof irq_info, .index = i}; + for (i = 0; i < LM_DEV_NUM_IRQS; i++) { /* TODO move body of loop into function */ int size; + struct vfio_irq_info vfio_irq_info = { + .argsz = sizeof vfio_irq_info, + .index = i + }; ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_IRQ_INFO, - &irq_info, sizeof irq_info, NULL, 0, NULL, - &irq_info, sizeof irq_info); + &vfio_irq_info, sizeof vfio_irq_info, + NULL, 0, NULL, + &vfio_irq_info, sizeof vfio_irq_info); if (ret < 0) { fprintf(stderr, "failed to get %s info: %s\n", irq_to_str[i], strerror(-ret)); return ret; } - if (irq_info.count > 0) { + if (vfio_irq_info.count > 0) { printf("IRQ %s: count=%d flags=%#x\n", - irq_to_str[i], irq_info.count, irq_info.flags); + irq_to_str[i], vfio_irq_info.count, vfio_irq_info.flags); } } @@ -258,14 +301,16 @@ configure_irqs(int sock) printf("INTx triggered!\n"); msg_id++; - size = sizeof(irq_info); - ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &irq_info, &size); + + size = sizeof(vfio_user_irq_info); + ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &vfio_user_irq_info, + &size); if (ret < 0) { - fprintf(stderr, "failed to recieve IRQ message: %s\n", strerror(-ret)); + fprintf(stderr, "failed to receive IRQ message: %s\n", strerror(-ret)); return ret; } - if (irq_info.subindex >= irq_set.count) { - fprintf(stderr, "bad IRQ %d, max=%d\n", irq_info.subindex, + if (vfio_user_irq_info.subindex >= irq_set.count) { + fprintf(stderr, "bad IRQ %d, max=%d\n", vfio_user_irq_info.subindex, irq_set.count); return -ENOENT; } @@ -305,7 +350,10 @@ access_bar0(int sock) fprintf(stderr, "failed to write to BAR0: %s\n", strerror(-ret)); return ret; } - assert(region_access.count == sizeof data.t); + if (region_access.count != sizeof data.t) { + fprintf(stderr, "bad written data length %d\n", region_access.count); + return -EINVAL; + } printf("wrote to BAR0: %ld\n", data.t); @@ -334,7 +382,8 @@ static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions, { struct vfio_user_dma_region_access dma_access; struct vfio_user_header hdr; - int ret, size = sizeof(dma_access), i; + int ret, i; + size_t size = sizeof(dma_access); uint16_t msg_id; void *data; @@ -371,9 +420,10 @@ static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions, dma_access.count = 0; ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_DMA_WRITE, - &dma_access, sizeof(dma_access), NULL, 0); + &dma_access, sizeof dma_access, NULL, 0); if (ret < 0) { - fprintf(stderr, "failed to send reply of DMA write: %m\n"); + fprintf(stderr, "failed to send reply of DMA write: %s\n", + strerror(-ret)); } out: @@ -386,7 +436,8 @@ static int handle_dma_read(int sock, struct vfio_user_dma_region *dma_regions, { struct vfio_user_dma_region_access dma_access, *response; struct vfio_user_header hdr; - int ret, size = sizeof(dma_access), i, response_sz; + int ret, i, response_sz; + size_t size = sizeof(dma_access); uint16_t msg_id; void *data; @@ -449,6 +500,56 @@ static int handle_dma_io(int sock, struct vfio_user_dma_region *dma_regions, return 0; } +static int +get_dirty_bitmaps(int sock, struct vfio_user_dma_region *dma_regions, + int nr_dma_regions) +{ + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0}; + struct vfio_iommu_type1_dirty_bitmap_get bitmaps[2]; + int ret, i; + struct iovec iovecs[4] = { + [1] = { + .iov_base = &dirty_bitmap, + .iov_len = sizeof dirty_bitmap + } + }; + struct vfio_user_header hdr = {0}; + char data[ARRAY_SIZE(bitmaps)]; + + assert(dma_regions != NULL); + assert(nr_dma_regions >= ARRAY_SIZE(bitmaps)); + + for (i = 0; i < ARRAY_SIZE(bitmaps); i++) { + bitmaps[i].iova = dma_regions[i].addr; + bitmaps[i].size = dma_regions[i].size; + bitmaps[i].bitmap.size = 1; /* FIXME calculate based on page and IOVA size, don't hardcode */ + bitmaps[i].bitmap.pgsize = sysconf(_SC_PAGESIZE); + iovecs[(i + 2)].iov_base = &bitmaps[i]; /* FIXME the +2 is because iovecs[0] is the vfio_user_header and iovecs[1] is vfio_iommu_type1_dirty_bitmap */ + iovecs[(i + 2)].iov_len = sizeof(struct vfio_iommu_type1_dirty_bitmap_get); + } + + /* + * FIXME there should be at least two IOVAs. Send single message for two + * IOVAs and ensure only one bit is set in first IOVA. + */ + dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + ret = _send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES, + iovecs, ARRAY_SIZE(iovecs), + NULL, 0, + &hdr, data, ARRAY_SIZE(data)); + if (ret != 0) { + fprintf(stderr, "failed to start dirty page logging: %s\n", + strerror(-ret)); + return ret; + } + + for (i = 0; i < ARRAY_SIZE(bitmaps); i++) { + printf("%#x-%#x\t%hhu\n", bitmaps[i].iova, + bitmaps[i].iova + bitmaps[i].size - 1, data[i]); + } + return 0; +} + int main(int argc, char *argv[]) { int ret, sock; @@ -461,7 +562,9 @@ int main(int argc, char *argv[]) int fd; const int client_max_fds = 32; int server_max_fds; + size_t pgsize; int nr_dma_regions; + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0}; if (argc != 2) { fprintf(stderr, "usage: %s /path/to/socket\n", argv[0]); @@ -478,7 +581,7 @@ int main(int argc, char *argv[]) * The server proposes version upon connection, we need to send back the * version the version we support. */ - if ((ret = set_version(sock, client_max_fds, &server_max_fds)) < 0) { + if ((ret = set_version(sock, client_max_fds, &server_max_fds, &pgsize)) < 0) { return ret; } @@ -503,7 +606,7 @@ int main(int argc, char *argv[]) /* * XXX VFIO_USER_DMA_MAP * - * Tell the server we have some DMA regions it can access. Each DMA regions + * Tell the server we have some DMA regions it can access. Each DMA region * is accompanied by a file descriptor, so let's create more (2x) DMA * regions that can fit in a message that can be handled by the server. */ @@ -531,10 +634,10 @@ int main(int argc, char *argv[]) for (i = 0; i < nr_dma_regions / server_max_fds; i++, msg_id++) { ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_MAP, - dma_regions + (i * server_max_fds), - sizeof *dma_regions * server_max_fds, - dma_region_fds + (i * server_max_fds), - server_max_fds, NULL, NULL, 0); + dma_regions + (i * server_max_fds), + sizeof(*dma_regions) * server_max_fds, + dma_region_fds + (i * server_max_fds), + server_max_fds, NULL, NULL, 0); if (ret < 0) { fprintf(stderr, "failed to map DMA regions: %s\n", strerror(-ret)); return ret; @@ -553,6 +656,17 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } + + dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES, + &dirty_bitmap, sizeof dirty_bitmap, + NULL, 0, NULL, NULL, 0); + if (ret != 0) { + fprintf(stderr, "failed to start dirty page logging: %s\n", + strerror(-ret)); + exit(EXIT_FAILURE); + } + /* * XXX VFIO_USER_DEVICE_GET_IRQ_INFO and VFIO_IRQ_SET_ACTION_TRIGGER * Query interrupts, configure an eventfd to be associated with INTx, and @@ -570,6 +684,23 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } + ret = get_dirty_bitmaps(sock, dma_regions, nr_dma_regions); + if (ret < 0) { + fprintf(stderr, "failed to receive dirty bitmaps: %s\n", + strerror(-ret)); + exit(EXIT_FAILURE); + } + + dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES, + &dirty_bitmap, sizeof dirty_bitmap, + NULL, 0, NULL, NULL, 0); + if (ret != 0) { + fprintf(stderr, "failed to stop dirty page logging: %s\n", + strerror(-ret)); + exit(EXIT_FAILURE); + } + /* * FIXME now that region read/write works, change the server implementation * to trigger an interrupt after N seconds, where N is the value written to @@ -577,7 +708,11 @@ int main(int argc, char *argv[]) */ /* BAR1 can be memory mapped and read directly */ - /* TODO implement the following: write a value in BAR1, a server timer will increase it every second (SIGALARM) */ + + /* + * TODO implement the following: write a value in BAR1, a server timer will + * increase it every second (SIGALARM) + */ /* * XXX VFIO_USER_DMA_UNMAP @@ -585,8 +720,7 @@ int main(int argc, char *argv[]) * unmap the first group of the DMA regions */ ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_UNMAP, - dma_regions, - sizeof *dma_regions * server_max_fds, + dma_regions, sizeof *dma_regions * server_max_fds, NULL, 0, NULL, NULL, 0); if (ret < 0) { fprintf(stderr, "failed to unmap DMA regions: %s\n", strerror(-ret)); diff --git a/samples/server.c b/samples/server.c index 782ccdc..9f4d3b1 100644 --- a/samples/server.c +++ b/samples/server.c @@ -39,6 +39,7 @@ #include <time.h> #include <assert.h> #include <openssl/md5.h> +#include <sys/mman.h> #include "../lib/muser.h" @@ -146,17 +147,30 @@ void get_md5sum(char *buf, int len, char *md5sum) return; } +/* + * FIXME this function does DMA write/read using messages. This should be done + * on a region that is not memory mappable or an area of a region that is not + * sparsely memory mappable. We should also have a test where the server does + * DMA directly on the client memory. + */ static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data) { int count = 4096; char buf[count], md5sum1[MD5_DIGEST_LENGTH], md5sum2[MD5_DIGEST_LENGTH]; int i, ret; + dma_sg_t sg; + + assert(lm_ctx != NULL); + + ret = lm_addr_to_sg(lm_ctx, server_data->regions[0].addr, count, &sg, + 1, PROT_WRITE); + assert(ret == 1); /* FIXME */ memset(buf, 'A', count); get_md5sum(buf, count, md5sum1); printf("%s: WRITE addr %#lx count %llu\n", __func__, server_data->regions[0].addr, count); - ret = lm_dma_write(lm_ctx, server_data->regions[0].addr, count, buf); + ret = lm_dma_write(lm_ctx, &sg, buf); if (ret < 0) { fprintf(stderr, "lm_dma_write failed: %s\n", strerror(-ret)); return ret; @@ -165,7 +179,7 @@ static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data) memset(buf, 0, count); printf("%s: READ addr %#lx count %llu\n", __func__, server_data->regions[0].addr, count); - ret = lm_dma_read(lm_ctx, server_data->regions[0].addr, count, buf); + ret = lm_dma_read(lm_ctx, &sg, buf); if (ret < 0) { fprintf(stderr, "lm_dma_read failed: %s\n", strerror(-ret)); return ret; @@ -253,6 +267,11 @@ int main(int argc, char *argv[]) .mmap_areas = sparse_areas, .map = map_area }, + .reg_info[LM_DEV_MIGRATION_REG_IDX] = { /* migration region */ + .flags = LM_REG_FLAG_RW, + .size = sysconf(_SC_PAGESIZE), + .mmap_areas = sparse_areas, + }, .irq_count[LM_DEV_INTX_IRQ_IDX] = 1, }, .uuid = argv[optind], |