diff options
-rw-r--r-- | lib/dma.c | 123 | ||||
-rw-r--r-- | lib/dma.h | 85 | ||||
-rw-r--r-- | lib/libmuser.c | 469 | ||||
-rw-r--r-- | lib/muser.h | 24 | ||||
-rw-r--r-- | lib/muser_priv.h | 29 | ||||
-rw-r--r-- | lib/vfio_user.h | 60 | ||||
-rw-r--r-- | samples/client.c | 252 | ||||
-rw-r--r-- | samples/server.c | 23 |
8 files changed, 855 insertions, 210 deletions
@@ -81,6 +81,7 @@ dma_controller_create(lm_ctx_t *lm_ctx, int max_regions) dma->max_regions = max_regions; dma->nregions = 0; memset(dma->regions, 0, max_regions * sizeof(dma->regions[0])); + dma->dirty_pgsize = 0; return dma; } @@ -106,6 +107,10 @@ _dma_controller_do_remove_region(dma_controller_t *dma, } } +/* + * FIXME no longer used. Also, it doesn't work for addresses that span two + * DMA regions. + */ bool dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr, size_t size) @@ -328,7 +333,7 @@ dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len) int _dma_addr_sg_split(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg) + dma_sg_t *sg, int max_sg, int prot) { int idx; int cnt = 0; @@ -348,6 +353,9 @@ _dma_addr_sg_split(const dma_controller_t *dma, sg[cnt].region = idx; sg[cnt].offset = dma_addr - region->dma_addr; sg[cnt].length = region_len; + if (_dma_should_mark_dirty(dma, prot)) { + _dma_mark_dirty(dma, region, sg); + } } cnt++; @@ -376,4 +384,117 @@ out: return cnt; } +ssize_t _get_bitmap_size(size_t region_size, size_t pgsize) +{ + if (pgsize == 0) { + return -EINVAL; + } + if (region_size < pgsize) { + return -EINVAL; + } + size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0); + return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0); +} + +int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize) +{ + int i; + + assert(dma != NULL); + + if (pgsize == 0) { + return -EINVAL; + } + + if (dma->dirty_pgsize > 0) { + if (dma->dirty_pgsize != pgsize) { + return -EINVAL; + } + return 0; + } + + for (i = 0; i < dma->nregions; i++) { + dma_memory_region_t *region = &dma->regions[i]; + ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize); + if (bitmap_size < 0) { + return bitmap_size; + } + region->dirty_bitmap = calloc(bitmap_size, sizeof(char)); + if (region->dirty_bitmap == NULL) { + int j, ret = -errno; + for (j = 0; j < i; j++) { + free(region->dirty_bitmap); + region->dirty_bitmap = NULL; + } + return ret; + } + } + dma->dirty_pgsize = pgsize; + return 0; +} + +int dma_controller_dirty_page_logging_stop(dma_controller_t *dma) +{ + int i; + + assert(dma != NULL); + + if (dma->dirty_pgsize == 0) { + return 0; + } + + for (i = 0; i < dma->nregions; i++) { + free(dma->regions[i].dirty_bitmap); + dma->regions[i].dirty_bitmap = NULL; + } + dma->dirty_pgsize = 0; + return 0; +} + +int +dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len, + size_t pgsize, size_t size, char **data) +{ + int ret; + ssize_t bitmap_size; + dma_sg_t sg; + dma_memory_region_t *region; + + assert(dma != NULL); + assert(data != NULL); + + /* + * FIXME for now we support IOVAs that match exactly the DMA region. This + * is purely for simplifying the implementation. We MUST allow arbitrary + * IOVAs. + */ + ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE); + if (ret != 1 || sg.dma_addr != addr || sg.length != len) { + return -ENOTSUP; + } + + if (pgsize != dma->dirty_pgsize) { + return -EINVAL; + } + + bitmap_size = _get_bitmap_size(len, pgsize); + if (bitmap_size < 0) { + return bitmap_size; + } + + /* + * FIXME they must be equal because this is how much data the client + * expects to receive. + */ + if (size != (size_t)bitmap_size) { + return -EINVAL; + } + + region = &dma->regions[sg.region]; + + *data = region->dirty_bitmap; + + return 0; +} + /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ @@ -32,6 +32,11 @@ #define DMA_DMA_H /* + * FIXME check whether DMA regions must be page aligned. If so then the + * implementation can be greatly simpified. + */ + +/* * This library emulates a DMA controller for a device emulation application to * perform DMA operations on a foreign memory space. * @@ -82,12 +87,14 @@ typedef struct { off_t offset; // File offset void *virt_addr; // Virtual address of this region int refcnt; // Number of users of this region + char *dirty_bitmap; // Dirty page bitmap } dma_memory_region_t; typedef struct { int max_regions; int nregions; struct lm_ctx *lm_ctx; + size_t dirty_pgsize; // Dirty page granularity dma_memory_region_t regions[0]; } dma_controller_t; @@ -118,7 +125,59 @@ dma_controller_remove_region(dma_controller_t *dma, int _dma_addr_sg_split(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg); + dma_sg_t *sg, int max_sg, int prot); + +static bool +_dma_should_mark_dirty(const dma_controller_t *dma, int prot) +{ + assert(dma != NULL); + + return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0; +} + +static size_t +_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset) +{ + return (offset - base_addr) / pgsize; +} + +static size_t +_get_pgend(size_t pgsize, uint64_t len, size_t start) +{ + return start + (len / pgsize) + (len % pgsize != 0) - 1; +} + +static void +_dma_bitmap_get_pgrange(const dma_controller_t *dma, + const dma_memory_region_t *region, + const dma_sg_t *sg, size_t *start, size_t *end) +{ + assert(dma != NULL); + assert(region != NULL); + assert(sg != NULL); + assert(start != NULL); + assert(end != NULL); + + *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset); + *end = _get_pgend(dma->dirty_pgsize, sg->length, *start); +} + +static void +_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region, + dma_sg_t *sg) +{ + size_t i, start, end; + + assert(dma != NULL); + assert(region != NULL); + assert(sg != NULL); + assert(region->dirty_bitmap != NULL); + + _dma_bitmap_get_pgrange(dma, region, sg, &start, &end); + for (i = start; i <= end; i++) { + region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT); + } +} /* Takes a linear dma address span and returns a sg list suitable for DMA. * A single linear dma address span may need to be split into multiple @@ -134,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma, static inline int dma_addr_to_sg(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg) + dma_sg_t *sg, int max_sg, int prot) { static __thread int region_hint; int cnt; @@ -150,10 +209,13 @@ dma_addr_to_sg(const dma_controller_t *dma, sg->region = region_hint; sg->offset = dma_addr - region->dma_addr; sg->length = len; + if (_dma_should_mark_dirty(dma, prot)) { + _dma_mark_dirty(dma, region, sg); + } return 1; } // Slow path: search through regions. - cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg); + cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot); if (likely(cnt > 0)) { region_hint = sg->region; } @@ -186,6 +248,7 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov, return 0; } +/* FIXME useless define */ #define UNUSED __attribute__((unused)) static inline void @@ -215,12 +278,12 @@ dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg, } static inline void * -dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len) +dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot) { dma_sg_t sg; struct iovec iov; - if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 && + if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 && dma_map_sg(dma, &sg, &iov, 1) == 0) { return iov.iov_base; } @@ -239,12 +302,22 @@ dma_unmap_addr(dma_controller_t *dma, }; int r; - r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1); + r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE); assert(r == 1); dma_unmap_sg(dma, &sg, &iov, 1); } +int +dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize); + +int +dma_controller_dirty_page_logging_stop(dma_controller_t *dma); + +int +dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len, + size_t pgsize, size_t size, char **data); + bool dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr, size_t size); diff --git a/lib/libmuser.c b/lib/libmuser.c index 547a318..0aa3443 100644 --- a/lib/libmuser.c +++ b/lib/libmuser.c @@ -112,6 +112,8 @@ struct lm_ctx { int client_max_fds; + size_t migration_pgsize; + lm_irqs_t irqs; /* XXX must be last */ }; @@ -270,14 +272,20 @@ __free_s(char **p) } int -send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, - enum vfio_user_command cmd, void *data, int len, +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, int *fds, int count) { int ret; struct vfio_user_header hdr = {.msg_id = msg_id}; - struct iovec iov[2]; struct msghdr msg; + size_t i; + + if (nr_iovecs == 0) { + iovecs = alloca(sizeof(*iovecs)); + nr_iovecs = 1; + } memset(&msg, 0, sizeof(msg)); @@ -288,23 +296,15 @@ send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, hdr.flags.type = VFIO_USER_F_TYPE_COMMAND; } - if (data != NULL && len == 0) { - return -EINVAL; - } - - hdr.msg_size = sizeof(hdr) + len; + iovecs[0].iov_base = &hdr; + iovecs[0].iov_len = sizeof(hdr); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); - msg.msg_iovlen = 1; - - if (data != NULL) { - msg.msg_iovlen++; - iov[1].iov_base = data; - iov[1].iov_len = len; + for (i = 0; i < nr_iovecs; i++) { + hdr.msg_size += iovecs[i].iov_len; } - msg.msg_iov = iov; + msg.msg_iovlen = nr_iovecs; + msg.msg_iov = iovecs; if (fds != NULL) { size_t size = count * sizeof *fds; @@ -329,26 +329,43 @@ send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, } int +send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count) { + + struct iovec iovecs[2] = { + [1] = { + .iov_base = data, + .iov_len = data_len + } + }; + return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs, + ARRAY_SIZE(iovecs), fds, count); +} + +int send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, char *caps) { int ret; - char *data __attribute__((__cleanup__(__free_s))) = NULL; + char *data; - ret = asprintf(&data, "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}", + ret = asprintf(&data, + "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}", major, minor, caps != NULL ? caps : "{}"); if (ret == -1) { - data = NULL; return -1; } - - return send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data, - ret, NULL, 0); + ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data, + ret, NULL, 0); + free(data); + return ret; } int recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, - uint16_t *msg_id, void *data, int *len) + uint16_t *msg_id, void *data, size_t *len) { int ret; @@ -388,7 +405,7 @@ recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, if (ret < 0) { return ret; } - if (*len != ret) { /* FIXME we should allow receiving less */ + if (*len != (size_t)ret) { /* FIXME we should allow receiving less */ return -EINVAL; } *len = ret; @@ -398,7 +415,7 @@ recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, int recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, - int *max_fds) + int *max_fds, size_t *pgsize) { int ret; struct vfio_user_header hdr; @@ -424,23 +441,23 @@ recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, /* FIXME use proper parsing */ ret = sscanf(data, - "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d}}", - major, minor, max_fds); - if (ret != 3) { + "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}", + major, minor, max_fds, pgsize); + if (ret != 4) { return -EINVAL; } return 0; } int -send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, - void *send_data, int send_len, - int *send_fds, int fd_count, - struct vfio_user_header *hdr, - void *recv_data, int recv_len) -{ - int ret = send_vfio_user_msg(sock, msg_id, false, cmd, send_data, send_len, - send_fds, fd_count); +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs, + send_fds, fd_count); if (ret < 0) { return ret; } @@ -450,6 +467,24 @@ send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len); } +int +send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + struct iovec iovecs[2] = { + [1] = { + .iov_base = send_data, + .iov_len = send_len + } + }; + return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs, + ARRAY_SIZE(iovecs), send_fds, fd_count, + hdr, recv_data, recv_len); +} + static int set_version(lm_ctx_t *lm_ctx, int sock) { @@ -458,7 +493,8 @@ set_version(lm_ctx_t *lm_ctx, int sock) uint16_t msg_id = 0; char *server_caps; - ret = asprintf(&server_caps, "{max_fds: %d}", MAX_FDS); + ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}", + MAX_FDS, sysconf(_SC_PAGESIZE)); if (ret == -1) { return -ENOMEM; } @@ -471,7 +507,7 @@ set_version(lm_ctx_t *lm_ctx, int sock) } ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true, - &lm_ctx->client_max_fds); + &lm_ctx->client_max_fds, &lm_ctx->migration_pgsize); if (ret < 0) { lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret)); goto out; @@ -482,7 +518,18 @@ set_version(lm_ctx_t *lm_ctx, int sock) LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN, client_mj, client_mn); ret = -EINVAL; + goto out; + } + if (lm_ctx->migration_pgsize == 0) { + lm_log(lm_ctx, LM_ERR, "bad migration page size"); + ret = -EINVAL; + goto out; } + + /* FIXME need to check max_fds */ + + lm_ctx->migration_pgsize = MIN(lm_ctx->migration_pgsize, + sysconf(_SC_PAGESIZE)); out: free(server_caps); return ret; @@ -932,23 +979,33 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info) * points accordingly. */ static int -dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, +dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index, struct vfio_region_info **vfio_reg, bool is_kernel) { + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *type = NULL; struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct lm_sparse_mmap_areas *mmap_areas; int nr_mmap_areas, i; - size_t sparse_size; + size_t type_size = 0; + size_t sparse_size = 0; + size_t cap_size; ssize_t ret; void *cap_ptr; - if (lm_reg->mmap_areas == NULL) { - lm_log(lm_ctx, LM_DBG, "bad mmap_areas\n"); - return -EINVAL; + if (reg_index == LM_DEV_MIGRATION_REG_IDX) { + type_size = sizeof(struct vfio_region_info_cap_type); + } + + if (lm_reg->mmap_areas != NULL) { + nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas; + sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas)); } - nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas; - sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas)); + cap_size = type_size + sparse_size; + if (cap_size == 0) { + return 0; + } /* * If vfio_reg does not have enough space to accommodate sparse info then @@ -956,54 +1013,79 @@ dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, * is only for kernel/muser.ko, where the request comes from kernel/vfio. */ - if ((*vfio_reg)->argsz < sparse_size + sizeof(**vfio_reg) && is_kernel) { + if ((*vfio_reg)->argsz < cap_size + sizeof(**vfio_reg) && is_kernel) { lm_log(lm_ctx, LM_DBG, "vfio_reg too small=%d\n", (*vfio_reg)->argsz); - (*vfio_reg)->argsz = sparse_size + sizeof(**vfio_reg); + (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg); (*vfio_reg)->cap_offset = 0; return 0; } - sparse = calloc(1, sparse_size); - if (sparse == NULL) + /* TODO deosn't need to be calloc, we overwrite it entirely */ + header = calloc(1, cap_size); + if (header == NULL) { return -ENOMEM; - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->header.next = 0; - sparse->nr_areas = nr_mmap_areas; + } - lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__, - sparse_size, nr_mmap_areas); - mmap_areas = lm_reg->mmap_areas; - for (i = 0; i < nr_mmap_areas; i++) { - sparse->areas[i].offset = mmap_areas->areas[i].start; - sparse->areas[i].size = mmap_areas->areas[i].size; - lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__, - i, sparse->areas[i].offset, sparse->areas[i].size); + if (reg_index == LM_DEV_MIGRATION_REG_IDX) { + type = (struct vfio_region_info_cap_type*)header; + type->header.id = VFIO_REGION_INFO_CAP_TYPE; + type->header.version = 1; + type->header.next = 0; + type->type = VFIO_REGION_TYPE_MIGRATION; + type->subtype = VFIO_REGION_SUBTYPE_MIGRATION; + (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info); } + if (lm_reg->mmap_areas != NULL) { + if (type != NULL) { + type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type); + sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1); + } else { + (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info); + sparse = (struct vfio_region_info_cap_sparse_mmap*)header; + } + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->header.next = 0; + sparse->nr_areas = nr_mmap_areas; + + lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__, + sparse_size, nr_mmap_areas); + mmap_areas = lm_reg->mmap_areas; + for (i = 0; i < nr_mmap_areas; i++) { + sparse->areas[i].offset = mmap_areas->areas[i].start; + sparse->areas[i].size = mmap_areas->areas[i].size; + lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__, + i, sparse->areas[i].offset, sparse->areas[i].size); + } + } + + /* + * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is + * memory-mappable in general, not only if it supports sparse mmap. + */ (*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS; - (*vfio_reg)->cap_offset = sizeof(**vfio_reg); if (is_kernel) { /* write the sparse mmap cap info to vfio-client user pages */ - ret = write(lm_ctx->conn_fd, sparse, sparse_size); - if (ret != (ssize_t)sparse_size) { - free(sparse); + ret = write(lm_ctx->conn_fd, header, cap_size); + if (ret != (ssize_t)cap_size) { + free(header); return -EIO; } } else { - (*vfio_reg)->argsz = sparse_size + sizeof(**vfio_reg); + (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg); *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz); if (*vfio_reg == NULL) { - free(sparse); + free(header); return -ENOMEM; } cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset; - memcpy(cap_ptr, sparse, sparse_size); + memcpy(cap_ptr, header, cap_size); } - free(sparse); + free(header); return 0; } @@ -1073,11 +1155,10 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg, (*vfio_reg)->flags = lm_reg->flags; (*vfio_reg)->size = lm_reg->size; - if (lm_reg->mmap_areas != NULL) { - err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg, is_kernel); - if (err) { - return err; - } + err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg, + is_kernel); + if (err) { + return err; } lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu " @@ -1089,8 +1170,9 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg, } static long -dev_get_info(struct vfio_device_info *dev_info) +dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info) { + assert(lm_ctx != NULL); assert(dev_info != NULL); // Ensure provided argsz is sufficiently big. @@ -1114,7 +1196,7 @@ do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data) assert(lm_ctx != NULL); switch (cmd_ioctl->vfio_cmd) { case VFIO_DEVICE_GET_INFO: - err = dev_get_info(&cmd_ioctl->data.dev_info); + err = dev_get_info(lm_ctx, &cmd_ioctl->data.dev_info); break; case VFIO_DEVICE_GET_REGION_INFO: reg_info = &cmd_ioctl->data.reg_info; @@ -1737,7 +1819,7 @@ static int handle_device_get_info(lm_ctx_t *lm_ctx, return -errno; } - ret = dev_get_info(dev_info); + ret = dev_get_info(lm_ctx, dev_info); if (ret < 0) { return ret; } @@ -1919,7 +2001,7 @@ handle_device_reset(lm_ctx_t *lm_ctx) static int handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, - void **data, int *len) + void **data, size_t *len) { struct vfio_user_region_access region_access; struct muser_cmd muser_cmd = {}; @@ -1973,6 +2055,105 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, return 0; } +static int +handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct iovec **iovecs, size_t *nr_iovecs) +{ + int size, ret; + size_t i; + struct vfio_iommu_type1_dirty_bitmap_get *ranges; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(iovecs != NULL); + assert(nr_iovecs != NULL); + + size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap); + if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) { + return -EINVAL; + } + ranges = malloc(size); + if (ranges == NULL) { + return -errno; + } + ret = recv(lm_ctx->conn_fd, ranges, size, 0); + if (ret == -1) { + ret = -errno; + goto out; + } + if (ret != size) { + ret = -EINVAL; + goto out; + } + *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get); + *iovecs = malloc(*nr_iovecs); + if (*iovecs == NULL) { + ret = -errno; + goto out; + } + + for (i = 1; i < *nr_iovecs; i++) { + struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */ + ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size, + r->bitmap.pgsize, r->bitmap.size, + (char**)&((*iovecs)[i].iov_base)); + if (ret != 0) { + goto out; + } + (*iovecs)[i].iov_len = r->bitmap.size; + } +out: + if (ret != 0) { + if (*iovecs != NULL) { + free(*iovecs); + *iovecs = NULL; + } + } + free(ranges); + return ret; +} + +static int +handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct iovec **iovecs, size_t *nr_iovecs) +{ + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap; + int ret; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(iovecs != NULL); + assert(nr_iovecs != NULL); + + if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) { + lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size); + return -EINVAL; + } + + /* FIXME must also check argsz */ + + ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0); + if (ret == -1) { + return -errno; + } + if ((size_t)ret < sizeof dirty_bitmap) { + return -EINVAL; + } + + if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { + ret = dma_controller_dirty_page_logging_start(lm_ctx->dma, + lm_ctx->migration_pgsize); + } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { + ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma); + } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { + ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs); + } else { + ret = -EINVAL; + } + + return ret; +} + /* * FIXME return value is messed up, sometimes we return -1 and set errno while * other times we return -errno. Fix. @@ -1988,9 +2169,9 @@ process_request(lm_ctx_t *lm_ctx) struct vfio_irq_info irq_info; struct vfio_device_info dev_info; struct vfio_region_info *dev_reg_info = NULL; - void *data = NULL; - bool free_data = false; - int len = 0; + struct iovec _iovecs[2] = {0}, *iovecs = NULL; + size_t nr_iovecs = 0; + bool free_iovec_data = true; assert(lm_ctx != NULL); @@ -2034,6 +2215,12 @@ process_request(lm_ctx_t *lm_ctx) return -EINVAL; } + /* FIXME in most of the following function we check that hdr.count is >= + * than the command-specific struct and there is an additional recv(2) for + * that data. We should eliminate duplicating this common code and move it + * here. + */ + switch (hdr.cmd) { case VFIO_USER_DMA_MAP: case VFIO_USER_DMA_UNMAP: @@ -2044,23 +2231,28 @@ process_request(lm_ctx_t *lm_ctx) case VFIO_USER_DEVICE_GET_INFO: ret = handle_device_get_info(lm_ctx, &hdr, &dev_info); if (ret == 0) { - data = &dev_info; - len = dev_info.argsz; + _iovecs[1].iov_base = &dev_info; + _iovecs[1].iov_len = dev_info.argsz; + iovecs = _iovecs; + nr_iovecs = 2; } break; case VFIO_USER_DEVICE_GET_REGION_INFO: ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info); if (ret == 0) { - data = dev_reg_info; - len = dev_reg_info->argsz; - free_data = true; + _iovecs[1].iov_base = dev_reg_info; + _iovecs[1].iov_len = dev_reg_info->argsz; + iovecs = _iovecs; + nr_iovecs = 2; } break; case VFIO_USER_DEVICE_GET_IRQ_INFO: ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info); if (ret == 0) { - data = &irq_info; - len = sizeof irq_info; + _iovecs[1].iov_base = &irq_info; + _iovecs[1].iov_len = sizeof irq_info; + iovecs = _iovecs; + nr_iovecs = 2; } break; case VFIO_USER_DEVICE_SET_IRQS: @@ -2068,12 +2260,20 @@ process_request(lm_ctx_t *lm_ctx) break; case VFIO_USER_REGION_READ: case VFIO_USER_REGION_WRITE: - ret = handle_region_access(lm_ctx, &hdr, &data, &len); - free_data = true; + iovecs = _iovecs; + ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base, + &iovecs[1].iov_len); + nr_iovecs = 2; break; case VFIO_USER_DEVICE_RESET: ret = handle_device_reset(lm_ctx); break; + case VFIO_USER_DIRTY_PAGES: + ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs); + if (ret >= 0) { + free_iovec_data = false; + } + break; default: lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd); return -EINVAL; @@ -2083,14 +2283,23 @@ process_request(lm_ctx_t *lm_ctx) * TODO: In case of error during command handling set errno respectively * in the reply message. */ - ret = send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true, - 0, data, len, NULL, 0); + if (ret < 0) { + assert(false); /* FIXME */ + } + ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true, + 0, iovecs, nr_iovecs, NULL, 0); if (unlikely(ret < 0)) { lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n", strerror(-ret)); } - if (free_data) { - free(data); + if (iovecs != NULL && iovecs != _iovecs) { + if (free_iovec_data) { + size_t i; + for (i = 0; i < nr_iovecs; i++) { + free(iovecs[i].iov_base); + } + } + free(iovecs); } return ret; @@ -2190,8 +2399,9 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex) irq_info.subindex = subindex; ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, - VFIO_USER_VM_INTERRUPT, &irq_info, - sizeof(irq_info), NULL, 0, NULL, NULL, 0); + VFIO_USER_VM_INTERRUPT, + &irq_info, sizeof irq_info, + NULL, 0, NULL, NULL, 0); if (ret < 0) { errno = -ret; return -1; @@ -2291,6 +2501,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) { lm_reg_info_t *cfg_reg; const lm_reg_info_t zero_reg = { 0 }; + lm_reg_info_t *migr_reg; int i; assert(lm_ctx != NULL); @@ -2347,6 +2558,16 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF; } + /* + * Check the migration region. + */ + migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX]; + if (migr_reg->size > 0) { + if (migr_reg->size < sizeof(struct vfio_device_migration_info)) { + return -EINVAL; + } + } + return 0; err: @@ -2529,13 +2750,15 @@ lm_get_region_info(lm_ctx_t *lm_ctx) inline int lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, - uint32_t len, dma_sg_t *sg, int max_sg) + uint32_t len, dma_sg_t *sg, int max_sg, int prot) { + assert(lm_ctx != NULL); + if (unlikely(lm_ctx->unmap_dma == NULL)) { errno = EINVAL; return -1; } - return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg); + return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot); } inline int @@ -2581,62 +2804,54 @@ lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id) } int -lm_dma_read(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data) +lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data) { struct vfio_user_dma_region_access *dma_recv; - struct vfio_user_dma_region_access dma_send = { - .addr = addr, - .count = count - }; - int recv_size = sizeof(*dma_recv) + count; + struct vfio_user_dma_region_access dma_send; + int recv_size; int msg_id = 1, ret; - if (!dma_controller_region_valid(lm_ctx->dma, addr, count)) { - lm_log(lm_ctx, LM_ERR, "DMA region addr %#lx count %llu doest not " - "exists", addr, count); - return -ENOENT; - } + assert(lm_ctx != NULL); + assert(sg != NULL); + + recv_size = sizeof(*dma_recv) + sg->length; dma_recv = calloc(recv_size, 1); if (dma_recv == NULL) { return -ENOMEM; } - dma_recv->addr = addr; - dma_recv->count = count; + dma_send.addr = sg->dma_addr; + dma_send.count = sg->length; ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ, - &dma_send, sizeof(dma_send), NULL, 0, NULL, + &dma_send, sizeof dma_send, NULL, 0, NULL, dma_recv, recv_size); - memcpy(data, dma_recv->data, count); + memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */ free(dma_recv); return ret; } int -lm_dma_write(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data) +lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data) { struct vfio_user_dma_region_access *dma_send, dma_recv; - int send_size = sizeof(*dma_send) + count; + int send_size = sizeof(*dma_send) + sg->length; int msg_id = 1, ret; - if (!dma_controller_region_valid(lm_ctx->dma, addr, count)) { - lm_log(lm_ctx, LM_ERR, "DMA region addr %#lx count %llu does not " - "exists", addr, count); - return -ENOENT; - } + assert(lm_ctx != NULL); + assert(sg != NULL); dma_send = calloc(send_size, 1); if (dma_send == NULL) { return -ENOMEM; } - dma_send->addr = addr; - dma_send->count = count; - memcpy(dma_send->data, data, count); - + dma_send->addr = sg->dma_addr; + dma_send->count = sg->length; + memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE, - dma_send, send_size, NULL, 0, NULL, &dma_recv, - sizeof(dma_recv)); + dma_send, send_size, + NULL, 0, NULL, &dma_recv, sizeof(dma_recv)); free(dma_send); return ret; diff --git a/lib/muser.h b/lib/muser.h index 3c24a7a..375be0e 100644 --- a/lib/muser.h +++ b/lib/muser.h @@ -159,6 +159,7 @@ enum { LM_DEV_NUM_IRQS }; +/* FIXME these are PCI regions */ enum { LM_DEV_BAR0_REG_IDX, LM_DEV_BAR1_REG_IDX, @@ -169,7 +170,15 @@ enum { LM_DEV_ROM_REG_IDX, LM_DEV_CFG_REG_IDX, LM_DEV_VGA_REG_IDX, - LM_DEV_NUM_REGS = 9 + /* + * FIXME this really belong here, but simplifies implementation for now. A + * migration region can exist for non-PCI devices (can its index be + * anything?). In any case, we should allow the user to define custom regions + * at will, by fixing the migration region in that position we don't allow + * this. + */ + LM_DEV_MIGRATION_REG_IDX, + LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */ }; typedef struct { @@ -426,6 +435,7 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex); * @len: size of memory to be mapped * @sg: array that receives the scatter/gather entries to be mapped * @max_sg: maximum number of elements in above array + * @prot: protection as define in <sys/mman.h> * * @returns the number of scatter/gather entries created on success, and on * failure: @@ -435,7 +445,7 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex); */ int lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg); + dma_sg_t *sg, int max_sg, int prot); /** * Maps a list scatter/gather entries from the guest's physical address space @@ -488,23 +498,21 @@ lm_get_region(loff_t pos, size_t count, loff_t *off); * Read from the dma region exposed by the client. * * @lm_ctx: the libmuser context - * @addr: dma address exposed by the client - * @count: size of the data to read + * @sg: a DMA segment obtained from dma_addr_to_sg * @data: data buffer to read into */ int -lm_dma_read(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data); +lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data); /** * Write to the dma region exposed by the client. * * @lm_ctx: the libmuser context - * @addr: dma address exposed by the client - * @count: size of the data to write + * @sg: a DMA segment obtained from dma_addr_to_sg * @data: data buffer to write */ int -lm_dma_write(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data); +lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data); /* * Advanced stuff. diff --git a/lib/muser_priv.h b/lib/muser_priv.h index 8d07b2c..c45a8f3 100644 --- a/lib/muser_priv.h +++ b/lib/muser_priv.h @@ -48,13 +48,21 @@ uint64_t region_to_offset(uint32_t region); int +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *fds, int count); + +int send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, - enum vfio_user_command cmd, void *data, int len, int *fds, - int count); + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count); + int recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, - uint16_t *msg_id, void *data, int *len); + uint16_t *msg_id, void *data, size_t *len); int send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, @@ -62,14 +70,21 @@ send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, int recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, - int *max_fds); + int *max_fds, size_t *pgsize); + +int +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len); int send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, - void *send_data, int send_len, - int *send_fds, int fd_count, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, struct vfio_user_header *hdr, - void *recv_data, int recv_len); + void *recv_data, size_t recv_len); #endif /* MUSER_PRIV_H */ diff --git a/lib/vfio_user.h b/lib/vfio_user.h index 890fc43..19f751a 100644 --- a/lib/vfio_user.h +++ b/lib/vfio_user.h @@ -36,6 +36,7 @@ #include <inttypes.h> #include <linux/vfio.h> +#include <linux/version.h> enum vfio_user_command { VFIO_USER_VERSION = 1, @@ -51,6 +52,7 @@ enum vfio_user_command { VFIO_USER_DMA_WRITE = 11, VFIO_USER_VM_INTERRUPT = 12, VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_DIRTY_PAGES = 14, VFIO_USER_MAX, }; @@ -102,6 +104,64 @@ struct vfio_user_irq_info { uint32_t subindex; } __attribute__((packed)); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) + +/* copied from <linux/vfio.h> */ + +#define VFIO_REGION_TYPE_MIGRATION (3) +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 *data; /* one bit per page */ +}; + +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#endif + #endif /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/samples/client.c b/samples/client.c index 925335d..5ff79cd 100644 --- a/samples/client.c +++ b/samples/client.c @@ -39,6 +39,7 @@ #include <time.h> #include <err.h> #include <assert.h> +//#include <sys/uio.h> #include "../lib/muser.h" #include "../lib/muser_priv.h" @@ -66,13 +67,16 @@ init_sock(const char *path) } static int -set_version(int sock, int client_max_fds, int *server_max_fds) +set_version(int sock, int client_max_fds, int *server_max_fds, size_t *pgsize) { int ret, mj, mn; uint16_t msg_id; char *client_caps = NULL; - ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds); + assert(server_max_fds != NULL); + assert(pgsize != NULL); + + ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds, pgsize); if (ret < 0) { fprintf(stderr, "failed to receive version from server: %s\n", strerror(-ret)); @@ -85,7 +89,8 @@ set_version(int sock, int client_max_fds, int *server_max_fds) goto out; } - ret = asprintf(&client_caps, "{max_fds: %d}", client_max_fds); + ret = asprintf(&client_caps, "{max_fds: %d, migration: {pgsize: %lu}}", + client_max_fds, sysconf(_SC_PAGESIZE)); if (ret == -1) { client_caps = NULL; ret = -ENOMEM; /* FIXME */ @@ -115,14 +120,64 @@ send_device_reset(int sock) } static int +get_region_vfio_caps(int sock, size_t cap_sz) +{ + struct vfio_info_cap_header *header, *_header; + struct vfio_region_info_cap_type *type; + struct vfio_region_info_cap_sparse_mmap *sparse; + int i, ret; + + header = _header = calloc(cap_sz, 1); + if (header == NULL) { + return -ENOMEM; + } + + ret = recv(sock, header, cap_sz, 0); + if (ret < 0) { + err(EXIT_FAILURE, "failed to receive VFIO cap info"); + } + assert(ret == cap_sz); + + while (true) { + switch (header->id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + sparse = (struct vfio_region_info_cap_sparse_mmap*)header; + fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__, + sparse->nr_areas); + for (i = 0; i < sparse->nr_areas; i++) { + fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__, + i, sparse->areas[i].offset, sparse->areas[i].size); + } + break; + case VFIO_REGION_INFO_CAP_TYPE: + type = (struct vfio_region_info_cap_type*)header; + if (type->type != VFIO_REGION_TYPE_MIGRATION || + type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) { + fprintf(stderr, "bad region type %d/%d\n", type->type, + type->subtype); + exit(EXIT_FAILURE); + } + printf("migration region\n"); + break; + default: + fprintf(stderr, "bad VFIO cap ID %#x\n", header->id); + exit(EXIT_FAILURE); + } + if (header->next == 0) { + break; + } + header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info)); + } + free(_header); +} + +static int get_device_region_info(int sock, struct vfio_device_info *client_dev_info) { struct vfio_region_info region_info; - struct vfio_region_info_cap_sparse_mmap *sparse; struct vfio_user_header hdr; uint16_t msg_id = 0; size_t cap_sz; - int regsz = sizeof(region_info); int i, ret; msg_id = 1; @@ -133,8 +188,9 @@ get_device_region_info(int sock, struct vfio_device_info *client_dev_info) msg_id++; ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_REGION_INFO, - ®ion_info, regsz, NULL, 0, NULL, - ®ion_info, regsz); + ®ion_info, sizeof region_info, + NULL, 0, NULL, + ®ion_info, sizeof(region_info)); if (ret < 0) { fprintf(stderr, "failed to get device region info: %s\n", strerror(-ret)); @@ -146,44 +202,26 @@ get_device_region_info(int sock, struct vfio_device_info *client_dev_info) "cap_sz %d\n", __func__, i, region_info.offset, region_info.flags, region_info.size, cap_sz); if (cap_sz) { - int j; - - sparse = calloc(cap_sz, 1); - if (sparse == NULL) { - return -ENOMEM; - } - - ret = recv(sock, sparse, cap_sz, 0); - if (ret < 0) { - ret = -errno; - fprintf(stderr, "%s: failed to receive sparse cap info: %s\n", - __func__, strerror(-ret)); - free(sparse); + ret = get_region_vfio_caps(sock, cap_sz); + if (ret != 0) { return ret; } - fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__, - sparse->nr_areas); - for (j = 0; j < sparse->nr_areas; j++) { - fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__, - j, sparse->areas[j].offset, sparse->areas[j].size); - } - free(sparse); } } + return 0; } static int get_device_info(int sock, struct vfio_device_info *dev_info) { struct vfio_user_header hdr; - int dev_info_sz = sizeof(*dev_info); uint16_t msg_id; int ret; - dev_info->argsz = dev_info_sz; + dev_info->argsz = sizeof(*dev_info); msg_id = 1; ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_INFO, - dev_info, dev_info_sz, NULL, 0, NULL, - dev_info, dev_info_sz); + dev_info, sizeof(*dev_info), NULL, 0, NULL, + dev_info, sizeof(*dev_info)); if (ret < 0) { fprintf(stderr, "failed to get device info: %s\n", strerror(-ret)); return ret; @@ -197,30 +235,35 @@ static int get_device_info(int sock, struct vfio_device_info *dev_info) static int configure_irqs(int sock) { - int i, size; - int ret; + int i, ret; + size_t size; struct vfio_irq_set irq_set; - struct vfio_user_irq_info irq_info; + struct vfio_user_irq_info vfio_user_irq_info; struct vfio_user_header hdr; uint16_t msg_id = 1; int irq_fd; uint64_t val; + struct iovec iovecs[2]; - for (i = 0; i < LM_DEV_NUM_IRQS; i++) { - struct vfio_irq_info irq_info = {.argsz = sizeof irq_info, .index = i}; + for (i = 0; i < LM_DEV_NUM_IRQS; i++) { /* TODO move body of loop into function */ int size; + struct vfio_irq_info vfio_irq_info = { + .argsz = sizeof vfio_irq_info, + .index = i + }; ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_IRQ_INFO, - &irq_info, sizeof irq_info, NULL, 0, NULL, - &irq_info, sizeof irq_info); + &vfio_irq_info, sizeof vfio_irq_info, + NULL, 0, NULL, + &vfio_irq_info, sizeof vfio_irq_info); if (ret < 0) { fprintf(stderr, "failed to get %s info: %s\n", irq_to_str[i], strerror(-ret)); return ret; } - if (irq_info.count > 0) { + if (vfio_irq_info.count > 0) { printf("IRQ %s: count=%d flags=%#x\n", - irq_to_str[i], irq_info.count, irq_info.flags); + irq_to_str[i], vfio_irq_info.count, vfio_irq_info.flags); } } @@ -258,14 +301,16 @@ configure_irqs(int sock) printf("INTx triggered!\n"); msg_id++; - size = sizeof(irq_info); - ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &irq_info, &size); + + size = sizeof(vfio_user_irq_info); + ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &vfio_user_irq_info, + &size); if (ret < 0) { - fprintf(stderr, "failed to recieve IRQ message: %s\n", strerror(-ret)); + fprintf(stderr, "failed to receive IRQ message: %s\n", strerror(-ret)); return ret; } - if (irq_info.subindex >= irq_set.count) { - fprintf(stderr, "bad IRQ %d, max=%d\n", irq_info.subindex, + if (vfio_user_irq_info.subindex >= irq_set.count) { + fprintf(stderr, "bad IRQ %d, max=%d\n", vfio_user_irq_info.subindex, irq_set.count); return -ENOENT; } @@ -305,7 +350,10 @@ access_bar0(int sock) fprintf(stderr, "failed to write to BAR0: %s\n", strerror(-ret)); return ret; } - assert(region_access.count == sizeof data.t); + if (region_access.count != sizeof data.t) { + fprintf(stderr, "bad written data length %d\n", region_access.count); + return -EINVAL; + } printf("wrote to BAR0: %ld\n", data.t); @@ -334,7 +382,8 @@ static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions, { struct vfio_user_dma_region_access dma_access; struct vfio_user_header hdr; - int ret, size = sizeof(dma_access), i; + int ret, i; + size_t size = sizeof(dma_access); uint16_t msg_id; void *data; @@ -371,9 +420,10 @@ static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions, dma_access.count = 0; ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_DMA_WRITE, - &dma_access, sizeof(dma_access), NULL, 0); + &dma_access, sizeof dma_access, NULL, 0); if (ret < 0) { - fprintf(stderr, "failed to send reply of DMA write: %m\n"); + fprintf(stderr, "failed to send reply of DMA write: %s\n", + strerror(-ret)); } out: @@ -386,7 +436,8 @@ static int handle_dma_read(int sock, struct vfio_user_dma_region *dma_regions, { struct vfio_user_dma_region_access dma_access, *response; struct vfio_user_header hdr; - int ret, size = sizeof(dma_access), i, response_sz; + int ret, i, response_sz; + size_t size = sizeof(dma_access); uint16_t msg_id; void *data; @@ -449,6 +500,56 @@ static int handle_dma_io(int sock, struct vfio_user_dma_region *dma_regions, return 0; } +static int +get_dirty_bitmaps(int sock, struct vfio_user_dma_region *dma_regions, + int nr_dma_regions) +{ + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0}; + struct vfio_iommu_type1_dirty_bitmap_get bitmaps[2]; + int ret, i; + struct iovec iovecs[4] = { + [1] = { + .iov_base = &dirty_bitmap, + .iov_len = sizeof dirty_bitmap + } + }; + struct vfio_user_header hdr = {0}; + char data[ARRAY_SIZE(bitmaps)]; + + assert(dma_regions != NULL); + assert(nr_dma_regions >= ARRAY_SIZE(bitmaps)); + + for (i = 0; i < ARRAY_SIZE(bitmaps); i++) { + bitmaps[i].iova = dma_regions[i].addr; + bitmaps[i].size = dma_regions[i].size; + bitmaps[i].bitmap.size = 1; /* FIXME calculate based on page and IOVA size, don't hardcode */ + bitmaps[i].bitmap.pgsize = sysconf(_SC_PAGESIZE); + iovecs[(i + 2)].iov_base = &bitmaps[i]; /* FIXME the +2 is because iovecs[0] is the vfio_user_header and iovecs[1] is vfio_iommu_type1_dirty_bitmap */ + iovecs[(i + 2)].iov_len = sizeof(struct vfio_iommu_type1_dirty_bitmap_get); + } + + /* + * FIXME there should be at least two IOVAs. Send single message for two + * IOVAs and ensure only one bit is set in first IOVA. + */ + dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + ret = _send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES, + iovecs, ARRAY_SIZE(iovecs), + NULL, 0, + &hdr, data, ARRAY_SIZE(data)); + if (ret != 0) { + fprintf(stderr, "failed to start dirty page logging: %s\n", + strerror(-ret)); + return ret; + } + + for (i = 0; i < ARRAY_SIZE(bitmaps); i++) { + printf("%#x-%#x\t%hhu\n", bitmaps[i].iova, + bitmaps[i].iova + bitmaps[i].size - 1, data[i]); + } + return 0; +} + int main(int argc, char *argv[]) { int ret, sock; @@ -461,7 +562,9 @@ int main(int argc, char *argv[]) int fd; const int client_max_fds = 32; int server_max_fds; + size_t pgsize; int nr_dma_regions; + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0}; if (argc != 2) { fprintf(stderr, "usage: %s /path/to/socket\n", argv[0]); @@ -478,7 +581,7 @@ int main(int argc, char *argv[]) * The server proposes version upon connection, we need to send back the * version the version we support. */ - if ((ret = set_version(sock, client_max_fds, &server_max_fds)) < 0) { + if ((ret = set_version(sock, client_max_fds, &server_max_fds, &pgsize)) < 0) { return ret; } @@ -503,7 +606,7 @@ int main(int argc, char *argv[]) /* * XXX VFIO_USER_DMA_MAP * - * Tell the server we have some DMA regions it can access. Each DMA regions + * Tell the server we have some DMA regions it can access. Each DMA region * is accompanied by a file descriptor, so let's create more (2x) DMA * regions that can fit in a message that can be handled by the server. */ @@ -531,10 +634,10 @@ int main(int argc, char *argv[]) for (i = 0; i < nr_dma_regions / server_max_fds; i++, msg_id++) { ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_MAP, - dma_regions + (i * server_max_fds), - sizeof *dma_regions * server_max_fds, - dma_region_fds + (i * server_max_fds), - server_max_fds, NULL, NULL, 0); + dma_regions + (i * server_max_fds), + sizeof(*dma_regions) * server_max_fds, + dma_region_fds + (i * server_max_fds), + server_max_fds, NULL, NULL, 0); if (ret < 0) { fprintf(stderr, "failed to map DMA regions: %s\n", strerror(-ret)); return ret; @@ -553,6 +656,17 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } + + dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES, + &dirty_bitmap, sizeof dirty_bitmap, + NULL, 0, NULL, NULL, 0); + if (ret != 0) { + fprintf(stderr, "failed to start dirty page logging: %s\n", + strerror(-ret)); + exit(EXIT_FAILURE); + } + /* * XXX VFIO_USER_DEVICE_GET_IRQ_INFO and VFIO_IRQ_SET_ACTION_TRIGGER * Query interrupts, configure an eventfd to be associated with INTx, and @@ -570,6 +684,23 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } + ret = get_dirty_bitmaps(sock, dma_regions, nr_dma_regions); + if (ret < 0) { + fprintf(stderr, "failed to receive dirty bitmaps: %s\n", + strerror(-ret)); + exit(EXIT_FAILURE); + } + + dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES, + &dirty_bitmap, sizeof dirty_bitmap, + NULL, 0, NULL, NULL, 0); + if (ret != 0) { + fprintf(stderr, "failed to stop dirty page logging: %s\n", + strerror(-ret)); + exit(EXIT_FAILURE); + } + /* * FIXME now that region read/write works, change the server implementation * to trigger an interrupt after N seconds, where N is the value written to @@ -577,7 +708,11 @@ int main(int argc, char *argv[]) */ /* BAR1 can be memory mapped and read directly */ - /* TODO implement the following: write a value in BAR1, a server timer will increase it every second (SIGALARM) */ + + /* + * TODO implement the following: write a value in BAR1, a server timer will + * increase it every second (SIGALARM) + */ /* * XXX VFIO_USER_DMA_UNMAP @@ -585,8 +720,7 @@ int main(int argc, char *argv[]) * unmap the first group of the DMA regions */ ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_UNMAP, - dma_regions, - sizeof *dma_regions * server_max_fds, + dma_regions, sizeof *dma_regions * server_max_fds, NULL, 0, NULL, NULL, 0); if (ret < 0) { fprintf(stderr, "failed to unmap DMA regions: %s\n", strerror(-ret)); diff --git a/samples/server.c b/samples/server.c index 782ccdc..9f4d3b1 100644 --- a/samples/server.c +++ b/samples/server.c @@ -39,6 +39,7 @@ #include <time.h> #include <assert.h> #include <openssl/md5.h> +#include <sys/mman.h> #include "../lib/muser.h" @@ -146,17 +147,30 @@ void get_md5sum(char *buf, int len, char *md5sum) return; } +/* + * FIXME this function does DMA write/read using messages. This should be done + * on a region that is not memory mappable or an area of a region that is not + * sparsely memory mappable. We should also have a test where the server does + * DMA directly on the client memory. + */ static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data) { int count = 4096; char buf[count], md5sum1[MD5_DIGEST_LENGTH], md5sum2[MD5_DIGEST_LENGTH]; int i, ret; + dma_sg_t sg; + + assert(lm_ctx != NULL); + + ret = lm_addr_to_sg(lm_ctx, server_data->regions[0].addr, count, &sg, + 1, PROT_WRITE); + assert(ret == 1); /* FIXME */ memset(buf, 'A', count); get_md5sum(buf, count, md5sum1); printf("%s: WRITE addr %#lx count %llu\n", __func__, server_data->regions[0].addr, count); - ret = lm_dma_write(lm_ctx, server_data->regions[0].addr, count, buf); + ret = lm_dma_write(lm_ctx, &sg, buf); if (ret < 0) { fprintf(stderr, "lm_dma_write failed: %s\n", strerror(-ret)); return ret; @@ -165,7 +179,7 @@ static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data) memset(buf, 0, count); printf("%s: READ addr %#lx count %llu\n", __func__, server_data->regions[0].addr, count); - ret = lm_dma_read(lm_ctx, server_data->regions[0].addr, count, buf); + ret = lm_dma_read(lm_ctx, &sg, buf); if (ret < 0) { fprintf(stderr, "lm_dma_read failed: %s\n", strerror(-ret)); return ret; @@ -253,6 +267,11 @@ int main(int argc, char *argv[]) .mmap_areas = sparse_areas, .map = map_area }, + .reg_info[LM_DEV_MIGRATION_REG_IDX] = { /* migration region */ + .flags = LM_REG_FLAG_RW, + .size = sysconf(_SC_PAGESIZE), + .mmap_areas = sparse_areas, + }, .irq_count[LM_DEV_INTX_IRQ_IDX] = 1, }, .uuid = argv[optind], |