aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/dma.c123
-rw-r--r--lib/dma.h85
-rw-r--r--lib/libmuser.c469
-rw-r--r--lib/muser.h24
-rw-r--r--lib/muser_priv.h29
-rw-r--r--lib/vfio_user.h60
-rw-r--r--samples/client.c252
-rw-r--r--samples/server.c23
8 files changed, 855 insertions, 210 deletions
diff --git a/lib/dma.c b/lib/dma.c
index 2204669..191482b 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -81,6 +81,7 @@ dma_controller_create(lm_ctx_t *lm_ctx, int max_regions)
dma->max_regions = max_regions;
dma->nregions = 0;
memset(dma->regions, 0, max_regions * sizeof(dma->regions[0]));
+ dma->dirty_pgsize = 0;
return dma;
}
@@ -106,6 +107,10 @@ _dma_controller_do_remove_region(dma_controller_t *dma,
}
}
+/*
+ * FIXME no longer used. Also, it doesn't work for addresses that span two
+ * DMA regions.
+ */
bool
dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
size_t size)
@@ -328,7 +333,7 @@ dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len)
int
_dma_addr_sg_split(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg)
+ dma_sg_t *sg, int max_sg, int prot)
{
int idx;
int cnt = 0;
@@ -348,6 +353,9 @@ _dma_addr_sg_split(const dma_controller_t *dma,
sg[cnt].region = idx;
sg[cnt].offset = dma_addr - region->dma_addr;
sg[cnt].length = region_len;
+ if (_dma_should_mark_dirty(dma, prot)) {
+ _dma_mark_dirty(dma, region, sg);
+ }
}
cnt++;
@@ -376,4 +384,117 @@ out:
return cnt;
}
+ssize_t _get_bitmap_size(size_t region_size, size_t pgsize)
+{
+ if (pgsize == 0) {
+ return -EINVAL;
+ }
+ if (region_size < pgsize) {
+ return -EINVAL;
+ }
+ size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
+ return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0);
+}
+
+int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize)
+{
+ int i;
+
+ assert(dma != NULL);
+
+ if (pgsize == 0) {
+ return -EINVAL;
+ }
+
+ if (dma->dirty_pgsize > 0) {
+ if (dma->dirty_pgsize != pgsize) {
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ for (i = 0; i < dma->nregions; i++) {
+ dma_memory_region_t *region = &dma->regions[i];
+ ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+ region->dirty_bitmap = calloc(bitmap_size, sizeof(char));
+ if (region->dirty_bitmap == NULL) {
+ int j, ret = -errno;
+ for (j = 0; j < i; j++) {
+ free(region->dirty_bitmap);
+ region->dirty_bitmap = NULL;
+ }
+ return ret;
+ }
+ }
+ dma->dirty_pgsize = pgsize;
+ return 0;
+}
+
+int dma_controller_dirty_page_logging_stop(dma_controller_t *dma)
+{
+ int i;
+
+ assert(dma != NULL);
+
+ if (dma->dirty_pgsize == 0) {
+ return 0;
+ }
+
+ for (i = 0; i < dma->nregions; i++) {
+ free(dma->regions[i].dirty_bitmap);
+ dma->regions[i].dirty_bitmap = NULL;
+ }
+ dma->dirty_pgsize = 0;
+ return 0;
+}
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+ size_t pgsize, size_t size, char **data)
+{
+ int ret;
+ ssize_t bitmap_size;
+ dma_sg_t sg;
+ dma_memory_region_t *region;
+
+ assert(dma != NULL);
+ assert(data != NULL);
+
+ /*
+ * FIXME for now we support IOVAs that match exactly the DMA region. This
+ * is purely for simplifying the implementation. We MUST allow arbitrary
+ * IOVAs.
+ */
+ ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE);
+ if (ret != 1 || sg.dma_addr != addr || sg.length != len) {
+ return -ENOTSUP;
+ }
+
+ if (pgsize != dma->dirty_pgsize) {
+ return -EINVAL;
+ }
+
+ bitmap_size = _get_bitmap_size(len, pgsize);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+
+ /*
+ * FIXME they must be equal because this is how much data the client
+ * expects to receive.
+ */
+ if (size != (size_t)bitmap_size) {
+ return -EINVAL;
+ }
+
+ region = &dma->regions[sg.region];
+
+ *data = region->dirty_bitmap;
+
+ return 0;
+}
+
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.h b/lib/dma.h
index 1713bc5..7715b89 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -32,6 +32,11 @@
#define DMA_DMA_H
/*
+ * FIXME check whether DMA regions must be page aligned. If so then the
+ * implementation can be greatly simpified.
+ */
+
+/*
* This library emulates a DMA controller for a device emulation application to
* perform DMA operations on a foreign memory space.
*
@@ -82,12 +87,14 @@ typedef struct {
off_t offset; // File offset
void *virt_addr; // Virtual address of this region
int refcnt; // Number of users of this region
+ char *dirty_bitmap; // Dirty page bitmap
} dma_memory_region_t;
typedef struct {
int max_regions;
int nregions;
struct lm_ctx *lm_ctx;
+ size_t dirty_pgsize; // Dirty page granularity
dma_memory_region_t regions[0];
} dma_controller_t;
@@ -118,7 +125,59 @@ dma_controller_remove_region(dma_controller_t *dma,
int
_dma_addr_sg_split(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg);
+ dma_sg_t *sg, int max_sg, int prot);
+
+static bool
+_dma_should_mark_dirty(const dma_controller_t *dma, int prot)
+{
+ assert(dma != NULL);
+
+ return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0;
+}
+
+static size_t
+_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset)
+{
+ return (offset - base_addr) / pgsize;
+}
+
+static size_t
+_get_pgend(size_t pgsize, uint64_t len, size_t start)
+{
+ return start + (len / pgsize) + (len % pgsize != 0) - 1;
+}
+
+static void
+_dma_bitmap_get_pgrange(const dma_controller_t *dma,
+ const dma_memory_region_t *region,
+ const dma_sg_t *sg, size_t *start, size_t *end)
+{
+ assert(dma != NULL);
+ assert(region != NULL);
+ assert(sg != NULL);
+ assert(start != NULL);
+ assert(end != NULL);
+
+ *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset);
+ *end = _get_pgend(dma->dirty_pgsize, sg->length, *start);
+}
+
+static void
+_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region,
+ dma_sg_t *sg)
+{
+ size_t i, start, end;
+
+ assert(dma != NULL);
+ assert(region != NULL);
+ assert(sg != NULL);
+ assert(region->dirty_bitmap != NULL);
+
+ _dma_bitmap_get_pgrange(dma, region, sg, &start, &end);
+ for (i = start; i <= end; i++) {
+ region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT);
+ }
+}
/* Takes a linear dma address span and returns a sg list suitable for DMA.
* A single linear dma address span may need to be split into multiple
@@ -134,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma,
static inline int
dma_addr_to_sg(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg)
+ dma_sg_t *sg, int max_sg, int prot)
{
static __thread int region_hint;
int cnt;
@@ -150,10 +209,13 @@ dma_addr_to_sg(const dma_controller_t *dma,
sg->region = region_hint;
sg->offset = dma_addr - region->dma_addr;
sg->length = len;
+ if (_dma_should_mark_dirty(dma, prot)) {
+ _dma_mark_dirty(dma, region, sg);
+ }
return 1;
}
// Slow path: search through regions.
- cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg);
+ cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot);
if (likely(cnt > 0)) {
region_hint = sg->region;
}
@@ -186,6 +248,7 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov,
return 0;
}
+/* FIXME useless define */
#define UNUSED __attribute__((unused))
static inline void
@@ -215,12 +278,12 @@ dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg,
}
static inline void *
-dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len)
+dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot)
{
dma_sg_t sg;
struct iovec iov;
- if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 &&
+ if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 &&
dma_map_sg(dma, &sg, &iov, 1) == 0) {
return iov.iov_base;
}
@@ -239,12 +302,22 @@ dma_unmap_addr(dma_controller_t *dma,
};
int r;
- r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1);
+ r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE);
assert(r == 1);
dma_unmap_sg(dma, &sg, &iov, 1);
}
+int
+dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize);
+
+int
+dma_controller_dirty_page_logging_stop(dma_controller_t *dma);
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+ size_t pgsize, size_t size, char **data);
+
bool
dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
size_t size);
diff --git a/lib/libmuser.c b/lib/libmuser.c
index 547a318..0aa3443 100644
--- a/lib/libmuser.c
+++ b/lib/libmuser.c
@@ -112,6 +112,8 @@ struct lm_ctx {
int client_max_fds;
+ size_t migration_pgsize;
+
lm_irqs_t irqs; /* XXX must be last */
};
@@ -270,14 +272,20 @@ __free_s(char **p)
}
int
-send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
- enum vfio_user_command cmd, void *data, int len,
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
int *fds, int count)
{
int ret;
struct vfio_user_header hdr = {.msg_id = msg_id};
- struct iovec iov[2];
struct msghdr msg;
+ size_t i;
+
+ if (nr_iovecs == 0) {
+ iovecs = alloca(sizeof(*iovecs));
+ nr_iovecs = 1;
+ }
memset(&msg, 0, sizeof(msg));
@@ -288,23 +296,15 @@ send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
hdr.flags.type = VFIO_USER_F_TYPE_COMMAND;
}
- if (data != NULL && len == 0) {
- return -EINVAL;
- }
-
- hdr.msg_size = sizeof(hdr) + len;
+ iovecs[0].iov_base = &hdr;
+ iovecs[0].iov_len = sizeof(hdr);
- iov[0].iov_base = &hdr;
- iov[0].iov_len = sizeof(hdr);
- msg.msg_iovlen = 1;
-
- if (data != NULL) {
- msg.msg_iovlen++;
- iov[1].iov_base = data;
- iov[1].iov_len = len;
+ for (i = 0; i < nr_iovecs; i++) {
+ hdr.msg_size += iovecs[i].iov_len;
}
- msg.msg_iov = iov;
+ msg.msg_iovlen = nr_iovecs;
+ msg.msg_iov = iovecs;
if (fds != NULL) {
size_t size = count * sizeof *fds;
@@ -329,26 +329,43 @@ send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
}
int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count) {
+
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = data,
+ .iov_len = data_len
+ }
+ };
+ return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs,
+ ARRAY_SIZE(iovecs), fds, count);
+}
+
+int
send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
char *caps)
{
int ret;
- char *data __attribute__((__cleanup__(__free_s))) = NULL;
+ char *data;
- ret = asprintf(&data, "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}",
+ ret = asprintf(&data,
+ "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}",
major, minor, caps != NULL ? caps : "{}");
if (ret == -1) {
- data = NULL;
return -1;
}
-
- return send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data,
- ret, NULL, 0);
+ ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data,
+ ret, NULL, 0);
+ free(data);
+ return ret;
}
int
recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
- uint16_t *msg_id, void *data, int *len)
+ uint16_t *msg_id, void *data, size_t *len)
{
int ret;
@@ -388,7 +405,7 @@ recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
if (ret < 0) {
return ret;
}
- if (*len != ret) { /* FIXME we should allow receiving less */
+ if (*len != (size_t)ret) { /* FIXME we should allow receiving less */
return -EINVAL;
}
*len = ret;
@@ -398,7 +415,7 @@ recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
int
recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
- int *max_fds)
+ int *max_fds, size_t *pgsize)
{
int ret;
struct vfio_user_header hdr;
@@ -424,23 +441,23 @@ recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
/* FIXME use proper parsing */
ret = sscanf(data,
- "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d}}",
- major, minor, max_fds);
- if (ret != 3) {
+ "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}",
+ major, minor, max_fds, pgsize);
+ if (ret != 4) {
return -EINVAL;
}
return 0;
}
int
-send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
- void *send_data, int send_len,
- int *send_fds, int fd_count,
- struct vfio_user_header *hdr,
- void *recv_data, int recv_len)
-{
- int ret = send_vfio_user_msg(sock, msg_id, false, cmd, send_data, send_len,
- send_fds, fd_count);
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs,
+ send_fds, fd_count);
if (ret < 0) {
return ret;
}
@@ -450,6 +467,24 @@ send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len);
}
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = send_data,
+ .iov_len = send_len
+ }
+ };
+ return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs,
+ ARRAY_SIZE(iovecs), send_fds, fd_count,
+ hdr, recv_data, recv_len);
+}
+
static int
set_version(lm_ctx_t *lm_ctx, int sock)
{
@@ -458,7 +493,8 @@ set_version(lm_ctx_t *lm_ctx, int sock)
uint16_t msg_id = 0;
char *server_caps;
- ret = asprintf(&server_caps, "{max_fds: %d}", MAX_FDS);
+ ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}",
+ MAX_FDS, sysconf(_SC_PAGESIZE));
if (ret == -1) {
return -ENOMEM;
}
@@ -471,7 +507,7 @@ set_version(lm_ctx_t *lm_ctx, int sock)
}
ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true,
- &lm_ctx->client_max_fds);
+ &lm_ctx->client_max_fds, &lm_ctx->migration_pgsize);
if (ret < 0) {
lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret));
goto out;
@@ -482,7 +518,18 @@ set_version(lm_ctx_t *lm_ctx, int sock)
LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN,
client_mj, client_mn);
ret = -EINVAL;
+ goto out;
+ }
+ if (lm_ctx->migration_pgsize == 0) {
+ lm_log(lm_ctx, LM_ERR, "bad migration page size");
+ ret = -EINVAL;
+ goto out;
}
+
+ /* FIXME need to check max_fds */
+
+ lm_ctx->migration_pgsize = MIN(lm_ctx->migration_pgsize,
+ sysconf(_SC_PAGESIZE));
out:
free(server_caps);
return ret;
@@ -932,23 +979,33 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
* points accordingly.
*/
static int
-dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg,
+dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
struct vfio_region_info **vfio_reg, bool is_kernel)
{
+ struct vfio_info_cap_header *header;
+ struct vfio_region_info_cap_type *type = NULL;
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct lm_sparse_mmap_areas *mmap_areas;
int nr_mmap_areas, i;
- size_t sparse_size;
+ size_t type_size = 0;
+ size_t sparse_size = 0;
+ size_t cap_size;
ssize_t ret;
void *cap_ptr;
- if (lm_reg->mmap_areas == NULL) {
- lm_log(lm_ctx, LM_DBG, "bad mmap_areas\n");
- return -EINVAL;
+ if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+ type_size = sizeof(struct vfio_region_info_cap_type);
+ }
+
+ if (lm_reg->mmap_areas != NULL) {
+ nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
+ sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
}
- nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
- sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+ cap_size = type_size + sparse_size;
+ if (cap_size == 0) {
+ return 0;
+ }
/*
* If vfio_reg does not have enough space to accommodate sparse info then
@@ -956,54 +1013,79 @@ dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg,
* is only for kernel/muser.ko, where the request comes from kernel/vfio.
*/
- if ((*vfio_reg)->argsz < sparse_size + sizeof(**vfio_reg) && is_kernel) {
+ if ((*vfio_reg)->argsz < cap_size + sizeof(**vfio_reg) && is_kernel) {
lm_log(lm_ctx, LM_DBG, "vfio_reg too small=%d\n", (*vfio_reg)->argsz);
- (*vfio_reg)->argsz = sparse_size + sizeof(**vfio_reg);
+ (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
(*vfio_reg)->cap_offset = 0;
return 0;
}
- sparse = calloc(1, sparse_size);
- if (sparse == NULL)
+ /* TODO deosn't need to be calloc, we overwrite it entirely */
+ header = calloc(1, cap_size);
+ if (header == NULL) {
return -ENOMEM;
- sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->header.version = 1;
- sparse->header.next = 0;
- sparse->nr_areas = nr_mmap_areas;
+ }
- lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__,
- sparse_size, nr_mmap_areas);
- mmap_areas = lm_reg->mmap_areas;
- for (i = 0; i < nr_mmap_areas; i++) {
- sparse->areas[i].offset = mmap_areas->areas[i].start;
- sparse->areas[i].size = mmap_areas->areas[i].size;
- lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__,
- i, sparse->areas[i].offset, sparse->areas[i].size);
+ if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+ type = (struct vfio_region_info_cap_type*)header;
+ type->header.id = VFIO_REGION_INFO_CAP_TYPE;
+ type->header.version = 1;
+ type->header.next = 0;
+ type->type = VFIO_REGION_TYPE_MIGRATION;
+ type->subtype = VFIO_REGION_SUBTYPE_MIGRATION;
+ (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
}
+ if (lm_reg->mmap_areas != NULL) {
+ if (type != NULL) {
+ type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type);
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1);
+ } else {
+ (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)header;
+ }
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+ sparse->header.next = 0;
+ sparse->nr_areas = nr_mmap_areas;
+
+ lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__,
+ sparse_size, nr_mmap_areas);
+ mmap_areas = lm_reg->mmap_areas;
+ for (i = 0; i < nr_mmap_areas; i++) {
+ sparse->areas[i].offset = mmap_areas->areas[i].start;
+ sparse->areas[i].size = mmap_areas->areas[i].size;
+ lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__,
+ i, sparse->areas[i].offset, sparse->areas[i].size);
+ }
+ }
+
+ /*
+ * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is
+ * memory-mappable in general, not only if it supports sparse mmap.
+ */
(*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
- (*vfio_reg)->cap_offset = sizeof(**vfio_reg);
if (is_kernel) {
/* write the sparse mmap cap info to vfio-client user pages */
- ret = write(lm_ctx->conn_fd, sparse, sparse_size);
- if (ret != (ssize_t)sparse_size) {
- free(sparse);
+ ret = write(lm_ctx->conn_fd, header, cap_size);
+ if (ret != (ssize_t)cap_size) {
+ free(header);
return -EIO;
}
} else {
- (*vfio_reg)->argsz = sparse_size + sizeof(**vfio_reg);
+ (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
*vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz);
if (*vfio_reg == NULL) {
- free(sparse);
+ free(header);
return -ENOMEM;
}
cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset;
- memcpy(cap_ptr, sparse, sparse_size);
+ memcpy(cap_ptr, header, cap_size);
}
- free(sparse);
+ free(header);
return 0;
}
@@ -1073,11 +1155,10 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg,
(*vfio_reg)->flags = lm_reg->flags;
(*vfio_reg)->size = lm_reg->size;
- if (lm_reg->mmap_areas != NULL) {
- err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg, is_kernel);
- if (err) {
- return err;
- }
+ err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg,
+ is_kernel);
+ if (err) {
+ return err;
}
lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu "
@@ -1089,8 +1170,9 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg,
}
static long
-dev_get_info(struct vfio_device_info *dev_info)
+dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info)
{
+ assert(lm_ctx != NULL);
assert(dev_info != NULL);
// Ensure provided argsz is sufficiently big.
@@ -1114,7 +1196,7 @@ do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
assert(lm_ctx != NULL);
switch (cmd_ioctl->vfio_cmd) {
case VFIO_DEVICE_GET_INFO:
- err = dev_get_info(&cmd_ioctl->data.dev_info);
+ err = dev_get_info(lm_ctx, &cmd_ioctl->data.dev_info);
break;
case VFIO_DEVICE_GET_REGION_INFO:
reg_info = &cmd_ioctl->data.reg_info;
@@ -1737,7 +1819,7 @@ static int handle_device_get_info(lm_ctx_t *lm_ctx,
return -errno;
}
- ret = dev_get_info(dev_info);
+ ret = dev_get_info(lm_ctx, dev_info);
if (ret < 0) {
return ret;
}
@@ -1919,7 +2001,7 @@ handle_device_reset(lm_ctx_t *lm_ctx)
static int
handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
- void **data, int *len)
+ void **data, size_t *len)
{
struct vfio_user_region_access region_access;
struct muser_cmd muser_cmd = {};
@@ -1973,6 +2055,105 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
return 0;
}
+static int
+handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct iovec **iovecs, size_t *nr_iovecs)
+{
+ int size, ret;
+ size_t i;
+ struct vfio_iommu_type1_dirty_bitmap_get *ranges;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(iovecs != NULL);
+ assert(nr_iovecs != NULL);
+
+ size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap);
+ if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) {
+ return -EINVAL;
+ }
+ ranges = malloc(size);
+ if (ranges == NULL) {
+ return -errno;
+ }
+ ret = recv(lm_ctx->conn_fd, ranges, size, 0);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+ if (ret != size) {
+ ret = -EINVAL;
+ goto out;
+ }
+ *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get);
+ *iovecs = malloc(*nr_iovecs);
+ if (*iovecs == NULL) {
+ ret = -errno;
+ goto out;
+ }
+
+ for (i = 1; i < *nr_iovecs; i++) {
+ struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */
+ ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size,
+ r->bitmap.pgsize, r->bitmap.size,
+ (char**)&((*iovecs)[i].iov_base));
+ if (ret != 0) {
+ goto out;
+ }
+ (*iovecs)[i].iov_len = r->bitmap.size;
+ }
+out:
+ if (ret != 0) {
+ if (*iovecs != NULL) {
+ free(*iovecs);
+ *iovecs = NULL;
+ }
+ }
+ free(ranges);
+ return ret;
+}
+
+static int
+handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct iovec **iovecs, size_t *nr_iovecs)
+{
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap;
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(iovecs != NULL);
+ assert(nr_iovecs != NULL);
+
+ if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) {
+ lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ /* FIXME must also check argsz */
+
+ ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if ((size_t)ret < sizeof dirty_bitmap) {
+ return -EINVAL;
+ }
+
+ if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+ ret = dma_controller_dirty_page_logging_start(lm_ctx->dma,
+ lm_ctx->migration_pgsize);
+ } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+ ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma);
+ } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+ ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs);
+ } else {
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
/*
* FIXME return value is messed up, sometimes we return -1 and set errno while
* other times we return -errno. Fix.
@@ -1988,9 +2169,9 @@ process_request(lm_ctx_t *lm_ctx)
struct vfio_irq_info irq_info;
struct vfio_device_info dev_info;
struct vfio_region_info *dev_reg_info = NULL;
- void *data = NULL;
- bool free_data = false;
- int len = 0;
+ struct iovec _iovecs[2] = {0}, *iovecs = NULL;
+ size_t nr_iovecs = 0;
+ bool free_iovec_data = true;
assert(lm_ctx != NULL);
@@ -2034,6 +2215,12 @@ process_request(lm_ctx_t *lm_ctx)
return -EINVAL;
}
+ /* FIXME in most of the following function we check that hdr.count is >=
+ * than the command-specific struct and there is an additional recv(2) for
+ * that data. We should eliminate duplicating this common code and move it
+ * here.
+ */
+
switch (hdr.cmd) {
case VFIO_USER_DMA_MAP:
case VFIO_USER_DMA_UNMAP:
@@ -2044,23 +2231,28 @@ process_request(lm_ctx_t *lm_ctx)
case VFIO_USER_DEVICE_GET_INFO:
ret = handle_device_get_info(lm_ctx, &hdr, &dev_info);
if (ret == 0) {
- data = &dev_info;
- len = dev_info.argsz;
+ _iovecs[1].iov_base = &dev_info;
+ _iovecs[1].iov_len = dev_info.argsz;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
}
break;
case VFIO_USER_DEVICE_GET_REGION_INFO:
ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info);
if (ret == 0) {
- data = dev_reg_info;
- len = dev_reg_info->argsz;
- free_data = true;
+ _iovecs[1].iov_base = dev_reg_info;
+ _iovecs[1].iov_len = dev_reg_info->argsz;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
}
break;
case VFIO_USER_DEVICE_GET_IRQ_INFO:
ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info);
if (ret == 0) {
- data = &irq_info;
- len = sizeof irq_info;
+ _iovecs[1].iov_base = &irq_info;
+ _iovecs[1].iov_len = sizeof irq_info;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
}
break;
case VFIO_USER_DEVICE_SET_IRQS:
@@ -2068,12 +2260,20 @@ process_request(lm_ctx_t *lm_ctx)
break;
case VFIO_USER_REGION_READ:
case VFIO_USER_REGION_WRITE:
- ret = handle_region_access(lm_ctx, &hdr, &data, &len);
- free_data = true;
+ iovecs = _iovecs;
+ ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base,
+ &iovecs[1].iov_len);
+ nr_iovecs = 2;
break;
case VFIO_USER_DEVICE_RESET:
ret = handle_device_reset(lm_ctx);
break;
+ case VFIO_USER_DIRTY_PAGES:
+ ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs);
+ if (ret >= 0) {
+ free_iovec_data = false;
+ }
+ break;
default:
lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd);
return -EINVAL;
@@ -2083,14 +2283,23 @@ process_request(lm_ctx_t *lm_ctx)
* TODO: In case of error during command handling set errno respectively
* in the reply message.
*/
- ret = send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true,
- 0, data, len, NULL, 0);
+ if (ret < 0) {
+ assert(false); /* FIXME */
+ }
+ ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true,
+ 0, iovecs, nr_iovecs, NULL, 0);
if (unlikely(ret < 0)) {
lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n",
strerror(-ret));
}
- if (free_data) {
- free(data);
+ if (iovecs != NULL && iovecs != _iovecs) {
+ if (free_iovec_data) {
+ size_t i;
+ for (i = 0; i < nr_iovecs; i++) {
+ free(iovecs[i].iov_base);
+ }
+ }
+ free(iovecs);
}
return ret;
@@ -2190,8 +2399,9 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex)
irq_info.subindex = subindex;
ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id,
- VFIO_USER_VM_INTERRUPT, &irq_info,
- sizeof(irq_info), NULL, 0, NULL, NULL, 0);
+ VFIO_USER_VM_INTERRUPT,
+ &irq_info, sizeof irq_info,
+ NULL, 0, NULL, NULL, 0);
if (ret < 0) {
errno = -ret;
return -1;
@@ -2291,6 +2501,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
{
lm_reg_info_t *cfg_reg;
const lm_reg_info_t zero_reg = { 0 };
+ lm_reg_info_t *migr_reg;
int i;
assert(lm_ctx != NULL);
@@ -2347,6 +2558,16 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF;
}
+ /*
+ * Check the migration region.
+ */
+ migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX];
+ if (migr_reg->size > 0) {
+ if (migr_reg->size < sizeof(struct vfio_device_migration_info)) {
+ return -EINVAL;
+ }
+ }
+
return 0;
err:
@@ -2529,13 +2750,15 @@ lm_get_region_info(lm_ctx_t *lm_ctx)
inline int
lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr,
- uint32_t len, dma_sg_t *sg, int max_sg)
+ uint32_t len, dma_sg_t *sg, int max_sg, int prot)
{
+ assert(lm_ctx != NULL);
+
if (unlikely(lm_ctx->unmap_dma == NULL)) {
errno = EINVAL;
return -1;
}
- return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg);
+ return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot);
}
inline int
@@ -2581,62 +2804,54 @@ lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id)
}
int
-lm_dma_read(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data)
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
{
struct vfio_user_dma_region_access *dma_recv;
- struct vfio_user_dma_region_access dma_send = {
- .addr = addr,
- .count = count
- };
- int recv_size = sizeof(*dma_recv) + count;
+ struct vfio_user_dma_region_access dma_send;
+ int recv_size;
int msg_id = 1, ret;
- if (!dma_controller_region_valid(lm_ctx->dma, addr, count)) {
- lm_log(lm_ctx, LM_ERR, "DMA region addr %#lx count %llu doest not "
- "exists", addr, count);
- return -ENOENT;
- }
+ assert(lm_ctx != NULL);
+ assert(sg != NULL);
+
+ recv_size = sizeof(*dma_recv) + sg->length;
dma_recv = calloc(recv_size, 1);
if (dma_recv == NULL) {
return -ENOMEM;
}
- dma_recv->addr = addr;
- dma_recv->count = count;
+ dma_send.addr = sg->dma_addr;
+ dma_send.count = sg->length;
ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ,
- &dma_send, sizeof(dma_send), NULL, 0, NULL,
+ &dma_send, sizeof dma_send, NULL, 0, NULL,
dma_recv, recv_size);
- memcpy(data, dma_recv->data, count);
+ memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */
free(dma_recv);
return ret;
}
int
-lm_dma_write(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data)
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
{
struct vfio_user_dma_region_access *dma_send, dma_recv;
- int send_size = sizeof(*dma_send) + count;
+ int send_size = sizeof(*dma_send) + sg->length;
int msg_id = 1, ret;
- if (!dma_controller_region_valid(lm_ctx->dma, addr, count)) {
- lm_log(lm_ctx, LM_ERR, "DMA region addr %#lx count %llu does not "
- "exists", addr, count);
- return -ENOENT;
- }
+ assert(lm_ctx != NULL);
+ assert(sg != NULL);
dma_send = calloc(send_size, 1);
if (dma_send == NULL) {
return -ENOMEM;
}
- dma_send->addr = addr;
- dma_send->count = count;
- memcpy(dma_send->data, data, count);
-
+ dma_send->addr = sg->dma_addr;
+ dma_send->count = sg->length;
+ memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */
ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE,
- dma_send, send_size, NULL, 0, NULL, &dma_recv,
- sizeof(dma_recv));
+ dma_send, send_size,
+ NULL, 0, NULL, &dma_recv, sizeof(dma_recv));
free(dma_send);
return ret;
diff --git a/lib/muser.h b/lib/muser.h
index 3c24a7a..375be0e 100644
--- a/lib/muser.h
+++ b/lib/muser.h
@@ -159,6 +159,7 @@ enum {
LM_DEV_NUM_IRQS
};
+/* FIXME these are PCI regions */
enum {
LM_DEV_BAR0_REG_IDX,
LM_DEV_BAR1_REG_IDX,
@@ -169,7 +170,15 @@ enum {
LM_DEV_ROM_REG_IDX,
LM_DEV_CFG_REG_IDX,
LM_DEV_VGA_REG_IDX,
- LM_DEV_NUM_REGS = 9
+ /*
+ * FIXME this really belong here, but simplifies implementation for now. A
+ * migration region can exist for non-PCI devices (can its index be
+ * anything?). In any case, we should allow the user to define custom regions
+ * at will, by fixing the migration region in that position we don't allow
+ * this.
+ */
+ LM_DEV_MIGRATION_REG_IDX,
+ LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */
};
typedef struct {
@@ -426,6 +435,7 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex);
* @len: size of memory to be mapped
* @sg: array that receives the scatter/gather entries to be mapped
* @max_sg: maximum number of elements in above array
+ * @prot: protection as define in <sys/mman.h>
*
* @returns the number of scatter/gather entries created on success, and on
* failure:
@@ -435,7 +445,7 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex);
*/
int
lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg);
+ dma_sg_t *sg, int max_sg, int prot);
/**
* Maps a list scatter/gather entries from the guest's physical address space
@@ -488,23 +498,21 @@ lm_get_region(loff_t pos, size_t count, loff_t *off);
* Read from the dma region exposed by the client.
*
* @lm_ctx: the libmuser context
- * @addr: dma address exposed by the client
- * @count: size of the data to read
+ * @sg: a DMA segment obtained from dma_addr_to_sg
* @data: data buffer to read into
*/
int
-lm_dma_read(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data);
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
/**
* Write to the dma region exposed by the client.
*
* @lm_ctx: the libmuser context
- * @addr: dma address exposed by the client
- * @count: size of the data to write
+ * @sg: a DMA segment obtained from dma_addr_to_sg
* @data: data buffer to write
*/
int
-lm_dma_write(lm_ctx_t *lm_ctx, dma_addr_t addr, size_t count, void *data);
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
/*
* Advanced stuff.
diff --git a/lib/muser_priv.h b/lib/muser_priv.h
index 8d07b2c..c45a8f3 100644
--- a/lib/muser_priv.h
+++ b/lib/muser_priv.h
@@ -48,13 +48,21 @@ uint64_t
region_to_offset(uint32_t region);
int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *fds, int count);
+
+int
send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
- enum vfio_user_command cmd, void *data, int len, int *fds,
- int count);
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count);
+
int
recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
- uint16_t *msg_id, void *data, int *len);
+ uint16_t *msg_id, void *data, size_t *len);
int
send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
@@ -62,14 +70,21 @@ send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
int
recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
- int *max_fds);
+ int *max_fds, size_t *pgsize);
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len);
int
send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
- void *send_data, int send_len,
- int *send_fds, int fd_count,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
struct vfio_user_header *hdr,
- void *recv_data, int recv_len);
+ void *recv_data, size_t recv_len);
#endif /* MUSER_PRIV_H */
diff --git a/lib/vfio_user.h b/lib/vfio_user.h
index 890fc43..19f751a 100644
--- a/lib/vfio_user.h
+++ b/lib/vfio_user.h
@@ -36,6 +36,7 @@
#include <inttypes.h>
#include <linux/vfio.h>
+#include <linux/version.h>
enum vfio_user_command {
VFIO_USER_VERSION = 1,
@@ -51,6 +52,7 @@ enum vfio_user_command {
VFIO_USER_DMA_WRITE = 11,
VFIO_USER_VM_INTERRUPT = 12,
VFIO_USER_DEVICE_RESET = 13,
+ VFIO_USER_DIRTY_PAGES = 14,
VFIO_USER_MAX,
};
@@ -102,6 +104,64 @@ struct vfio_user_irq_info {
uint32_t subindex;
} __attribute__((packed));
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0)
+
+/* copied from <linux/vfio.h> */
+
+#define VFIO_REGION_TYPE_MIGRATION (3)
+#define VFIO_REGION_SUBTYPE_MIGRATION (1)
+
+struct vfio_device_migration_info {
+ __u32 device_state; /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP (0)
+#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING (1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
+ VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+ (state & VFIO_DEVICE_STATE_RESUMING ? \
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING)
+
+ __u32 reserved;
+ __u64 pending_bytes;
+ __u64 data_offset;
+ __u64 data_size;
+};
+
+struct vfio_bitmap {
+ __u64 pgsize; /* page size for bitmap in bytes */
+ __u64 size; /* in bytes */
+ __u64 *data; /* one bit per page */
+};
+
+struct vfio_iommu_type1_dirty_bitmap {
+ __u32 argsz;
+ __u32 flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+ __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+ __u64 iova; /* IO virtual address */
+ __u64 size; /* Size of iova range */
+ struct vfio_bitmap bitmap;
+};
+
+#endif
+
#endif
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/samples/client.c b/samples/client.c
index 925335d..5ff79cd 100644
--- a/samples/client.c
+++ b/samples/client.c
@@ -39,6 +39,7 @@
#include <time.h>
#include <err.h>
#include <assert.h>
+//#include <sys/uio.h>
#include "../lib/muser.h"
#include "../lib/muser_priv.h"
@@ -66,13 +67,16 @@ init_sock(const char *path)
}
static int
-set_version(int sock, int client_max_fds, int *server_max_fds)
+set_version(int sock, int client_max_fds, int *server_max_fds, size_t *pgsize)
{
int ret, mj, mn;
uint16_t msg_id;
char *client_caps = NULL;
- ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds);
+ assert(server_max_fds != NULL);
+ assert(pgsize != NULL);
+
+ ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds, pgsize);
if (ret < 0) {
fprintf(stderr, "failed to receive version from server: %s\n",
strerror(-ret));
@@ -85,7 +89,8 @@ set_version(int sock, int client_max_fds, int *server_max_fds)
goto out;
}
- ret = asprintf(&client_caps, "{max_fds: %d}", client_max_fds);
+ ret = asprintf(&client_caps, "{max_fds: %d, migration: {pgsize: %lu}}",
+ client_max_fds, sysconf(_SC_PAGESIZE));
if (ret == -1) {
client_caps = NULL;
ret = -ENOMEM; /* FIXME */
@@ -115,14 +120,64 @@ send_device_reset(int sock)
}
static int
+get_region_vfio_caps(int sock, size_t cap_sz)
+{
+ struct vfio_info_cap_header *header, *_header;
+ struct vfio_region_info_cap_type *type;
+ struct vfio_region_info_cap_sparse_mmap *sparse;
+ int i, ret;
+
+ header = _header = calloc(cap_sz, 1);
+ if (header == NULL) {
+ return -ENOMEM;
+ }
+
+ ret = recv(sock, header, cap_sz, 0);
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to receive VFIO cap info");
+ }
+ assert(ret == cap_sz);
+
+ while (true) {
+ switch (header->id) {
+ case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)header;
+ fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__,
+ sparse->nr_areas);
+ for (i = 0; i < sparse->nr_areas; i++) {
+ fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__,
+ i, sparse->areas[i].offset, sparse->areas[i].size);
+ }
+ break;
+ case VFIO_REGION_INFO_CAP_TYPE:
+ type = (struct vfio_region_info_cap_type*)header;
+ if (type->type != VFIO_REGION_TYPE_MIGRATION ||
+ type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) {
+ fprintf(stderr, "bad region type %d/%d\n", type->type,
+ type->subtype);
+ exit(EXIT_FAILURE);
+ }
+ printf("migration region\n");
+ break;
+ default:
+ fprintf(stderr, "bad VFIO cap ID %#x\n", header->id);
+ exit(EXIT_FAILURE);
+ }
+ if (header->next == 0) {
+ break;
+ }
+ header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info));
+ }
+ free(_header);
+}
+
+static int
get_device_region_info(int sock, struct vfio_device_info *client_dev_info)
{
struct vfio_region_info region_info;
- struct vfio_region_info_cap_sparse_mmap *sparse;
struct vfio_user_header hdr;
uint16_t msg_id = 0;
size_t cap_sz;
- int regsz = sizeof(region_info);
int i, ret;
msg_id = 1;
@@ -133,8 +188,9 @@ get_device_region_info(int sock, struct vfio_device_info *client_dev_info)
msg_id++;
ret = send_recv_vfio_user_msg(sock, msg_id,
VFIO_USER_DEVICE_GET_REGION_INFO,
- &region_info, regsz, NULL, 0, NULL,
- &region_info, regsz);
+ &region_info, sizeof region_info,
+ NULL, 0, NULL,
+ &region_info, sizeof(region_info));
if (ret < 0) {
fprintf(stderr, "failed to get device region info: %s\n",
strerror(-ret));
@@ -146,44 +202,26 @@ get_device_region_info(int sock, struct vfio_device_info *client_dev_info)
"cap_sz %d\n", __func__, i, region_info.offset,
region_info.flags, region_info.size, cap_sz);
if (cap_sz) {
- int j;
-
- sparse = calloc(cap_sz, 1);
- if (sparse == NULL) {
- return -ENOMEM;
- }
-
- ret = recv(sock, sparse, cap_sz, 0);
- if (ret < 0) {
- ret = -errno;
- fprintf(stderr, "%s: failed to receive sparse cap info: %s\n",
- __func__, strerror(-ret));
- free(sparse);
+ ret = get_region_vfio_caps(sock, cap_sz);
+ if (ret != 0) {
return ret;
}
- fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__,
- sparse->nr_areas);
- for (j = 0; j < sparse->nr_areas; j++) {
- fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__,
- j, sparse->areas[j].offset, sparse->areas[j].size);
- }
- free(sparse);
}
}
+ return 0;
}
static int get_device_info(int sock, struct vfio_device_info *dev_info)
{
struct vfio_user_header hdr;
- int dev_info_sz = sizeof(*dev_info);
uint16_t msg_id;
int ret;
- dev_info->argsz = dev_info_sz;
+ dev_info->argsz = sizeof(*dev_info);
msg_id = 1;
ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_INFO,
- dev_info, dev_info_sz, NULL, 0, NULL,
- dev_info, dev_info_sz);
+ dev_info, sizeof(*dev_info), NULL, 0, NULL,
+ dev_info, sizeof(*dev_info));
if (ret < 0) {
fprintf(stderr, "failed to get device info: %s\n", strerror(-ret));
return ret;
@@ -197,30 +235,35 @@ static int get_device_info(int sock, struct vfio_device_info *dev_info)
static int
configure_irqs(int sock)
{
- int i, size;
- int ret;
+ int i, ret;
+ size_t size;
struct vfio_irq_set irq_set;
- struct vfio_user_irq_info irq_info;
+ struct vfio_user_irq_info vfio_user_irq_info;
struct vfio_user_header hdr;
uint16_t msg_id = 1;
int irq_fd;
uint64_t val;
+ struct iovec iovecs[2];
- for (i = 0; i < LM_DEV_NUM_IRQS; i++) {
- struct vfio_irq_info irq_info = {.argsz = sizeof irq_info, .index = i};
+ for (i = 0; i < LM_DEV_NUM_IRQS; i++) { /* TODO move body of loop into function */
int size;
+ struct vfio_irq_info vfio_irq_info = {
+ .argsz = sizeof vfio_irq_info,
+ .index = i
+ };
ret = send_recv_vfio_user_msg(sock, msg_id,
VFIO_USER_DEVICE_GET_IRQ_INFO,
- &irq_info, sizeof irq_info, NULL, 0, NULL,
- &irq_info, sizeof irq_info);
+ &vfio_irq_info, sizeof vfio_irq_info,
+ NULL, 0, NULL,
+ &vfio_irq_info, sizeof vfio_irq_info);
if (ret < 0) {
fprintf(stderr, "failed to get %s info: %s\n", irq_to_str[i],
strerror(-ret));
return ret;
}
- if (irq_info.count > 0) {
+ if (vfio_irq_info.count > 0) {
printf("IRQ %s: count=%d flags=%#x\n",
- irq_to_str[i], irq_info.count, irq_info.flags);
+ irq_to_str[i], vfio_irq_info.count, vfio_irq_info.flags);
}
}
@@ -258,14 +301,16 @@ configure_irqs(int sock)
printf("INTx triggered!\n");
msg_id++;
- size = sizeof(irq_info);
- ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &irq_info, &size);
+
+ size = sizeof(vfio_user_irq_info);
+ ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &vfio_user_irq_info,
+ &size);
if (ret < 0) {
- fprintf(stderr, "failed to recieve IRQ message: %s\n", strerror(-ret));
+ fprintf(stderr, "failed to receive IRQ message: %s\n", strerror(-ret));
return ret;
}
- if (irq_info.subindex >= irq_set.count) {
- fprintf(stderr, "bad IRQ %d, max=%d\n", irq_info.subindex,
+ if (vfio_user_irq_info.subindex >= irq_set.count) {
+ fprintf(stderr, "bad IRQ %d, max=%d\n", vfio_user_irq_info.subindex,
irq_set.count);
return -ENOENT;
}
@@ -305,7 +350,10 @@ access_bar0(int sock)
fprintf(stderr, "failed to write to BAR0: %s\n", strerror(-ret));
return ret;
}
- assert(region_access.count == sizeof data.t);
+ if (region_access.count != sizeof data.t) {
+ fprintf(stderr, "bad written data length %d\n", region_access.count);
+ return -EINVAL;
+ }
printf("wrote to BAR0: %ld\n", data.t);
@@ -334,7 +382,8 @@ static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions,
{
struct vfio_user_dma_region_access dma_access;
struct vfio_user_header hdr;
- int ret, size = sizeof(dma_access), i;
+ int ret, i;
+ size_t size = sizeof(dma_access);
uint16_t msg_id;
void *data;
@@ -371,9 +420,10 @@ static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions,
dma_access.count = 0;
ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_DMA_WRITE,
- &dma_access, sizeof(dma_access), NULL, 0);
+ &dma_access, sizeof dma_access, NULL, 0);
if (ret < 0) {
- fprintf(stderr, "failed to send reply of DMA write: %m\n");
+ fprintf(stderr, "failed to send reply of DMA write: %s\n",
+ strerror(-ret));
}
out:
@@ -386,7 +436,8 @@ static int handle_dma_read(int sock, struct vfio_user_dma_region *dma_regions,
{
struct vfio_user_dma_region_access dma_access, *response;
struct vfio_user_header hdr;
- int ret, size = sizeof(dma_access), i, response_sz;
+ int ret, i, response_sz;
+ size_t size = sizeof(dma_access);
uint16_t msg_id;
void *data;
@@ -449,6 +500,56 @@ static int handle_dma_io(int sock, struct vfio_user_dma_region *dma_regions,
return 0;
}
+static int
+get_dirty_bitmaps(int sock, struct vfio_user_dma_region *dma_regions,
+ int nr_dma_regions)
+{
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0};
+ struct vfio_iommu_type1_dirty_bitmap_get bitmaps[2];
+ int ret, i;
+ struct iovec iovecs[4] = {
+ [1] = {
+ .iov_base = &dirty_bitmap,
+ .iov_len = sizeof dirty_bitmap
+ }
+ };
+ struct vfio_user_header hdr = {0};
+ char data[ARRAY_SIZE(bitmaps)];
+
+ assert(dma_regions != NULL);
+ assert(nr_dma_regions >= ARRAY_SIZE(bitmaps));
+
+ for (i = 0; i < ARRAY_SIZE(bitmaps); i++) {
+ bitmaps[i].iova = dma_regions[i].addr;
+ bitmaps[i].size = dma_regions[i].size;
+ bitmaps[i].bitmap.size = 1; /* FIXME calculate based on page and IOVA size, don't hardcode */
+ bitmaps[i].bitmap.pgsize = sysconf(_SC_PAGESIZE);
+ iovecs[(i + 2)].iov_base = &bitmaps[i]; /* FIXME the +2 is because iovecs[0] is the vfio_user_header and iovecs[1] is vfio_iommu_type1_dirty_bitmap */
+ iovecs[(i + 2)].iov_len = sizeof(struct vfio_iommu_type1_dirty_bitmap_get);
+ }
+
+ /*
+ * FIXME there should be at least two IOVAs. Send single message for two
+ * IOVAs and ensure only one bit is set in first IOVA.
+ */
+ dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+ ret = _send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
+ iovecs, ARRAY_SIZE(iovecs),
+ NULL, 0,
+ &hdr, data, ARRAY_SIZE(data));
+ if (ret != 0) {
+ fprintf(stderr, "failed to start dirty page logging: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(bitmaps); i++) {
+ printf("%#x-%#x\t%hhu\n", bitmaps[i].iova,
+ bitmaps[i].iova + bitmaps[i].size - 1, data[i]);
+ }
+ return 0;
+}
+
int main(int argc, char *argv[])
{
int ret, sock;
@@ -461,7 +562,9 @@ int main(int argc, char *argv[])
int fd;
const int client_max_fds = 32;
int server_max_fds;
+ size_t pgsize;
int nr_dma_regions;
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0};
if (argc != 2) {
fprintf(stderr, "usage: %s /path/to/socket\n", argv[0]);
@@ -478,7 +581,7 @@ int main(int argc, char *argv[])
* The server proposes version upon connection, we need to send back the
* version the version we support.
*/
- if ((ret = set_version(sock, client_max_fds, &server_max_fds)) < 0) {
+ if ((ret = set_version(sock, client_max_fds, &server_max_fds, &pgsize)) < 0) {
return ret;
}
@@ -503,7 +606,7 @@ int main(int argc, char *argv[])
/*
* XXX VFIO_USER_DMA_MAP
*
- * Tell the server we have some DMA regions it can access. Each DMA regions
+ * Tell the server we have some DMA regions it can access. Each DMA region
* is accompanied by a file descriptor, so let's create more (2x) DMA
* regions that can fit in a message that can be handled by the server.
*/
@@ -531,10 +634,10 @@ int main(int argc, char *argv[])
for (i = 0; i < nr_dma_regions / server_max_fds; i++, msg_id++) {
ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_MAP,
- dma_regions + (i * server_max_fds),
- sizeof *dma_regions * server_max_fds,
- dma_region_fds + (i * server_max_fds),
- server_max_fds, NULL, NULL, 0);
+ dma_regions + (i * server_max_fds),
+ sizeof(*dma_regions) * server_max_fds,
+ dma_region_fds + (i * server_max_fds),
+ server_max_fds, NULL, NULL, 0);
if (ret < 0) {
fprintf(stderr, "failed to map DMA regions: %s\n", strerror(-ret));
return ret;
@@ -553,6 +656,17 @@ int main(int argc, char *argv[])
exit(EXIT_FAILURE);
}
+
+ dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+ ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
+ &dirty_bitmap, sizeof dirty_bitmap,
+ NULL, 0, NULL, NULL, 0);
+ if (ret != 0) {
+ fprintf(stderr, "failed to start dirty page logging: %s\n",
+ strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
/*
* XXX VFIO_USER_DEVICE_GET_IRQ_INFO and VFIO_IRQ_SET_ACTION_TRIGGER
* Query interrupts, configure an eventfd to be associated with INTx, and
@@ -570,6 +684,23 @@ int main(int argc, char *argv[])
exit(EXIT_FAILURE);
}
+ ret = get_dirty_bitmaps(sock, dma_regions, nr_dma_regions);
+ if (ret < 0) {
+ fprintf(stderr, "failed to receive dirty bitmaps: %s\n",
+ strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+ dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+ ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
+ &dirty_bitmap, sizeof dirty_bitmap,
+ NULL, 0, NULL, NULL, 0);
+ if (ret != 0) {
+ fprintf(stderr, "failed to stop dirty page logging: %s\n",
+ strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
/*
* FIXME now that region read/write works, change the server implementation
* to trigger an interrupt after N seconds, where N is the value written to
@@ -577,7 +708,11 @@ int main(int argc, char *argv[])
*/
/* BAR1 can be memory mapped and read directly */
- /* TODO implement the following: write a value in BAR1, a server timer will increase it every second (SIGALARM) */
+
+ /*
+ * TODO implement the following: write a value in BAR1, a server timer will
+ * increase it every second (SIGALARM)
+ */
/*
* XXX VFIO_USER_DMA_UNMAP
@@ -585,8 +720,7 @@ int main(int argc, char *argv[])
* unmap the first group of the DMA regions
*/
ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_UNMAP,
- dma_regions,
- sizeof *dma_regions * server_max_fds,
+ dma_regions, sizeof *dma_regions * server_max_fds,
NULL, 0, NULL, NULL, 0);
if (ret < 0) {
fprintf(stderr, "failed to unmap DMA regions: %s\n", strerror(-ret));
diff --git a/samples/server.c b/samples/server.c
index 782ccdc..9f4d3b1 100644
--- a/samples/server.c
+++ b/samples/server.c
@@ -39,6 +39,7 @@
#include <time.h>
#include <assert.h>
#include <openssl/md5.h>
+#include <sys/mman.h>
#include "../lib/muser.h"
@@ -146,17 +147,30 @@ void get_md5sum(char *buf, int len, char *md5sum)
return;
}
+/*
+ * FIXME this function does DMA write/read using messages. This should be done
+ * on a region that is not memory mappable or an area of a region that is not
+ * sparsely memory mappable. We should also have a test where the server does
+ * DMA directly on the client memory.
+ */
static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data)
{
int count = 4096;
char buf[count], md5sum1[MD5_DIGEST_LENGTH], md5sum2[MD5_DIGEST_LENGTH];
int i, ret;
+ dma_sg_t sg;
+
+ assert(lm_ctx != NULL);
+
+ ret = lm_addr_to_sg(lm_ctx, server_data->regions[0].addr, count, &sg,
+ 1, PROT_WRITE);
+ assert(ret == 1); /* FIXME */
memset(buf, 'A', count);
get_md5sum(buf, count, md5sum1);
printf("%s: WRITE addr %#lx count %llu\n", __func__,
server_data->regions[0].addr, count);
- ret = lm_dma_write(lm_ctx, server_data->regions[0].addr, count, buf);
+ ret = lm_dma_write(lm_ctx, &sg, buf);
if (ret < 0) {
fprintf(stderr, "lm_dma_write failed: %s\n", strerror(-ret));
return ret;
@@ -165,7 +179,7 @@ static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data)
memset(buf, 0, count);
printf("%s: READ addr %#lx count %llu\n", __func__,
server_data->regions[0].addr, count);
- ret = lm_dma_read(lm_ctx, server_data->regions[0].addr, count, buf);
+ ret = lm_dma_read(lm_ctx, &sg, buf);
if (ret < 0) {
fprintf(stderr, "lm_dma_read failed: %s\n", strerror(-ret));
return ret;
@@ -253,6 +267,11 @@ int main(int argc, char *argv[])
.mmap_areas = sparse_areas,
.map = map_area
},
+ .reg_info[LM_DEV_MIGRATION_REG_IDX] = { /* migration region */
+ .flags = LM_REG_FLAG_RW,
+ .size = sysconf(_SC_PAGESIZE),
+ .mmap_areas = sparse_areas,
+ },
.irq_count[LM_DEV_INTX_IRQ_IDX] = 1,
},
.uuid = argv[optind],