From 36beb63be45ad1412562a98d9373a4c0bd91ab3d Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Mon, 4 Jul 2022 12:16:08 +0100 Subject: support for shadow ioeventfd (#698) When an ioeventfd is written to, KVM discards the value since it has no memory to write it to, and simply kicks the eventfd. This a problem for devices such a NVMe controllers that need the value (e.g. doorbells on BAR0). This patch allows the vfio-user server to pass a file descriptor that can be mmap'ed and KVM can write the ioeventfd value to this _shadow_ memory instead of discarding it. This shadow memory is not exposed to the guest. Signed-off-by: Thanos Makatos Reviewed-by: John Levon Change-Id: Iad849c94076ffa5988e034c8bf7ec312d01f095f --- lib/libvfio-user.c | 24 +++++++++++++++++++++--- lib/private.h | 1 + 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index ac04d3b..5ce5767 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -467,13 +467,19 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) EXPORT int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch) + uint64_t datamatch, int shadow_fd) { vfu_reg_info_t *vfu_reg; assert(vfu_ctx != NULL); assert(fd >= 0); +#ifndef SHADOW_IOEVENTFD + if (shadow_fd != -1) { + return ERROR_INT(EINVAL); + } +#endif + if (region_idx >= VFU_PCI_DEV_NUM_REGIONS) { return ERROR_INT(EINVAL); } @@ -494,6 +500,7 @@ vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, elem->size = size; elem->flags = flags; elem->datamatch = datamatch; + elem->shadow_fd = shadow_fd; LIST_INSERT_HEAD(&vfu_reg->subregions, elem, entry); return 0; @@ -555,6 +562,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioeventfd_t *sub_reg = NULL; size_t nr_sub_reg = 0; size_t i = 0; + size_t nr_shadow_reg = 0; assert(vfu_ctx != NULL); assert(msg != NULL); @@ -585,6 +593,9 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) LIST_FOREACH(sub_reg, &vfu_reg->subregions, entry) { nr_sub_reg++; + if (sub_reg->shadow_fd != -1) { + nr_shadow_reg++; + } } if (req->argsz < sizeof(vfio_user_region_io_fds_reply_t) || @@ -614,7 +625,8 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) msg->out.nr_fds = 0; if (req->argsz >= reply->argsz) { - msg->out.fds = calloc(sizeof(int), max_sent_sub_regions); + msg->out.fds = calloc(sizeof(int), + max_sent_sub_regions + nr_shadow_reg); if (msg->out.fds == NULL) { return -1; } @@ -627,7 +639,13 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioefd->size = sub_reg->size; ioefd->fd_index = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->fd); - ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; + if (sub_reg->shadow_fd == -1) { + ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; + } else { + ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW; + int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->shadow_fd); + assert(ret == 1); + } ioefd->flags = sub_reg->flags; ioefd->datamatch = sub_reg->datamatch; diff --git a/lib/private.h b/lib/private.h index 7ffd6be..b875138 100644 --- a/lib/private.h +++ b/lib/private.h @@ -186,6 +186,7 @@ typedef struct ioeventfd { int32_t fd; uint32_t flags; uint64_t datamatch; + int32_t shadow_fd; LIST_ENTRY(ioeventfd) entry; } ioeventfd_t; -- cgit v1.1