aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThanos Makatos <thanos.makatos@nutanix.com>2022-07-04 12:16:08 +0100
committerGitHub <noreply@github.com>2022-07-04 12:16:08 +0100
commit36beb63be45ad1412562a98d9373a4c0bd91ab3d (patch)
tree61bb44f0befc0055292b120909251c0fc2d27b0e
parent1c274027bb4f9d68eee846036e8d50dcde2fd7e9 (diff)
downloadlibvfio-user-36beb63be45ad1412562a98d9373a4c0bd91ab3d.zip
libvfio-user-36beb63be45ad1412562a98d9373a4c0bd91ab3d.tar.gz
libvfio-user-36beb63be45ad1412562a98d9373a4c0bd91ab3d.tar.bz2
support for shadow ioeventfd (#698)
When an ioeventfd is written to, KVM discards the value since it has no memory to write it to, and simply kicks the eventfd. This a problem for devices such a NVMe controllers that need the value (e.g. doorbells on BAR0). This patch allows the vfio-user server to pass a file descriptor that can be mmap'ed and KVM can write the ioeventfd value to this _shadow_ memory instead of discarding it. This shadow memory is not exposed to the guest. Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com> Reviewed-by: John Levon <john.levon@nutanix.com> Change-Id: Iad849c94076ffa5988e034c8bf7ec312d01f095f
-rw-r--r--docs/ioregionfd.md25
-rw-r--r--include/libvfio-user.h10
-rw-r--r--include/vfio-user.h1
-rw-r--r--lib/libvfio-user.c24
-rw-r--r--lib/private.h1
-rw-r--r--meson.build5
-rw-r--r--meson_options.txt2
-rw-r--r--test/py/libvfio_user.py15
-rw-r--r--test/py/meson.build4
-rw-r--r--test/py/test_device_get_region_io_fds.py1
-rw-r--r--test/py/test_pci_caps.py4
-rw-r--r--test/py/test_shadow_ioeventfd.py105
12 files changed, 184 insertions, 13 deletions
diff --git a/docs/ioregionfd.md b/docs/ioregionfd.md
new file mode 100644
index 0000000..c09b077
--- /dev/null
+++ b/docs/ioregionfd.md
@@ -0,0 +1,25 @@
+# ioregionfd
+
+ioregionfd is a mechanism that speeds up ioeventfds:
+https://lore.kernel.org/kvm/cover.1613828726.git.eafanasova@gmail.com/. In the
+author's original words: "ioregionfd is a KVM dispatch mechanism which can be
+used for handling MMIO/PIO accesses over file descriptors without returning
+from ioctl(KVM_RUN).".
+
+libvfio-user currently supports an experimental variant of this mechanism
+called shadow ioeventfd. A shadow ioeventfd is a normal ioeventfd where the
+vfio-user server passes another piece of memory (called the _shadow_ memory)
+via an additional file descriptor when configuring the ioregionfd, which then
+QEMU memory maps and passes this address to KVM. This shadow memory is never
+exposed to the guest. When the guest writes to the trapped memory, KVM writes
+the value to the shadow memory instread of discarding it, and then proceeds
+kicking the eventfd as normal.
+
+To use shadow ioeventfd, the kernel and QEMU need to be patched. The QEMU patch
+is designed specifically for SPDK's doorbells (one ioregionfd of 4K in BAR0);
+it should be trivial to extend.
+
+The list of patches:
+* kernel: https://gist.github.com/tmakatos/532afd092a8df2175120d3dbfcd719ef
+* QEMU: https://gist.github.com/tmakatos/57755d2a37a6d53c9ff392e7c34470f6
+* SPDK: https://gist.github.com/tmakatos/f6c10fdaff59c9d629f94bd8e44a53bc
diff --git a/include/libvfio-user.h b/include/libvfio-user.h
index 84eb2d8..d12d89f 100644
--- a/include/libvfio-user.h
+++ b/include/libvfio-user.h
@@ -1069,12 +1069,18 @@ vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg);
* @size: size of the ioeventfd
* @flags: Any flags to set up the ioeventfd
* @datamatch: sets the datamatch value
+ * @shadow_fd: File descriptor that can be mmap'ed, KVM will write there the
+ * otherwise discarded value when the ioeventfd is written to. If set to -1
+ * then a normal ioeventfd is set up instead of a shadow one. The vfio-user
+ * client is free to ignore this, even if it supports shadow ioeventfds.
+ * Requires a kernel with shadow ioeventfd support.
+ * Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise
+ * must be -1.
*/
int
vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
size_t offset, uint32_t size, uint32_t flags,
- uint64_t datamatch);
-
+ uint64_t datamatch, int shadow_fd);
#ifdef __cplusplus
}
#endif
diff --git a/include/vfio-user.h b/include/vfio-user.h
index 7439484..dc6fafa 100644
--- a/include/vfio-user.h
+++ b/include/vfio-user.h
@@ -168,6 +168,7 @@ typedef struct vfio_user_region_io_fds_request {
#define VFIO_USER_IO_FD_TYPE_IOEVENTFD 0
#define VFIO_USER_IO_FD_TYPE_IOREGIONFD 1
+#define VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW 2
typedef struct vfio_user_sub_region_ioeventfd {
uint64_t offset;
diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c
index ac04d3b..5ce5767 100644
--- a/lib/libvfio-user.c
+++ b/lib/libvfio-user.c
@@ -467,13 +467,19 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
EXPORT int
vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
size_t offset, uint32_t size, uint32_t flags,
- uint64_t datamatch)
+ uint64_t datamatch, int shadow_fd)
{
vfu_reg_info_t *vfu_reg;
assert(vfu_ctx != NULL);
assert(fd >= 0);
+#ifndef SHADOW_IOEVENTFD
+ if (shadow_fd != -1) {
+ return ERROR_INT(EINVAL);
+ }
+#endif
+
if (region_idx >= VFU_PCI_DEV_NUM_REGIONS) {
return ERROR_INT(EINVAL);
}
@@ -494,6 +500,7 @@ vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
elem->size = size;
elem->flags = flags;
elem->datamatch = datamatch;
+ elem->shadow_fd = shadow_fd;
LIST_INSERT_HEAD(&vfu_reg->subregions, elem, entry);
return 0;
@@ -555,6 +562,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
ioeventfd_t *sub_reg = NULL;
size_t nr_sub_reg = 0;
size_t i = 0;
+ size_t nr_shadow_reg = 0;
assert(vfu_ctx != NULL);
assert(msg != NULL);
@@ -585,6 +593,9 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
LIST_FOREACH(sub_reg, &vfu_reg->subregions, entry) {
nr_sub_reg++;
+ if (sub_reg->shadow_fd != -1) {
+ nr_shadow_reg++;
+ }
}
if (req->argsz < sizeof(vfio_user_region_io_fds_reply_t) ||
@@ -614,7 +625,8 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
msg->out.nr_fds = 0;
if (req->argsz >= reply->argsz) {
- msg->out.fds = calloc(sizeof(int), max_sent_sub_regions);
+ msg->out.fds = calloc(sizeof(int),
+ max_sent_sub_regions + nr_shadow_reg);
if (msg->out.fds == NULL) {
return -1;
}
@@ -627,7 +639,13 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
ioefd->size = sub_reg->size;
ioefd->fd_index = add_fd_index(msg->out.fds, &msg->out.nr_fds,
sub_reg->fd);
- ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD;
+ if (sub_reg->shadow_fd == -1) {
+ ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD;
+ } else {
+ ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW;
+ int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->shadow_fd);
+ assert(ret == 1);
+ }
ioefd->flags = sub_reg->flags;
ioefd->datamatch = sub_reg->datamatch;
diff --git a/lib/private.h b/lib/private.h
index 7ffd6be..b875138 100644
--- a/lib/private.h
+++ b/lib/private.h
@@ -186,6 +186,7 @@ typedef struct ioeventfd {
int32_t fd;
uint32_t flags;
uint64_t datamatch;
+ int32_t shadow_fd;
LIST_ENTRY(ioeventfd) entry;
} ioeventfd_t;
diff --git a/meson.build b/meson.build
index 03e9267..dba2ba6 100644
--- a/meson.build
+++ b/meson.build
@@ -18,6 +18,7 @@ opt_tran_pipe = get_option('tran-pipe')
opt_debug_logs = get_option('debug-logs')
opt_sanitizers = get_option('b_sanitize')
opt_debug = get_option('debug')
+opt_shadow_ioeventfd = get_option('shadow-ioeventfd')
cc = meson.get_compiler('c')
@@ -57,6 +58,10 @@ if opt_debug_logs.enabled() or (not opt_debug_logs.disabled() and opt_debug)
common_cflags += ['-DDEBUG']
endif
+if opt_shadow_ioeventfd
+ common_cflags += ['-DSHADOW_IOEVENTFD']
+endif
+
if get_option('warning_level') == '2'
# -Wall is set for 'warning_level>=1'
# -Wextra is set for 'warning_level>=2'
diff --git a/meson_options.txt b/meson_options.txt
index 49e4d9a..5dd2efd 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -4,3 +4,5 @@ option('tran-pipe', type: 'boolean', value: false,
description: 'enable pipe transport for testing')
option('debug-logs', type: 'feature', value: 'auto',
description: 'enable extra debugging code (default for debug builds)')
+option('shadow-ioeventfd', type: 'boolean', value : false,
+ description: 'enable shadow ioeventfd (experimental)')
diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py
index 0ebf773..c852542 100644
--- a/test/py/libvfio_user.py
+++ b/test/py/libvfio_user.py
@@ -194,6 +194,7 @@ VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP = (1 << 2)
VFIO_USER_IO_FD_TYPE_IOEVENTFD = 0
VFIO_USER_IO_FD_TYPE_IOREGIONFD = 1
+VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW = 2
# enum vfu_dev_irq_type
@@ -620,7 +621,7 @@ lib.vfu_sgl_put.argtypes = (c.c_void_p, c.POINTER(dma_sg_t),
lib.vfu_create_ioeventfd.argtypes = (c.c_void_p, c.c_uint32, c.c_int,
c.c_size_t, c.c_uint32, c.c_uint32,
- c.c_uint64)
+ c.c_uint64, c.c_int32)
lib.vfu_device_quiesced.argtypes = (c.c_void_p, c.c_int)
@@ -635,6 +636,10 @@ def to_byte(val):
return val.to_bytes(1, 'little')
+def to_bytes_le(n, length=1):
+ return n.to_bytes(length, 'little')
+
+
def skip(fmt, buf):
"""Return the data remaining after skipping the given elements."""
return buf[struct.calcsize(fmt):]
@@ -645,6 +650,9 @@ def parse_json(json_str):
return json.loads(json_str, object_hook=lambda d: SimpleNamespace(**d))
+IOEVENT_SIZE = 8
+
+
def eventfd(initval=0, flags=0):
libc.eventfd.argtypes = (c.c_uint, c.c_int)
return libc.eventfd(initval, flags)
@@ -1184,11 +1192,12 @@ def vfu_sgl_put(ctx, sg, iovec, cnt=1):
return lib.vfu_sgl_put(ctx, sg, iovec, cnt)
-def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch):
+def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch,
+ shadow_fd=-1):
assert ctx is not None
return lib.vfu_create_ioeventfd(ctx, region_idx, fd, offset, size,
- flags, datamatch)
+ flags, datamatch, shadow_fd)
def vfu_device_quiesced(ctx, err):
diff --git a/test/py/meson.build b/test/py/meson.build
index d9c97b3..0ea9f08 100644
--- a/test/py/meson.build
+++ b/test/py/meson.build
@@ -49,6 +49,10 @@ python_tests = [
'test_vfu_realize_ctx.py',
]
+if get_option('shadow-ioeventfd')
+ python_tests += 'test_shadow_ioeventfd.py'
+endif
+
python_files = python_tests_common + python_tests
if pytest.found() and opt_sanitizers == 'none'
diff --git a/test/py/test_device_get_region_io_fds.py b/test/py/test_device_get_region_io_fds.py
index cb1b732..924f462 100644
--- a/test/py/test_device_get_region_io_fds.py
+++ b/test/py/test_device_get_region_io_fds.py
@@ -36,7 +36,6 @@ import struct
ctx = None
sock = None
fds = []
-IOEVENT_SIZE = 8
def test_device_get_region_io_fds_setup():
diff --git a/test/py/test_pci_caps.py b/test/py/test_pci_caps.py
index b83c06c..01f23a2 100644
--- a/test/py/test_pci_caps.py
+++ b/test/py/test_pci_caps.py
@@ -349,10 +349,6 @@ def test_pci_cap_write_px(mock_quiesce, mock_reset):
expect=errno.EINVAL)
-def to_bytes_le(n, length=1):
- return n.to_bytes(length, 'little')
-
-
def test_pci_cap_write_msix():
setup_pci_dev(realize=True)
sock = connect_client(ctx)
diff --git a/test/py/test_shadow_ioeventfd.py b/test/py/test_shadow_ioeventfd.py
new file mode 100644
index 0000000..642ad0e
--- /dev/null
+++ b/test/py/test_shadow_ioeventfd.py
@@ -0,0 +1,105 @@
+#
+# Copyright (c) 2022 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos.makatos@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+# DAMAGE.
+#
+
+from libvfio_user import *
+import tempfile
+import mmap
+import errno
+
+
+def test_shadow_ioeventfd():
+ """Configure a shadow ioeventfd, have the client trigger it, confirm that
+ the server receives the notification and can see the value."""
+
+ # server setup
+ ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB)
+ assert ctx is not None
+ ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_BAR0_REGION_IDX, size=0x1000,
+ flags=VFU_REGION_FLAG_RW)
+ assert ret == 0
+ fo = tempfile.TemporaryFile(dir="/dev/shm")
+ fo.truncate(0x1000)
+
+ # FIXME
+ # Use pip install eventfd?
+ # $ grep EFD_NONBLOCK -wr /usr/include/
+ # /usr/include/bits/eventfd.h: EFD_NONBLOCK = 00004000
+ EFD_NONBLOCK = 0o00004000
+
+ efd = eventfd(flags=EFD_NONBLOCK)
+ ret = vfu_create_ioeventfd(ctx, VFU_PCI_DEV_BAR0_REGION_IDX, efd, 0x8,
+ 0x16, 0, 0, shadow_fd=fo.fileno())
+ assert ret == 0
+ ret = vfu_realize_ctx(ctx)
+ assert ret == 0
+
+ # client queries I/O region FDs
+ sock = connect_client(ctx)
+ payload = vfio_user_region_io_fds_request(
+ argsz=len(vfio_user_region_io_fds_reply()) +
+ len(vfio_user_sub_region_ioeventfd()), flags=0,
+ index=VFU_PCI_DEV_BAR0_REGION_IDX, count=0)
+ newfds, ret = msg_fds(ctx, sock, VFIO_USER_DEVICE_GET_REGION_IO_FDS,
+ payload, expect=0)
+ reply, ret = vfio_user_region_io_fds_reply.pop_from_buffer(ret)
+ assert reply.count == 1 # 1 eventfd
+ ioevent, _ = vfio_user_sub_region_ioeventfd.pop_from_buffer(ret)
+ assert ioevent.offset == 0x8
+ assert ioevent.size == 0x16
+ assert ioevent.fd_index == 0
+ assert ioevent.type == VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW
+ assert ioevent.flags == 0
+ assert ioevent.datamatch == 0
+
+ assert len(newfds) == 2 # 2 FDs: eventfd plus shadow FD
+ cefd = newfds[0]
+ csfd = newfds[1]
+ cmem = mmap.mmap(csfd, 0x1000)
+
+ # vfio-user app reads the eventfd, there should be nothing there
+ try:
+ os.read(efd, IOEVENT_SIZE)
+ except BlockingIOError as e:
+ if e.errno != errno.EAGAIN:
+ assert False
+ else:
+ assert False
+
+ # Client writes to the I/O region. The write to the eventfd would be done
+ # by KVM and the value would be the same in both cases.
+ cmem.seek(0x8)
+ cmem.write(c.c_ulonglong(0xdeadbeef))
+ os.write(cefd, c.c_ulonglong(0xcafebabe))
+
+ # vfio-user app reads eventfd
+ assert os.read(efd, IOEVENT_SIZE) == to_bytes_le(0xcafebabe, 8)
+ fo.seek(0x8)
+ assert fo.read(0x8) == to_bytes_le(0xdeadbeef, 8)
+
+ vfu_destroy_ctx(ctx)