aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules0
-rw-r--r--.travis.yml1
-rw-r--r--CMakeLists.txt6
-rw-r--r--Makefile2
-rw-r--r--README.md156
-rw-r--r--kmod/CMakeLists.txt52
-rw-r--r--kmod/muser.h73
-rw-r--r--lib/CMakeLists.txt4
-rw-r--r--lib/cap.c444
-rw-r--r--lib/cap.h9
-rw-r--r--lib/caps/common.h46
-rw-r--r--lib/caps/msi.h9
-rw-r--r--lib/caps/msix.h9
-rw-r--r--lib/caps/pm.h15
-rw-r--r--lib/caps/px.h9
-rw-r--r--lib/common.h8
-rw-r--r--lib/dma.c248
-rw-r--r--lib/dma.h137
-rw-r--r--lib/muser.h232
-rw-r--r--lib/muser_ctx.c2242
-rw-r--r--lib/muser_pci.c75
-rw-r--r--lib/muser_priv.h113
-rw-r--r--lib/vfio_user.h167
-rw-r--r--samples/CMakeLists.txt7
-rw-r--r--samples/client.c901
-rw-r--r--samples/gpio-pci-idio-16.c63
-rw-r--r--samples/null.c108
-rw-r--r--samples/server.c398
28 files changed, 4579 insertions, 955 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/.gitmodules
diff --git a/.travis.yml b/.travis.yml
index 0afe572..b0a3dae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,7 +10,6 @@ before_install:
- git remote add origin git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
- git fetch --depth 1 origin v5.8
- git checkout FETCH_HEAD
- - patch -p1 < "${muser_dir}/patches/vfio.diff"
- make olddefconfig
- make prepare
- cd "${muser_dir}"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7975983..af97768 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,11 +32,9 @@ cmake_minimum_required (VERSION 2.6)
project(muser)
include(GNUInstallDirs)
-# shared library
+# shared libraries
add_subdirectory(lib)
-# kernel module
-add_subdirectory(kmod)
-
# samples
add_subdirectory(samples)
+
diff --git a/Makefile b/Makefile
index 5a4482e..9d0676a 100644
--- a/Makefile
+++ b/Makefile
@@ -46,6 +46,7 @@ BUILD_DIR_BASE = $(CURDIR)/build
BUILD_DIR = $(BUILD_DIR_BASE)/$(BUILD_TYPE)
KDIR ?= "/lib/modules/$(shell uname -r)/build"
+INSTALL_PREFIX ?= /usr/local
PHONY_TARGETS := all realclean buildclean force_cmake export install-export tags
@@ -68,6 +69,7 @@ $(BUILD_DIR)/Makefile:
-D "CMAKE_C_FLAGS:STRING=$(CFLAGS)" \
-D "CMAKE_BUILD_TYPE:STRING=$(CMAKE_BUILD_TYPE)" \
-D "KDIR=$(KDIR)" \
+ -D "CMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX)" \
$(CURDIR)
tags:
diff --git a/README.md b/README.md
index cd1daae..887c51a 100644
--- a/README.md
+++ b/README.md
@@ -4,41 +4,22 @@ Mediated Userspace Device
Overview
--------
-MUSER is a framework that allows PCI devices to be implemented in userspace. It
-leverages the Linux kernel VFIO/MDEV infrastructure, allowing such devices to
-be easily accessed via standard VFIO interfaces and subsequently virtual
-machines. These can be completely virtual and not backed by any real hardware.
-This provides interesting benefits, including:
-
-* Simplification of the initial development of kernel drivers for new devices
-* Easy plumbing to hypervisors that support VFIO device pass-through
-* Performance benefits as a single process can poll multiple drivers
-
-MUSER is implemented by two components: a loadable kernel module (muser.ko) and
-a userspace library (libmuser). The LKM registers itself with MDEV and relay
-VFIO requests to libmuser via a custom ioctl-based interface. The library, in
-turn, abstracts most of the complexity around representing the device.
+MUSER is a framework that allows implementing PCI devices under the [vfio-user
+protocol](https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg02458.html).
+MUSER is the _backend_ part of the vfio-user protocol, the frontend part is
+implemented by Oracle in https://github.com/oracle/qemu/tree/vfio-user-v0.1.
+
+The library abstracts most of the complexity around representing the device.
Applications using libmuser provide a description of the device (eg. region and
irq information) and as set of callbacks which are invoked by libmuser when
those regions are accessed. See src/samples on how to build such an
application.
-Currently there is a one, single-threaded application instance per device,
+Currently there is one, single-threaded application instance per device,
however the application can employ any form of concurrency needed. In the
future we plan to make libmuser multi-threaded. The application can be
implemented in whatever way is convenient, e.g. as a Python script using
-bindings, on the cloud, etc.
-
-There is also an ongoing effort to define a protocol based on VFIO that will be
-officially supported by QEMU so the kernel module won't be necessary. This
-protocol (tentatively named VFIO-over-socket and soon to be renamed to
-vfio-user) has been discussed as an RFC in qemu-devel:
-https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07900.html,
-and is now in the process of being reviewed:
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg723773.html.
-In the RFC email thread it is explained how to run the GPIO sample without the
-MUSER kernel module, where to get sources etc. Please refer to the RFC email
-thread for more information.
+bindings, on the cloud, etc. There's also experimental support for polling.
Memory Mapping the Device
@@ -57,139 +38,24 @@ page is written to.
Interrupts
----------
-Interrupts are implemented by installing the event file descriptor in libmuser
+Interrupts are implemented by passing the event file descriptor to libmuser
and then notifying it about it. libmuser can then trigger interrupts simply by
writing to it. This can be much more expensive compared to triggering interrupts
from the kernel, however this performance penalty is perfectly acceptable when
prototyping the functional aspect of a device driver.
-System Architecture
--------------------
-
-muser.ko and libmuser communicate via ioctl on a control device. This control
-device is create when the mediated device is created and appears as
-/dev/muser/<UUID>. libmuser opens this device and then executes a "wait
-command" ioctl. Whenever a callback of muser.ko is executed, it fills a struct
-with the command details and then completes the ioctl, unblocking libmuser. It
-then waits to receive another ioctl from libmuser with the result. Currently
-there can be only one command pending, we plan to allow multiple commands to be
-executed in parallel.
-
-
Building muser
==============
-vfio/mdev needs to be patched:
-
- patch -p1 < muser/patches/vfio.diff
-
-Apply the patch and rebuild the vfio/mdev modules:
-
- make SUBDIRS=drivers/vfio/ modules
-
-Reload the relevant kernel modules:
-
- drivers/vfio/vfio_iommu_type1.ko
- drivers/vfio/vfio.ko
- drivers/vfio/mdev/mdev.ko
- drivers/vfio/mdev/vfio_mdev.ko
-
-To build and install the library run:
+Just do:
make && make install
+The kernel headers are necessary because VFIO structs and defines are reused.
To specify an alternative kernel directory set the KDIR environment variable
accordingly.
To enable Python bindings set the PYTHON_BINDINGS environment variable to a
non-empty string.
Finally build your program and link it to libmuser.so.
-
-Running QEMU
-============
-
-To pass the device to QEMU add the following options:
-
- -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/<UUID>
- -object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=mem,share=yes,size=1073741824 -numa node,nodeid=0,cpus=0,memdev=ram-node0
-
-Guest RAM must be shared (share=yes) otherwise libmuser won't be able to do DMA
-transfers from/to it. If you're not using QEMU then any memory that must be
-accessed by libmuser must be allocate MAP_SHARED. Registering memory for DMA
-that has not been allocated with MAP_SHARED is ignored and any attempts to
-access that memory will result in an error.
-
-Example
-=======
-
-samples/gpio-pci-idio-16.c implements a tiny part of the PCI-IDIO-16 GPIO
-(https://www.accesio.com/?p=/pci/pci_idio_16.html). In this sample it's a simple
-device that toggles the input every 3 times it's read.
-
-Running gpio-pci-idio-16
-------------------------
-
-1. First, follow the instructions to build and load muser.
-2. Then, start the gpio-pci-idio-16 device emulation:
-```
-# echo 00000000-0000-0000-0000-000000000000 > /sys/class/muser/muser/mdev_supported_types/muser-1/create
-# build/dbg/samples/gpio-pci-idio-16 00000000-0000-0000-0000-000000000000
-```
-3. Finally, start the VM adding the command line explained earlier and then
-execute:
-```
-# insmod gpio-pci-idio-16.ko
-# cat /sys/class/gpio/gpiochip480/base > /sys/class/gpio/export
-# for ((i=0;i<12;i++)); do cat /sys/class/gpio/OUT0/value; done
-0
-0
-0
-1
-1
-1
-0
-0
-0
-1
-1
-1
-```
-
-Future Work
-===========
-
-Making libmuser Restartable
-----------------------------
-
-muser can be made restartable so that (a) it can recover from failures, and
-(b) upgrades are less disrupting. This is something we plan to implement in the
-future. To make it restarable muser needs to reconfigure eventfds and DMA
-region mmaps first thing when the device is re-opened by libmuser. After muser
-has finished reconfiguring it will send a "ready" command, after which normal
-operation will be resumed. This "ready" command will always be sent when the
-device is opened, even if this is the first time, as this way we don't need to
-differentiate between normal operation and restarted operation. libmuser will
-store the PCI BAR on /dev/shm (named after e.g. the device UUID) so that it can
-easily find them on restart.
-
-
-Making libmuser Multi-threaded
--------------------------------
-
-libmuser can be made multi-threaded in order to improve performance. To
-implement this we'll have to maintain a private context in struct file.
-
-Troubleshooting
----------------
-
-If you get the following error when starting QEMU:
-
- qemu-system-x86_64: -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/00000000-0000-0000-0000-000000000000: vfio 00000000-0000-0000-0000-000000000000: failed to read device config space: Bad address
-
-it might mean that you haven't properly patched your kernel.
-
-To debug accesses to your PCI device from QEMU add the following to the QEMU
-command line:
-
- -trace enable=vfio*,file=qemu-vfio.trace
diff --git a/kmod/CMakeLists.txt b/kmod/CMakeLists.txt
deleted file mode 100644
index 07e90e7..0000000
--- a/kmod/CMakeLists.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-# Copyright (c) 2019 Nutanix Inc. All rights reserved.
-#
-# Authors: Thanos Makatos <thanos@nutanix.com>
-# Swapnil Ingle <swapnil.ingle@nutanix.com>
-# Felipe Franciosi <felipe@nutanix.com>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Nutanix nor the names of its contributors may be
-# used to endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-
-# Copy sources to build directory (avoid polluting source directory).
-# TODO can we copy all source files with a wildcard?
-configure_file(muser.c ${CMAKE_CURRENT_BINARY_DIR}/muser.c COPYONLY)
-configure_file(muser.h ${CMAKE_CURRENT_BINARY_DIR}/muser.h COPYONLY)
-# FIXME need to pass "CFLAGS_muser.o := -DDEBUG" for debug builds
-set(KMOD_MAKEFILE_CONTENT "obj-m := muser.o")
-IF(CMAKE_BUILD_TYPE MATCHES Debug)
- set(KMOD_MAKEFILE_CONTENT "CFLAGS_muser.o := -DDEBUG\n${KMOD_MAKEFILE_CONTENT}")
-ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/Kbuild ${KMOD_MAKEFILE_CONTENT})
-
-# Build module using kernel's Makefile.
-set(KBUILD_CMD ${CMAKE_MAKE_PROGRAM} -C ${KDIR} M=${CMAKE_CURRENT_BINARY_DIR} modules)
-ADD_CUSTOM_COMMAND(OUTPUT DRIVER_BIN_FILE
- COMMAND ${KBUILD_CMD}
- DEPENDS ${MODULE_SOURCE_FILES} VERBATIM
-)
-ADD_CUSTOM_TARGET(driver ALL DEPENDS DRIVER_BIN_FILE)
-execute_process(COMMAND uname -r OUTPUT_VARIABLE kver OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.ko DESTINATION /lib/modules/${kver}/extra/)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/linux)
diff --git a/kmod/muser.h b/kmod/muser.h
deleted file mode 100644
index 65841a4..0000000
--- a/kmod/muser.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-/*
- * Copyright (c) 2019, Nutanix Inc. All rights reserved.
- *
- * Author: Thanos Makatos <thanos@nutanix.com>
- * Swapnil Ingle <swapnil.ingle@nutanix.com>
- * Felipe Franciosi <felipe@nutanix.com>
- *
- */
-
-#ifndef _UAPI_LINUX_MUSER_H
-#define _UAPI_LINUX_MUSER_H
-
-#ifndef __KERNEL__
-#include <sys/types.h>
-#endif
-
-#include <linux/ioctl.h>
-#include <linux/vfio.h>
-
-#define MUSER_DEVNODE "muser"
-
-enum muser_cmd_type {
- MUSER_IOCTL = 1,
- MUSER_READ,
- MUSER_WRITE,
- MUSER_MMAP,
- MUSER_DMA_MMAP,
- MUSER_DMA_MUNMAP,
-};
-
-struct muser_cmd_rw {
- size_t count;
- loff_t pos;
-};
-
-struct muser_cmd_ioctl {
- int vfio_cmd;
- union {
- struct vfio_device_info dev_info;
- struct vfio_region_info reg_info;
- struct vfio_irq_info irq_info;
- struct vfio_irq_set irq_set;
- } data;
-};
-
-union muser_cmd_mmap {
- struct {
- unsigned long addr; /* iova for DMA_MAP, offset for MMAP */
- unsigned long len;
- unsigned long offset;
- unsigned long flags;
- struct file *file;
- int fd;
- } request;
- unsigned long response;
-};
-
-struct muser_cmd {
- enum muser_cmd_type type;
- union {
- struct muser_cmd_rw rw;
- struct muser_cmd_ioctl ioctl;
- union muser_cmd_mmap mmap;
- };
- int err;
-};
-
-/* ioctl cmds valid for /dev/muser/<uuid> */
-#define MUSER_DEV_CMD_WAIT _IOR('M', 1, struct muser_cmd)
-#define MUSER_DEV_CMD_DONE _IOW('M', 2, struct muser_cmd)
-
-#endif /* _UAPI_LINUX_MUSER_H */
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index e2084fe..bc9e4b8 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -32,14 +32,14 @@ set(CMAKE_C_FLAGS "-Wall -Wextra -Werror -fPIC")
set(CMAKE_C_FLAGS_DEBUG "-O0 -ggdb")
add_library(muser SHARED
- ../kmod/muser.h
+ vfio_user.h
muser.h
muser_priv.h
common.h)
target_link_libraries(muser muser_ctx muser_pci dma cap)
set_target_properties(muser PROPERTIES LINKER_LANGUAGE C)
-set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h")
+set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;vfio_user.h")
set(UT_CFLAGS "-O0 -ggdb --coverage")
set(UT_LFLAGS "--coverage")
diff --git a/lib/cap.c b/lib/cap.c
index ca2235a..451c85a 100644
--- a/lib/cap.c
+++ b/lib/cap.c
@@ -34,56 +34,60 @@
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
#include "muser.h"
#include "cap.h"
struct cap {
- uint8_t start;
- uint8_t end;
- uint8_t id;
- lm_cap_access_t *fn;
+ uint8_t start;
+ uint8_t end;
};
struct caps {
- struct cap caps[LM_MAX_CAPS];
- int nr_caps;
+ struct cap caps[LM_MAX_CAPS]; /* FIXME only needs to be as big as nr_caps */
+ unsigned int nr_caps;
};
/*
* Tells whether a capability is being accessed.
*/
static bool
-cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset)
+cap_is_accessed(struct cap *caps, int nr_caps, size_t count, loff_t offset)
{
- /*
- * Ignore if it's at the standard PCI header. The first capability starts
- * right after that.
- */
- if (offset < PCI_STD_HEADER_SIZEOF) {
- return false;
- }
-
- /* ignore if there are no capabilities */
- if (!nr_caps) {
+ if (nr_caps == 0) {
return false;
}
- assert(caps);
+ assert(caps != NULL);
- /*
- * Ignore if it's before the first capability. This check is probably
- * redundant since we assume that the first capability starts right after
- * the standard PCI header.
- * TODO should we check that it doesn't cross into the first capability?
- */
if (offset < caps[0].start) {
+ /* write starts before first capability */
+
+ if (offset + count <= caps[0].start) {
+ /* write ends before first capability */
+ return false;
+ }
+
+ /*
+ * FIXME write starts before capabilities but extends into them. I don't
+ * think that the while loop in lm_access will allow this in the first
+ * place.
+ */
+ assert(false);
+ } else if (offset > caps[nr_caps - 1].end) {
+ /* write starts after last capability */
return false;
}
- /* ignore if it's past the last capability */
- if (offset > caps[nr_caps - 1].end) {
- return false;
+ if (offset + count > (size_t)(caps[nr_caps - 1].end + 1)) {
+ /*
+ * FIXME write starts within capabilities but extends past them, I think
+ * that this _is_ possible, e.g. MSI-X is 12 bytes (PCI_CAP_MSIX_SIZEOF)
+ * and the host writes to first 8 bytes and then writes 8 more.
+ */
+ assert(false);
}
return true;
}
@@ -92,151 +96,369 @@ cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset)
* Returns the PCI capability that is contained within the specified region
* (offset + count).
*/
-static struct cap *
-cap_find(struct cap *caps, int nr_caps, loff_t offset, size_t count)
+static uint8_t *
+cap_find(lm_pci_config_space_t *config_space, struct caps *caps, loff_t offset,
+ size_t count)
{
struct cap *cap;
- cap = caps;
- while (cap < caps + nr_caps) {
+ assert(config_space != NULL);
+ assert(caps != NULL);
+
+ cap = caps->caps;
+ while (cap < caps->caps + caps->nr_caps) {
/*
- * TODO this assumes that at most one capability is read. It might be
- * legitimate to read an arbitrary number of bytes, which we could
- * support. For now lets explicitly fail such cases.
+ * FIXME ensure that at most one capability is written to. It might
+ * legitimate to write to two capabilities at the same time.
*/
- if (offset >= cap->start && offset + count - 1 <= cap->end) {
- return cap;
+ if (offset >= cap->start && offset <= cap->end) {
+ if (offset + count - 1 > cap->end) {
+ assert(false);
+ }
+ return config_space->raw + cap->start;
}
cap++;
}
- /* this means that the access spans more than a capability */
return NULL;
}
-/*
- * Tells whether the header of a PCI capability is accessed.
- */
static bool
-cap_header_is_accessed(struct cap *cap, loff_t offset)
+cap_is_valid(uint8_t id)
{
- assert(cap);
- return offset - cap->start <= 1;
+ /* TODO 0 is a valid capability ID (Null Capability), check
+ * https://pcisig.com/sites/default/files/files/PCI_Code-ID_r_1_11__v24_Jan_2019.pdf:
+ *
+ */
+ return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX;
}
-/*
- * Reads the header of a PCI capability.
- */
-static int
-cap_header_access(struct caps *caps, struct cap *cap, char *buf,
- loff_t offset, size_t count, bool is_write)
+uint8_t *
+cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id)
{
- int n;
+ uint8_t *pos;
+ lm_pci_config_space_t *config_space;
- /*
- * We don't allow ID and next to be written. TODO not sure what the PCI
- * spec says about this, need to check.
- */
- if (is_write) {
- return -EINVAL;
+ if (!cap_is_valid(id)) {
+ errno = EINVAL;
+ return NULL;
}
- assert(caps);
- assert(cap);
- n = 0;
- /*
- * We handle reads to ID and next, the rest is handled by the callback.
- */
- if (offset == cap->start && count > 0) { /* ID */
- buf[n++] = cap->id;
- offset++;
- count--;
+ config_space = lm_get_pci_config_space(lm_ctx);
+
+ if (config_space->hdr.cap == 0) {
+ errno = ENOENT;
+ return NULL;
}
- if (offset == cap->start + 1 && count > 0) { /* next */
- if ((cap - caps->caps) / sizeof *cap == (size_t)(caps->nr_caps - 1)) {
- buf[n++] = 0;
- } else {
- buf[n++] = (cap + 1)->start;
+ pos = config_space->raw + config_space->hdr.cap;
+ while (true) {
+ if (*(pos + PCI_CAP_LIST_ID) == id) {
+ return pos;
}
-
- offset++;
- count--;
+ if (*(pos + PCI_CAP_LIST_NEXT) == 0) {
+ break;
+ }
+ pos = config_space->raw + *(pos + PCI_CAP_LIST_NEXT);
}
- return n;
+ errno = ENOENT;
+ return NULL;
}
+/*
+ * Tells whether the header of a PCI capability is accessed.
+ */
+static bool
+cap_header_is_accessed(uint8_t cap_offset, loff_t offset)
+{
+ return offset - cap_offset <= 1;
+}
+
+typedef ssize_t (cap_access) (lm_ctx_t *lm_ctx, uint8_t *cap, char *buf,
+ size_t count, loff_t offset);
+
+static ssize_t
+handle_pmcs_write(lm_ctx_t *lm_ctx, struct pmcap *pm,
+ const struct pmcs *const pmcs)
+{
+
+ if (pm->pmcs.ps != pmcs->ps) {
+ lm_log(lm_ctx, LM_DBG, "power state set to %#x\n", pmcs->ps);
+ }
+ if (pm->pmcs.pmee != pmcs->pmee) {
+ lm_log(lm_ctx, LM_DBG, "PME enable set to %#x\n", pmcs->pmee);
+ }
+ if (pm->pmcs.dse != pmcs->dse) {
+ lm_log(lm_ctx, LM_DBG, "data select set to %#x\n", pmcs->dse);
+ }
+ if (pm->pmcs.pmes != pmcs->pmes) {
+ lm_log(lm_ctx, LM_DBG, "PME status set to %#x\n", pmcs->pmes);
+ }
+ pm->pmcs = *pmcs;
+ return 0;
+}
+
+static ssize_t
+handle_pm_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+ const size_t count, const loff_t offset)
+{
+ struct pmcap *pm = (struct pmcap *)cap;
+
+ switch (offset) {
+ case offsetof(struct pmcap, pc):
+ if (count != sizeof(struct pc)) {
+ return -EINVAL;
+ }
+ assert(false); /* FIXME implement */
+ case offsetof(struct pmcap, pmcs):
+ if (count != sizeof(struct pmcs)) {
+ return -EINVAL;
+ }
+ return handle_pmcs_write(lm_ctx, pm, (struct pmcs *)buf);
+ }
+ return -EINVAL;
+}
+
+static ssize_t
+handle_mxc_write(lm_ctx_t *lm_ctx, struct msixcap *msix,
+ const struct mxc *const mxc)
+{
+ assert(msix != NULL);
+ assert(mxc != NULL);
+
+ if (mxc->mxe != msix->mxc.mxe) {
+ lm_log(lm_ctx, LM_DBG, "%s MSI-X\n", mxc->mxe ? "enable" : "disable");
+ msix->mxc.mxe = mxc->mxe;
+ }
+
+ if (mxc->fm != msix->mxc.fm) {
+ if (mxc->fm) {
+ lm_log(lm_ctx, LM_DBG, "all MSI-X vectors masked\n");
+ } else {
+ lm_log(lm_ctx, LM_DBG,
+ "vector's mask bit determines whether vector is masked\n");
+ }
+ msix->mxc.fm = mxc->fm;
+ }
+
+ return sizeof(struct mxc);
+}
+
+static ssize_t
+handle_msix_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+ const size_t count, const loff_t offset)
+{
+ struct msixcap *msix = (struct msixcap *)cap;
+
+ if (count == sizeof(struct mxc)) {
+ switch (offset) {
+ case offsetof(struct msixcap, mxc):
+ return handle_mxc_write(lm_ctx, msix, (struct mxc *)buf);
+ default:
+ lm_log(lm_ctx, LM_ERR, "invalid MSI-X write offset %ld\n", offset);
+ return -EINVAL;
+ }
+ }
+ lm_log(lm_ctx, LM_ERR, "invalid MSI-X write size %lu\n", count);
+ return -EINVAL;
+}
+
+static int
+handle_px_pxdc_write(lm_ctx_t *lm_ctx, struct pxcap *px, const union pxdc *const p)
+{
+ assert(px != NULL);
+ assert(p != NULL);
+
+ if (p->cere != px->pxdc.cere) {
+ px->pxdc.cere = p->cere;
+ lm_log(lm_ctx, LM_DBG, "CERE %s\n", p->cere ? "enable" : "disable");
+ }
+
+ if (p->nfere != px->pxdc.nfere) {
+ px->pxdc.nfere = p->nfere;
+ lm_log(lm_ctx, LM_DBG, "NFERE %s\n", p->nfere ? "enable" : "disable");
+ }
+
+ if (p->fere != px->pxdc.fere) {
+ px->pxdc.fere = p->fere;
+ lm_log(lm_ctx, LM_DBG, "FERE %s\n", p->fere ? "enable" : "disable");
+ }
+
+ if (p->urre != px->pxdc.urre) {
+ px->pxdc.urre = p->urre;
+ lm_log(lm_ctx, LM_DBG, "URRE %s\n", p->urre ? "enable" : "disable");
+ }
+
+ if (p->ero != px->pxdc.ero) {
+ px->pxdc.ero = p->ero;
+ lm_log(lm_ctx, LM_DBG, "ERO %s\n", p->ero ? "enable" : "disable");
+ }
+
+ if (p->mps != px->pxdc.mps) {
+ px->pxdc.mps = p->mps;
+ lm_log(lm_ctx, LM_DBG, "MPS set to %d\n", p->mps);
+ }
+
+ if (p->ete != px->pxdc.ete) {
+ px->pxdc.ete = p->ete;
+ lm_log(lm_ctx, LM_DBG, "ETE %s\n", p->ete ? "enable" : "disable");
+ }
+
+ if (p->pfe != px->pxdc.pfe) {
+ px->pxdc.pfe = p->pfe;
+ lm_log(lm_ctx, LM_DBG, "PFE %s\n", p->pfe ? "enable" : "disable");
+ }
+
+ if (p->appme != px->pxdc.appme) {
+ px->pxdc.appme = p->appme;
+ lm_log(lm_ctx, LM_DBG, "APPME %s\n", p->appme ? "enable" : "disable");
+ }
+
+ if (p->ens != px->pxdc.ens) {
+ px->pxdc.ens = p->ens;
+ lm_log(lm_ctx, LM_DBG, "ENS %s\n", p->ens ? "enable" : "disable");
+ }
+
+ if (p->mrrs != px->pxdc.mrrs) {
+ px->pxdc.mrrs = p->mrrs;
+ lm_log(lm_ctx, LM_DBG, "MRRS set to %d\n", p->mrrs);
+ }
+
+ if (p->iflr) {
+ lm_log(lm_ctx, LM_DBG,
+ "initiate function level reset\n");
+ }
+
+ return 0;
+}
+
+static int
+handle_px_write_2_bytes(lm_ctx_t *lm_ctx, struct pxcap *px, char *const buf,
+ loff_t off)
+{
+ switch (off) {
+ case offsetof(struct pxcap, pxdc):
+ return handle_px_pxdc_write(lm_ctx, px, (union pxdc *)buf);
+ }
+ return -EINVAL;
+}
+
+static ssize_t
+handle_px_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+ size_t count, loff_t offset)
+{
+ struct pxcap *px = (struct pxcap *)cap;
+
+ int err = -EINVAL;
+ switch (count) {
+ case 2:
+ err = handle_px_write_2_bytes(lm_ctx, px, buf, offset);
+ break;
+ }
+ if (err != 0) {
+ return err;
+ }
+ return count;
+}
+
+static const struct cap_handler {
+ char *name;
+ size_t size;
+ cap_access *fn;
+} cap_handlers[PCI_CAP_ID_MAX + 1] = {
+ [PCI_CAP_ID_PM] = {"PM", PCI_PM_SIZEOF, handle_pm_write},
+ [PCI_CAP_ID_EXP] = {"PCI Express", PCI_CAP_EXP_ENDPOINT_SIZEOF_V2,
+ handle_px_write},
+ [PCI_CAP_ID_MSIX] = {"MSI-X", PCI_CAP_MSIX_SIZEOF, handle_msix_write},
+};
+
ssize_t
-cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count,
- loff_t offset, bool is_write)
+cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count,
+ loff_t offset)
{
- struct cap *cap;
+ lm_pci_config_space_t *config_space;
+ uint8_t *cap;
- if (!caps) {
+ if (caps == NULL) {
return 0;
}
- if (!count) {
+ if (count == 0) {
return 0;
}
- if (!cap_is_accessed(caps->caps, caps->nr_caps, offset)) {
+ if (!cap_is_accessed(caps->caps, caps->nr_caps, count, offset)) {
return 0;
}
/* we're now guaranteed that the access is within some capability */
- cap = cap_find(caps->caps, caps->nr_caps, offset, count);
+ config_space = lm_get_pci_config_space(lm_ctx);
+ cap = cap_find(config_space, caps, offset, count);
+ assert(cap != NULL); /* FIXME */
- if (!cap) {
- return 0;
- }
-
- if (cap_header_is_accessed(cap, offset)) {
- return cap_header_access(caps, cap, buf, offset, count, is_write);
- }
- if (count > 0) {
- return cap->fn(pvt, cap->id, buf, count, offset - cap->start, is_write);
+ if (cap_header_is_accessed(cap - config_space->raw, offset)) {
+ /* FIXME how to deal with writes to capability header? */
+ assert(false);
}
- return 0;
-}
-
-static bool
-cap_is_valid(uint8_t id)
-{
- return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX;
+ return cap_handlers[cap[PCI_CAP_LIST_ID]].fn(lm_ctx, cap, buf, count,
+ offset - (loff_t)(cap - config_space->raw));
}
struct caps *
-caps_create(const lm_cap_t *lm_caps, int nr_caps)
+caps_create(lm_ctx_t *lm_ctx, lm_cap_t **lm_caps, int nr_caps)
{
- uint8_t prev_end;
int i, err = 0;
- struct caps *caps = NULL;
+ uint8_t *prev;
+ uint8_t next;
+ lm_pci_config_space_t *config_space;
+ struct caps *caps;
if (nr_caps <= 0 || nr_caps >= LM_MAX_CAPS) {
err = EINVAL;
goto out;
}
- assert(lm_caps);
+ assert(lm_caps != NULL);
caps = calloc(1, sizeof *caps);
- if (!caps) {
- err = errno;
+ if (caps == NULL) {
goto out;
}
- prev_end = PCI_STD_HEADER_SIZEOF - 1;
+ config_space = lm_get_pci_config_space(lm_ctx);
+ /* points to the next field of the previous capability */
+ prev = &config_space->hdr.cap;
+
+ /* relative offset that points where the next capability should be placed */
+ next = PCI_STD_HEADER_SIZEOF;
+
for (i = 0; i < nr_caps; i++) {
- if (!cap_is_valid(lm_caps[i].id) || !lm_caps[i].fn || !lm_caps[i].size) {
+ uint8_t *cap = (uint8_t*)lm_caps[i];
+ uint8_t id = cap[PCI_CAP_LIST_ID];
+ size_t size;
+
+ if (!cap_is_valid(id)) {
+ err = EINVAL;
+ goto out;
+ }
+
+ size = cap_handlers[id].size;
+ if (size == 0) {
err = EINVAL;
goto out;
}
- caps->caps[i].id = lm_caps[i].id;
- caps->caps[i].fn = lm_caps[i].fn;
- /* FIXME PCI capabilities must be dword aligned. */
- caps->caps[i].start = prev_end + 1;
- caps->caps[i].end = prev_end = caps->caps[i].start + lm_caps[i].size - 1;
+ caps->caps[i].start = next;
+ caps->caps[i].end = next + size - 1;
+
+ memcpy(&config_space->hdr.raw[next], cap, size);
+ *prev = next;
+ prev = &config_space->hdr.raw[next + PCI_CAP_LIST_NEXT];
+ *prev = 0;
+ next += size;
+ assert(next % 4 == 0); /* FIXME */
+
+ lm_log(lm_ctx, LM_DBG, "initialized capability %s %#x-%#x\n",
+ cap_handlers[id].name, caps->caps[i].start, caps->caps[i].end);
}
caps->nr_caps = nr_caps;
diff --git a/lib/cap.h b/lib/cap.h
index e814d6c..1f72247 100644
--- a/lib/cap.h
+++ b/lib/cap.h
@@ -44,7 +44,7 @@ struct caps;
* capabilities have been added.
*/
struct caps *
-caps_create(const lm_cap_t *caps, int nr_caps);
+caps_create(lm_ctx_t *lm_ctx, lm_cap_t **caps, int nr_caps);
/*
* Conditionally accesses the PCI capabilities. Returns:
@@ -54,8 +54,11 @@ caps_create(const lm_cap_t *caps, int nr_caps);
* <0: negative error code on error.
*/
ssize_t
-cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count,
- loff_t offset, bool is_write);
+cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count,
+ loff_t offset);
+
+uint8_t *
+cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id);
#endif /* __CAP_H__ */
diff --git a/lib/caps/common.h b/lib/caps/common.h
new file mode 100644
index 0000000..2181a3b
--- /dev/null
+++ b/lib/caps/common.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef LM_PCI_CAP_COMMON_H
+#define LM_PCI_CAP_COMMON_H
+
+#include <stddef.h>
+
+struct cap_hdr {
+ uint8_t id;
+ uint8_t next;
+} __attribute__((packed));
+_Static_assert(sizeof(struct cap_hdr) == 0x2, "bad PCI capability header size");
+_Static_assert(offsetof(struct cap_hdr, id) == PCI_CAP_LIST_ID, "bad offset");
+_Static_assert(offsetof(struct cap_hdr, next) == PCI_CAP_LIST_NEXT, "bad offset");
+
+#endif /* LM_PCI_CAP_COMMON_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/caps/msi.h b/lib/caps/msi.h
index b310ae9..5933006 100644
--- a/lib/caps/msi.h
+++ b/lib/caps/msi.h
@@ -33,11 +33,7 @@
#ifndef LM_PCI_CAP_MSI_H
#define LM_PCI_CAP_MSI_H
-struct mid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__ ((packed));
-_Static_assert(sizeof(struct mid) == 0x2, "bad MID size");
+#include "common.h"
struct mc {
unsigned int msie:1;
@@ -56,7 +52,7 @@ struct ma {
_Static_assert(sizeof(struct ma) == 0x4, "bad MA size");
struct msicap {
- struct mid mid;
+ struct cap_hdr hdr;
struct mc mc;
struct ma ma;
uint32_t mua;
@@ -66,6 +62,7 @@ struct msicap {
uint32_t mpend;
} __attribute__ ((packed));
_Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size");
+_Static_assert(offsetof(struct msicap, hdr) == 0, "bad offset");
#endif /* LM_CAP_MSI_H */
diff --git a/lib/caps/msix.h b/lib/caps/msix.h
index b13c1c8..b0bc1a5 100644
--- a/lib/caps/msix.h
+++ b/lib/caps/msix.h
@@ -35,12 +35,6 @@
#include <linux/pci_regs.h>
-struct mxid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__ ((packed));
-_Static_assert(sizeof(struct mxid) == 0x2, "bad MXID size");
-
struct mxc {
unsigned int ts:11;
unsigned int reserved:3;
@@ -63,12 +57,13 @@ _Static_assert(sizeof(struct mtab) == PCI_MSIX_PBA - PCI_MSIX_TABLE,
"bad MPBA size");
struct msixcap {
- struct mxid mxid;
+ struct cap_hdr hdr;
struct mxc mxc;
struct mtab mtab;
struct mpba mpba;
} __attribute__ ((packed)) __attribute__ ((aligned(4)));
_Static_assert(sizeof(struct msixcap) == PCI_CAP_MSIX_SIZEOF, "bad MSI-X size");
+_Static_assert(offsetof(struct msixcap, hdr) == 0, "bad offset");
#endif /* LM_CAP_MSIX_H */
diff --git a/lib/caps/pm.h b/lib/caps/pm.h
index ddae2e6..e976d95 100644
--- a/lib/caps/pm.h
+++ b/lib/caps/pm.h
@@ -33,11 +33,7 @@
#ifndef LM_PCI_CAP_PM_H
#define LM_PCI_CAP_PM_H
-struct pid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__((packed));
-_Static_assert(sizeof(struct pid) == 0x2, "bad PID size");
+#include "common.h"
struct pc {
unsigned int vs:3;
@@ -60,15 +56,16 @@ struct pmcs {
unsigned int dse:4;
unsigned int dsc:2;
unsigned int pmes:1;
-};
-_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+} __attribute__((packed));
+_Static_assert(sizeof(struct pc) == 0x2, "bad PMCS size");
struct pmcap {
- struct pid pid;
+ struct cap_hdr hdr;
struct pc pc;
struct pmcs pmcs;
-} __attribute__((packed)) __attribute__ ((aligned(8)));
+} __attribute__((packed)) __attribute__ ((aligned(8))); /* FIXME why does it need to be aligned? */
_Static_assert(sizeof(struct pmcap) == PCI_PM_SIZEOF, "bad PC size");
+_Static_assert(offsetof(struct pmcap, hdr) == 0, "bad offset");
#endif /* LM_CAP_PM_H */
diff --git a/lib/caps/px.h b/lib/caps/px.h
index ce17cfe..28a04d5 100644
--- a/lib/caps/px.h
+++ b/lib/caps/px.h
@@ -33,11 +33,7 @@
#ifndef LM_PCI_CAP_PX_H
#define LM_PCI_CAP_PX_H
-struct pxid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__((packed));
-_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size");
+#include "common.h"
struct pxcaps {
unsigned int ver:4;
@@ -133,7 +129,7 @@ _Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size");
* the whole struct.
*/
struct pxcap {
- struct pxid pxid;
+ struct cap_hdr hdr;
struct pxcaps pxcaps;
struct pxdcap pxdcap;
union pxdc pxdc;
@@ -147,6 +143,7 @@ struct pxcap {
} __attribute__((packed));
_Static_assert(sizeof(struct pxcap) == 0x2a,
"bad PCI Express Capability size");
+_Static_assert(offsetof(struct pxcap, hdr) == 0, "bad offset");
#endif /* LM_PCI_CAP_PX_H */
diff --git a/lib/common.h b/lib/common.h
index 27d6735..f5de4d8 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -45,18 +45,18 @@
#define likely(e) __builtin_expect(!!(e), 1)
#define unlikely(e) __builtin_expect(e, 0)
+/* XXX NB 2nd argument must be power of two */
#define ROUND_DOWN(x, a) ((x) & ~((a)-1))
#define ROUND_UP(x,a) ROUND_DOWN((x)+(a)-1, a)
void
lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...);
-#ifdef DEBUG
+#ifdef LM_VERBOSE_LOGGING
void
-dump_buffer(lm_ctx_t *lm_ctx, const char *prefix,
- const char *buf, uint32_t count);
+dump_buffer(const char *prefix, const char *buf, uint32_t count);
#else
-#define dump_buffer(lm_ctx, prefix, buf, count)
+#define dump_buffer(prefix, buf, count)
#endif
#endif /* __COMMON_H__ */
diff --git a/lib/dma.c b/lib/dma.c
index eb4b9d4..b6d365e 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -66,7 +66,7 @@ fds_are_same_file(int fd1, int fd2)
}
dma_controller_t *
-dma_controller_create(int max_regions)
+dma_controller_create(lm_ctx_t *lm_ctx, int max_regions)
{
dma_controller_t *dma;
@@ -77,37 +77,89 @@ dma_controller_create(int max_regions)
return dma;
}
+ dma->lm_ctx = lm_ctx;
dma->max_regions = max_regions;
dma->nregions = 0;
memset(dma->regions, 0, max_regions * sizeof(dma->regions[0]));
+ dma->dirty_pgsize = 0;
return dma;
}
static void
-_dma_controller_do_remove_region(dma_memory_region_t *region)
+_dma_controller_do_remove_region(dma_controller_t *dma,
+ dma_memory_region_t *region)
{
- assert(region);
- dma_unmap_region(region, region->virt_addr, region->size);
- (void)close(region->fd);
+ int err;
+
+ assert(dma != NULL);
+ assert(region != NULL);
+
+ err = dma_unmap_region(region, region->virt_addr, region->size);
+ if (err != 0) {
+ lm_log(dma->lm_ctx, LM_DBG, "failed to unmap fd=%d vaddr=%#lx-%#lx\n",
+ region->fd, region->virt_addr, region->size);
+ }
+ if (region->fd != -1) {
+ if (close(region->fd) == -1) {
+ lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n", region->fd);
+ }
+ }
+}
+
+/*
+ * FIXME no longer used. Also, it doesn't work for addresses that span two
+ * DMA regions.
+ */
+bool
+dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
+ size_t size)
+{
+ dma_memory_region_t *region;
+ int i;
+
+ for (i = 0; i < dma->nregions; i++) {
+ region = &dma->regions[i];
+ if (dma_addr == region->dma_addr && size <= region->size) {
+ return true;
+ }
+ }
+
+ return false;
}
/* FIXME not thread safe */
int
-dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
- size_t size, int fd)
+dma_controller_remove_region(dma_controller_t *dma,
+ dma_addr_t dma_addr, size_t size,
+ int (*unmap_dma) (void*, uint64_t), void *data)
{
int idx;
dma_memory_region_t *region;
+ int err;
- assert(dma);
+ assert(dma != NULL);
for (idx = 0; idx < dma->nregions; idx++) {
region = &dma->regions[idx];
- if (region->dma_addr == dma_addr && region->size == size &&
- fds_are_same_file(region->fd, fd)) {
- _dma_controller_do_remove_region(region);
+ if (region->dma_addr == dma_addr && region->size == size) {
+ if (region->refcnt > 0) {
+ err = unmap_dma(data, region->dma_addr);
+ if (err != 0) {
+ lm_log(dma->lm_ctx, LM_ERR,
+ "failed to notify of removal of DMA region %#lx-%#lx: %s\n",
+ region->dma_addr, region->dma_addr + region->size,
+ strerror(-err));
+ return err;
+ }
+ assert(region->refcnt == 0);
+ }
+ _dma_controller_do_remove_region(dma, region);
if (dma->nregions > 1)
+ /*
+ * FIXME valgrind complains with 'Source and destination overlap in memcpy',
+ * check whether memmove eliminates this warning.
+ */
memcpy(region, &dma->regions[dma->nregions - 1],
sizeof *region);
dma->nregions--;
@@ -118,7 +170,7 @@ dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
}
static inline void
-dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma)
+dma_controller_remove_regions(dma_controller_t *dma)
{
int i;
@@ -127,26 +179,26 @@ dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma)
for (i = 0; i < dma->nregions; i++) {
dma_memory_region_t *region = &dma->regions[i];
- lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n",
+ lm_log(dma->lm_ctx, LM_INF, "unmap vaddr=%#lx IOVA=%#lx",
region->virt_addr, region->dma_addr);
- _dma_controller_do_remove_region(region);
+ _dma_controller_do_remove_region(dma, region);
}
}
void
-dma_controller_destroy(lm_ctx_t *lm_ctx, dma_controller_t *dma)
+dma_controller_destroy(dma_controller_t *dma)
{
if (dma == NULL) {
return;
}
- dma_controller_remove_regions(lm_ctx, dma);
+ dma_controller_remove_regions(dma);
free(dma);
}
int
-dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
+dma_controller_add_region(dma_controller_t *dma,
dma_addr_t dma_addr, size_t size,
int fd, off_t offset)
{
@@ -160,8 +212,8 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
/* First check if this is the same exact region. */
if (region->dma_addr == dma_addr && region->size == size) {
if (offset != region->offset) {
- lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, "
- "want=%d, existing=%d\n",
+ lm_log(dma->lm_ctx, LM_ERR,
+ "bad offset for new DMA region %#lx+%#lx, want=%d, existing=%d\n",
dma_addr, size, offset, region->offset);
goto err;
}
@@ -172,8 +224,9 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
* the same file, however in the majority of cases we'll be
* using a single fd.
*/
- lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, "
- "existing fd=%d\n", fd, region->fd);
+ lm_log(dma->lm_ctx, LM_ERR,
+ "bad fd=%d for new DMA region %#lx-%#lx, existing fd=%d\n",
+ fd, region->fd);
goto err;
}
return idx;
@@ -184,16 +237,17 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
dma_addr < region->dma_addr + region->size) ||
(region->dma_addr >= dma_addr &&
region->dma_addr < dma_addr + size)) {
- lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA "
- "region %lx-%lx\n", dma_addr, size, region->dma_addr,
- region->size);
+ lm_log(dma->lm_ctx, LM_INF,
+ "new DMA region %#lx+%#lx overlaps with DMA region %#lx-%#lx\n",
+ dma_addr, size, region->dma_addr, region->size);
goto err;
}
}
if (dma->nregions == dma->max_regions) {
idx = dma->max_regions;
- lm_log(lm_ctx, LM_ERR, "reached maxed regions, recompile with higher number of DMA regions\n");
+ lm_log(dma->lm_ctx, LM_ERR,
+ "reached maxed regions, recompile with higher number of DMA regions\n");
goto err;
}
@@ -202,7 +256,7 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
page_size = fd_get_blocksize(fd);
if (page_size < 0) {
- lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size);
+ lm_log(dma->lm_ctx, LM_ERR, "bad page size %d\n", page_size);
goto err;
}
page_size = MAX(page_size, getpagesize());
@@ -211,20 +265,21 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
region->size = size;
region->page_size = page_size;
region->offset = offset;
-
- region->fd = dup(fd); // dup the fd to get our own private copy
- if (region->fd < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n",
- strerror(errno));
- goto err;
- }
+ region->fd = fd;
+ region->refcnt = 0;
region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE,
0, region->size);
if (region->virt_addr == MAP_FAILED) {
- lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n",
+ lm_log(dma->lm_ctx, LM_ERR,
+ "failed to memory map DMA region %#lx-%#lx: %s\n",
dma_addr, dma_addr + size, strerror(errno));
- close(region->fd);
+ if (region->fd != -1) {
+ if (close(region->fd) == -1) {
+ lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n",
+ region->fd);
+ }
+ }
goto err;
}
dma->nregions++;
@@ -269,17 +324,17 @@ dma_map_region(dma_memory_region_t *region, int prot, size_t offset, size_t len)
return mmap_base + (offset - mmap_offset);
}
-void
+int
dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len)
{
mmap_round((size_t *)&virt_addr, &len, region->page_size);
- munmap(virt_addr, len);
+ return munmap(virt_addr, len);
}
int
_dma_addr_sg_split(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg)
+ dma_sg_t *sg, int max_sg, int prot)
{
int idx;
int cnt = 0;
@@ -295,9 +350,13 @@ _dma_addr_sg_split(const dma_controller_t *dma,
size_t region_len = MIN(region_end - dma_addr, len);
if (cnt < max_sg) {
+ sg[cnt].dma_addr = region->dma_addr;
sg[cnt].region = idx;
sg[cnt].offset = dma_addr - region->dma_addr;
sg[cnt].length = region_len;
+ if (_dma_should_mark_dirty(dma, prot)) {
+ _dma_mark_dirty(dma, region, sg);
+ }
}
cnt++;
@@ -326,4 +385,117 @@ out:
return cnt;
}
+ssize_t _get_bitmap_size(size_t region_size, size_t pgsize)
+{
+ if (pgsize == 0) {
+ return -EINVAL;
+ }
+ if (region_size < pgsize) {
+ return -EINVAL;
+ }
+ size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
+ return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0);
+}
+
+int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize)
+{
+ int i;
+
+ assert(dma != NULL);
+
+ if (pgsize == 0) {
+ return -EINVAL;
+ }
+
+ if (dma->dirty_pgsize > 0) {
+ if (dma->dirty_pgsize != pgsize) {
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ for (i = 0; i < dma->nregions; i++) {
+ dma_memory_region_t *region = &dma->regions[i];
+ ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+ region->dirty_bitmap = calloc(bitmap_size, sizeof(char));
+ if (region->dirty_bitmap == NULL) {
+ int j, ret = -errno;
+ for (j = 0; j < i; j++) {
+ free(region->dirty_bitmap);
+ region->dirty_bitmap = NULL;
+ }
+ return ret;
+ }
+ }
+ dma->dirty_pgsize = pgsize;
+ return 0;
+}
+
+int dma_controller_dirty_page_logging_stop(dma_controller_t *dma)
+{
+ int i;
+
+ assert(dma != NULL);
+
+ if (dma->dirty_pgsize == 0) {
+ return 0;
+ }
+
+ for (i = 0; i < dma->nregions; i++) {
+ free(dma->regions[i].dirty_bitmap);
+ dma->regions[i].dirty_bitmap = NULL;
+ }
+ dma->dirty_pgsize = 0;
+ return 0;
+}
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+ size_t pgsize, size_t size, char **data)
+{
+ int ret;
+ ssize_t bitmap_size;
+ dma_sg_t sg;
+ dma_memory_region_t *region;
+
+ assert(dma != NULL);
+ assert(data != NULL);
+
+ /*
+ * FIXME for now we support IOVAs that match exactly the DMA region. This
+ * is purely for simplifying the implementation. We MUST allow arbitrary
+ * IOVAs.
+ */
+ ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE);
+ if (ret != 1 || sg.dma_addr != addr || sg.length != len) {
+ return -ENOTSUP;
+ }
+
+ if (pgsize != dma->dirty_pgsize) {
+ return -EINVAL;
+ }
+
+ bitmap_size = _get_bitmap_size(len, pgsize);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+
+ /*
+ * FIXME they must be equal because this is how much data the client
+ * expects to receive.
+ */
+ if (size != (size_t)bitmap_size) {
+ return -EINVAL;
+ }
+
+ region = &dma->regions[sg.region];
+
+ *data = region->dirty_bitmap;
+
+ return 0;
+}
+
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.h b/lib/dma.h
index 1c41dce..7715b89 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -32,6 +32,11 @@
#define DMA_DMA_H
/*
+ * FIXME check whether DMA regions must be page aligned. If so then the
+ * implementation can be greatly simpified.
+ */
+
+/*
* This library emulates a DMA controller for a device emulation application to
* perform DMA operations on a foreign memory space.
*
@@ -72,6 +77,8 @@
#include "muser.h"
#include "common.h"
+struct lm_ctx;
+
typedef struct {
dma_addr_t dma_addr; // DMA address of this region
size_t size; // Size of this region
@@ -79,19 +86,23 @@ typedef struct {
int page_size; // Page size of this fd
off_t offset; // File offset
void *virt_addr; // Virtual address of this region
+ int refcnt; // Number of users of this region
+ char *dirty_bitmap; // Dirty page bitmap
} dma_memory_region_t;
typedef struct {
int max_regions;
int nregions;
+ struct lm_ctx *lm_ctx;
+ size_t dirty_pgsize; // Dirty page granularity
dma_memory_region_t regions[0];
} dma_controller_t;
dma_controller_t *
-dma_controller_create(int max_regions);
+dma_controller_create(lm_ctx_t *lm_ctx, int max_regions);
void
-dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma);
+dma_controller_destroy(dma_controller_t *dma);
/* Registers a new memory region.
* Returns:
@@ -101,19 +112,72 @@ dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma);
* (e.g. due to conflict with existing region).
*/
int
-dma_controller_add_region(lm_ctx_t *ctx, dma_controller_t *dma,
+dma_controller_add_region(dma_controller_t *dma,
dma_addr_t dma_addr, size_t size,
int fd, off_t offset);
int
-dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
- size_t size, int fd);
+dma_controller_remove_region(dma_controller_t *dma,
+ dma_addr_t dma_addr, size_t size,
+ int (*unmap_dma) (void*, uint64_t), void *data);
// Helper for dma_addr_to_sg() slow path.
int
_dma_addr_sg_split(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg);
+ dma_sg_t *sg, int max_sg, int prot);
+
+static bool
+_dma_should_mark_dirty(const dma_controller_t *dma, int prot)
+{
+ assert(dma != NULL);
+
+ return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0;
+}
+
+static size_t
+_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset)
+{
+ return (offset - base_addr) / pgsize;
+}
+
+static size_t
+_get_pgend(size_t pgsize, uint64_t len, size_t start)
+{
+ return start + (len / pgsize) + (len % pgsize != 0) - 1;
+}
+
+static void
+_dma_bitmap_get_pgrange(const dma_controller_t *dma,
+ const dma_memory_region_t *region,
+ const dma_sg_t *sg, size_t *start, size_t *end)
+{
+ assert(dma != NULL);
+ assert(region != NULL);
+ assert(sg != NULL);
+ assert(start != NULL);
+ assert(end != NULL);
+
+ *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset);
+ *end = _get_pgend(dma->dirty_pgsize, sg->length, *start);
+}
+
+static void
+_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region,
+ dma_sg_t *sg)
+{
+ size_t i, start, end;
+
+ assert(dma != NULL);
+ assert(region != NULL);
+ assert(sg != NULL);
+ assert(region->dirty_bitmap != NULL);
+
+ _dma_bitmap_get_pgrange(dma, region, sg, &start, &end);
+ for (i = start; i <= end; i++) {
+ region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT);
+ }
+}
/* Takes a linear dma address span and returns a sg list suitable for DMA.
* A single linear dma address span may need to be split into multiple
@@ -129,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma,
static inline int
dma_addr_to_sg(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg)
+ dma_sg_t *sg, int max_sg, int prot)
{
static __thread int region_hint;
int cnt;
@@ -139,14 +203,19 @@ dma_addr_to_sg(const dma_controller_t *dma,
// Fast path: single region.
if (likely(max_sg > 0 && len > 0 &&
- dma_addr >= region->dma_addr && dma_addr + len <= region_end)) {
+ dma_addr >= region->dma_addr && dma_addr + len <= region_end &&
+ region_hint < dma->nregions)) {
+ sg->dma_addr = region->dma_addr;
sg->region = region_hint;
sg->offset = dma_addr - region->dma_addr;
sg->length = len;
+ if (_dma_should_mark_dirty(dma, prot)) {
+ _dma_mark_dirty(dma, region, sg);
+ }
return 1;
}
// Slow path: search through regions.
- cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg);
+ cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot);
if (likely(cnt > 0)) {
region_hint = sg->region;
}
@@ -157,7 +226,7 @@ void *
dma_map_region(dma_memory_region_t *region, int prot,
size_t offset, size_t len);
-void
+int
dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len);
static inline int
@@ -168,31 +237,53 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov,
int i;
for (i = 0; i < cnt; i++) {
+ lm_log(dma->lm_ctx, LM_DBG, "map %#lx-%#lx\n",
+ sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length);
region = &dma->regions[sg[i].region];
iov[i].iov_base = region->virt_addr + sg[i].offset;
iov[i].iov_len = sg[i].length;
+ region->refcnt++;
}
return 0;
}
+/* FIXME useless define */
#define UNUSED __attribute__((unused))
static inline void
-dma_unmap_sg(UNUSED dma_controller_t *dma, UNUSED const dma_sg_t *sg,
- UNUSED struct iovec *iov, UNUSED int cnt)
+dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg,
+ UNUSED struct iovec *iov, int cnt)
{
- /* just a placeholder for now */
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dma_memory_region_t *r;
+ /*
+ * FIXME this double loop will be removed if we replace the array with
+ * tfind(3)
+ */
+ for (r = dma->regions;
+ r < dma->regions + dma->nregions && r->dma_addr != sg[i].dma_addr;
+ r++);
+ if (r > dma->regions + dma->nregions) {
+ /* bad region */
+ continue;
+ }
+ lm_log(dma->lm_ctx, LM_DBG, "unmap %#lx-%#lx\n",
+ sg[i].dma_addr + sg[i].offset, sg[i].dma_addr + sg[i].offset + sg[i].length);
+ r->refcnt--;
+ }
return;
}
static inline void *
-dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len)
+dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot)
{
dma_sg_t sg;
struct iovec iov;
- if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 &&
+ if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 &&
dma_map_sg(dma, &sg, &iov, 1) == 0) {
return iov.iov_base;
}
@@ -211,12 +302,26 @@ dma_unmap_addr(dma_controller_t *dma,
};
int r;
- r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1);
+ r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE);
assert(r == 1);
dma_unmap_sg(dma, &sg, &iov, 1);
}
+int
+dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize);
+
+int
+dma_controller_dirty_page_logging_stop(dma_controller_t *dma);
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+ size_t pgsize, size_t size, char **data);
+
+bool
+dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
+ size_t size);
+
#endif /* DMA_DMA_H */
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser.h b/lib/muser.h
index f3330fe..a39d477 100644
--- a/lib/muser.h
+++ b/lib/muser.h
@@ -37,22 +37,27 @@
#include <sys/uio.h>
#include <unistd.h>
+#include "vfio_user.h"
#include "pci.h"
+#include "caps/pm.h"
+#include "caps/px.h"
+#include "caps/msi.h"
+#include "caps/msix.h"
-/*
- * Influential enviroment variables:
- *
- * LM_TERSE_LOGGING: define to make libmuser log only erroneous PCI accesses.
- * (this should really be done with a more fine grained debug
- * level)
- */
-#ifndef LM_TERSE_LOGGING
-#define LM_TERSE_LOGGING 0
-#endif
+#define LIB_MUSER_VFIO_USER_VERS_MJ 0
+#define LIB_MUSER_VFIO_USER_VERS_MN 1
+
+#define VFIO_NAME "vfio"
+#define VFIO_DIR "/dev/" VFIO_NAME "/"
+#define VFIO_CONTAINER VFIO_DIR "/" VFIO_NAME
+
+#define MUSER_DIR "/var/run/muser/"
+#define MUSER_SOCK "cntrl"
typedef uint64_t dma_addr_t;
typedef struct {
+ dma_addr_t dma_addr;
int region;
int length;
uint64_t offset;
@@ -134,6 +139,8 @@ typedef struct {
/*
* Callback function that is called when the region is read or written.
+ * Note that the memory of the region is owned by the user, except for the
+ * standard header (first 64 bytes) of the PCI configuration space.
*/
lm_region_access_t *fn;
@@ -149,9 +156,12 @@ enum {
LM_DEV_INTX_IRQ_IDX,
LM_DEV_MSI_IRQ_IDX,
LM_DEV_MSIX_IRQ_IDX,
- LM_DEV_NUM_IRQS = 3
+ LM_DEV_ERR_IRQ_INDEX,
+ LM_DEV_REQ_IRQ_INDEX,
+ LM_DEV_NUM_IRQS
};
+/* FIXME these are PCI regions */
enum {
LM_DEV_BAR0_REG_IDX,
LM_DEV_BAR1_REG_IDX,
@@ -162,7 +172,15 @@ enum {
LM_DEV_ROM_REG_IDX,
LM_DEV_CFG_REG_IDX,
LM_DEV_VGA_REG_IDX,
- LM_DEV_NUM_REGS = 9
+ /*
+ * FIXME this really belong here, but simplifies implementation for now. A
+ * migration region can exist for non-PCI devices (can its index be
+ * anything?). In any case, we should allow the user to define custom regions
+ * at will, by fixing the migration region in that position we don't allow
+ * this.
+ */
+ LM_DEV_MIGRATION_REG_IDX,
+ LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */
};
typedef struct {
@@ -191,7 +209,7 @@ typedef struct {
} lm_pci_info_t;
/*
- * Returns a pointer to the non-standard part of the PCI configuration space.
+ * Returns a pointer to the standard part of the PCI configuration space.
*/
lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t *lm_ctx);
@@ -208,7 +226,7 @@ typedef enum {
*
* @lm_log_fn_t: typedef for log function.
*/
-typedef void (lm_log_fn_t) (void *pvt, const char *msg);
+typedef void (lm_log_fn_t) (void *pvt, lm_log_lvl_t lvl, const char *msg);
/**
* Callback function that gets called when a capability is accessed. The
@@ -228,26 +246,77 @@ typedef ssize_t (lm_cap_access_t) (void *pvt, uint8_t id,
char *buf, size_t count,
loff_t offset, bool is_write);
+/* FIXME does it have to be packed as well? */
+typedef union {
+ struct msicap msi;
+ struct msixcap msix;
+ struct pmcap pm;
+ struct pxcap px;
+} lm_cap_t;
+
+typedef enum {
+ LM_TRANS_KERNEL,
+ LM_TRANS_SOCK,
+ LM_TRANS_MAX
+} lm_trans_t;
+
+#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF
+
+/*
+ * FIXME the names of migration callback functions are probably far too long,
+ * but for now it helps with the implementation.
+ */
+typedef int (lm_migration_callback_t)(void *pvt);
+
+typedef enum {
+ LM_MIGR_STATE_STOP,
+ LM_MIGR_STATE_START,
+ LM_MIGR_STATE_STOP_AND_COPY,
+ LM_MIGR_STATE_PRE_COPY,
+ LM_MIGR_STATE_RESUME
+} lm_migr_state_t;
+
typedef struct {
+ /* migration state transition callback */
+ /* TODO rename to lm_migration_state_transition_callback */
+ /* FIXME maybe we should create a single callback and pass the state? */
+ int (*transition)(void *pvt, lm_migr_state_t state);
+
+ /* Callbacks for saving device state */
+
/*
- * Capability ID, as defined by the PCI specification. Also defined as
- * PCI_CAP_ID_XXX in <linux/pci_regs.h>.
+ * Function that is called to retrieve pending migration data. If migration
+ * data were previously made available (function prepare_data has been
+ * called) then calling this function signifies that they have been read
+ * (e.g. migration data can be discarded). If the function returns 0 then
+ * migration has finished and this function won't be called again.
*/
- uint8_t id;
+ __u64 (*get_pending_bytes)(void *pvt);
/*
- * Size of the capability.
+ * Function that is called to instruct the device to prepare migration data.
+ * The function must return only after migration data are available at the
+ * specified offset.
*/
- size_t size;
+ int (*prepare_data)(void *pvt, __u64 *offset, __u64 *size);
/*
- * Function to call back when the capability gets read or written.
+ * Function that is called to read migration data. offset and size can
+ * be any subrange on the offset and size previously returned by
+ * prepare_data. The function must return the amount of data read. This
+ * function can be called even if the migration data can be memory mapped.
+ *
+ * Does this mean that reading data_offset/data_size updates the values?
*/
- lm_cap_access_t *fn;
-} lm_cap_t;
+ size_t (*read_data)(void *pvt, void *buf, __u64 count, __u64 offset);
-#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF
+ /* Callback for restoring device state */
+
+ /* Fuction that is called for writing previously stored device state. */
+ size_t (*write_data)(void *pvt, void *data, __u64 size);
+
+} lm_migration_callbacks_t;
/**
* Device information structure, used to create the lm_ctx.
@@ -287,16 +356,36 @@ typedef struct {
int (*reset) (void *pvt);
/*
- * PCI capabilities. The user needs to only define the ID and size of each
- * capability. The actual capability is not maintained by libmuser. When a
- * capability is accessed the appropriate callback function is called.
+ * Function that is called when the guest maps a DMA region. Optional.
+ */
+ void (*map_dma) (void *pvt, uint64_t iova, uint64_t len);
+
+ /*
+ * Function that is called when the guest unmaps a DMA region. The device
+ * must release all references to that region before the callback returns.
+ * This is required if you want to be able to access guest memory.
*/
- lm_cap_t caps[LM_MAX_CAPS];
+ int (*unmap_dma) (void *pvt, uint64_t iova);
+
+ lm_trans_t trans;
/*
- * Number of capabilities in above array.
+ * Attaching to the transport is non-blocking. The library will not attempt
+ * to attach during context creation time. The caller must then manually
+ * call lm_ctx_try_attach(), which is non-blocking, as many times as
+ * necessary.
+ */
+#define LM_FLAG_ATTACH_NB (1 << 0)
+ uint64_t flags;
+
+ /*
+ * PCI capabilities.
*/
int nr_caps;
+ lm_cap_t **caps;
+
+ lm_migration_callbacks_t migration_callbacks;
+
} lm_dev_info_t;
/**
@@ -339,18 +428,49 @@ int
lm_ctx_run(lm_dev_info_t *dev_info);
/**
+ * Polls, without blocking, an lm_ctx. This is an alternative to using
+ * a thread and making a blocking call to lm_ctx_drive(). Instead, the
+ * application can periodically poll the context directly from one of
+ * its own threads.
+ *
+ * This is only allowed when LM_FLAG_ATTACH_NB is specified during creation.
+ *
+ * @lm_ctx: The libmuser context to poll
+ *
+ * @returns 0 on success, -errno on failure.
+ */
+int
+lm_ctx_poll(lm_ctx_t *lm_ctx);
+
+/**
* Triggers an interrupt.
*
+ * libmuser takes care of using the correct IRQ type (IRQ index: INTx or MSI/X),
+ * the caller only needs to specify the sub-index.
+ *
+ * @lm_ctx: the libmuser context to trigger interrupt
+ * @subindex: vector subindex to trigger interrupt on
+ *
+ * @returns 0 on success, or -1 on failure. Sets errno.
+ */
+int
+lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
+
+/**
+ * Sends message to client to trigger an interrupt.
+ *
* libmuser takes care of using the IRQ type (INTx, MSI/X), the caller only
* needs to specify the sub-index.
+ * This api can be used to trigger interrupt by sending message to client.
*
* @lm_ctx: the libmuser context to trigger interrupt
* @subindex: vector subindex to trigger interrupt on
*
* @returns 0 on success, or -1 on failure. Sets errno.
*/
+
int
-lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
+lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex);
/* Helper functions */
@@ -366,12 +486,15 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
* than can be individually mapped in the program's virtual memory. A single
* linear guest physical address span may need to be split into multiple
* scatter/gather regions due to limitations of how memory can be mapped.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
*
* @lm_ctx: the libmuser context
* @dma_addr: the guest physical address
* @len: size of memory to be mapped
* @sg: array that receives the scatter/gather entries to be mapped
* @max_sg: maximum number of elements in above array
+ * @prot: protection as define in <sys/mman.h>
*
* @returns the number of scatter/gather entries created on success, and on
* failure:
@@ -381,12 +504,14 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
*/
int
lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg);
+ dma_sg_t *sg, int max_sg, int prot);
/**
* Maps a list scatter/gather entries from the guest's physical address space
* to the program's virtual memory. It is the caller's responsibility to remove
* the mappings by calling lm_unmap_sg.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
*
* @lm_ctx: the libmuser context
* @sg: array of scatter/gather entries returned by lm_addr_to_sg
@@ -403,6 +528,8 @@ lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
/**
* Unmaps a list scatter/gather entries (previously mapped by lm_map_sg) from
* the program's virtual memory.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
*
* @lm_ctx: the libmuser context
* @sg: array of scatter/gather entries to unmap
@@ -426,16 +553,59 @@ lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
int
lm_get_region(loff_t pos, size_t count, loff_t *off);
+/**
+ * Read from the dma region exposed by the client.
+ *
+ * @lm_ctx: the libmuser context
+ * @sg: a DMA segment obtained from dma_addr_to_sg
+ * @data: data buffer to read into
+ */
+int
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
+
+/**
+ * Write to the dma region exposed by the client.
+ *
+ * @lm_ctx: the libmuser context
+ * @sg: a DMA segment obtained from dma_addr_to_sg
+ * @data: data buffer to write
+ */
+int
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
+
/*
* Advanced stuff.
*/
/**
- * Returns the non-standard part of the PCI configuragion space.
+ * Returns the non-standard part of the PCI configuration space.
*/
uint8_t *
lm_get_pci_non_std_config_space(lm_ctx_t *lm_ctx);
+/*
+ * Attempts to attach to the transport. LM_FLAG_ATTACH_NB must be set when
+ * creating the context. Returns 0 on success and -1 on error. If errno is set
+ * to EAGAIN or EWOULDBLOCK then the transport is not ready to attach to and the
+ * operation must be retried.
+ */
+int
+lm_ctx_try_attach(lm_ctx_t *lm_ctx);
+
+/*
+ * FIXME need to make sure that there can be at most one capability with a given
+ * ID, otherwise this function will return the first one with this ID.
+ */
+uint8_t *
+lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id);
+
+void
+lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...);
+
+/* FIXME */
+int muser_send_fds(int sock, int *fds, size_t count);
+ssize_t muser_recv_fds(int sock, int *fds, size_t count);
+
#endif /* LIB_MUSER_H */
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser_ctx.c b/lib/muser_ctx.c
index 0de3ac0..92155d7 100644
--- a/lib/muser_ctx.c
+++ b/lib/muser_ctx.c
@@ -47,13 +47,22 @@
#include <stdarg.h>
#include <linux/vfio.h>
#include <sys/param.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/select.h>
-#include "../kmod/muser.h"
#include "muser.h"
#include "muser_priv.h"
#include "dma.h"
#include "cap.h"
+#define MAX_FDS 8
+
+#define IOMMU_GRP_NAME "iommu_group"
+
typedef enum {
IRQ_NONE = 0,
IRQ_INTX,
@@ -61,6 +70,14 @@ typedef enum {
IRQ_MSIX,
} irq_type_t;
+char *irq_to_str[] = {
+ [LM_DEV_INTX_IRQ_IDX] = "INTx",
+ [LM_DEV_MSI_IRQ_IDX] = "MSI",
+ [LM_DEV_MSIX_IRQ_IDX] = "MSI-X",
+ [LM_DEV_ERR_IRQ_INDEX] = "ERR",
+ [LM_DEV_REQ_IRQ_INDEX] = "REQ"
+};
+
typedef struct {
irq_type_t type; /* irq type this device is using */
int err_efd; /* eventfd for irq err */
@@ -69,27 +86,517 @@ typedef struct {
int efds[0]; /* XXX must be last */
} lm_irqs_t;
-/*
- * Macro that ensures that a particular struct member is last. Doesn't work for
- * flexible array members.
- */
-#define MUST_BE_LAST(s, m, t) \
- _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \
- #t " " #m " must be last member in " #s)
+enum migration_iteration_state {
+ VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL,
+ VFIO_USER_MIGRATION_ITERATION_STATE_STARTED,
+ VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED,
+ VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED
+};
struct lm_ctx {
void *pvt;
dma_controller_t *dma;
int fd;
+ int conn_fd;
int (*reset) (void *pvt);
lm_log_lvl_t log_lvl;
lm_log_fn_t *log;
lm_pci_info_t pci_info;
lm_pci_config_space_t *pci_config_space;
+ lm_trans_t trans;
struct caps *caps;
+ uint64_t flags;
+ char *uuid;
+ void (*map_dma) (void *pvt, uint64_t iova, uint64_t len);
+ int (*unmap_dma) (void *pvt, uint64_t iova);
+
+ /* TODO there should be a void * variable to store transport-specific stuff */
+ /* LM_TRANS_SOCK */
+ char *iommu_dir;
+ int iommu_dir_fd;
+ int sock_flags;
+
+ int client_max_fds;
+
+ struct {
+ struct vfio_device_migration_info info;
+ size_t pgsize;
+ lm_migration_callbacks_t callbacks;
+ struct {
+ enum migration_iteration_state state;
+ __u64 offset;
+ __u64 size;
+ } iter;
+ } migration;
+
lm_irqs_t irqs; /* XXX must be last */
};
-MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t);
+
+
+/* function prototypes */
+static void
+free_sparse_mmap_areas(lm_reg_info_t*);
+
+static inline int recv_blocking(int sock, void *buf, size_t len, int flags)
+{
+ int f = fcntl(sock, F_GETFL, 0);
+ int ret, fret;
+
+ fret = fcntl(sock, F_SETFL, f & ~O_NONBLOCK);
+ assert(fret != -1);
+
+ ret = recv(sock, buf, len, flags);
+
+ fret = fcntl(sock, F_SETFL, f);
+ assert(fret != -1);
+
+ return ret;
+}
+
+static int
+init_sock(lm_ctx_t *lm_ctx)
+{
+ struct sockaddr_un addr = { .sun_family = AF_UNIX };
+ int ret, unix_sock;
+ mode_t mode;
+
+ assert(lm_ctx != NULL);
+
+ lm_ctx->iommu_dir = strdup(lm_ctx->uuid);
+ if (!lm_ctx->iommu_dir) {
+ return -ENOMEM;
+ }
+
+ /* FIXME SPDK can't easily run as non-root */
+ mode = umask(0000);
+
+ if ((unix_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ ret = errno;
+ goto out;
+ }
+
+ if (lm_ctx->flags & LM_FLAG_ATTACH_NB) {
+ ret = fcntl(unix_sock, F_SETFL,
+ fcntl(unix_sock, F_GETFL, 0) | O_NONBLOCK);
+ if (ret < 0) {
+ ret = errno;
+ goto close_unix_sock;
+ }
+ lm_ctx->sock_flags = MSG_DONTWAIT | MSG_WAITALL;
+ } else {
+ lm_ctx->sock_flags = 0;
+ }
+
+ lm_ctx->iommu_dir_fd = open(lm_ctx->iommu_dir, O_DIRECTORY);
+ if (lm_ctx->iommu_dir_fd < 0) {
+ ret = errno;
+ goto close_unix_sock;
+ }
+
+ ret = snprintf(addr.sun_path, sizeof addr.sun_path, "%s/" MUSER_SOCK,
+ lm_ctx->iommu_dir);
+ if (ret >= (int)sizeof addr.sun_path) {
+ ret = ENAMETOOLONG;
+ goto close_iommu_dir_fd;
+ }
+ if (ret < 0) {
+ goto close_iommu_dir_fd;
+ }
+
+ /* start listening business */
+ ret = bind(unix_sock, (struct sockaddr*)&addr, sizeof(addr));
+ if (ret < 0) {
+ ret = errno;
+ goto close_iommu_dir_fd;
+ }
+
+ ret = listen(unix_sock, 0);
+ if (ret < 0) {
+ ret = errno;
+ goto close_iommu_dir_fd;
+ }
+
+ umask(mode);
+ return unix_sock;
+
+close_iommu_dir_fd:
+ close(lm_ctx->iommu_dir_fd);
+close_unix_sock:
+ close(unix_sock);
+out:
+ return -ret;
+}
+
+static void
+__free_s(char **p)
+{
+ free(*p);
+}
+
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *fds, int count)
+{
+ int ret;
+ struct vfio_user_header hdr = {.msg_id = msg_id};
+ struct msghdr msg;
+ size_t i;
+
+ if (nr_iovecs == 0) {
+ iovecs = alloca(sizeof(*iovecs));
+ nr_iovecs = 1;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+
+ if (is_reply) {
+ hdr.flags.type = VFIO_USER_F_TYPE_REPLY;
+ } else {
+ hdr.cmd = cmd;
+ hdr.flags.type = VFIO_USER_F_TYPE_COMMAND;
+ }
+
+ iovecs[0].iov_base = &hdr;
+ iovecs[0].iov_len = sizeof(hdr);
+
+ for (i = 0; i < nr_iovecs; i++) {
+ hdr.msg_size += iovecs[i].iov_len;
+ }
+
+ msg.msg_iovlen = nr_iovecs;
+ msg.msg_iov = iovecs;
+
+ if (fds != NULL) {
+ size_t size = count * sizeof *fds;
+ char *buf = alloca(CMSG_SPACE(size));
+
+ msg.msg_control = buf;
+ msg.msg_controllen = CMSG_SPACE(size);
+
+ struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), fds, size);
+ }
+
+ ret = sendmsg(sock, &msg, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count) {
+
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = data,
+ .iov_len = data_len
+ }
+ };
+ return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs,
+ ARRAY_SIZE(iovecs), fds, count);
+}
+
+int
+send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
+ char *caps)
+{
+ int ret;
+ char *data;
+
+ ret = asprintf(&data,
+ "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}",
+ major, minor, caps != NULL ? caps : "{}");
+ if (ret == -1) {
+ return -1;
+ }
+ ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data,
+ ret, NULL, 0);
+ free(data);
+ return ret;
+}
+
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+ uint16_t *msg_id, void *data, size_t *len)
+{
+ int ret;
+
+ ret = recv_blocking(sock, hdr, sizeof(*hdr), 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if (ret < (int)sizeof(*hdr)) {
+ return -EINVAL;
+ }
+
+ if (is_reply) {
+ if (hdr->msg_id != *msg_id) {
+ return -EINVAL;
+ }
+
+ if (hdr->flags.type != VFIO_USER_F_TYPE_REPLY) {
+ return -EINVAL;
+ }
+
+ if (hdr->flags.error == 1U) {
+ if (hdr->error_no <= 0) {
+ hdr->error_no = EINVAL;
+ }
+ return -hdr->error_no;
+ }
+ } else {
+ if (hdr->flags.type != VFIO_USER_F_TYPE_COMMAND) {
+ return -EINVAL;
+ }
+ *msg_id = hdr->msg_id;
+ }
+
+ if (len != NULL && *len > 0 && hdr->msg_size > sizeof *hdr) {
+ ret = recv_blocking(sock, data, MIN(hdr->msg_size - sizeof *hdr, *len),
+ 0);
+ if (ret < 0) {
+ return ret;
+ }
+ if (*len != (size_t)ret) { /* FIXME we should allow receiving less */
+ return -EINVAL;
+ }
+ *len = ret;
+ }
+ return 0;
+}
+
+int
+recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
+ int *max_fds, size_t *pgsize)
+{
+ int ret;
+ struct vfio_user_header hdr;
+ char *data __attribute__((__cleanup__(__free_s))) = NULL;
+
+ ret = recv_vfio_user_msg(sock, &hdr, is_reply, msg_id, NULL, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ hdr.msg_size -= sizeof(hdr);
+ data = malloc(hdr.msg_size);
+ if (data == NULL) {
+ return -errno;
+ }
+ ret = recv_blocking(sock, data, hdr.msg_size, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if (ret < (int)hdr.msg_size) {
+ return -EINVAL;
+ }
+
+ /* FIXME use proper parsing */
+ ret = sscanf(data,
+ "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}",
+ major, minor, max_fds, pgsize);
+ if (ret != 4) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs,
+ send_fds, fd_count);
+ if (ret < 0) {
+ return ret;
+ }
+ if (hdr == NULL) {
+ hdr = alloca(sizeof *hdr);
+ }
+ return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len);
+}
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = send_data,
+ .iov_len = send_len
+ }
+ };
+ return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs,
+ ARRAY_SIZE(iovecs), send_fds, fd_count,
+ hdr, recv_data, recv_len);
+}
+
+static int
+set_version(lm_ctx_t *lm_ctx, int sock)
+{
+ int ret;
+ int client_mj, client_mn;
+ uint16_t msg_id = 0;
+ char *server_caps;
+
+ ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}",
+ MAX_FDS, sysconf(_SC_PAGESIZE));
+ if (ret == -1) {
+ return -ENOMEM;
+ }
+
+ ret = send_version(sock, LIB_MUSER_VFIO_USER_VERS_MJ,
+ LIB_MUSER_VFIO_USER_VERS_MN, msg_id, false, server_caps);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_DBG, "failed to send version: %s", strerror(-ret));
+ goto out;
+ }
+
+ ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true,
+ &lm_ctx->client_max_fds, &lm_ctx->migration.pgsize);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret));
+ goto out;
+ }
+ if (client_mj != LIB_MUSER_VFIO_USER_VERS_MJ ||
+ client_mn != LIB_MUSER_VFIO_USER_VERS_MN) {
+ lm_log(lm_ctx, LM_DBG, "version mismatch, server=%d.%d, client=%d.%d",
+ LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN,
+ client_mj, client_mn);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (lm_ctx->migration.pgsize == 0) {
+ lm_log(lm_ctx, LM_ERR, "bad migration page size");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* FIXME need to check max_fds */
+
+ lm_ctx->migration.pgsize = MIN(lm_ctx->migration.pgsize,
+ sysconf(_SC_PAGESIZE));
+out:
+ free(server_caps);
+ return ret;
+}
+
+/**
+ * lm_ctx: libmuser context
+ * iommu_dir: full path to the IOMMU group to create. All parent directories
+ * must already exist.
+ */
+static int
+open_sock(lm_ctx_t *lm_ctx)
+{
+ int ret;
+ int conn_fd;
+
+ assert(lm_ctx != NULL);
+
+ conn_fd = accept(lm_ctx->fd, NULL, NULL);
+ if (conn_fd == -1) {
+ return conn_fd;
+ }
+
+ /* send version and caps */
+ ret = set_version(lm_ctx, conn_fd);
+ if (ret < 0) {
+ return ret;
+ }
+
+ lm_ctx->conn_fd = conn_fd;
+ return conn_fd;
+}
+
+static int
+close_sock(lm_ctx_t *lm_ctx)
+{
+ return close(lm_ctx->conn_fd);
+}
+
+static int
+get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ int *fds, int *nr_fds)
+{
+ int ret;
+ struct iovec iov = {.iov_base = hdr, .iov_len = sizeof *hdr};
+ struct msghdr msg = {.msg_iov = &iov, .msg_iovlen = 1};
+ struct cmsghdr *cmsg;
+
+ msg.msg_controllen = CMSG_SPACE(sizeof(int) * *nr_fds);
+ msg.msg_control = alloca(msg.msg_controllen);
+
+ /*
+ * TODO ideally we should set O_NONBLOCK on the fd so that the syscall is
+ * faster (?). I tried that and get short reads, so we need to store the
+ * partially received buffer somewhere and retry.
+ */
+ ret = recvmsg(lm_ctx->conn_fd, &msg, lm_ctx->sock_flags);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) {
+ continue;
+ }
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(int))) {
+ return -EINVAL;
+ }
+ int size = cmsg->cmsg_len - CMSG_LEN(0);
+ if (size % sizeof(int) != 0) {
+ return -EINVAL;
+ }
+ *nr_fds = (int)(size / sizeof(int));
+ memcpy(fds, CMSG_DATA(cmsg), *nr_fds * sizeof(int));
+ break;
+ }
+
+ return ret;
+}
+
+static ssize_t
+recv_fds_sock(lm_ctx_t *lm_ctx, void *buf, size_t size)
+{
+ ssize_t ret = muser_recv_fds(lm_ctx->conn_fd, buf, size / sizeof(int));
+ if (ret < 0) {
+ return ret;
+ }
+ return ret * sizeof(int);
+}
+
+static struct transport_ops {
+ int (*init)(lm_ctx_t*);
+ int (*attach)(lm_ctx_t*);
+ int(*detach)(lm_ctx_t*);
+ int (*get_request)(lm_ctx_t*, struct vfio_user_header*, int *fds, int *nr_fds);
+ ssize_t (*recv_fds)(lm_ctx_t*, void *buf, size_t size);
+} transports_ops[] = {
+ [LM_TRANS_SOCK] = {
+ .init = init_sock,
+ .attach = open_sock,
+ .detach = close_sock,
+ .recv_fds = recv_fds_sock,
+ .get_request = get_request_sock,
+ }
+};
#define LM2VFIO_IRQT(type) (type - 1)
@@ -98,6 +605,7 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...)
{
va_list ap;
char buf[BUFSIZ];
+ int _errno = errno;
assert(lm_ctx != NULL);
@@ -108,7 +616,8 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...)
va_start(ap, fmt);
vsnprintf(buf, sizeof buf, fmt, ap);
va_end(ap);
- lm_ctx->log(lm_ctx->pvt, buf);
+ lm_ctx->log(lm_ctx->pvt, lvl, buf);
+ errno = _errno;
}
static const char *
@@ -137,11 +646,14 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index)
case VFIO_PCI_INTX_IRQ_INDEX:
case VFIO_PCI_MSI_IRQ_INDEX:
case VFIO_PCI_MSIX_IRQ_INDEX:
- lm_log(lm_ctx, LM_DBG, "disabling IRQ %s\n", vfio_irq_idx_to_str(index));
+ lm_log(lm_ctx, LM_DBG, "disabling IRQ %s", vfio_irq_idx_to_str(index));
lm_ctx->irqs.type = IRQ_NONE;
for (i = 0; i < lm_ctx->irqs.max_ivs; i++) {
if (lm_ctx->irqs.efds[i] >= 0) {
- (void)close(lm_ctx->irqs.efds[i]);
+ if (close(lm_ctx->irqs.efds[i]) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m",
+ lm_ctx->irqs.efds[i]);
+ }
lm_ctx->irqs.efds[i] = -1;
}
}
@@ -155,12 +667,17 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index)
}
if (irq_efd != NULL) {
- (void)close(*irq_efd);
- *irq_efd = -1;
+ if (*irq_efd != -1) {
+ if (close(*irq_efd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m",
+ *irq_efd);
+ }
+ *irq_efd = -1;
+ }
return 0;
}
- lm_log(lm_ctx, LM_DBG, "failed to disable IRQs\n");
+ lm_log(lm_ctx, LM_DBG, "failed to disable IRQs");
return -EINVAL;
}
@@ -178,9 +695,8 @@ irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
val = 1;
ret = eventfd_write(efd, val);
if (ret == -1) {
- ret = -errno;
- lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m\n");
- return ret;
+ lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m");
+ return -errno;
}
}
}
@@ -206,9 +722,8 @@ irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
val = 1;
ret = eventfd_write(efd, val);
if (ret == -1) {
- ret = -errno;
- lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m\n");
- return ret;
+ lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m");
+ return -errno;
}
}
}
@@ -228,13 +743,16 @@ irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data
i++, d32++) {
efd = lm_ctx->irqs.efds[i];
if (efd >= 0) {
- (void) close(efd);
+ if (close(efd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m", efd);
+ }
+
lm_ctx->irqs.efds[i] = -1;
}
if (*d32 >= 0) {
lm_ctx->irqs.efds[i] = *d32;
}
- lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d\n", i, lm_ctx->irqs.efds[i]);
+ lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d", i, lm_ctx->irqs.efds[i]);
}
return 0;
@@ -252,7 +770,7 @@ irqs_trigger(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
return irqs_disable(lm_ctx, irq_set->index);
}
- lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=0x%x\n",
+ lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=%#lx",
vfio_irq_idx_to_str(irq_set->index), irq_set->flags);
switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
@@ -334,6 +852,17 @@ dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
return 0;
}
+static int
+device_reset(lm_ctx_t *lm_ctx)
+{
+ lm_log(lm_ctx, LM_DBG, "Device reset called by client");
+ if (lm_ctx->reset != NULL) {
+ return lm_ctx->reset(lm_ctx->pvt);
+ }
+
+ return 0;
+}
+
static long
dev_set_irqs(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
{
@@ -368,7 +897,8 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
// Ensure provided argsz is sufficiently big and index is within bounds.
if ((irq_info->argsz < sizeof(struct vfio_irq_info)) ||
(irq_info->index >= LM_DEV_NUM_IRQS)) {
- lm_log(lm_ctx, LM_DBG, "bad irq_info\n");
+ lm_log(lm_ctx, LM_DBG, "bad irq_info (size=%d index=%d)\n",
+ irq_info->argsz, irq_info->index);
return -EINVAL;
}
@@ -380,66 +910,94 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
/*
* Populate the sparse mmap capability information to vfio-client.
- * kernel/muser constructs the response for VFIO_DEVICE_GET_REGION_INFO
- * accommodating sparse mmap information.
* Sparse mmap information stays after struct vfio_region_info and cap_offest
* points accordingly.
*/
static int
-dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg,
- struct vfio_region_info *vfio_reg)
+dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
+ struct vfio_region_info **vfio_reg)
{
+ struct vfio_info_cap_header *header;
+ struct vfio_region_info_cap_type *type = NULL;
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct lm_sparse_mmap_areas *mmap_areas;
int nr_mmap_areas, i;
- size_t size;
- ssize_t ret;
-
- if (lm_reg->mmap_areas == NULL)
- return -EINVAL;
+ size_t type_size = 0;
+ size_t sparse_size = 0;
+ size_t cap_size;
+ void *cap_ptr;
- nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
- size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+ if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+ type_size = sizeof(struct vfio_region_info_cap_type);
+ }
- /*
- * If vfio_reg does not have enough space to accommodate sparse info then
- * set the argsz with the expected size and return. Vfio client will call
- * back after reallocating the vfio_reg
- */
+ if (lm_reg->mmap_areas != NULL) {
+ nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
+ sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+ }
- if (vfio_reg->argsz < size + sizeof(*vfio_reg)) {
- vfio_reg->argsz = size + sizeof(*vfio_reg);
- vfio_reg->cap_offset = 0;
+ cap_size = type_size + sparse_size;
+ if (cap_size == 0) {
return 0;
}
- lm_log(lm_ctx, LM_DBG, "%s: size %llu, nr_mmap_areas %u\n", __func__, size,
- nr_mmap_areas);
- sparse = calloc(1, size);
- if (sparse == NULL)
+ /* TODO deosn't need to be calloc, we overwrite it entirely */
+ header = calloc(1, cap_size);
+ if (header == NULL) {
return -ENOMEM;
- sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->header.version = 1;
- sparse->header.next = 0;
- sparse->nr_areas = nr_mmap_areas;
+ }
+
+ if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+ type = (struct vfio_region_info_cap_type*)header;
+ type->header.id = VFIO_REGION_INFO_CAP_TYPE;
+ type->header.version = 1;
+ type->header.next = 0;
+ type->type = VFIO_REGION_TYPE_MIGRATION;
+ type->subtype = VFIO_REGION_SUBTYPE_MIGRATION;
+ (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+ }
- mmap_areas = lm_reg->mmap_areas;
- for (i = 0; i < nr_mmap_areas; i++) {
- sparse->areas[i].offset = mmap_areas->areas[i].start;
- sparse->areas[i].size = mmap_areas->areas[i].size;
+ if (lm_reg->mmap_areas != NULL) {
+ if (type != NULL) {
+ type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type);
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1);
+ } else {
+ (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)header;
+ }
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+ sparse->header.next = 0;
+ sparse->nr_areas = nr_mmap_areas;
+
+ lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__,
+ sparse_size, nr_mmap_areas);
+ mmap_areas = lm_reg->mmap_areas;
+ for (i = 0; i < nr_mmap_areas; i++) {
+ sparse->areas[i].offset = mmap_areas->areas[i].start;
+ sparse->areas[i].size = mmap_areas->areas[i].size;
+ lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__,
+ i, sparse->areas[i].offset, sparse->areas[i].size);
+ }
}
- /* write the sparse mmap cap info to vfio-client user pages */
- ret = write(lm_ctx->fd, sparse, size);
- if (ret != (ssize_t)size) {
- free(sparse);
- return -EIO;
+ /*
+ * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is
+ * memory-mappable in general, not only if it supports sparse mmap.
+ */
+ (*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
+
+ (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
+ *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz);
+ if (*vfio_reg == NULL) {
+ free(header);
+ return -ENOMEM;
}
- vfio_reg->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
- vfio_reg->cap_offset = sizeof(*vfio_reg);
+ cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset;
+ memcpy(cap_ptr, header, cap_size);
- free(sparse);
+ free(header);
return 0;
}
@@ -458,42 +1016,73 @@ offset_to_region(uint64_t offset)
return (offset >> LM_REGION_SHIFT) & LM_REGION_MASK;
}
+#ifdef LM_VERBOSE_LOGGING
+void
+dump_buffer(const char *prefix, const char *buf, uint32_t count)
+{
+ int i;
+ const size_t bytes_per_line = 0x8;
+
+ if (strcmp(prefix, "")) {
+ fprintf(stderr, "%s\n", prefix);
+ }
+ for (i = 0; i < (int)count; i++) {
+ if (i % bytes_per_line != 0) {
+ fprintf(stderr, " ");
+ }
+ /* TODO valgrind emits a warning if count is 1 */
+ fprintf(stderr,"0x%02x", *(buf + i));
+ if ((i + 1) % bytes_per_line == 0) {
+ fprintf(stderr, "\n");
+ }
+ }
+ if (i % bytes_per_line != 0) {
+ fprintf(stderr, "\n");
+ }
+}
+#else
+#define dump_buffer(prefix, buf, count)
+#endif
+
static long
-dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info *vfio_reg)
+dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg)
{
lm_reg_info_t *lm_reg;
int err;
assert(lm_ctx != NULL);
- assert(vfio_reg != NULL);
- lm_reg = &lm_ctx->pci_info.reg_info[vfio_reg->index];
+ assert(*vfio_reg != NULL);
+ lm_reg = &lm_ctx->pci_info.reg_info[(*vfio_reg)->index];
// Ensure provided argsz is sufficiently big and index is within bounds.
- if ((vfio_reg->argsz < sizeof(struct vfio_region_info)) ||
- (vfio_reg->index >= LM_DEV_NUM_REGS)) {
+ if (((*vfio_reg)->argsz < sizeof(struct vfio_region_info)) ||
+ ((*vfio_reg)->index >= LM_DEV_NUM_REGS)) {
+ lm_log(lm_ctx, LM_DBG, "bad args argsz=%d index=%d",
+ (*vfio_reg)->argsz, (*vfio_reg)->index);
return -EINVAL;
}
- vfio_reg->offset = region_to_offset(vfio_reg->index);
- vfio_reg->flags = lm_reg->flags;
- vfio_reg->size = lm_reg->size;
+ (*vfio_reg)->offset = region_to_offset((*vfio_reg)->index);
+ (*vfio_reg)->flags = lm_reg->flags;
+ (*vfio_reg)->size = lm_reg->size;
- if (lm_reg->mmap_areas != NULL) {
- err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg);
- if (err) {
- return err;
- }
+ err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg);
+ if (err) {
+ return err;
}
- lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", vfio_reg->index);
- dump_buffer(lm_ctx, "", (char*)vfio_reg, sizeof *vfio_reg);
+ lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu "
+ "argsz %llu",
+ (*vfio_reg)->index, (*vfio_reg)->offset, (*vfio_reg)->flags,
+ (*vfio_reg)->size, (*vfio_reg)->argsz);
return 0;
}
static long
-dev_get_info(struct vfio_device_info *dev_info)
+dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info)
{
+ assert(lm_ctx != NULL);
assert(dev_info != NULL);
// Ensure provided argsz is sufficiently big.
@@ -508,173 +1097,81 @@ dev_get_info(struct vfio_device_info *dev_info)
return 0;
}
-static long
-do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
-{
- int err = -ENOTSUP;
-
- assert(lm_ctx != NULL);
- switch (cmd_ioctl->vfio_cmd) {
- case VFIO_DEVICE_GET_INFO:
- err = dev_get_info(&cmd_ioctl->data.dev_info);
- break;
- case VFIO_DEVICE_GET_REGION_INFO:
- err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info);
- break;
- case VFIO_DEVICE_GET_IRQ_INFO:
- err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info);
- break;
- case VFIO_DEVICE_SET_IRQS:
- err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data);
- break;
- case VFIO_DEVICE_RESET:
- if (lm_ctx->reset != NULL) {
- return lm_ctx->reset(lm_ctx->pvt);
- }
- lm_log(lm_ctx, LM_DBG, "reset called but not reset function present\n");
- break;
- }
-
- return err;
-}
-
-static void
-get_path_from_fd(lm_ctx_t *lm_ctx, int fd, char *buf)
-{
- int err;
- ssize_t ret;
- char pathname[PATH_MAX];
-
- err = snprintf(pathname, PATH_MAX, "/proc/self/fd/%d", fd);
- if (err >= PATH_MAX || err == -1) {
- buf[0] = '\0';
- }
- ret = readlink(pathname, buf, PATH_MAX);
- if (ret == -1) {
- lm_log(lm_ctx, LM_DBG, "failed to readlink %s: %m\n", pathname);
- ret = 0;
- } else if (ret == PATH_MAX) {
- lm_log(lm_ctx, LM_DBG, "failed to readlink %s, output truncated\n",
- pathname);
- ret -= 1;
- }
- buf[ret] = '\0';
-}
-
-static int
-muser_dma_unmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int err;
- char buf[PATH_MAX];
-
- get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf);
-
- lm_log(lm_ctx, LM_INF, "removing DMA region fd=%d path=%s %#lx-%#lx\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len);
-
- if (lm_ctx->dma == NULL) {
- lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
- return -EINVAL;
- }
-
- err = dma_controller_remove_region(lm_ctx->dma,
- cmd->mmap.request.addr,
- cmd->mmap.request.len,
- cmd->mmap.request.fd);
- if (err != 0) {
- lm_log(lm_ctx, LM_ERR, "failed to remove DMA region fd=%d path=%s %#lx-%#lx: %s\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len,
- strerror(err));
- }
-
- return err;
-}
-
-static int
-muser_dma_map(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int err;
- char buf[PATH_MAX];
-
- get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf);
-
- lm_log(lm_ctx, LM_INF, "adding DMA region fd=%d path=%s iova=%#lx-%#lx offset=%#lx\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len,
- cmd->mmap.request.offset);
-
- if (lm_ctx->dma == NULL) {
- lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
- return -EINVAL;
- }
-
- err = dma_controller_add_region(lm_ctx, lm_ctx->dma,
- cmd->mmap.request.addr,
- cmd->mmap.request.len,
- cmd->mmap.request.fd,
- cmd->mmap.request.offset);
- if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to add DMA region fd=%d path=%s %#lx-%#lx: %d\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len, err);
- }
-
- return 0;
+int
+muser_send_fds(int sock, int *fds, size_t count) {
+ struct msghdr msg = { 0 };
+ size_t size = count * sizeof *fds;
+ char buf[CMSG_SPACE(size)];
+ memset(buf, '\0', sizeof(buf));
+
+ /* XXX requires at least one byte */
+ struct iovec io = { .iov_base = "\0", .iov_len = 1 };
+
+ msg.msg_iov = &io;
+ msg.msg_iovlen = 1;
+ msg.msg_control = buf;
+ msg.msg_controllen = sizeof(buf);
+
+ struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), fds, size);
+ msg.msg_controllen = CMSG_SPACE(size);
+ return sendmsg(sock, &msg, 0);
}
-/*
- * Callback that is executed when device memory is to be mmap'd.
- */
-static int
-muser_mmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
+ssize_t
+muser_recv_fds(int sock, int *fds, size_t count)
{
- int region, err = 0;
- unsigned long addr;
- unsigned long len = cmd->mmap.request.len;
- loff_t offset = cmd->mmap.request.addr;
+ int ret;
+ struct cmsghdr *cmsg;
+ size_t fds_size;
+ char msg_buf[sysconf(_SC_PAGESIZE)];
+ struct iovec io = {.iov_base = msg_buf, .iov_len = sizeof(msg_buf)};
+ char cmsg_buf[sysconf(_SC_PAGESIZE)];
+ struct msghdr msg = {
+ .msg_iov = &io,
+ .msg_iovlen = 1,
+ .msg_control = cmsg_buf,
+ .msg_controllen = sizeof(cmsg_buf)
+ };
- region = lm_get_region(offset, len, &offset);
- if (region < 0) {
- lm_log(lm_ctx, LM_ERR, "bad region %d\n", region);
- err = EINVAL;
- goto out;
+ if (fds == NULL || count <= 0) {
+ errno = EINVAL;
+ return -1;
}
- if (lm_ctx->pci_info.reg_info[region].map == NULL) {
- lm_log(lm_ctx, LM_ERR, "region not mmapable\n");
- err = ENOTSUP;
- goto out;
+ ret = recvmsg(sock, &msg, 0);
+ if (ret == -1) {
+ return ret;
}
- addr = lm_ctx->pci_info.reg_info[region].map(lm_ctx->pvt, offset, len);
- if ((void *)addr == MAP_FAILED) {
- err = errno;
- lm_log(lm_ctx, LM_ERR, "failed to mmap: %m\n");
- goto out;
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (cmsg == NULL) {
+ errno = EINVAL;
+ return -1;
}
- cmd->mmap.response = addr;
-
-out:
- if (err != 0) {
- lm_log(lm_ctx, LM_ERR, "failed to mmap device memory %#x-%#lx: %s\n",
- offset, offset + len, strerror(err));
+ fds_size = cmsg->cmsg_len - sizeof *cmsg;
+ if ((fds_size % sizeof(int)) != 0 || fds_size / sizeof (int) > count) {
+ errno = EINVAL;
+ return -1;
}
+ memcpy((void*)fds, CMSG_DATA(cmsg), cmsg->cmsg_len - sizeof *cmsg);
- return -err;
+ return fds_size / sizeof(int);
}
/*
- * Returns the number of bytes communicated to the kernel (may be less than
- * ret), or a negative number on error.
+ * Returns the number of bytes sent (may be less than ret), or a negative
+ * number on error.
*/
static int
post_read(lm_ctx_t *lm_ctx, char *rwbuf, ssize_t count)
{
ssize_t ret;
- ret = write(lm_ctx->fd, rwbuf, count);
+ ret = write(lm_ctx->conn_fd, rwbuf, count);
if (ret != count) {
lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %lu/%lu, %s\n",
__func__, ret, count, strerror(errno));
@@ -719,17 +1216,274 @@ handle_pci_config_space_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
int ret;
count = MIN(pci_config_space_size(lm_ctx), count);
- ret = cap_maybe_access(lm_ctx->caps, lm_ctx->pvt, buf, count, pos, is_write);
+ if (is_write) {
+ ret = cap_maybe_access(lm_ctx, lm_ctx->caps, buf, count, pos);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count,
+ pos);
+ return ret;
+ }
+ } else {
+ memcpy(buf, lm_ctx->pci_config_space->raw + pos, count);
+ }
+ return count;
+}
+
+/* valid migration state transitions */
+__u32 migration_states[VFIO_DEVICE_STATE_MASK] = {
+ [VFIO_DEVICE_STATE_STOP] = 1 << VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING] = /* running */
+ (1 << VFIO_DEVICE_STATE_STOP) |
+ (1 << VFIO_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_DEVICE_STATE_SAVING) |
+ (1 << (VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING)) |
+ (1 << VFIO_DEVICE_STATE_RESUMING),
+ [VFIO_DEVICE_STATE_SAVING] = /* stop-and-copy */
+ (1 << VFIO_DEVICE_STATE_STOP) |
+ (1 << VFIO_DEVICE_STATE_SAVING),
+ [VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING] = /* pre-copy */
+ (1 << VFIO_DEVICE_STATE_SAVING) |
+ (1 << VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING),
+ [VFIO_DEVICE_STATE_RESUMING] = /* resuming */
+ (1 << VFIO_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_DEVICE_STATE_RESUMING)
+};
+
+static bool
+_migration_state_transition_is_valid(__u32 from, __u32 to)
+{
+ return migration_states[from] & (1 << to);
+}
+
+static ssize_t
+handle_migration_device_state(lm_ctx_t *lm_ctx, __u32 *device_state,
+ bool is_write) {
+
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(device_state != NULL);
+
+ if (!is_write) {
+ *device_state = lm_ctx->migration.info.device_state;
+ return 0;
+ }
+
+ if (*device_state & ~VFIO_DEVICE_STATE_MASK) {
+ return -EINVAL;
+ }
+
+ if (!_migration_state_transition_is_valid(lm_ctx->migration.info.device_state,
+ *device_state)) {
+ return -EINVAL;
+ }
+
+ switch (*device_state) {
+ case VFIO_DEVICE_STATE_STOP:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_STOP);
+ break;
+ case VFIO_DEVICE_STATE_RUNNING:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_START);
+ break;
+ case VFIO_DEVICE_STATE_SAVING:
+ /*
+ * FIXME How should the device operate during the stop-and-copy
+ * phase? Should we only allow the migration data to be read from
+ * the migration region? E.g. Access to any other region should be
+ * failed? This might be a good question to send to LKML.
+ */
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_STOP_AND_COPY);
+ break;
+ case VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_PRE_COPY);
+ break;
+ case VFIO_DEVICE_STATE_RESUMING:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_RESUME);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret == 0) {
+ lm_ctx->migration.info.device_state = *device_state;
+ }
+
+ return ret;
+}
+
+static ssize_t
+handle_migration_pending_bytes(lm_ctx_t *lm_ctx, __u64 *pending_bytes,
+ bool is_write)
+{
+ assert(lm_ctx != NULL);
+ assert(pending_bytes != NULL);
+
+ if (is_write) {
+ return -EINVAL;
+ }
+
+ if (lm_ctx->migration.iter.state == VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED) {
+ *pending_bytes = 0;
+ return 0;
+ }
+
+ *pending_bytes = lm_ctx->migration.callbacks.get_pending_bytes(lm_ctx->pvt);
+
+ switch (lm_ctx->migration.iter.state) {
+ case VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL:
+ case VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED:
+ /*
+ * FIXME what happens if data haven't been consumed in the previous
+ * iteration? Ask on LKML.
+ */
+ if (*pending_bytes == 0) {
+ lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED;
+ } else {
+ lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_STARTED;
+ }
+ break;
+ case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+ /*
+ * Repeated reads of pending_bytes should not have any side effects.
+ * FIXME does it have to be the same as the previous value? Can it
+ * increase or even decrease? I suppose it can't be lower than
+ * data_size? Ask on LKML.
+ */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t
+handle_migration_data_offset(lm_ctx_t *lm_ctx, __u64 *offset, bool is_write)
+{
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(offset != NULL);
+
+ if (is_write) {
+ return -EINVAL;
+ }
+
+ switch (lm_ctx->migration.iter.state) {
+ case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+ break;
+ default:
+ /*
+ * FIXME it's not clear whether these registers can be accessed in
+ * other parts of the iteration, need clarification on the
+ * following:
+ *
+ * Read on data_offset and data_size should return the offset and
+ * size of the current buffer if the user application reads
+ * data_offset and data_size more than once here.
+ */
+ return -EINVAL;
+ }
+
+ ret = lm_ctx->migration.callbacks.prepare_data(lm_ctx->pvt,
+ &lm_ctx->migration.iter.offset,
+ &lm_ctx->migration.iter.size);
if (ret < 0) {
- lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count,
- pos);
return ret;
}
- return count;
+
+ *offset = lm_ctx->migration.iter.offset + sizeof(struct vfio_device_migration_info);
+
+ return ret;
+}
+
+static ssize_t
+handle_migration_data_size(lm_ctx_t *lm_ctx, __u64 *size, bool is_write)
+{
+ assert(lm_ctx != NULL);
+ assert(size != NULL);
+
+ if (is_write) {
+ return -EINVAL;
+ }
+
+ switch (lm_ctx->migration.iter.state) {
+ case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+ break;
+ default:
+ /* FIXME see comment in handle_migration_data_offset */
+ return -EINVAL;
+ }
+
+ *size = lm_ctx->migration.iter.size;
+
+ return 0;
}
static ssize_t
-do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
+handle_migration_region_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(buf != NULL);
+
+ if (pos + count > lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size) {
+ lm_log(lm_ctx, LM_ERR, "read %#x-%#x past end of migration region",
+ pos, pos + count - 1);
+ return -EINVAL;
+ }
+ switch (pos) {
+ case offsetof(struct vfio_device_migration_info, device_state):
+ if (count != sizeof(lm_ctx->migration.info.device_state)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_device_state(lm_ctx, (__u32*)buf,
+ is_write);
+ break;
+ case offsetof(struct vfio_device_migration_info, pending_bytes):
+ if (count != sizeof(lm_ctx->migration.info.pending_bytes)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_pending_bytes(lm_ctx, (__u64*)buf, is_write);
+ break;
+ case offsetof(struct vfio_device_migration_info, data_offset):
+ if (count != sizeof(lm_ctx->migration.info.data_offset)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_data_offset(lm_ctx, (__u64*)buf, is_write);
+ break;
+ case offsetof(struct vfio_device_migration_info, data_size):
+ if (count != sizeof(lm_ctx->migration.info.data_size)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_data_size(lm_ctx, (__u64*)buf, is_write);
+ break;
+ default:
+ if (is_write) {
+ /* FIXME how do we handle the offset? */
+ ret = lm_ctx->migration.callbacks.write_data(lm_ctx->pvt,
+ buf, count);
+ } else {
+ ret = lm_ctx->migration.callbacks.read_data(lm_ctx->pvt,
+ buf, count,
+ pos - sizeof(struct vfio_device_migration_info));
+ }
+ }
+
+ if (ret == 0) {
+ ret = count;
+ }
+ return ret;
+}
+
+static ssize_t
+do_access(lm_ctx_t *lm_ctx, char *buf, uint8_t count, uint64_t pos, bool is_write)
{
int idx;
loff_t offset;
@@ -737,7 +1491,7 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
assert(lm_ctx != NULL);
assert(buf != NULL);
- assert(count > 0);
+ assert(count == 1 || count == 2 || count == 4 || count == 8);
pci_info = &lm_ctx->pci_info;
idx = lm_get_region(pos, count, &offset);
@@ -756,6 +1510,11 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
is_write);
}
+ if (idx == LM_DEV_MIGRATION_REG_IDX) {
+ return handle_migration_region_access(lm_ctx, buf, count, offset,
+ is_write);
+ }
+
/*
* Checking whether a callback exists might sound expensive however this
* code is not performance critical. This works well when we don't expect a
@@ -777,12 +1536,15 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
* error.
*
* TODO function name same lm_access_t, fix
+ * FIXME we must be able to return values up to uint32_t bit, or negative on
+ * error. Better to make return value an int and return the number of bytes
+ * processed via an argument.
*/
ssize_t
-lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
+lm_access(lm_ctx_t *lm_ctx, char *buf, uint32_t count, uint64_t *ppos,
bool is_write)
{
- unsigned int done = 0;
+ uint32_t done = 0;
int ret;
assert(lm_ctx != NULL);
@@ -792,7 +1554,10 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
size_t size;
/*
* Limit accesses to qword and enforce alignment. Figure out whether
- * the PCI spec requires this.
+ * the PCI spec requires this
+ * FIXME while this makes sense for registers, we might be able to relax
+ * this requirement and make some transfers more efficient. Maybe make
+ * this a per-region option that can be set by the user?
*/
if (count >= 8 && !(*ppos % 8)) {
size = 8;
@@ -805,15 +1570,16 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
}
ret = do_access(lm_ctx, buf, size, *ppos, is_write);
if (ret <= 0) {
- lm_log(lm_ctx, LM_ERR, "failed to %s %llx@%lx: %s\n",
- is_write ? "write" : "read", size, *ppos, strerror(-ret));
+ lm_log(lm_ctx, LM_ERR, "failed to %s %#lx-%#lx: %s",
+ is_write ? "write to" : "read from", *ppos, *ppos + size - 1,
+ strerror(-ret));
/*
* TODO if ret < 0 then it might contain a legitimate error code, why replace it with EFAULT?
*/
return -EFAULT;
}
if (ret != (int)size) {
- lm_log(lm_ctx, LM_DBG, "bad read %d != %d\n", ret, size);
+ lm_log(lm_ctx, LM_DBG, "bad read %d != %d", ret, size);
}
count -= size;
done += size;
@@ -824,50 +1590,54 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
}
static inline int
-muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write)
+muser_access(lm_ctx_t *lm_ctx, bool is_write, void **data, uint32_t count,
+ uint64_t *pos)
{
+ struct vfio_user_region_access *region_access;
char *rwbuf;
int err;
- size_t count = 0, _count;
- ssize_t ret;
+ uint32_t processed = 0, _count;
+ int ret;
+
+ assert(pos != NULL);
/* TODO how big do we expect count to be? Can we use alloca(3) instead? */
- rwbuf = calloc(1, cmd->rw.count);
- if (rwbuf == NULL) {
+ region_access = calloc(1, sizeof(*region_access) + count);
+ if (region_access == NULL) {
lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n");
return -1;
}
+ rwbuf = (char*)(region_access + 1);
-#ifndef LM_TERSE_LOGGING
- lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count,
- cmd->rw.pos);
-#endif
+ lm_log(lm_ctx, LM_DBG, "%s %#lx-%#lx", is_write ? "W" : "R", *pos,
+ *pos + count - 1);
- /* copy data to be written from kernel to user space */
+ /* receive data to be written */
if (is_write) {
- err = read(lm_ctx->fd, rwbuf, cmd->rw.count);
+ err = read(lm_ctx->conn_fd, rwbuf, count);
/*
* FIXME this is wrong, we should be checking for
- * err != cmd->rw.count
+ * err != count
*/
if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n",
+ lm_log(lm_ctx, LM_ERR, "failed to receive write payload: %s",
strerror(errno));
goto out;
}
err = 0;
-#ifndef LM_TERSE_LOGGING
- dump_buffer(lm_ctx, "buffer write", rwbuf, cmd->rw.count);
+#ifdef LM_VERBOSE_LOGGING
+ dump_buffer("buffer write", rwbuf, count);
#endif
}
- count = _count = cmd->rw.count;
- cmd->err = muser_pci_hdr_access(lm_ctx, &_count, &cmd->rw.pos,
- is_write, rwbuf);
- if (cmd->err) {
- lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err);
-#ifndef LM_TERSE_LOGGING
- dump_buffer(lm_ctx, "buffer write", rwbuf, _count);
+ _count = count;
+ ret = muser_pci_hdr_access(lm_ctx, &_count, pos, is_write, rwbuf);
+ if (ret != 0) {
+ /* FIXME shouldn't we fail here? */
+ lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %s",
+ strerror(-ret));
+#ifdef LM_VERBOSE_LOGGING
+ dump_buffer("buffer write", rwbuf, _count);
#endif
}
@@ -875,150 +1645,618 @@ muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write)
* count is how much has been processed by muser_pci_hdr_access,
* _count is how much there's left to be processed by lm_access
*/
- count -= _count;
- ret = lm_access(lm_ctx, rwbuf + count, _count, &cmd->rw.pos,
- is_write);
- if (!is_write && ret >= 0) {
- ret += count;
- err = post_read(lm_ctx, rwbuf, ret);
- if (!LM_TERSE_LOGGING && err == ret) {
- dump_buffer(lm_ctx, "buffer read", rwbuf, ret);
+ processed = count - _count;
+ ret = lm_access(lm_ctx, rwbuf + processed, _count, pos, is_write);
+ if (ret >= 0) {
+ ret += processed;
+ if (data != NULL) {
+ /*
+ * FIXME the spec doesn't specify whether the reset of the
+ * region_access struct needs to be populated.
+ */
+ region_access->count = ret;
+ *data = region_access;
+ return ret;
+ } else if (!is_write) {
+ err = post_read(lm_ctx, rwbuf, ret);
+#ifdef LM_VERBOSE_LOGGING
+ if (err == ret) {
+ dump_buffer("buffer read", rwbuf, ret);
+ }
+#endif
}
}
out:
- free(rwbuf);
+ free(region_access);
- return err;
+ return ret;
+}
+
+static int handle_device_get_region_info(lm_ctx_t *lm_ctx,
+ struct vfio_user_header *hdr,
+ struct vfio_region_info **dev_reg_info)
+{
+ struct vfio_region_info *reg_info;
+ int ret;
+
+ reg_info = calloc(sizeof(*reg_info), 1);
+ if (reg_info == NULL) {
+ return -ENOMEM;
+ }
+
+ if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*reg_info)) {
+ free(reg_info);
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, reg_info, sizeof(*reg_info), 0);
+ if (ret < 0) {
+ free(reg_info);
+ return -errno;
+ }
+
+ ret = dev_get_reginfo(lm_ctx, &reg_info);
+ if (ret < 0) {
+ free(reg_info);
+ return ret;
+ }
+ *dev_reg_info = reg_info;
+
+ return 0;
+}
+
+static int handle_device_get_info(lm_ctx_t *lm_ctx,
+ struct vfio_user_header *hdr,
+ struct vfio_device_info *dev_info)
+{
+ int ret;
+
+ if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*dev_info)) {
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, dev_info, sizeof(*dev_info), 0);
+ if (ret < 0) {
+ return -errno;
+ }
+
+ ret = dev_get_info(lm_ctx, dev_info);
+ if (ret < 0) {
+ return ret;
+ }
+
+ lm_log(lm_ctx, LM_DBG, "sent devinfo flags %#x, num_regions %d, num_irqs"
+ " %d", dev_info->flags, dev_info->num_regions, dev_info->num_irqs);
+ return ret;
}
static int
-muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
+handle_device_get_irq_info(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct vfio_irq_info *irq_info)
{
- void *data = NULL;
- size_t size = 0;
int ret;
- /* TODO make this a function that returns the size */
- if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) {
- uint32_t flags = cmd->ioctl.data.irq_set.flags;
- switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+ assert(lm_ctx != NULL);
+ assert(irq_info != NULL);
+
+ hdr->msg_size -= sizeof *hdr;
+
+ if (hdr->msg_size != sizeof *irq_info) {
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, irq_info, hdr->msg_size, 0);
+ if (ret < 0) {
+ return -errno;
+ }
+ if (ret != (int)hdr->msg_size) {
+ assert(false); /* FIXME */
+ }
+
+ return dev_get_irqinfo(lm_ctx, irq_info);
+}
+
+static int
+handle_device_set_irqs(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ int *fds, int nr_fds)
+{
+ int ret;
+ struct vfio_irq_set *irq_set;
+ void *data;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+
+ hdr->msg_size -= sizeof *hdr;
+
+ if (hdr->msg_size < sizeof *irq_set) {
+ return -EINVAL;
+ }
+
+ irq_set = alloca(hdr->msg_size); /* FIXME */
+
+ ret = recv(lm_ctx->conn_fd, irq_set, hdr->msg_size, 0);
+ if (ret < 0) {
+ return -errno;
+ }
+ if (ret != (int)hdr->msg_size) {
+ assert(false); /* FIXME */
+ }
+ if (ret != (int)irq_set->argsz) {
+ assert(false); /* FIXME */
+ }
+ switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
case VFIO_IRQ_SET_DATA_EVENTFD:
- size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count;
+ data = fds;
+ if (nr_fds != (int)irq_set->count) {
+ return -EINVAL;
+ }
break;
case VFIO_IRQ_SET_DATA_BOOL:
- size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count;
+ data = irq_set + 1;
break;
+ }
+
+ return dev_set_irqs(lm_ctx, irq_set, data);
+}
+
+static int
+handle_dma_map_or_unmap(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, bool map,
+ int *fds, int nr_fds)
+{
+ int ret, i;
+ int nr_dma_regions;
+ struct vfio_user_dma_region *dma_regions;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+
+ hdr->msg_size -= sizeof *hdr;
+
+ if (hdr->msg_size % sizeof(struct vfio_user_dma_region) != 0) {
+ lm_log(lm_ctx, LM_ERR, "bad size of DMA regions %d", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ nr_dma_regions = (int)(hdr->msg_size / sizeof(struct vfio_user_dma_region));
+ if (map && nr_dma_regions != nr_fds) {
+ lm_log(lm_ctx, LM_ERR, "expected %d fds but got %d instead",
+ nr_dma_regions, nr_fds);
+ return -EINVAL;
+ }
+
+ dma_regions = alloca(nr_dma_regions * sizeof(*dma_regions));
+
+ ret = recv(lm_ctx->conn_fd, dma_regions, hdr->msg_size, 0);
+ if (ret == -1) {
+ lm_log(lm_ctx, LM_ERR, "failed to receive DMA region entries: %m");
+ return -errno;
+ }
+
+ if (lm_ctx->dma == NULL) {
+ return 0;
+ }
+
+ for (i = 0; i < nr_dma_regions; i++) {
+ if (map) {
+ if (dma_regions[i].flags != VFIO_USER_F_DMA_REGION_MAPPABLE) {
+ /*
+ * FIXME implement non-mappable DMA regions. This requires changing
+ * dma.c to not take a file descriptor.
+ */
+ assert(false);
+ }
+
+ ret = dma_controller_add_region(lm_ctx->dma,
+ dma_regions[i].addr,
+ dma_regions[i].size,
+ fds[i],
+ dma_regions[i].offset);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_INF,
+ "failed to add DMA region %#lx-%#lx offset=%#lx fd=%d: %s",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1,
+ dma_regions[i].offset, fds[i],
+ strerror(-ret));
+ } else {
+ lm_log(lm_ctx, LM_DBG,
+ "added DMA region %#lx-%#lx offset=%#lx fd=%d",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1,
+ dma_regions[i].offset, fds[i]);
+ }
+ } else {
+ ret = dma_controller_remove_region(lm_ctx->dma,
+ dma_regions[i].addr,
+ dma_regions[i].size,
+ lm_ctx->unmap_dma, lm_ctx->pvt);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_INF,
+ "failed to remove DMA region %#lx-%#lx: %s",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1,
+ strerror(-ret));
+ } else {
+ lm_log(lm_ctx, LM_DBG,
+ "removed DMA region %#lx-%#lx",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1);
+ }
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ if (lm_ctx->map_dma != NULL) {
+ lm_ctx->map_dma(lm_ctx->pvt, dma_regions[i].addr, dma_regions[i].size);
}
}
+ return 0;
+}
- if (size != 0) {
- data = calloc(1, size);
- if (data == NULL) {
-#ifdef DEBUG
- perror("calloc");
-#endif
- return -1;
+static int
+handle_device_reset(lm_ctx_t *lm_ctx)
+{
+ return device_reset(lm_ctx);
+}
+
+static int
+handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ void **data, size_t *len)
+{
+ struct vfio_user_region_access region_access;
+ uint64_t count, offset;
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(data != NULL);
+
+ /*
+ * TODO Since muser_access doesn't have to handle the kernel case any more,
+ * we can avoid having to do an additional read/recv inside muser_access
+ * (one recv for struct region_access and another for the write data) by
+ * doing a single recvmsg here with an iovec where the first element of the
+ * array will be struct vfio_user_region_access and the second a buffer if
+ * it's a write. The size of the write buffer is: hdr->msg_size - sizeof
+ * *hdr - sizeof region_access, and should be equal to region_access.count.
+ */
+
+ hdr->msg_size -= sizeof *hdr;
+ if (hdr->msg_size < sizeof region_access) {
+ lm_log(lm_ctx, LM_ERR, "message size too small (%d)", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, &region_access, sizeof region_access, 0);
+ if (ret == -1) {
+ lm_log(lm_ctx, LM_ERR, "failed to recv: %m");
+ return -errno;
+ }
+ if (ret != sizeof region_access) {
+ lm_log(lm_ctx, LM_ERR, "bad region_access size %d", ret);
+ return -EINVAL;
+ }
+ if (region_access.region >= LM_DEV_NUM_REGS || region_access.count <= 0 ) {
+ lm_log(lm_ctx, LM_ERR, "bad region %d and/or count %d",
+ region_access.region, region_access.count);
+ return -EINVAL;
+ }
+ count = region_access.count;
+ offset = region_to_offset(region_access.region) + region_access.offset;
+
+ ret = muser_access(lm_ctx, hdr->cmd == VFIO_USER_REGION_WRITE,
+ data, count, &offset);
+ if (ret != (int)region_access.count) {
+ lm_log(lm_ctx, LM_ERR, "bad region access acount, expected=%d, actual=%d",
+ region_access.count, ret);
+ /* FIXME we should return whatever has been accessed, not an error */
+ if (ret >= 0) {
+ ret = -EINVAL;
}
+ return ret;
+ }
- ret = read(lm_ctx->fd, data, size);
- if (ret < 0) {
-#ifdef DEBUG
- perror("read failed");
-#endif
+ *len = sizeof(region_access);
+ if (hdr->cmd == VFIO_USER_REGION_READ) {
+ *len += region_access.count;
+ }
+
+ return 0;
+}
+
+static int
+handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct iovec **iovecs, size_t *nr_iovecs)
+{
+ int size, ret;
+ size_t i;
+ struct vfio_iommu_type1_dirty_bitmap_get *ranges;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(iovecs != NULL);
+ assert(nr_iovecs != NULL);
+
+ size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap);
+ if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) {
+ return -EINVAL;
+ }
+ ranges = malloc(size);
+ if (ranges == NULL) {
+ return -errno;
+ }
+ ret = recv(lm_ctx->conn_fd, ranges, size, 0);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+ if (ret != size) {
+ ret = -EINVAL;
+ goto out;
+ }
+ *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get);
+ *iovecs = malloc(*nr_iovecs * sizeof(struct iovec));
+ if (*iovecs == NULL) {
+ ret = -errno;
+ goto out;
+ }
+
+ for (i = 1; i < *nr_iovecs; i++) {
+ struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */
+ ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size,
+ r->bitmap.pgsize, r->bitmap.size,
+ (char**)&((*iovecs)[i].iov_base));
+ if (ret != 0) {
goto out;
}
+ (*iovecs)[i].iov_len = r->bitmap.size;
}
+out:
+ if (ret != 0) {
+ if (*iovecs != NULL) {
+ free(*iovecs);
+ *iovecs = NULL;
+ }
+ }
+ free(ranges);
+ return ret;
+}
- ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data);
+static int
+handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct iovec **iovecs, size_t *nr_iovecs)
+{
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap;
+ int ret;
-out:
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(iovecs != NULL);
+ assert(nr_iovecs != NULL);
+
+ if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) {
+ lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ /* FIXME must also check argsz */
+
+ ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if ((size_t)ret < sizeof dirty_bitmap) {
+ return -EINVAL;
+ }
+
+ if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+ ret = dma_controller_dirty_page_logging_start(lm_ctx->dma,
+ lm_ctx->migration.pgsize);
+ } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+ ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma);
+ } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+ ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs);
+ } else {
+ ret = -EINVAL;
+ }
- free(data);
return ret;
}
+/*
+ * FIXME return value is messed up, sometimes we return -1 and set errno while
+ * other times we return -errno. Fix.
+ */
+
static int
-drive_loop(lm_ctx_t *lm_ctx)
+process_request(lm_ctx_t *lm_ctx)
{
- struct muser_cmd cmd = { 0 };
- int err;
+ struct vfio_user_header hdr = { 0, };
+ int ret;
+ int *fds = NULL;
+ int nr_fds;
+ struct vfio_irq_info irq_info;
+ struct vfio_device_info dev_info;
+ struct vfio_region_info *dev_reg_info = NULL;
+ struct iovec _iovecs[2] = { { 0, } };
+ struct iovec *iovecs = NULL;
+ size_t nr_iovecs = 0;
+ bool free_iovec_data = true;
- do {
- err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd);
- if (err < 0) {
- return err;
+ assert(lm_ctx != NULL);
+
+ if (lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size > 0 &&
+ lm_ctx->migration.info.device_state == VFIO_DEVICE_STATE_STOP) {
+ return -ESHUTDOWN;
+ }
+
+ nr_fds = lm_ctx->client_max_fds;
+ fds = alloca(nr_fds * sizeof(int));
+
+ /* FIXME get request shouldn't set errno, it should return it as -errno */
+ ret = transports_ops[lm_ctx->trans].get_request(lm_ctx, &hdr, fds, &nr_fds);
+ if (unlikely(ret < 0)) {
+ if (ret == -EAGAIN || ret == -EWOULDBLOCK) {
+ return 0;
+ }
+ if (ret != -EINTR) {
+ lm_log(lm_ctx, LM_ERR, "failed to receive request: %s", strerror(-ret));
}
+ return ret;
+ }
+ if (unlikely(ret == 0)) {
+ if (errno == EINTR) {
+ return -EINTR;
+ }
+ if (errno == 0) {
+ lm_log(lm_ctx, LM_INF, "VFIO client closed connection");
+ } else {
+ lm_log(lm_ctx, LM_ERR, "end of file: %m");
+ }
+ return -ENOTCONN;
+ }
+
+ if (ret < (int)sizeof hdr) {
+ lm_log(lm_ctx, LM_ERR, "short header read %d", ret);
+ return -EINVAL;
+ }
- switch (cmd.type) {
- case MUSER_IOCTL:
- err = muser_ioctl(lm_ctx, &cmd);
+ if (hdr.flags.type != VFIO_USER_F_TYPE_COMMAND) {
+ lm_log(lm_ctx, LM_ERR, "header not a request");
+ return -EINVAL;
+ }
+
+ if (hdr.msg_size < sizeof hdr) {
+ lm_log(lm_ctx, LM_ERR, "bad size in header %d", hdr.msg_size);
+ return -EINVAL;
+ }
+
+ /* FIXME in most of the following function we check that hdr.count is >=
+ * than the command-specific struct and there is an additional recv(2) for
+ * that data. We should eliminate duplicating this common code and move it
+ * here.
+ */
+
+ switch (hdr.cmd) {
+ case VFIO_USER_DMA_MAP:
+ case VFIO_USER_DMA_UNMAP:
+ ret = handle_dma_map_or_unmap(lm_ctx, &hdr,
+ hdr.cmd == VFIO_USER_DMA_MAP,
+ fds, nr_fds);
break;
- case MUSER_READ:
- case MUSER_WRITE:
- err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE);
+ case VFIO_USER_DEVICE_GET_INFO:
+ ret = handle_device_get_info(lm_ctx, &hdr, &dev_info);
+ if (ret == 0) {
+ _iovecs[1].iov_base = &dev_info;
+ _iovecs[1].iov_len = dev_info.argsz;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
+ }
break;
- case MUSER_MMAP:
- err = muser_mmap(lm_ctx, &cmd);
+ case VFIO_USER_DEVICE_GET_REGION_INFO:
+ ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info);
+ if (ret == 0) {
+ _iovecs[1].iov_base = dev_reg_info;
+ _iovecs[1].iov_len = dev_reg_info->argsz;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
+ }
+ break;
+ case VFIO_USER_DEVICE_GET_IRQ_INFO:
+ ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info);
+ if (ret == 0) {
+ _iovecs[1].iov_base = &irq_info;
+ _iovecs[1].iov_len = sizeof irq_info;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
+ }
break;
- case MUSER_DMA_MMAP:
- err = muser_dma_map(lm_ctx, &cmd);
+ case VFIO_USER_DEVICE_SET_IRQS:
+ ret = handle_device_set_irqs(lm_ctx, &hdr, fds, nr_fds);
break;
- case MUSER_DMA_MUNMAP:
- err = muser_dma_unmap(lm_ctx, &cmd);
+ case VFIO_USER_REGION_READ:
+ case VFIO_USER_REGION_WRITE:
+ iovecs = _iovecs;
+ ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base,
+ &iovecs[1].iov_len);
+ nr_iovecs = 2;
+ break;
+ case VFIO_USER_DEVICE_RESET:
+ ret = handle_device_reset(lm_ctx);
+ break;
+ case VFIO_USER_DIRTY_PAGES:
+ ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs);
+ if (ret >= 0) {
+ free_iovec_data = false;
+ }
break;
default:
- lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type);
- continue;
- }
- cmd.err = err;
- err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd);
- if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n",
- strerror(errno));
+ lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd);
+ return -EINVAL;
+ }
+
+ /*
+ * TODO: In case of error during command handling set errno respectively
+ * in the reply message.
+ */
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to handle command %d: %s", hdr.cmd,
+ strerror(-ret));
+ assert(false); /* FIXME */
+ }
+ ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true,
+ 0, iovecs, nr_iovecs, NULL, 0);
+ if (unlikely(ret < 0)) {
+ lm_log(lm_ctx, LM_ERR, "failed to complete command: %s",
+ strerror(-ret));
+ }
+ if (iovecs != NULL && iovecs != _iovecs) {
+ if (free_iovec_data) {
+ size_t i;
+ for (i = 0; i < nr_iovecs; i++) {
+ free(iovecs[i].iov_base);
+ }
}
- // TODO: Figure out a clean way to get out of the loop.
- } while (1);
+ free(iovecs);
+ }
- return err;
+ return ret;
}
int
lm_ctx_drive(lm_ctx_t *lm_ctx)
{
+ int err;
+
if (lm_ctx == NULL) {
errno = EINVAL;
return -1;
}
- return drive_loop(lm_ctx);
-}
+ do {
+ err = process_request(lm_ctx);
+ } while (err >= 0);
-static int
-dev_detach(int dev_fd)
-{
- return close(dev_fd);
+ return err;
}
-static int
-dev_attach(const char *uuid)
+int
+lm_ctx_poll(lm_ctx_t *lm_ctx)
{
- char *path;
- int dev_fd;
int err;
- err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid);
- if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) {
- return -1;
+ if (unlikely((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0)) {
+ return -ENOTSUP;
}
- dev_fd = open(path, O_RDWR);
-
- free(path);
+ err = process_request(lm_ctx);
- return dev_fd;
+ return err >= 0 ? 0 : err;
}
+/* FIXME this is not enough anymore, check muser_mmap */
void *
lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length)
{
@@ -1035,38 +2273,64 @@ lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length)
lm_ctx->fd, offset);
}
-int
-lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t vector)
+static int validate_irq_subindex(lm_ctx_t *lm_ctx, uint32_t subindex)
{
- eventfd_t val = 1;
- if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) {
- lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", vector,
+ if ((lm_ctx == NULL) || (subindex >= lm_ctx->irqs.max_ivs)) {
+ lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", subindex,
lm_ctx->irqs.max_ivs);
+ /* FIXME should return -errno */
errno = EINVAL;
return -1;
}
- if (lm_ctx->irqs.efds[vector] == -1) {
- lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", vector);
+ return 0;
+}
+
+int
+lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex)
+{
+ int ret;
+ eventfd_t val = 1;
+
+ ret = validate_irq_subindex(lm_ctx, subindex);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (lm_ctx->irqs.efds[subindex] == -1) {
+ lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", subindex);
+ /* FIXME should return -errno */
errno = ENOENT;
return -1;
}
- if (vector == LM_DEV_INTX_IRQ_IDX && !lm_ctx->pci_config_space->hdr.cmd.id) {
- lm_log(lm_ctx, LM_ERR, "failed to trigger INTx IRQ, INTx disabled\n");
- errno = EINVAL;
+ return eventfd_write(lm_ctx->irqs.efds[subindex], val);
+}
+
+int
+lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex)
+{
+ int ret, msg_id = 1;
+ struct vfio_user_irq_info irq_info;
+
+ ret = validate_irq_subindex(lm_ctx, subindex);
+ if (ret < 0) {
return -1;
- } else if (vector == LM_DEV_MSIX_IRQ_IDX) {
- /*
- * FIXME must check that MSI-X capability exists during creation time
- * FIXME need to check that MSI-X is enabled and that it's not masked.
- * Currently that's not possible because libmuser doesn't care about
- * the internals of a capability.
- */
}
- return eventfd_write(lm_ctx->irqs.efds[vector], val);
+ irq_info.subindex = subindex;
+ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id,
+ VFIO_USER_VM_INTERRUPT,
+ &irq_info, sizeof irq_info,
+ NULL, 0, NULL, NULL, 0);
+ if (ret < 0) {
+ /* FIXME should return -errno */
+ errno = -ret;
+ return -1;
+ }
+
+ return 0;
}
static void
@@ -1081,16 +2345,50 @@ free_sparse_mmap_areas(lm_reg_info_t *reg_info)
void
lm_ctx_destroy(lm_ctx_t *lm_ctx)
{
+ int ret;
+
if (lm_ctx == NULL) {
return;
}
+ free(lm_ctx->uuid);
+
+ /*
+ * FIXME The following cleanup can be dangerous depending on how lm_ctx_destroy
+ * is called since it might delete files it did not create. Improve by
+ * acquiring a lock on the directory.
+ */
+
+ if (lm_ctx->iommu_dir_fd != -1) {
+ if ((ret = unlinkat(lm_ctx->iommu_dir_fd, IOMMU_GRP_NAME, 0)) == -1
+ && errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove " IOMMU_GRP_NAME ": "
+ "%m\n");
+ }
+ if ((ret = unlinkat(lm_ctx->iommu_dir_fd, MUSER_SOCK, 0)) == -1 &&
+ errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove " MUSER_SOCK ": %m\n");
+ }
+ if (close(lm_ctx->iommu_dir_fd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IOMMU dir fd %d: %m\n",
+ lm_ctx->iommu_dir_fd);
+ }
+ }
+ if (lm_ctx->iommu_dir != NULL) {
+ if ((ret = rmdir(lm_ctx->iommu_dir)) == -1 && errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove %s: %m\n",
+ lm_ctx->iommu_dir);
+ }
+ free(lm_ctx->iommu_dir);
+ }
+
free(lm_ctx->pci_config_space);
- dev_detach(lm_ctx->fd);
+ transports_ops[lm_ctx->trans].detach(lm_ctx);
if (lm_ctx->dma != NULL) {
- dma_controller_destroy(lm_ctx, lm_ctx->dma);
+ dma_controller_destroy(lm_ctx->dma);
}
free_sparse_mmap_areas(lm_ctx->pci_info.reg_info);
+ free(lm_ctx->caps);
free(lm_ctx);
// FIXME: Maybe close any open irq efds? Unmap stuff?
}
@@ -1125,6 +2423,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
{
lm_reg_info_t *cfg_reg;
const lm_reg_info_t zero_reg = { 0 };
+ lm_reg_info_t *migr_reg;
int i;
assert(lm_ctx != NULL);
@@ -1171,7 +2470,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
// Initialise capabilities.
if (dev_info->nr_caps > 0) {
- lm_ctx->caps = caps_create(dev_info->caps, dev_info->nr_caps);
+ lm_ctx->caps = caps_create(lm_ctx, dev_info->caps, dev_info->nr_caps);
if (lm_ctx->caps == NULL) {
lm_log(lm_ctx, LM_ERR, "failed to create PCI capabilities: %m\n");
goto err;
@@ -1181,6 +2480,28 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF;
}
+ /*
+ * Check the migration region.
+ */
+ migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX];
+ if (migr_reg->size > 0) {
+ if (migr_reg->size < sizeof(struct vfio_device_migration_info)) {
+ return -EINVAL;
+ }
+
+ /* FIXME this should be done in lm_ctx_run or poll */
+ lm_ctx->migration.info.device_state = VFIO_DEVICE_STATE_RUNNING;
+
+ lm_ctx->migration.callbacks = dev_info->migration_callbacks;
+ if (lm_ctx->migration.callbacks.transition == NULL ||
+ lm_ctx->migration.callbacks.get_pending_bytes == NULL ||
+ lm_ctx->migration.callbacks.prepare_data == NULL ||
+ lm_ctx->migration.callbacks.read_data == NULL ||
+ lm_ctx->migration.callbacks.write_data == NULL) {
+ return -EINVAL;
+ }
+ }
+
return 0;
err:
@@ -1212,6 +2533,18 @@ pci_info_bounce(lm_pci_info_t *dst, const lm_pci_info_t *src)
dst->cc = src->cc;
}
+int
+lm_ctx_try_attach(lm_ctx_t *lm_ctx)
+{
+ assert(lm_ctx != NULL);
+
+ if ((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0) {
+ errno = EINVAL;
+ return -1;
+ }
+ return transports_ops[lm_ctx->trans].attach(lm_ctx);
+}
+
lm_ctx_t *
lm_ctx_create(const lm_dev_info_t *dev_info)
{
@@ -1226,6 +2559,11 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
return NULL;
}
+ if (dev_info->trans != LM_TRANS_SOCK) {
+ errno = EINVAL;
+ return NULL;
+ }
+
/*
* FIXME need to check that the number of MSI and MSI-X IRQs are valid
* (1, 2, 4, 8, 16 or 32 for MSI and up to 2048 for MSI-X).
@@ -1244,6 +2582,9 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
if (lm_ctx == NULL) {
return NULL;
}
+ lm_ctx->trans = dev_info->trans;
+
+ lm_ctx->iommu_dir_fd = -1;
// Set context irq information.
for (i = 0; i < max_ivs; i++) {
@@ -1259,10 +2600,26 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
lm_ctx->log = dev_info->log;
lm_ctx->log_lvl = dev_info->log_lvl;
lm_ctx->reset = dev_info->reset;
+ lm_ctx->flags = dev_info->flags;
+
+ lm_ctx->uuid = strdup(dev_info->uuid);
+ if (lm_ctx->uuid == NULL) {
+ err = errno;
+ goto out;
+ }
// Bounce the provided pci_info into the context.
pci_info_bounce(&lm_ctx->pci_info, &dev_info->pci_info);
+ /*
+ * FIXME above memcpy also copies reg_info->mmap_areas. If pci_config_setup
+ * fails then we try to free reg_info->mmap_areas, which is wrong because
+ * this is a user pointer.
+ */
+ for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_info.reg_info); i++) {
+ lm_ctx->pci_info.reg_info[i].mmap_areas = NULL;
+ }
+
// Setup the PCI config space for this context.
err = pci_config_setup(lm_ctx, dev_info);
if (err != 0) {
@@ -1276,65 +2633,53 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
goto out;
}
- // Attach to the muser control device.
- lm_ctx->fd = dev_attach(dev_info->uuid);
- if (lm_ctx->fd == -1) {
- err = errno;
- goto out;
+ if (transports_ops[dev_info->trans].init != NULL) {
+ err = transports_ops[dev_info->trans].init(lm_ctx);
+ if (err < 0) {
+ goto out;
+ }
+ lm_ctx->fd = err;
+ }
+ err = 0;
+
+ // Attach to the muser control device. With LM_FLAG_ATTACH_NB caller is
+ // always expected to call lm_ctx_try_attach().
+ if ((dev_info->flags & LM_FLAG_ATTACH_NB) == 0) {
+ lm_ctx->conn_fd = transports_ops[dev_info->trans].attach(lm_ctx);
+ if (lm_ctx->conn_fd < 0) {
+ err = lm_ctx->conn_fd;
+ if (err != EINTR) {
+ lm_log(lm_ctx, LM_ERR, "failed to attach: %s",
+ strerror(-err));
+ }
+ goto out;
+ }
}
+ lm_ctx->map_dma = dev_info->map_dma;
+ lm_ctx->unmap_dma = dev_info->unmap_dma;
+
// Create the internal DMA controller.
- lm_ctx->dma = dma_controller_create(LM_DMA_REGIONS);
- if (lm_ctx->dma == NULL) {
- err = errno;
- goto out;
+ if (lm_ctx->unmap_dma != NULL) {
+ lm_ctx->dma = dma_controller_create(lm_ctx, LM_DMA_REGIONS);
+ if (lm_ctx->dma == NULL) {
+ err = errno;
+ goto out;
+ }
}
out:
- if (err) {
- if (lm_ctx) {
- dma_controller_destroy(lm_ctx, lm_ctx->dma);
- dev_detach(lm_ctx->fd);
- free_sparse_mmap_areas(lm_ctx->pci_info.reg_info);
- free(lm_ctx->pci_config_space);
- free(lm_ctx);
+ if (err != 0) {
+ if (lm_ctx != NULL) {
+ lm_ctx_destroy(lm_ctx);
lm_ctx = NULL;
}
- errno = err;
+ errno = -err;
}
return lm_ctx;
}
-#ifdef DEBUG
-static void
-dump_buffer(lm_ctx_t *lm_ctx, const char *prefix,
- const char *buf, uint32_t count)
-{
- int i;
- const size_t bytes_per_line = 0x8;
-
- if (strcmp(prefix, "")) {
- lm_log(lm_ctx, LM_DBG, "%s\n", prefix);
- }
- for (i = 0; i < (int)count; i++) {
- if (i % bytes_per_line != 0) {
- lm_log(lm_ctx, LM_DBG, " ");
- }
- /* TODO valgrind emits a warning if count is 1 */
- lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i));
- if ((i + 1) % bytes_per_line == 0) {
- lm_log(lm_ctx, LM_DBG, "\n");
- }
- }
- if (i % bytes_per_line != 0) {
- lm_log(lm_ctx, LM_DBG, "\n");
- }
-}
-#else
-#define dump_buffer(lm_ctx, prefix, buf, count)
-#endif
-
/*
* Returns a pointer to the standard part of the PCI configuration space.
*/
@@ -1364,21 +2709,34 @@ lm_get_region_info(lm_ctx_t *lm_ctx)
inline int
lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr,
- uint32_t len, dma_sg_t *sg, int max_sg)
+ uint32_t len, dma_sg_t *sg, int max_sg, int prot)
{
- return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg);
+ assert(lm_ctx != NULL);
+
+ if (unlikely(lm_ctx->unmap_dma == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+ return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot);
}
inline int
lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
struct iovec *iov, int cnt)
{
+ if (unlikely(lm_ctx->unmap_dma == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
return dma_map_sg(lm_ctx->dma, sg, iov, cnt);
}
inline void
lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, struct iovec *iov, int cnt)
{
+ if (unlikely(lm_ctx->unmap_dma == NULL)) {
+ return;
+ }
return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt);
}
@@ -1396,4 +2754,66 @@ lm_ctx_run(lm_dev_info_t *dev_info)
return ret;
}
+uint8_t *
+lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id)
+{
+ assert(lm_ctx != NULL);
+
+ return cap_find_by_id(lm_ctx, id);
+}
+
+int
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
+{
+ struct vfio_user_dma_region_access *dma_recv;
+ struct vfio_user_dma_region_access dma_send;
+ int recv_size;
+ int msg_id = 1, ret;
+
+ assert(lm_ctx != NULL);
+ assert(sg != NULL);
+
+ recv_size = sizeof(*dma_recv) + sg->length;
+
+ dma_recv = calloc(recv_size, 1);
+ if (dma_recv == NULL) {
+ return -ENOMEM;
+ }
+
+ dma_send.addr = sg->dma_addr;
+ dma_send.count = sg->length;
+ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ,
+ &dma_send, sizeof dma_send, NULL, 0, NULL,
+ dma_recv, recv_size);
+ memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */
+ free(dma_recv);
+
+ return ret;
+}
+
+int
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
+{
+ struct vfio_user_dma_region_access *dma_send, dma_recv;
+ int send_size = sizeof(*dma_send) + sg->length;
+ int msg_id = 1, ret;
+
+ assert(lm_ctx != NULL);
+ assert(sg != NULL);
+
+ dma_send = calloc(send_size, 1);
+ if (dma_send == NULL) {
+ return -ENOMEM;
+ }
+ dma_send->addr = sg->dma_addr;
+ dma_send->count = sg->length;
+ memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */
+ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE,
+ dma_send, send_size,
+ NULL, 0, NULL, &dma_recv, sizeof(dma_recv));
+ free(dma_send);
+
+ return ret;
+}
+
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser_pci.c b/lib/muser_pci.c
index 36692ab..2846301 100644
--- a/lib/muser_pci.c
+++ b/lib/muser_pci.c
@@ -52,7 +52,7 @@ muser_pci_hdr_write_bar(lm_ctx_t *lm_ctx, uint16_t bar_index, const char *buf)
lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx);
lm_pci_hdr_t *hdr;
- assert(lm_ctx);
+ assert(lm_ctx != NULL);
if (reg_info[bar_index].size == 0) {
return;
@@ -86,15 +86,15 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
{
uint16_t v;
- assert(ctx);
+ assert(ctx != NULL);
if (count != 2) {
lm_log(ctx, LM_ERR, "bad write command size %d\n", count);
return -EINVAL;
}
- assert(pci);
- assert(buf);
+ assert(pci != NULL);
+ assert(buf != NULL);
v = *(uint16_t*)buf;
@@ -153,17 +153,35 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) {
if (!pci->hdr.cmd.id) {
pci->hdr.cmd.id = 0x1;
- lm_log(ctx, LM_INF, "INTx emulation enabled\n");
+ lm_log(ctx, LM_INF, "INTx emulation disabled\n");
}
v &= ~PCI_COMMAND_INTX_DISABLE;
} else {
if (pci->hdr.cmd.id) {
pci->hdr.cmd.id = 0x0;
- lm_log(ctx, LM_INF, "INTx emulation disabled\n");
+ lm_log(ctx, LM_INF, "INTx emulation enabled\n");
}
}
- if (v) {
+ if ((v & PCI_COMMAND_INVALIDATE) == PCI_COMMAND_INVALIDATE) {
+ if (!pci->hdr.cmd.mwie) {
+ pci->hdr.cmd.mwie = 1U;
+ lm_log(ctx, LM_INF, "memory write and invalidate enabled\n");
+ }
+ v &= ~PCI_COMMAND_INVALIDATE;
+ } else {
+ if (pci->hdr.cmd.mwie) {
+ pci->hdr.cmd.mwie = 0;
+ lm_log(ctx, LM_INF, "memory write and invalidate disabled");
+ }
+ }
+
+ if ((v & PCI_COMMAND_VGA_PALETTE) == PCI_COMMAND_VGA_PALETTE) {
+ lm_log(ctx, LM_INF, "enabling VGA palette snooping ignored\n");
+ v &= ~PCI_COMMAND_VGA_PALETTE;
+ }
+
+ if (v != 0) {
lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v);
return -EINVAL;
}
@@ -177,8 +195,8 @@ handle_erom_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
{
uint32_t v;
- assert(ctx);
- assert(pci);
+ assert(ctx != NULL);
+ assert(pci != NULL);
if (count != 0x4) {
lm_log(ctx, LM_ERR, "bad EROM count %d\n", count);
@@ -207,8 +225,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
lm_pci_config_space_t *pci;
int ret = 0;
- assert(lm_ctx);
- assert(buf);
+ assert(lm_ctx != NULL);
+ assert(buf != NULL);
pci = lm_get_pci_config_space(lm_ctx);
@@ -248,8 +266,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
ret = -EINVAL;
}
-#ifndef LM_TERSE_LOGGING
- dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff);
+#ifdef LM_VERBOSE_LOGGING
+ dump_buffer("PCI header", (char*)pci->hdr.raw, 0xff);
#endif
return ret;
@@ -263,18 +281,18 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
* @count: output parameter that receives the number of bytes read/written
*/
static inline int
-muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool is_write,
+muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool is_write,
char *buf)
{
- size_t _count;
+ uint32_t _count;
loff_t _pos;
int err = 0;
- assert(lm_ctx);
- assert(count);
- assert(pos);
- assert(buf);
+ assert(lm_ctx != NULL);
+ assert(count != NULL);
+ assert(pos != NULL);
+ assert(buf != NULL);
_pos = *pos - region_to_offset(LM_DEV_CFG_REG_IDX);
_count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos);
@@ -290,20 +308,21 @@ muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
}
static inline bool
-muser_is_pci_hdr_access(loff_t pos)
+muser_is_pci_hdr_access(uint64_t pos)
{
- const off_t off = (loff_t) region_to_offset(LM_DEV_CFG_REG_IDX);
- return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF;
+ const uint64_t off = region_to_offset(LM_DEV_CFG_REG_IDX);
+ return pos >= off && pos - off < PCI_STD_HEADER_SIZEOF;
}
+/* FIXME this function is misleading, remove it */
int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool is_write,
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool is_write,
char *buf)
{
- assert(lm_ctx);
- assert(count);
- assert(pos);
+ assert(lm_ctx != NULL);
+ assert(count != NULL);
+ assert(pos != NULL);
if (!muser_is_pci_hdr_access(*pos)) {
return 0;
diff --git a/lib/muser_priv.h b/lib/muser_priv.h
index aa29f5a..097874a 100644
--- a/lib/muser_priv.h
+++ b/lib/muser_priv.h
@@ -35,9 +35,11 @@
#include "muser.h"
+extern char *irq_to_str[];
+
int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool write, char *buf);
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool write, char *buf);
lm_reg_info_t *
lm_get_region_info(lm_ctx_t *lm_ctx);
@@ -45,4 +47,111 @@ lm_get_region_info(lm_ctx_t *lm_ctx);
uint64_t
region_to_offset(uint32_t region);
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *fds, int count);
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count);
+
+
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+ uint16_t *msg_id, void *data, size_t *len);
+
+int
+send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
+ char *caps);
+
+int
+recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
+ int *max_fds, size_t *pgsize);
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len);
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len);
+
+/* FIXME copied from include/linux/stddef.h, is this OK license-wise? */
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+
+static inline ssize_t get_minsz(unsigned int cmd)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ return offsetofend(struct vfio_device_info, num_irqs);
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return offsetofend(struct vfio_region_info, offset);
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return offsetofend(struct vfio_irq_info, count);
+ case VFIO_DEVICE_SET_IRQS:
+ return offsetofend(struct vfio_irq_set, count);
+ case VFIO_GROUP_GET_STATUS:
+ return offsetofend(struct vfio_group_status, flags);
+ case VFIO_GET_API_VERSION:
+ return 0;
+ case VFIO_CHECK_EXTENSION:
+ case VFIO_GROUP_SET_CONTAINER:
+ case VFIO_GROUP_UNSET_CONTAINER:
+ case VFIO_SET_IOMMU:
+ return sizeof(int);
+ case VFIO_IOMMU_GET_INFO:
+ return offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ case VFIO_IOMMU_MAP_DMA:
+ return offsetofend(struct vfio_iommu_type1_dma_map, size);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+ case VFIO_GROUP_GET_DEVICE_FD:
+ case VFIO_DEVICE_RESET:
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static inline const char* vfio_cmd_to_str(int cmd) {
+ switch (cmd) {
+ case VFIO_GET_API_VERSION: return "VFIO_GET_API_VERSION";
+ case VFIO_CHECK_EXTENSION: return "VFIO_CHECK_EXTENSION";
+ case VFIO_SET_IOMMU: return "VFIO_SET_IOMMU";
+ case VFIO_GROUP_GET_STATUS: return "VFIO_GROUP_GET_STATUS";
+ case VFIO_GROUP_SET_CONTAINER: return "VFIO_GROUP_SET_CONTAINER";
+ case VFIO_GROUP_UNSET_CONTAINER: return "VFIO_GROUP_UNSET_CONTAINER";
+ case VFIO_GROUP_GET_DEVICE_FD: return "VFIO_GROUP_GET_DEVICE_FD";
+ case VFIO_DEVICE_GET_INFO: return "VFIO_DEVICE_GET_INFO";
+ case VFIO_DEVICE_GET_REGION_INFO: return "VFIO_DEVICE_GET_REGION_INFO";
+ case VFIO_DEVICE_GET_IRQ_INFO: return "VFIO_DEVICE_GET_IRQ_INFO";
+ case VFIO_DEVICE_SET_IRQS: return "VFIO_DEVICE_SET_IRQS";
+ case VFIO_DEVICE_RESET: return "VFIO_DEVICE_RESET";
+ case VFIO_IOMMU_GET_INFO: return "VFIO_IOMMU_GET_INFO/VFIO_DEVICE_GET_PCI_HOT_RESET_INFO/VFIO_IOMMU_SPAPR_TCE_GET_INFO";
+ case VFIO_IOMMU_MAP_DMA: return "VFIO_IOMMU_MAP_DMA/VFIO_DEVICE_PCI_HOT_RESET";
+ case VFIO_IOMMU_UNMAP_DMA: return "VFIO_IOMMU_UNMAP_DMA";
+ case VFIO_IOMMU_ENABLE: return "VFIO_IOMMU_ENABLE";
+ case VFIO_IOMMU_DISABLE: return "VFIO_IOMMU_DISABLE";
+ case VFIO_EEH_PE_OP: return "VFIO_EEH_PE_OP";
+ case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_REGISTER_MEMORY";
+ case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY";
+ case VFIO_IOMMU_SPAPR_TCE_CREATE: return "VFIO_IOMMU_SPAPR_TCE_CREATE";
+ case VFIO_IOMMU_SPAPR_TCE_REMOVE: return "VFIO_IOMMU_SPAPR_TCE_REMOVE";
+ }
+ return NULL;
+}
+
#endif /* MUSER_PRIV_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/vfio_user.h b/lib/vfio_user.h
new file mode 100644
index 0000000..19f751a
--- /dev/null
+++ b/lib/vfio_user.h
@@ -0,0 +1,167 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VFIO_USER_H
+#define _VFIO_USER_H
+
+#include <inttypes.h>
+#include <linux/vfio.h>
+#include <linux/version.h>
+
+enum vfio_user_command {
+ VFIO_USER_VERSION = 1,
+ VFIO_USER_DMA_MAP = 2,
+ VFIO_USER_DMA_UNMAP = 3,
+ VFIO_USER_DEVICE_GET_INFO = 4,
+ VFIO_USER_DEVICE_GET_REGION_INFO = 5,
+ VFIO_USER_DEVICE_GET_IRQ_INFO = 6,
+ VFIO_USER_DEVICE_SET_IRQS = 7,
+ VFIO_USER_REGION_READ = 8,
+ VFIO_USER_REGION_WRITE = 9,
+ VFIO_USER_DMA_READ = 10,
+ VFIO_USER_DMA_WRITE = 11,
+ VFIO_USER_VM_INTERRUPT = 12,
+ VFIO_USER_DEVICE_RESET = 13,
+ VFIO_USER_DIRTY_PAGES = 14,
+ VFIO_USER_MAX,
+};
+
+enum vfio_user_message_type {
+ VFIO_USER_MESSAGE_COMMAND = 0,
+ VFIO_USER_MESSAGE_REPLY = 1,
+};
+
+#define VFIO_USER_FLAGS_NO_REPLY (0x1)
+
+struct vfio_user_header {
+ uint16_t msg_id;
+ uint16_t cmd;
+ uint32_t msg_size;
+ struct {
+ uint32_t type : 4;
+#define VFIO_USER_F_TYPE_COMMAND 0
+#define VFIO_USER_F_TYPE_REPLY 1
+ uint32_t no_reply : 1;
+ uint32_t error : 1;
+ uint32_t resvd : 26;
+ } flags;
+ uint32_t error_no;
+} __attribute__((packed));
+
+struct vfio_user_dma_region {
+ uint64_t addr;
+ uint64_t size;
+ uint64_t offset;
+ uint32_t prot;
+ uint32_t flags;
+#define VFIO_USER_F_DMA_REGION_MAPPABLE (0x0)
+} __attribute__((packed));
+
+struct vfio_user_region_access {
+ uint64_t offset;
+ uint32_t region;
+ uint32_t count;
+ uint8_t data[];
+} __attribute__((packed));
+
+struct vfio_user_dma_region_access {
+ uint64_t addr;
+ uint32_t count;
+ uint8_t data[];
+} __attribute__((packed));
+
+struct vfio_user_irq_info {
+ uint32_t subindex;
+} __attribute__((packed));
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0)
+
+/* copied from <linux/vfio.h> */
+
+#define VFIO_REGION_TYPE_MIGRATION (3)
+#define VFIO_REGION_SUBTYPE_MIGRATION (1)
+
+struct vfio_device_migration_info {
+ __u32 device_state; /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP (0)
+#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING (1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
+ VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+ (state & VFIO_DEVICE_STATE_RESUMING ? \
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING)
+
+ __u32 reserved;
+ __u64 pending_bytes;
+ __u64 data_offset;
+ __u64 data_size;
+};
+
+struct vfio_bitmap {
+ __u64 pgsize; /* page size for bitmap in bytes */
+ __u64 size; /* in bytes */
+ __u64 *data; /* one bit per page */
+};
+
+struct vfio_iommu_type1_dirty_bitmap {
+ __u32 argsz;
+ __u32 flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+ __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+ __u64 iova; /* IO virtual address */
+ __u64 size; /* Size of iova range */
+ struct vfio_bitmap bitmap;
+};
+
+#endif
+
+#endif
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 9197a12..49a9201 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -32,5 +32,12 @@ add_executable(test_read test_read.c)
add_executable(test_mmap test_mmap.c)
add_executable(test_dma_map test_dma_map.c)
+add_executable(server server.c)
+target_link_libraries(server muser ssl crypto)
+add_executable(client client.c ../lib/muser_ctx.c ../lib/muser_pci.c ../lib/dma.c ../lib/cap.c)
+
+add_executable(null null.c)
+target_link_libraries(null muser pthread)
+
add_executable(gpio-pci-idio-16 gpio-pci-idio-16.c)
target_link_libraries(gpio-pci-idio-16 muser)
diff --git a/samples/client.c b/samples/client.c
new file mode 100644
index 0000000..0b42267
--- /dev/null
+++ b/samples/client.c
@@ -0,0 +1,901 @@
+/*
+ * Copyright (c) 2020 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/eventfd.h>
+#include <time.h>
+#include <err.h>
+#include <assert.h>
+//#include <sys/uio.h>
+
+#include "../lib/muser.h"
+#include "../lib/muser_priv.h"
+#include "../lib/common.h"
+
+static int
+init_sock(const char *path)
+{
+ int ret, sock;
+ struct sockaddr_un addr = {.sun_family = AF_UNIX};
+
+ /* TODO path should be defined elsewhere */
+ ret = snprintf(addr.sun_path, sizeof addr.sun_path, path);
+
+ if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ perror("failed to open socket");
+ return sock;
+ }
+
+ if ((ret = connect(sock, (struct sockaddr*)&addr, sizeof(addr))) == -1) {
+ perror("failed to connect server");
+ return ret;
+ }
+ return sock;
+}
+
+static int
+set_version(int sock, int client_max_fds, int *server_max_fds, size_t *pgsize)
+{
+ int ret, mj, mn;
+ uint16_t msg_id;
+ char *client_caps = NULL;
+
+ assert(server_max_fds != NULL);
+ assert(pgsize != NULL);
+
+ ret = recv_version(sock, &mj, &mn, &msg_id, false, server_max_fds, pgsize);
+ if (ret < 0) {
+ fprintf(stderr, "failed to receive version from server: %s\n",
+ strerror(-ret));
+ goto out;
+ }
+
+ if (mj != LIB_MUSER_VFIO_USER_VERS_MJ || mn != LIB_MUSER_VFIO_USER_VERS_MN) {
+ fprintf(stderr, "bad server version %d.%d\n", mj, mn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = asprintf(&client_caps, "{max_fds: %d, migration: {pgsize: %lu}}",
+ client_max_fds, sysconf(_SC_PAGESIZE));
+ if (ret == -1) {
+ client_caps = NULL;
+ ret = -ENOMEM; /* FIXME */
+ goto out;
+ }
+
+ ret = send_version(sock, mj, mn, msg_id, true, client_caps);
+ if (ret < 0) {
+ fprintf(stderr, "failed to send version to server: %s\n",
+ strerror(-ret));
+ goto out;
+ }
+ ret = 0;
+
+out:
+ free(client_caps);
+ return ret;
+}
+
+static int
+send_device_reset(int sock)
+{
+ int ret, msg_id = 1;
+
+ return send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_RESET,
+ NULL, 0, NULL, 0, NULL, NULL, 0);
+}
+
+static int
+get_region_vfio_caps(int sock, size_t cap_sz)
+{
+ struct vfio_info_cap_header *header, *_header;
+ struct vfio_region_info_cap_type *type;
+ struct vfio_region_info_cap_sparse_mmap *sparse;
+ int i, ret;
+
+ header = _header = calloc(cap_sz, 1);
+ if (header == NULL) {
+ return -ENOMEM;
+ }
+
+ ret = recv(sock, header, cap_sz, 0);
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to receive VFIO cap info");
+ }
+ assert(ret == cap_sz);
+
+ while (true) {
+ switch (header->id) {
+ case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)header;
+ fprintf(stdout, "%s: Sparse cap nr_mmap_areas %d\n", __func__,
+ sparse->nr_areas);
+ for (i = 0; i < sparse->nr_areas; i++) {
+ fprintf(stdout, "%s: area %d offset %#lx size %llu\n", __func__,
+ i, sparse->areas[i].offset, sparse->areas[i].size);
+ }
+ break;
+ case VFIO_REGION_INFO_CAP_TYPE:
+ type = (struct vfio_region_info_cap_type*)header;
+ if (type->type != VFIO_REGION_TYPE_MIGRATION ||
+ type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) {
+ fprintf(stderr, "bad region type %d/%d\n", type->type,
+ type->subtype);
+ exit(EXIT_FAILURE);
+ }
+ printf("migration region\n");
+ break;
+ default:
+ fprintf(stderr, "bad VFIO cap ID %#x\n", header->id);
+ exit(EXIT_FAILURE);
+ }
+ if (header->next == 0) {
+ break;
+ }
+ header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info));
+ }
+ free(_header);
+}
+
+static int
+get_device_region_info(int sock, struct vfio_device_info *client_dev_info)
+{
+ struct vfio_region_info region_info;
+ struct vfio_user_header hdr;
+ uint16_t msg_id = 0;
+ size_t cap_sz;
+ int i, ret;
+
+ msg_id = 1;
+ for (i = 0; i < client_dev_info->num_regions; i++) {
+ memset(&region_info, 0, sizeof(region_info));
+ region_info.argsz = sizeof(region_info);
+ region_info.index = i;
+ msg_id++;
+ ret = send_recv_vfio_user_msg(sock, msg_id,
+ VFIO_USER_DEVICE_GET_REGION_INFO,
+ &region_info, sizeof region_info,
+ NULL, 0, NULL,
+ &region_info, sizeof(region_info));
+ if (ret < 0) {
+ fprintf(stderr, "failed to get device region info: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ cap_sz = region_info.argsz - sizeof(struct vfio_region_info);
+ fprintf(stdout, "%s: region_info[%d] offset %#lx flags %#x size %llu "
+ "cap_sz %d\n", __func__, i, region_info.offset,
+ region_info.flags, region_info.size, cap_sz);
+ if (cap_sz) {
+ ret = get_region_vfio_caps(sock, cap_sz);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+static int get_device_info(int sock, struct vfio_device_info *dev_info)
+{
+ struct vfio_user_header hdr;
+ uint16_t msg_id;
+ int ret;
+
+ dev_info->argsz = sizeof(*dev_info);
+ msg_id = 1;
+ ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_GET_INFO,
+ dev_info, sizeof(*dev_info), NULL, 0, NULL,
+ dev_info, sizeof(*dev_info));
+ if (ret < 0) {
+ fprintf(stderr, "failed to get device info: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ printf("devinfo: flags %#x, num_regions %d, num_irqs %d\n",
+ dev_info->flags, dev_info->num_regions, dev_info->num_irqs);
+ return 0;
+}
+
+static int
+configure_irqs(int sock)
+{
+ int i, ret;
+ size_t size;
+ struct vfio_irq_set irq_set;
+ struct vfio_user_irq_info vfio_user_irq_info;
+ struct vfio_user_header hdr;
+ uint16_t msg_id = 1;
+ int irq_fd;
+ uint64_t val;
+ struct iovec iovecs[2];
+
+ for (i = 0; i < LM_DEV_NUM_IRQS; i++) { /* TODO move body of loop into function */
+ int size;
+ struct vfio_irq_info vfio_irq_info = {
+ .argsz = sizeof vfio_irq_info,
+ .index = i
+ };
+ ret = send_recv_vfio_user_msg(sock, msg_id,
+ VFIO_USER_DEVICE_GET_IRQ_INFO,
+ &vfio_irq_info, sizeof vfio_irq_info,
+ NULL, 0, NULL,
+ &vfio_irq_info, sizeof vfio_irq_info);
+ if (ret < 0) {
+ fprintf(stderr, "failed to get %s info: %s\n", irq_to_str[i],
+ strerror(-ret));
+ return ret;
+ }
+ if (vfio_irq_info.count > 0) {
+ printf("IRQ %s: count=%d flags=%#x\n",
+ irq_to_str[i], vfio_irq_info.count, vfio_irq_info.flags);
+ }
+ }
+
+ msg_id++;
+
+ irq_set.argsz = sizeof irq_set;
+ irq_set.flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set.index = 0;
+ irq_set.start = 0;
+ irq_set.count = 1;
+ irq_fd = eventfd(0, 0);
+ if (irq_fd == -1) {
+ perror("failed to create eventfd");
+ return -1;
+ }
+ ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DEVICE_SET_IRQS,
+ &irq_set, sizeof irq_set, &irq_fd, 1, NULL,
+ NULL, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to send configure IRQs message: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ printf("client waiting for server to trigger INTx\n");
+ printf("(send SIGUSR1 to the server trigger INTx)\n");
+
+ ret = read(irq_fd, &val, sizeof val);
+ if (ret == -1) {
+ ret = -errno;
+ perror("server failed to trigger IRQ");
+ return ret;
+ }
+
+ printf("INTx triggered!\n");
+
+ msg_id++;
+
+ size = sizeof(vfio_user_irq_info);
+ ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &vfio_user_irq_info,
+ &size);
+ if (ret < 0) {
+ fprintf(stderr, "failed to receive IRQ message: %s\n", strerror(-ret));
+ return ret;
+ }
+ if (vfio_user_irq_info.subindex >= irq_set.count) {
+ fprintf(stderr, "bad IRQ %d, max=%d\n", vfio_user_irq_info.subindex,
+ irq_set.count);
+ return -ENOENT;
+ }
+
+ ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_VM_INTERRUPT,
+ NULL, 0, NULL, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to send reply for VFIO_USER_VM_INTERRUPT: "
+ "%s\n", strerror(-ret));
+ return ret;
+ }
+ printf("INTx messaged triggered!\n");
+
+ return 0;
+}
+
+static int
+access_region(int sock, int region, bool is_write, uint64_t offset,
+ void *data, size_t data_len)
+{
+ struct vfio_user_region_access send_region_access = {
+ .offset = offset,
+ .region = region,
+ .count = data_len
+ };
+ struct iovec send_iovecs[3] = {
+ [1] = {
+ .iov_base = &send_region_access,
+ .iov_len = sizeof send_region_access
+ },
+ [2] = {
+ .iov_base = data,
+ .iov_len = data_len
+ }
+ };
+ struct {
+ struct vfio_user_region_access region_access;
+ char data[data_len];
+ } __attribute__((packed)) recv_data;
+ int op, ret;
+ size_t nr_send_iovecs, recv_data_len;
+
+ if (is_write) {
+ op = VFIO_USER_REGION_WRITE;
+ nr_send_iovecs = 3;
+ recv_data_len = sizeof(recv_data.region_access);
+ } else {
+ op = VFIO_USER_REGION_READ;
+ nr_send_iovecs = 2;
+ recv_data_len = sizeof(recv_data);
+ }
+
+ ret = _send_recv_vfio_user_msg(sock, 0, op,
+ send_iovecs, nr_send_iovecs,
+ NULL, 0, NULL,
+ &recv_data, recv_data_len);
+ if (ret != 0) {
+ fprintf(stderr, "failed to %s region %d %#lx-%#lx: %s\n",
+ is_write ? "write to" : "read from", region, offset,
+ offset + data_len - 1, strerror(-ret));
+ return ret;
+ }
+ if (recv_data.region_access.count != data_len) {
+ fprintf(stderr, "bad %s data count, expected=%d, actual=%d\n",
+ is_write ? "write" : "read", data_len,
+ recv_data.region_access.count);
+ return -EINVAL;
+ }
+
+ /*
+ * TODO we could avoid the memcpy if _sed_recv_vfio_user_msg received the
+ * response into an iovec, but it's some work to implement it.
+ */
+ if (!is_write) {
+ memcpy(data, recv_data.data, data_len);
+ }
+ return 0;
+}
+
+static int
+access_bar0(int sock)
+{
+ time_t t = time(NULL);
+ const int sleep_time = 1;
+ int ret = access_region(sock, LM_DEV_BAR0_REG_IDX, true, 0, &t, sizeof t);
+
+ if (ret < 0) {
+ fprintf(stderr, "failed to write to BAR0: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ printf("wrote to BAR0: %ld\n", t);
+
+ sleep(sleep_time);
+
+ ret = access_region(sock, LM_DEV_BAR0_REG_IDX, false, 0, &t, sizeof t);
+ if (ret < 0) {
+ fprintf(stderr, "failed to read from BAR0: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ printf("read from BAR0: %ld\n", t);
+
+ assert(t >= sleep_time);
+
+ return 0;
+}
+
+static int handle_dma_write(int sock, struct vfio_user_dma_region *dma_regions,
+ int nr_dma_regions, int *dma_region_fds)
+{
+ struct vfio_user_dma_region_access dma_access;
+ struct vfio_user_header hdr;
+ int ret, i;
+ size_t size = sizeof(dma_access);
+ uint16_t msg_id;
+ void *data;
+
+ msg_id = 1;
+ ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &dma_access, &size);
+ if (ret < 0) {
+ fprintf(stderr, "failed to recieve DMA read: %m\n");
+ return ret;
+ }
+
+ data = calloc(dma_access.count, 1);
+ if (data == NULL) {
+ return -ENOMEM;
+ }
+
+ ret = recv(sock, data, dma_access.count, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to recieve DMA read data: %m\n");
+ goto out;
+ }
+
+ for (i = 0; i < nr_dma_regions; i++) {
+ if (dma_regions[i].addr == dma_access.addr) {
+ ret = pwrite(dma_region_fds[i], data, dma_access.count,
+ dma_regions[i].offset);
+ if (ret < 0) {
+ fprintf(stderr, "failed to write data at %#lu: %m\n",
+ dma_regions[i].offset);
+ goto out;
+ }
+ break;
+ }
+ }
+
+ dma_access.count = 0;
+ ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_DMA_WRITE,
+ &dma_access, sizeof dma_access, NULL, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to send reply of DMA write: %s\n",
+ strerror(-ret));
+ }
+
+out:
+ free(data);
+ return ret;
+}
+
+static int handle_dma_read(int sock, struct vfio_user_dma_region *dma_regions,
+ int nr_dma_regions, int *dma_region_fds)
+{
+ struct vfio_user_dma_region_access dma_access, *response;
+ struct vfio_user_header hdr;
+ int ret, i, response_sz;
+ size_t size = sizeof(dma_access);
+ uint16_t msg_id;
+ void *data;
+
+ msg_id = 1;
+ ret = recv_vfio_user_msg(sock, &hdr, false, &msg_id, &dma_access, &size);
+ if (ret < 0) {
+ fprintf(stderr, "failed to recieve DMA read: %m\n");
+ return ret;
+ }
+
+ response_sz = sizeof(dma_access) + dma_access.count;
+ response = calloc(response_sz, 1);
+ if (response == NULL) {
+ return -ENOMEM;
+ }
+ response->count = dma_access.count;
+ data = (char *)response->data;
+
+ for (i = 0; i < nr_dma_regions; i++) {
+ if (dma_regions[i].addr == dma_access.addr) {
+ ret = pread(dma_region_fds[i], data, dma_access.count,
+ dma_regions[i].offset);
+ if (ret < 0) {
+ fprintf(stderr, "failed to write data at %#lu: %m\n",
+ dma_regions[i].offset);
+ goto out;
+ }
+ break;
+ }
+ }
+
+ ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_DMA_READ,
+ response, response_sz, NULL, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to send reply of DMA write: %m\n");
+ }
+
+out:
+ free(response);
+ return ret;
+}
+
+static int handle_dma_io(int sock, struct vfio_user_dma_region *dma_regions,
+ int nr_dma_regions, int *dma_region_fds)
+{
+ int ret;
+
+ ret = handle_dma_write(sock, dma_regions, nr_dma_regions, dma_region_fds);
+ if (ret < 0) {
+ fprintf(stderr, "failed to handle DMA write data: %m\n");
+ return ret;
+ }
+
+ ret = handle_dma_read(sock, dma_regions, nr_dma_regions, dma_region_fds);
+ if (ret < 0) {
+ fprintf(stderr, "failed to handle DMA read data: %m\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int
+get_dirty_bitmaps(int sock, struct vfio_user_dma_region *dma_regions,
+ int nr_dma_regions)
+{
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0};
+ struct vfio_iommu_type1_dirty_bitmap_get bitmaps[2];
+ int ret, i;
+ struct iovec iovecs[4] = {
+ [1] = {
+ .iov_base = &dirty_bitmap,
+ .iov_len = sizeof dirty_bitmap
+ }
+ };
+ struct vfio_user_header hdr = {0};
+ char data[ARRAY_SIZE(bitmaps)];
+
+ assert(dma_regions != NULL);
+ assert(nr_dma_regions >= ARRAY_SIZE(bitmaps));
+
+ for (i = 0; i < ARRAY_SIZE(bitmaps); i++) {
+ bitmaps[i].iova = dma_regions[i].addr;
+ bitmaps[i].size = dma_regions[i].size;
+ bitmaps[i].bitmap.size = 1; /* FIXME calculate based on page and IOVA size, don't hardcode */
+ bitmaps[i].bitmap.pgsize = sysconf(_SC_PAGESIZE);
+ iovecs[(i + 2)].iov_base = &bitmaps[i]; /* FIXME the +2 is because iovecs[0] is the vfio_user_header and iovecs[1] is vfio_iommu_type1_dirty_bitmap */
+ iovecs[(i + 2)].iov_len = sizeof(struct vfio_iommu_type1_dirty_bitmap_get);
+ }
+
+ /*
+ * FIXME there should be at least two IOVAs. Send single message for two
+ * IOVAs and ensure only one bit is set in first IOVA.
+ */
+ dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+ ret = _send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
+ iovecs, ARRAY_SIZE(iovecs),
+ NULL, 0,
+ &hdr, data, ARRAY_SIZE(data));
+ if (ret != 0) {
+ fprintf(stderr, "failed to start dirty page logging: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(bitmaps); i++) {
+ printf("%#x-%#x\t%hhu\n", bitmaps[i].iova,
+ bitmaps[i].iova + bitmaps[i].size - 1, data[i]);
+ }
+ return 0;
+}
+
+enum migration {
+ NO_MIGRATION,
+ MIGRATION_SOURCE,
+ MIGRATION_DESTINATION,
+};
+
+static void
+usage(char *path) {
+ fprintf(stderr, "Usage: %s [-h] [-m src|dst] /path/to/socket\n",
+ basename(path));
+}
+
+static int
+migrate_from(int sock)
+{
+ __u32 device_state = VFIO_DEVICE_STATE_SAVING;
+ __u64 pending_bytes, data_offset, data_size;
+ void *data;
+
+ /* XXX set device state to stop-and-copy */
+ int ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, true,
+ offsetof(struct vfio_device_migration_info, device_state),
+ &device_state, sizeof(device_state));
+ if (ret < 0) {
+ fprintf(stderr, "failed to write to device state: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ /* XXX read pending_bytes */
+ ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, false,
+ offsetof(struct vfio_device_migration_info, pending_bytes),
+ &pending_bytes, sizeof pending_bytes);
+ if (ret < 0) {
+ fprintf(stderr, "failed to read pending_bytes: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ while (pending_bytes > 0) {
+
+ /* XXX read data_offset and data_size */
+ ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, false,
+ offsetof(struct vfio_device_migration_info, data_offset),
+ &data_offset, sizeof data_offset);
+ if (ret < 0) {
+ fprintf(stderr, "failed to read data_offset: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, false,
+ offsetof(struct vfio_device_migration_info, data_size),
+ &data_size, sizeof data_size);
+ if (ret < 0) {
+ fprintf(stderr, "failed to read data_size: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ /* XXX read migration data */
+ data = malloc(data_size);
+ if (data == NULL) {
+ return -errno;
+ }
+ ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, false, data_offset,
+ data, data_size);
+ if (ret < 0) {
+ fprintf(stderr, "failed to read migration data: %s\n",
+ strerror(-ret));
+ }
+
+ /* FIXME send migration data to the destination client process */
+ printf("XXX migration: %#x bytes worth of data\n", data_size);
+
+ /*
+ * XXX read pending_bytes again to indicate to the sever that the
+ * migration data have been consumed.
+ */
+ ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, false,
+ offsetof(struct vfio_device_migration_info, pending_bytes),
+ &pending_bytes, sizeof pending_bytes);
+ if (ret < 0) {
+ fprintf(stderr, "failed to read pending_bytes: %s\n", strerror(-ret));
+ return ret;
+ }
+ }
+
+ /* XXX read device state, migration must have finished now */
+ device_state = VFIO_DEVICE_STATE_STOP;
+ ret = access_region(sock, LM_DEV_MIGRATION_REG_IDX, true,
+ offsetof(struct vfio_device_migration_info, device_state),
+ &device_state, sizeof(device_state));
+ if (ret < 0) {
+ fprintf(stderr, "failed to write to device state: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, sock;
+ struct vfio_user_dma_region *dma_regions;
+ struct vfio_device_info client_dev_info = {0};
+ int *dma_region_fds;
+ uint16_t msg_id = 1;
+ int i;
+ FILE *fp;
+ int fd;
+ const int client_max_fds = 32;
+ int server_max_fds;
+ size_t pgsize;
+ int nr_dma_regions;
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap = {0};
+ int opt;
+ enum migration migration = NO_MIGRATION;
+
+ while ((opt = getopt(argc, argv, "hm:")) != -1) {
+ switch (opt) {
+ case 'h':
+ usage(argv[0]);
+ exit(EXIT_SUCCESS);
+ case 'm':
+ if (strcmp(optarg, "src") == 0) {
+ migration = MIGRATION_SOURCE;
+ } else if (strcmp(optarg, "dst") == 0) {
+ migration = MIGRATION_DESTINATION;
+ } else {
+ fprintf(stderr, "invalid migration argument %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ default:
+ usage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (argc != optind + 1) {
+ usage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ if ((sock = init_sock(argv[optind])) < 0) {
+ return sock;
+ }
+
+ /*
+ * XXX VFIO_USER_VERSION
+ *
+ * The server proposes version upon connection, we need to send back the
+ * version the version we support.
+ */
+ if ((ret = set_version(sock, client_max_fds, &server_max_fds, &pgsize)) < 0) {
+ return ret;
+ }
+
+ /* XXX VFIO_USER_DEVICE_GET_INFO */
+ ret = get_device_info(sock, &client_dev_info);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* XXX VFIO_USER_DEVICE_GET_REGION_INFO */
+ ret = get_device_region_info(sock, &client_dev_info);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* XXX VFIO_USER_DEVICE_RESET */
+ ret = send_device_reset(sock);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * XXX VFIO_USER_DMA_MAP
+ *
+ * Tell the server we have some DMA regions it can access. Each DMA region
+ * is accompanied by a file descriptor, so let's create more (2x) DMA
+ * regions that can fit in a message that can be handled by the server.
+ */
+ nr_dma_regions = server_max_fds << 1;
+
+ if ((fp = tmpfile()) == NULL) {
+ err(EXIT_FAILURE, "failed to create DMA file");
+ }
+
+ if ((ret = ftruncate(fileno(fp), nr_dma_regions * sysconf(_SC_PAGESIZE))) == -1) {
+ err(EXIT_FAILURE,"failed to truncate file");
+ }
+
+ dma_regions = alloca(sizeof *dma_regions * nr_dma_regions);
+ dma_region_fds = alloca(sizeof *dma_region_fds * nr_dma_regions);
+
+ for (i = 0; i < nr_dma_regions; i++) {
+ dma_regions[i].addr = i * sysconf(_SC_PAGESIZE);
+ dma_regions[i].size = sysconf(_SC_PAGESIZE);
+ dma_regions[i].offset = dma_regions[i].addr;
+ dma_regions[i].prot = PROT_READ | PROT_WRITE;
+ dma_regions[i].flags = VFIO_USER_F_DMA_REGION_MAPPABLE;
+ dma_region_fds[i] = fileno(fp);
+ }
+
+ for (i = 0; i < nr_dma_regions / server_max_fds; i++, msg_id++) {
+ ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_MAP,
+ dma_regions + (i * server_max_fds),
+ sizeof(*dma_regions) * server_max_fds,
+ dma_region_fds + (i * server_max_fds),
+ server_max_fds, NULL, NULL, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to map DMA regions: %s\n", strerror(-ret));
+ return ret;
+ }
+ }
+
+ /*
+ * XXX VFIO_USER_REGION_READ and VFIO_USER_REGION_WRITE
+ *
+ * BAR0 in the server does not support memory mapping so it must be accessed
+ * via explicit messages.
+ */
+ ret = access_bar0(sock);
+ if (ret < 0) {
+ fprintf(stderr, "failed to access BAR0: %s\n", strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+
+ dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+ ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
+ &dirty_bitmap, sizeof dirty_bitmap,
+ NULL, 0, NULL, NULL, 0);
+ if (ret != 0) {
+ fprintf(stderr, "failed to start dirty page logging: %s\n",
+ strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * XXX VFIO_USER_DEVICE_GET_IRQ_INFO and VFIO_IRQ_SET_ACTION_TRIGGER
+ * Query interrupts, configure an eventfd to be associated with INTx, and
+ * finally wait for the server to fire the interrupt.
+ */
+ ret = configure_irqs(sock);
+ if (ret < 0) {
+ fprintf(stderr, "failed to configure IRQs: %s\n", strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+ ret = handle_dma_io(sock, dma_regions, nr_dma_regions, dma_region_fds);
+ if (ret < 0) {
+ fprintf(stderr, "DMA IO failed: %s\n", strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+ ret = get_dirty_bitmaps(sock, dma_regions, nr_dma_regions);
+ if (ret < 0) {
+ fprintf(stderr, "failed to receive dirty bitmaps: %s\n",
+ strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+ dirty_bitmap.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+ ret = send_recv_vfio_user_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
+ &dirty_bitmap, sizeof dirty_bitmap,
+ NULL, 0, NULL, NULL, 0);
+ if (ret != 0) {
+ fprintf(stderr, "failed to stop dirty page logging: %s\n",
+ strerror(-ret));
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * FIXME now that region read/write works, change the server implementation
+ * to trigger an interrupt after N seconds, where N is the value written to
+ * BAR0 by the client.
+ */
+
+ /* BAR1 can be memory mapped and read directly */
+
+ /*
+ * TODO implement the following: write a value in BAR1, a server timer will
+ * increase it every second (SIGALARM)
+ */
+
+ /*
+ * XXX VFIO_USER_DMA_UNMAP
+ *
+ * unmap the first group of the DMA regions
+ */
+ ret = send_recv_vfio_user_msg(sock, msg_id, VFIO_USER_DMA_UNMAP,
+ dma_regions, sizeof *dma_regions * server_max_fds,
+ NULL, 0, NULL, NULL, 0);
+ if (ret < 0) {
+ fprintf(stderr, "failed to unmap DMA regions: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ if (migration == MIGRATION_SOURCE) {
+ ret = migrate_from(sock);
+ }
+
+ return 0;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/samples/gpio-pci-idio-16.c b/samples/gpio-pci-idio-16.c
index 285b600..c549017 100644
--- a/samples/gpio-pci-idio-16.c
+++ b/samples/gpio-pci-idio-16.c
@@ -36,9 +36,19 @@
#include <stdio.h>
#include <err.h>
#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
#include "../lib/muser.h"
+static void
+_log(void *pvt, lm_log_lvl_t lvl __attribute__((unused)), char const *msg)
+{
+ fprintf(stderr, "gpio: %s\n", msg);
+}
+
ssize_t
bar2_access(void *pvt, char * const buf, size_t count, loff_t offset,
const bool is_write)
@@ -51,15 +61,40 @@ bar2_access(void *pvt, char * const buf, size_t count, loff_t offset,
return count;
}
-int main(int argc, char **argv)
+static void _sa_handler(int signum __attribute__((unused)))
+{
+}
+
+int main(int argc, char *argv[])
{
int ret;
+ bool trans_sock = false, verbose = false;
+ char opt;
+ struct sigaction act = {.sa_handler = _sa_handler};
+ lm_ctx_t *lm_ctx;
- if (argc != 2) {
+ while ((opt = getopt(argc, argv, "sv")) != -1) {
+ switch (opt) {
+ case 's':
+ trans_sock = true;
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ default: /* '?' */
+ fprintf(stderr, "Usage: %s [-s] [-d] <IOMMU group>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (optind >= argc) {
err(EXIT_FAILURE, "missing MUSER device UUID");
}
lm_dev_info_t dev_info = {
+ .trans = trans_sock ? LM_TRANS_SOCK : LM_TRANS_KERNEL,
+ .log = verbose ? _log : NULL,
+ .log_lvl = LM_DBG,
.pci_info = {
.id = {.vid = 0x494F, .did = 0x0DC8 },
.reg_info[LM_DEV_BAR2_REG_IDX] = {
@@ -67,15 +102,31 @@ int main(int argc, char **argv)
.size = 0x100,
.fn = &bar2_access
},
- .irq_count[LM_DEV_INTX_IRQ_IDX] = 1,
},
- .uuid = argv[1],
+ .uuid = argv[optind],
};
- ret = lm_ctx_run(&dev_info);
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGINT, &act, NULL) == -1) {
+ fprintf(stderr, "warning: failed to register signal handler: %m\n");
+ }
+
+ lm_ctx = lm_ctx_create(&dev_info);
+ if (lm_ctx == NULL) {
+ if (errno == EINTR) {
+ goto out;
+ }
+ fprintf(stderr, "failed to initialize device emulation: %m\n");
+ return -1;
+ }
+ ret = lm_ctx_drive(lm_ctx);
if (ret != 0) {
- fprintf(stderr, "failed to realize device emulation: %m\n");
+ if (ret != -ENOTCONN && ret != -EINTR) {
+ fprintf(stderr, "failed to realize device emulation: %m\n");
+ }
}
+out:
+ lm_ctx_destroy(lm_ctx);
return ret;
}
diff --git a/samples/null.c b/samples/null.c
new file mode 100644
index 0000000..c00c984
--- /dev/null
+++ b/samples/null.c
@@ -0,0 +1,108 @@
+/*
+ * Userspace mediated device sample application
+ *
+ * Copyright (c) 2019, Nutanix Inc. All rights reserved.
+ * Author: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+/* null PCI device, doesn't do anything */
+
+#include <stdio.h>
+#include <err.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <errno.h>
+#include <string.h>
+
+#include "../lib/muser.h"
+
+static void
+null_log(void *pvt, lm_log_lvl_t lvl __attribute__((unused)), char const *msg)
+{
+ fprintf(stderr, "muser: %s", msg);
+}
+
+
+static void* null_drive(void *arg)
+{
+ lm_ctx_t *lm_ctx = (lm_ctx_t*)arg;
+ int ret = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+ if (ret != 0) {
+ fprintf(stderr, "failed to enable cancel state: %s\n", strerror(ret));
+ return NULL;
+ }
+ ret = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+ if (ret != 0) {
+ fprintf(stderr, "failed to enable cancel type: %s\n", strerror(ret));
+ return NULL;
+ }
+ printf("starting device emulation\n");
+ lm_ctx_drive(lm_ctx);
+ return NULL;
+}
+
+int main(int argc, char **argv)
+{
+ int ret;
+ pthread_t thread;
+
+ if (argc != 2) {
+ err(EXIT_FAILURE, "missing MUSER device UUID");
+ }
+
+ lm_dev_info_t dev_info = {.uuid = argv[1], .log = null_log, .log_lvl = LM_DBG };
+
+ lm_ctx_t *lm_ctx = lm_ctx_create(&dev_info);
+ if (lm_ctx == NULL) {
+ err(EXIT_FAILURE, "failed to create libmuser context");
+ }
+
+ ret = pthread_create(&thread, NULL, null_drive, lm_ctx);
+ if (ret != 0) {
+ errno = ret;
+ err(EXIT_FAILURE, "failed to create pthread");
+ }
+
+ printf("press enter to stop device emulation and clean up\n");
+ getchar();
+
+ ret = pthread_cancel(thread);
+ if (ret != 0) {
+ errno = ret;
+ err(EXIT_FAILURE, "failed to create pthread");
+ }
+ lm_ctx_destroy(lm_ctx);
+
+ printf("device emulation stopped and cleaned up, press enter to exit\n");
+ getchar();
+
+ return ret;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/samples/server.c b/samples/server.c
new file mode 100644
index 0000000..4611fb0
--- /dev/null
+++ b/samples/server.c
@@ -0,0 +1,398 @@
+/*
+ * Sample server to be tested with samples/client.c
+ *
+ * Copyright (c) 2020, Nutanix Inc. All rights reserved.
+ * Author: Thanos Makatos <thanos@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <err.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
+#include <time.h>
+#include <assert.h>
+#include <openssl/md5.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+
+#include "../lib/muser.h"
+
+struct dma_regions {
+ uint64_t addr;
+ uint64_t len;
+};
+
+#define NR_DMA_REGIONS 96
+
+struct server_data {
+ time_t bar0;
+ uint8_t *bar1;
+ struct dma_regions regions[NR_DMA_REGIONS];
+ struct {
+ int fake_internal_state;
+ __u64 pending_bytes;
+ __u64 data_size;
+ } migration;
+};
+
+static void
+_log(void *pvt, lm_log_lvl_t lvl __attribute__((unused)), char const *msg)
+{
+ fprintf(stderr, "server: %s\n", msg);
+}
+
+/* returns time in seconds since Epoch */
+ssize_t
+bar0_access(void *pvt, char * const buf, size_t count, loff_t offset,
+ const bool is_write)
+{
+ struct server_data *server_data = pvt;
+
+ if (count != sizeof(time_t) || offset != 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (is_write) {
+ memcpy(&server_data->bar0, buf, count);
+ } else {
+ time_t delta = time(NULL) - server_data->bar0;
+ memcpy(buf, &delta, count);
+ }
+
+ return count;
+}
+
+ssize_t
+bar1_access(void *pvt, char * const buf, size_t count, loff_t offset,
+ const bool is_write)
+{
+ assert(false);
+}
+
+bool irq_triggered = false;
+static void _sa_handler(int signum)
+{
+ int _errno = errno;
+ if (signum == SIGUSR1) {
+ irq_triggered = true;
+ }
+ errno = _errno;
+}
+
+static void map_dma(void *pvt, uint64_t iova, uint64_t len)
+{
+ struct server_data *server_data = pvt;
+ int idx;
+
+ for (idx = 0; idx < NR_DMA_REGIONS; idx++) {
+ if (server_data->regions[idx].addr == 0 &&
+ server_data->regions[idx].len == 0)
+ break;
+ }
+ if (idx >= NR_DMA_REGIONS) {
+ fprintf(stderr, "Failed to add dma region, slots full\n");
+ return;
+ }
+
+ server_data->regions[idx].addr = iova;
+ server_data->regions[idx].len = len;
+}
+
+static int unmap_dma(void *pvt, uint64_t iova)
+{
+ struct server_data *server_data = pvt;
+ int idx;
+
+ for (idx = 0; idx < NR_DMA_REGIONS; idx++) {
+ if (server_data->regions[idx].addr == iova) {
+ server_data->regions[idx].addr = 0;
+ server_data->regions[idx].len = 0;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+void get_md5sum(char *buf, int len, char *md5sum)
+{
+ MD5_CTX ctx;
+
+ MD5_Init(&ctx);
+ MD5_Update(&ctx, buf, len);
+ MD5_Final(md5sum, &ctx);
+
+ return;
+}
+
+/*
+ * FIXME this function does DMA write/read using messages. This should be done
+ * on a region that is not memory mappable or an area of a region that is not
+ * sparsely memory mappable. We should also have a test where the server does
+ * DMA directly on the client memory.
+ */
+static int do_dma_io(lm_ctx_t *lm_ctx, struct server_data *server_data)
+{
+ int count = 4096;
+ char buf[count], md5sum1[MD5_DIGEST_LENGTH], md5sum2[MD5_DIGEST_LENGTH];
+ int i, ret;
+ dma_sg_t sg;
+
+ assert(lm_ctx != NULL);
+
+ ret = lm_addr_to_sg(lm_ctx, server_data->regions[0].addr, count, &sg,
+ 1, PROT_WRITE);
+ assert(ret == 1); /* FIXME */
+
+ memset(buf, 'A', count);
+ get_md5sum(buf, count, md5sum1);
+ printf("%s: WRITE addr %#lx count %llu\n", __func__,
+ server_data->regions[0].addr, count);
+ ret = lm_dma_write(lm_ctx, &sg, buf);
+ if (ret < 0) {
+ fprintf(stderr, "lm_dma_write failed: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ memset(buf, 0, count);
+ printf("%s: READ addr %#lx count %llu\n", __func__,
+ server_data->regions[0].addr, count);
+ ret = lm_dma_read(lm_ctx, &sg, buf);
+ if (ret < 0) {
+ fprintf(stderr, "lm_dma_read failed: %s\n", strerror(-ret));
+ return ret;
+ }
+ get_md5sum(buf, count, md5sum2);
+ for(i = 0; i < MD5_DIGEST_LENGTH; i++) {
+ if (md5sum2[i] != md5sum1[i]) {
+ fprintf(stderr, "DMA write and DMA read mismatch\n");
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+unsigned long map_area(void *pvt, unsigned long off, unsigned long len)
+{
+ assert(false);
+}
+
+static int device_reset(void *pvt)
+{
+ printf("device reset callback\n");
+}
+
+static int
+migration_device_state_transition(void *pvt, lm_migr_state_t state)
+{
+ struct server_data *server_data = pvt;
+
+ printf("migration: transition to device state %d\n", state);
+
+ switch (state) {
+ case LM_MIGR_STATE_STOP_AND_COPY:
+ /* TODO must be less than size of data region in migration region */
+ server_data->migration.pending_bytes = sysconf(_SC_PAGESIZE);
+ break;
+ case LM_MIGR_STATE_STOP:
+ assert(server_data->migration.pending_bytes == 0);
+ break;
+ default:
+ assert(false); /* FIXME */
+ }
+ return 0;
+}
+
+static __u64
+migration_get_pending_bytes(void *pvt)
+{
+ struct server_data *server_data = pvt;
+ if (server_data->migration.data_size > 0) {
+ assert(server_data->migration.data_size <= server_data->migration.pending_bytes);
+ server_data->migration.pending_bytes -= server_data->migration.data_size;
+ }
+ return server_data->migration.pending_bytes;
+}
+
+static int
+migration_prepare_data(void *pvt, __u64 *offset, __u64 *size)
+{
+ struct server_data *server_data = pvt;
+
+ *offset = 0;
+ *size = server_data->migration.data_size = MIN(server_data->migration.pending_bytes, sysconf(_SC_PAGESIZE) / 4);
+ return 0;
+}
+
+static size_t
+migration_read_data(void *pvt, void *buf, __u64 size, __u64 offset)
+{
+ struct server_data *server_data = pvt;
+
+ assert(server_data->migration.data_size >= size);
+
+ return 0;
+}
+
+static size_t
+migration_write_data(void *pvt, void *data, __u64 size)
+{
+ assert(false);
+}
+
+int main(int argc, char *argv[]){
+ int ret;
+ bool trans_sock = false, verbose = false;
+ char opt;
+ struct sigaction act = {.sa_handler = _sa_handler};
+ struct server_data server_data = {0};
+ int nr_sparse_areas = 2, size = 1024, i;
+ struct lm_sparse_mmap_areas *sparse_areas;
+
+ lm_ctx_t *lm_ctx;
+
+ while ((opt = getopt(argc, argv, "v")) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose = true;
+ break;
+ default: /* '?' */
+ fprintf(stderr, "Usage: %s [-d] <IOMMU group>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (optind >= argc) {
+ fprintf(stderr, "missing MUSER device UUID\n");
+ exit(EXIT_FAILURE);
+ }
+
+ server_data.bar1 = malloc(sysconf(_SC_PAGESIZE));
+ if (server_data.bar1 == NULL) {
+ err(EXIT_FAILURE, "BAR1");
+ }
+
+ sparse_areas = calloc(1, sizeof(*sparse_areas) +
+ (nr_sparse_areas * sizeof(struct lm_mmap_area)));
+ if (sparse_areas == NULL) {
+ err(EXIT_FAILURE, "MMAP sparse areas ENOMEM");
+ goto out;
+ }
+ sparse_areas->nr_mmap_areas = nr_sparse_areas;
+ for (i = 0; i < nr_sparse_areas; i++) {
+ sparse_areas->areas[i].start += size;
+ sparse_areas->areas[i].size = size;
+ }
+
+ lm_dev_info_t dev_info = {
+ .trans = LM_TRANS_SOCK,
+ .log = verbose ? _log : NULL,
+ .log_lvl = LM_DBG,
+ .pci_info = {
+ .reg_info[LM_DEV_BAR0_REG_IDX] = {
+ .flags = LM_REG_FLAG_RW,
+ .size = sizeof(time_t),
+ .fn = &bar0_access
+ },
+ .reg_info[LM_DEV_BAR1_REG_IDX] = {
+ .flags = LM_REG_FLAG_RW,
+ .size = sysconf(_SC_PAGESIZE),
+ .fn = &bar1_access,
+ .mmap_areas = sparse_areas,
+ .map = map_area
+ },
+ .reg_info[LM_DEV_MIGRATION_REG_IDX] = { /* migration region */
+ .flags = LM_REG_FLAG_RW,
+ .size = sizeof(struct vfio_device_migration_info) + sysconf(_SC_PAGESIZE),
+ .mmap_areas = sparse_areas,
+ },
+ .irq_count[LM_DEV_INTX_IRQ_IDX] = 1,
+ },
+ .uuid = argv[optind],
+ .reset = device_reset,
+ .map_dma = map_dma,
+ .unmap_dma = unmap_dma,
+ .pvt = &server_data,
+ .migration_callbacks = {
+ .transition = &migration_device_state_transition,
+ .get_pending_bytes = &migration_get_pending_bytes,
+ .prepare_data = &migration_prepare_data,
+ .read_data = &migration_read_data,
+ .write_data = &migration_write_data
+ }
+ };
+
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGUSR1, &act, NULL) == -1) {
+ err(EXIT_FAILURE, "failed to register signal handler");
+ }
+
+ lm_ctx = lm_ctx_create(&dev_info);
+ if (lm_ctx == NULL) {
+ if (errno == EINTR) {
+ goto out;
+ }
+ err(EXIT_FAILURE, "failed to initialize device emulation");
+ }
+
+ do {
+ ret = lm_ctx_drive(lm_ctx);
+ if (ret == -EINTR) {
+ if (irq_triggered) {
+ irq_triggered = false;
+ lm_irq_trigger(lm_ctx, 0);
+
+ ret = lm_irq_message(lm_ctx, 0);
+ if (ret < 0) {
+ fprintf(stderr, "lm_irq_message() failed: %m\n");
+ }
+
+ ret = do_dma_io(lm_ctx, &server_data);
+ if (ret < 0) {
+ fprintf(stderr, "DMA read/write failed: %m\n");
+ }
+ ret = 0;
+ }
+ }
+ } while (ret == 0);
+ if (ret != -ENOTCONN && ret != -EINTR) {
+ fprintf(stderr, "failed to realize device emulation: %s\n",
+ strerror(-ret));
+ }
+out:
+ lm_ctx_destroy(lm_ctx);
+ free(server_data.bar1);
+ free(sparse_areas);
+ return ret;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */