aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelipe Franciosi <felipe@nutanix.com>2019-07-02 14:06:42 +0100
committerFelipe Franciosi <felipe@nutanix.com>2019-09-05 16:45:35 +0100
commitf8ef2771ca6c05dadd3188099eb678e6135e12e2 (patch)
tree1629283ee553622ce99477c63da4994d4c87bc0f
downloadlibvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.zip
libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.tar.gz
libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.tar.bz2
Initial commit
-rw-r--r--CMakeLists.txt42
-rw-r--r--LICENSE7
-rw-r--r--Makefile74
-rw-r--r--README.md134
-rw-r--r--kmod/CMakeLists.txt47
-rw-r--r--kmod/muser.c1807
-rw-r--r--kmod/muser.h74
-rw-r--r--lib/.indent.pro4
-rw-r--r--lib/CMakeLists.txt46
-rw-r--r--lib/common.h60
-rw-r--r--lib/dma.c331
-rw-r--r--lib/dma.h241
-rw-r--r--lib/libmuser.c1063
-rw-r--r--lib/libmuser_pci.c311
-rw-r--r--lib/msicap.h67
-rw-r--r--lib/muser.h185
-rw-r--r--lib/pci.h276
-rw-r--r--lib/pmcap.h70
-rw-r--r--lib/pxcap.h144
-rw-r--r--patches/vfio.diff192
-rw-r--r--samples/CMakeLists.txt32
-rw-r--r--samples/test_mmap.c199
-rw-r--r--samples/test_read.c233
23 files changed, 5639 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..47a8e6f
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,42 @@
+#
+# Copyright (c) 2019 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos@nutanix.com>
+# Swapnil Ingle <swapnil.ingle@nutanix.com>
+# Felipe Franciosi <felipe@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+cmake_minimum_required (VERSION 2.6)
+project(muser)
+include(GNUInstallDirs)
+
+# shared library
+add_subdirectory(lib)
+
+# kernel module
+add_subdirectory(kmod)
+
+# samples
+add_subdirectory(samples)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..162a7af
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,7 @@
+This project is released under dual license.
+
+The kernel driver (kmod/muser.[ch]) is released as GPL-2.0 or BSD-3-CLAUSE.
+
+The remaining source code is released as BSD-3-CLAUSE.
+
+Each source file in the repository reflects the above.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..11bd3fe
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2019 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos@nutanix.com>
+# Swapnil Ingle <swapnil.ingle@nutanix.com>
+# Felipe Franciosi <felipe@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+BUILD_TYPE ?= dbg
+
+ifeq ($(BUILD_TYPE), dbg)
+ CMAKE_BUILD_TYPE = Debug
+ CFLAGS += -DDEBUG
+else
+ CMAKE_BUILD_TYPE = Release
+ CFLAGS += -DNDEBUG
+endif
+
+ifeq ($(VERBOSE),)
+ MAKEFLAGS += -s
+endif
+
+BUILD_DIR_BASE = $(CURDIR)/build
+BUILD_DIR = $(BUILD_DIR_BASE)/$(BUILD_TYPE)
+
+KDIR ?= "/lib/modules/$(shell uname -r)/build"
+
+PHONY_TARGETS := all realclean buildclean force_cmake export install-export tags
+
+.PHONY: $(PHONY_TARGETS)
+
+all $(filter-out $(PHONY_TARGETS), $(MAKECMDGOALS)): $(BUILD_DIR)/Makefile
+ +$(MAKE) -C $(BUILD_DIR) $@
+
+realclean:
+ rm -rf $(BUILD_DIR_BASE)
+
+buildclean:
+ rm -rf $(BUILD_DIR)
+
+force_cmake: $(BUILD_DIR)/Makefile
+
+$(BUILD_DIR)/Makefile:
+ mkdir -p $(BUILD_DIR)
+ cd $(BUILD_DIR); cmake \
+ -D "CMAKE_C_FLAGS:STRING=$(CFLAGS)" \
+ -D "CMAKE_BUILD_TYPE:STRING=$(CMAKE_BUILD_TYPE)" \
+ -D "KDIR=$(KDIR)" \
+ $(CURDIR)
+
+tags:
+ ctags -R --exclude=$(BUILD_DIR)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b744c26
--- /dev/null
+++ b/README.md
@@ -0,0 +1,134 @@
+Mediated User space device
+==========================
+
+Overview
+--------
+
+muser is a framework that allows mediated device drivers to be implemented in
+user space. The device driver can by a completely virtual one without driving
+an actual device of that type. This can greatly simplify the initial
+development and prototyping of kernel drivers as no kernel code needs to be
+written, and failures result in the user space process crashing in the worst
+case. The mediated device can be passed to a virtual machine for proper
+testing. Device drivers are typically implemented entirely in kernel space for
+various reasons, however in early development stages it's acceptable to do it
+in user space.
+
+muser is implemented by a small kernel module, muser.ko, that registers itself
+with mdev. Every request is forwarded to a user space application via a small,
+custom ioctl interface on a control device. The application must be externally
+provided and needs to contain the actual device implementation by using the API
+of libmuser. See src/samples on how to build such an application. Currently
+there is a one, single-threaded application instance per device, however the
+application can employ any form of concurrency needed. In the future we plan to
+make libmuser multi-threaded. The application can be implemented in whatever
+way is convenient, e.g. as a Python script using bindings, on the cloud, etc.
+
+
+Memory Mapping the Device
+-------------------------
+
+The device driver can allow parts of the virtual device to be memory mapped by
+the virtual machine (e.g. the PCI BARs). The business logic needs to implement
+the mmap callback and reply to the request passing the memory address whose
+backing pages are then used to satisfy the original mmap call. Currently
+reading and writing of the memory mapped memory by the client goes undetected
+by libmuser, the business logic needs to poll. In the future we plan to
+implement a mechanism in order to provide notifications to libmuser whenever a
+page is written to.
+
+
+Interrupts
+----------
+
+Interrupts are implemented by installing the event file descriptor in libmuser
+and then notifying it about it. libmuser can then trigger interrupts simply by
+writing to it. This can be much more expensive compared to triggering interrupts
+from the kernel, however this performance penalty is perfectly acceptable when
+prototyping the functional aspect of a device driver.
+
+
+System Architecture
+-------------------
+
+muser.ko and libmuser communicate via ioctl on a control device. This control
+device is create when the mediated device is created and appears as
+/dev/muser/<UUID>. libmuser opens this device and then executes a "wait
+command" ioctl. Whenever a callback of muser.ko is executed, it fills a struct
+with the command details and then completes the ioctl, unblocking libmuser. It
+then waits to receive another ioctl from libmuser with the result. Currently
+there can be only one command pending, we plan to allow multiple commands to be
+executed in parallel.
+
+
+Building muser
+==============
+
+vfio/mdev needs to be patched. To generate the patch run:
+
+ git diff 869e3305f23dfeacdaa234717c92ccb237815d90 --diff-filter=M > vfio.patch
+
+Apply the patch and rebuild the vfio/mdev modules:
+
+ make SUBDIRS=drivers/vfio/ modules
+
+Reload the relevant kernel modules:
+
+ drivers/vfio/vfio_iommu_type1.ko
+ drivers/vfio/vfio.ko
+ drivers/vfio/mdev/mdev.ko
+ drivers/vfio/mdev/vfio_mdev.ko
+
+Build the kernel module:
+
+ cd src/kmod
+ make
+
+Build the library:
+
+ mkdir build
+ cd build
+ cmake ..
+ make
+ make install
+
+Finally build your program and link it to libmuser.so.
+
+Running QEMU
+============
+
+To pass the device to QEMU add the following options:
+
+ -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/00000000-0000-0000-0000-000000000000
+ -object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=mem,share=yes,size=1073741824 -numa node,nodeid=0,cpus=0,memdev=ram-node0
+
+Guest RAM must be shared (share=yes) otherwise libmuser won't be able to do DMA
+transfers from/to it. If you're not using QEMU then any memory that must be
+accessed by libmuser must be allocate MAP_SHARED. Registering memory for DMA
+that has not been allocated with MAP_SHARED is ignored and any attempts to
+access that memory will result in an error.
+
+
+Future Work
+===========
+
+Making libmuser Restartable
+----------------------------
+
+muser can be made restartable so that (a) it can recover from failures, and
+(b) upgrades are less disrupting. This is something we plan to implement in the
+future. To make it restarable muser needs to reconfigure eventfds and DMA
+region mmaps first thing when the device is re-opened by libmuser. After muser
+has finished reconfiguring it will send a "ready" command, after which normal
+operation will be resumed. This "ready" command will always be sent when the
+device is opened, even if this is the first time, as this way we don't need to
+differentiate between normal operation and restarted operation. libmuser will
+store the PCI BAR on /dev/shm (named after e.g. the device UUID) so that it can
+easily find them on restart.
+
+
+Making libmuser Multi-threaded
+-------------------------------
+
+libmuser can be made multi-threaded in order to improve performance. To
+implement this we'll have to maintain a private context in struct file.
diff --git a/kmod/CMakeLists.txt b/kmod/CMakeLists.txt
new file mode 100644
index 0000000..9065611
--- /dev/null
+++ b/kmod/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2019 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos@nutanix.com>
+# Swapnil Ingle <swapnil.ingle@nutanix.com>
+# Felipe Franciosi <felipe@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# Copy sources to build directory (avoid polluting source directory).
+# TODO can we copy all source files with a wildcard?
+configure_file(muser.c ${CMAKE_CURRENT_BINARY_DIR}/muser.c COPYONLY)
+configure_file(muser.h ${CMAKE_CURRENT_BINARY_DIR}/muser.h COPYONLY)
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/Kbuild "obj-m := muser.o")
+
+# Build module using kernel's Makefile.
+set(KBUILD_CMD ${CMAKE_MAKE_PROGRAM} -C ${KDIR} M=${CMAKE_CURRENT_BINARY_DIR} modules)
+ADD_CUSTOM_COMMAND(OUTPUT DRIVER_BIN_FILE
+ COMMAND ${KBUILD_CMD}
+ DEPENDS ${MODULE_SOURCE_FILES} VERBATIM
+)
+ADD_CUSTOM_TARGET(driver ALL DEPENDS DRIVER_BIN_FILE)
+execute_process(COMMAND uname -r OUTPUT_VARIABLE kver OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.ko DESTINATION /lib/modules/${kver}/extra/)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/linux)
diff --git a/kmod/muser.c b/kmod/muser.c
new file mode 100644
index 0000000..8a4ceb0
--- /dev/null
+++ b/kmod/muser.c
@@ -0,0 +1,1807 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2019, Nutanix Inc. All rights reserved.
+ *
+ * Author: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/idr.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/pagemap.h>
+#include <asm-generic/mman-common.h>
+#include <linux/device.h>
+#include <linux/uaccess.h>
+
+#include "muser.h"
+
+#define DRIVER_NAME "muser"
+
+#define NR_PAGES(x) (((x) + (PAGE_SIZE - 1)) >> PAGE_SHIFT)
+#define MIN(a, b) ((a) < (b) ? (a):(b))
+
+static struct muser {
+ struct class *class;
+ struct list_head dev_list;
+ struct idr dev_idr;
+ struct cdev muser_cdev;
+ dev_t muser_devt;
+ struct device dev;
+ struct mutex muser_lock;
+} muser;
+
+#define muser_log(func, fmt, ...) \
+ func(&muser.dev, "%s: " fmt "\n", __func__, ## __VA_ARGS__)
+
+#define muser_dbg(fmt, ...) muser_log(dev_dbg, fmt, ## __VA_ARGS__)
+#define muser_info(fmt, ...) muser_log(dev_info, fmt, ## __VA_ARGS__)
+#define muser_warn(fmt, ...) muser_log(dev_warn, fmt, ## __VA_ARGS__)
+#define muser_err(fmt, ...) muser_log(dev_err, fmt, ## __VA_ARGS__)
+#define muser_alert(fmt, ...) muser_log(dev_alert, fmt, ## __VA_ARGS__)
+
+/* TODO come up with as better name? */
+/*
+ * FIXME len and nr_pages are confusing, we user either one or the other however
+ * they seem to serve the same purpose, fix.
+ */
+struct page_map {
+ struct page **pages;
+ int nr_pages;
+ size_t len;
+ int offset;
+};
+
+struct vfio_dma_mapping {
+ unsigned long iova;
+ unsigned long length;
+ struct page **pages;
+ struct list_head entry;
+};
+
+/*
+ * TODO do we use all members at the same time? Does it make sense to put some
+ * of them in a union?
+ */
+struct mudev_cmd {
+ enum muser_cmd_type type; /* copy of muser_cmd.type */
+ struct muser_cmd muser_cmd;
+ struct page_map pg_map;
+ struct file **fds;
+ int *data_fds;
+ /*
+ * When libmuser completes an mmap call, we need to know the length
+ * in order to pass it to do_pin_pages.
+ */
+ unsigned long mmap_len;
+ struct list_head entry;
+};
+
+// FIXME: Reorganise the members of this struct.
+struct muser_dev {
+ guid_t uuid;
+ int minor;
+ struct device *dev;
+ struct list_head dlist_entry;
+ struct list_head cmd_list;
+ // FIXME: mucmd_pending should be per filep context.
+ struct mudev_cmd *mucmd_pending;
+ // FIXME: muser_dev should have a list of filep contexts instead of
+ // srv_opened
+ atomic_t srv_opened;
+ atomic_t mdev_opened;
+ struct mutex dev_lock;
+ struct mdev_device *mdev;
+ wait_queue_head_t user_wait_q;
+ struct semaphore sem;
+ struct notifier_block iommu_notifier;
+
+ struct vfio_dma_mapping *dma_map; /* Current DMA operation */
+ struct list_head dma_list; /* list of dma mappings */
+
+ struct radix_tree_root devmem_tree; /* Device memory */
+};
+
+/* function prototypes */
+static int dma_unmap_all(struct muser_dev *const mudev, const bool skip_user);
+
+static inline int muser_copyout(void __user *param, const void *address,
+ unsigned long size)
+{
+ int err = copy_to_user(param, address, size) ? -EFAULT : 0;
+
+ if (unlikely(err))
+ muser_dbg("failed to copy to user: %d", err);
+
+ return err;
+}
+
+static inline int muser_copyin(void *address, void __user *param,
+ unsigned long size)
+{
+ int err = copy_from_user(address, param, size) ? -EFAULT : 0;
+
+ if (unlikely(err))
+ muser_dbg("failed to copy from user: %d", err);
+
+ return err;
+}
+
+/* called with muser.muser_lock held */
+static struct muser_dev *__muser_search_dev(const guid_t *uuid)
+{
+ struct muser_dev *mudev;
+
+ list_for_each_entry(mudev, &muser.dev_list, dlist_entry) {
+ const uuid_le *u = &mudev->uuid;
+
+ if (uuid_le_cmp(*u, *uuid) == 0)
+ return mudev;
+ }
+
+ return NULL;
+}
+
+static int muser_create_dev(const guid_t *uuid, struct mdev_device *mdev)
+{
+ struct muser_dev *mudev;
+ char uuid_str[UUID_STRING_LEN + 1];
+ int minor;
+ int err = 0;
+
+ mutex_lock(&muser.muser_lock);
+ mudev = __muser_search_dev(uuid);
+ if (mudev) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ mudev = kzalloc(sizeof(*mudev), GFP_KERNEL);
+ if (!mudev) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ minor = idr_alloc(&muser.dev_idr, mudev, 0, MINORMASK + 1, GFP_KERNEL);
+ if (minor < 0) {
+ err = minor;
+ kfree(mudev);
+ goto out;
+ }
+
+ sprintf(uuid_str, "%pUl", uuid);
+ mudev->dev = device_create(muser.class, NULL,
+ MKDEV(MAJOR(muser.muser_devt), minor),
+ mudev, "%s", uuid_str);
+ if (IS_ERR(mudev->dev)) {
+ err = PTR_ERR(mudev->dev);
+ idr_remove(&muser.dev_idr, minor);
+ kfree(mudev);
+ goto out;
+ }
+
+ memcpy(&mudev->uuid, uuid, sizeof(mudev->uuid));
+ mudev->minor = minor;
+ mudev->mdev = mdev;
+ mutex_init(&mudev->dev_lock);
+ sema_init(&mudev->sem, 0);
+ init_waitqueue_head(&mudev->user_wait_q);
+ INIT_LIST_HEAD(&mudev->cmd_list);
+ INIT_LIST_HEAD(&mudev->dma_list);
+ INIT_RADIX_TREE(&mudev->devmem_tree, GFP_KERNEL);
+ list_add(&mudev->dlist_entry, &muser.dev_list);
+ mdev_set_drvdata(mdev, mudev);
+
+ muser_info("new device %s", uuid_str);
+
+out:
+ mutex_unlock(&muser.muser_lock);
+ return err;
+}
+
+/* called with muser.muser_lock held */
+static void __muser_deinit_dev(struct muser_dev *mudev)
+{
+ device_destroy(muser.class,
+ MKDEV(MAJOR(muser.muser_devt), mudev->minor));
+ list_del(&mudev->dlist_entry);
+ idr_remove(&muser.dev_idr, mudev->minor);
+}
+
+/* called with mudev.dev_lock held */
+static void __mudev_page_free(struct muser_dev *mudev, unsigned long pgnr)
+{
+ struct page *pg;
+
+ pg = radix_tree_delete(&mudev->devmem_tree, pgnr);
+ if (WARN_ON(!pg))
+ return;
+
+ __free_page(pg);
+}
+
+#define NR_INDICES 16
+
+/* called with mudev.dev_lock held */
+static void __mudev_free_devmem(struct muser_dev *mudev)
+{
+ struct radix_tree_iter iter;
+ struct radix_tree_root *root = &mudev->devmem_tree;
+ unsigned long indices[NR_INDICES], index = 0;
+ void __rcu **slot;
+ int i, nr;
+
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == NR_INDICES)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ __mudev_page_free(mudev, index);
+ }
+ } while (nr > 0);
+}
+
+static int muser_remove_dev(const uuid_le *uuid)
+{
+ struct muser_dev *mudev;
+ char uuid_str[UUID_STRING_LEN + 1];
+ int err = 0;
+
+ mutex_lock(&muser.muser_lock);
+
+ mudev = __muser_search_dev(uuid);
+ if (!mudev) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (atomic_read(&mudev->mdev_opened) > 0 ||
+ atomic_read(&mudev->srv_opened) > 0) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ mutex_lock(&mudev->dev_lock);
+
+ WARN_ON(!list_empty(&mudev->cmd_list));
+ __mudev_free_devmem(mudev);
+ __muser_deinit_dev(mudev);
+
+ mutex_unlock(&mudev->dev_lock);
+ kfree(mudev);
+
+ sprintf(uuid_str, "%pUl", uuid);
+ muser_info("removed muser device %s", uuid_str);
+
+out:
+ mutex_unlock(&muser.muser_lock);
+ return err;
+}
+
+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "muser\n");
+}
+
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_device_api.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group = {
+ .name = "1",
+ .attrs = mdev_types_attrs,
+};
+
+struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group,
+ NULL,
+};
+
+static int muser_process_cmd(struct muser_dev *mudev, struct mudev_cmd *mucmd)
+{
+ int err;
+
+ mucmd->type = mucmd->muser_cmd.type;
+
+ /* Add command to mudev list of commands. */
+ mutex_lock(&mudev->dev_lock);
+ list_add_tail(&mucmd->entry, &mudev->cmd_list);
+ mutex_unlock(&mudev->dev_lock);
+
+ /* Wake up any sleepers */
+ wake_up(&mudev->user_wait_q);
+
+ /*
+ * TODO: decide what to do with timeouts
+ * Timeouts can happen if:
+ * 1. No server has attached to mudev
+ * 2. Processing of cmd takes more time than timeout
+ */
+ /*
+ * TODO: Maybe use a while loop instead of goto
+ */
+retry:
+ err = down_timeout(&mudev->sem, msecs_to_jiffies(5000));
+ if (err) {
+ struct mudev_cmd *pos, *tmp;
+ bool found = false;
+
+ mutex_lock(&mudev->dev_lock);
+ list_for_each_entry_safe(pos, tmp, &mudev->cmd_list, entry) {
+ if (pos == mucmd) {
+ list_del(&mucmd->entry);
+ found = true;
+ break;
+ }
+ }
+ mutex_unlock(&mudev->dev_lock);
+ if (found) {
+ muser_err("giving up, no response for cmd %d",
+ mucmd->type);
+ } else {
+ muser_warn("server taking too long for cmd %d, retry",
+ mucmd->type);
+ goto retry;
+ }
+ }
+
+ return err;
+}
+
+int muser_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ const guid_t *uuid = mdev_uuid(mdev);
+
+ return muser_create_dev(uuid, mdev);
+}
+
+int muser_remove(struct mdev_device *mdev)
+{
+ const guid_t *uuid = mdev_uuid(mdev);
+
+ return muser_remove_dev(uuid);
+}
+
+static int do_pin_pages(char __user *buf, const size_t count,
+ int const writeable, struct page_map *const pg_map)
+{
+ unsigned long start;
+ unsigned long __user lbuf = (unsigned long __user)buf;
+ int i;
+ int err;
+
+ BUG_ON(!buf);
+ BUG_ON(!pg_map);
+
+ start = round_down(lbuf, PAGE_SIZE);
+ pg_map->nr_pages = (round_up(lbuf + count, PAGE_SIZE) - start) /
+ PAGE_SIZE;
+ pg_map->offset = lbuf - start;
+ pg_map->pages = kcalloc(pg_map->nr_pages, sizeof *(pg_map->pages),
+ GFP_KERNEL);
+ if (unlikely(!pg_map->pages)) {
+ muser_dbg("failed to allocate %d pages", pg_map->nr_pages);
+ return -ENOMEM;
+ }
+ err = get_user_pages_fast(start, pg_map->nr_pages, writeable,
+ pg_map->pages);
+ if (unlikely(err != pg_map->nr_pages)) {
+ for (i = 0; i < err; i++)
+ put_page(pg_map->pages[i]);
+ kfree(pg_map->pages);
+ muser_dbg("failed to get user pages: %d", err);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void unpin_pages(struct page_map *const pg_map)
+{
+ int i;
+
+ if (!pg_map)
+ return;
+
+ for (i = 0; i < pg_map->nr_pages; i++)
+ put_page(pg_map->pages[i]);
+ kfree(pg_map->pages);
+ pg_map->pages = NULL;
+}
+
+
+static int vm_insert_pages(struct vm_area_struct *const vma,
+ struct page *const pages[], const int nr_pages)
+{
+ int err = 0, i;
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(!pages[i]);
+ err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+ pages[i]);
+ if (unlikely(err)) {
+ muser_dbg("count=%d, anon=%d, slab=%d, type=%d",
+ page_count(pages[i]), PageAnon(pages[i]),
+ PageSlab(pages[i]), page_has_type(pages[i]));
+ muser_dbg("failed to insert page at %lx: %d",
+ vma->vm_start + i * PAGE_SIZE, err);
+ unmap_kernel_range((unsigned long)vma->vm_start,
+ PAGE_SIZE);
+ break;
+ }
+ }
+ return err;
+}
+
+static struct page *mudev_page_alloc(struct muser_dev *mudev,
+ unsigned long pgnr)
+{
+ struct page *pg;
+ int ret;
+
+ pg = alloc_page(GFP_KERNEL);
+ if (unlikely(!pg))
+ return NULL;
+
+ ret = radix_tree_insert(&mudev->devmem_tree, pgnr, pg);
+ if (ret) {
+ __free_page(pg);
+ return NULL;
+ }
+
+ return pg;
+}
+
+static int libmuser_mmap_dev(struct file *fp, struct vm_area_struct *vma)
+{
+ struct muser_dev *mudev = fp->private_data;
+ struct page *pg;
+ unsigned int nr_pages;
+ unsigned long cur_pgidx, end_pgidx;
+ unsigned long addr, *new_pgs;
+ int ret, i;
+
+ WARN_ON(mudev == NULL);
+ nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+ /* array to track new allocated pages, to be free'd
+ * in case of failure */
+ new_pgs = kmalloc(nr_pages * sizeof(*new_pgs), GFP_KERNEL);
+ if (!new_pgs)
+ return -ENOMEM;
+
+ cur_pgidx = vma->vm_pgoff & ~(BIT(63 - PAGE_SHIFT));
+ end_pgidx = cur_pgidx + nr_pages;
+
+ muser_info("mmap_dev: end 0x%lX - start 0x%lX (%lX), off = 0x%lX",
+ vma->vm_end, vma->vm_start, vma->vm_end - vma->vm_start,
+ cur_pgidx);
+
+ mutex_lock(&mudev->dev_lock);
+ for (i = 0; cur_pgidx < end_pgidx; cur_pgidx++, i++) {
+ pg = radix_tree_lookup(&mudev->devmem_tree, cur_pgidx);
+ if (!pg) {
+ pg = mudev_page_alloc(mudev, cur_pgidx);
+ if (!pg) {
+ i--;
+ ret = -ENOMEM;
+ goto free_pg;
+ }
+ }
+
+ addr = vma->vm_start + (cur_pgidx << PAGE_SHIFT);
+ ret = vm_insert_page(vma, addr, pg);
+ if (unlikely(ret))
+ goto free_pg;
+ }
+
+ mutex_unlock(&mudev->dev_lock);
+ kfree(new_pgs);
+ return 0;
+
+free_pg:
+ for ( ;i >= 0; i--)
+ __mudev_page_free(mudev, new_pgs[i]);
+ mutex_unlock(&mudev->dev_lock);
+ kfree(new_pgs);
+ return ret;
+}
+
+static int libmuser_mmap_dma(struct file *f, struct vm_area_struct *vma)
+{
+ int err;
+ unsigned long length;
+ struct vfio_dma_mapping *dma_map;
+ struct muser_dev *mudev = f->private_data;
+
+ BUG_ON(!mudev);
+
+ muser_info("mmap_dma: end 0x%lX - start 0x%lX (%lX), off = 0x%lX",
+ vma->vm_end, vma->vm_start, vma->vm_end - vma->vm_start,
+ vma->vm_pgoff);
+
+ if (unlikely(!mudev->dma_map)) {
+ muser_dbg("no pending DMA map operation");
+ return -EINVAL;
+ }
+
+ dma_map = mudev->dma_map;
+ length = round_up(dma_map->length, PAGE_SIZE);
+ if (unlikely(vma->vm_end - vma->vm_start != length)) {
+ muser_dbg("expected mmap of %lx bytes, got %lx instead",
+ vma->vm_end - vma->vm_start, length);
+ return -EINVAL;
+ }
+
+ err = vm_insert_pages(vma, dma_map->pages, NR_PAGES(dma_map->length));
+ if (unlikely(err)) {
+ muser_dbg("DMA region insert failed (%lu pages: %lx-%lx): %d",
+ NR_PAGES(dma_map->length), vma->vm_start,
+ vma->vm_end, err);
+ return err;
+ }
+
+ return 0;
+}
+
+static int libmuser_mmap(struct file *f, struct vm_area_struct *vma)
+{
+ if (vma->vm_pgoff & BIT(63 - PAGE_SHIFT)) {
+ muser_info("offset: 0x%lX (top bit set)", vma->vm_pgoff);
+ return libmuser_mmap_dev(f, vma);
+ }
+
+ muser_dbg("offset: 0x%lX", vma->vm_pgoff);
+ return libmuser_mmap_dma(f, vma);
+}
+
+static int muser_process_dma_request(struct muser_dev *mudev,
+ struct vfio_dma_mapping *dma_map,
+ int flags, int type)
+{
+ int err;
+ struct mudev_cmd mucmd = {
+ .type = type,
+ .muser_cmd = {
+ .type = type,
+ .mmap = {
+ .request = {
+ .start = dma_map->iova,
+ .end = dma_map->iova + dma_map->length,
+ .flags = flags}
+ }
+ }
+ };
+
+ err = muser_process_cmd(mudev, &mucmd);
+ if (unlikely(err))
+ return err;
+
+ return mucmd.muser_cmd.mmap.response.addr;
+}
+
+static int muser_process_dma_map(struct muser_dev *mudev, int flags)
+{
+ return muser_process_dma_request(mudev, mudev->dma_map, flags,
+ MUSER_DMA_MMAP);
+}
+
+static int muser_process_dma_unmap(struct muser_dev *mudev,
+ struct vfio_dma_mapping *dma_map)
+{
+ return muser_process_dma_request(mudev, dma_map, 0, MUSER_DMA_MUNMAP);
+}
+
+static int put_dma_map(struct muser_dev *mudev,
+ struct vfio_dma_mapping *dma_map, int nr_pages)
+{
+ unsigned long off, iova_pfn;
+ int i, ret;
+
+ for (i = 0, off = 0; i < nr_pages; i++, off += PAGE_SIZE) {
+ iova_pfn = (dma_map->iova + off) >> PAGE_SHIFT;
+ ret = vfio_unpin_pages(mdev_dev(mudev->mdev), &iova_pfn, 1);
+ if (WARN_ON(ret != 1))
+ return -EINVAL;
+
+ put_page(dma_map->pages[i]);
+ }
+
+ kfree(dma_map->pages);
+ return 0;
+}
+
+static int
+get_dma_map(struct muser_dev *mudev, struct vfio_dma_mapping *dma_map,
+ struct vfio_iommu_type1_dma_map *map)
+{
+ unsigned long iova, vaddr;
+ unsigned long iova_pfn, phys_pfn;
+ unsigned long length, off;
+ int pgflag, ret, nr_pages = 0;
+ struct page **pages;
+
+ length = map->size;
+ pages = kmalloc_array(NR_PAGES(length), sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ pgflag = map->flags & VFIO_DMA_MAP_FLAG_WRITE ? FOLL_WRITE : 0;
+ dma_map->pages = pages;
+ dma_map->iova = map->iova;
+ dma_map->length = map->size;
+
+ iova = map->iova;
+ vaddr = map->vaddr;
+
+ /*
+ * XXX: for now the for loop is for each page, vfio_pin_pages() has
+ * limit of 512 pages.
+ */
+ for (off = 0; off < length; off += PAGE_SIZE, vaddr += PAGE_SIZE) {
+ iova_pfn = (iova + off) >> PAGE_SHIFT;
+ ret = vfio_pin_pages(mdev_dev(mudev->mdev), &iova_pfn, 1,
+ map->flags, &phys_pfn);
+ if (ret != 1)
+ goto err;
+
+ ret = get_user_pages_fast(vaddr, 1, pgflag, pages + nr_pages);
+ if (ret != 1) {
+ vfio_unpin_pages(mdev_dev(mudev->mdev), &iova_pfn, 1);
+ goto err;
+ }
+
+ nr_pages++;
+ }
+
+ return 0;
+
+err:
+ put_dma_map(mudev, dma_map, nr_pages);
+ return ret;
+}
+
+static int has_anonymous_pages(struct vfio_dma_mapping *dma_map)
+{
+ int i, nr_pages = NR_PAGES(dma_map->length);
+
+ for (i = 0; i < nr_pages; i++) {
+ if (PageAnon(dma_map->pages[i])) {
+ muser_dbg("ignore IOVA=%lx, page(s) not shared",
+ dma_map->iova);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int muser_iommu_dma_map(struct muser_dev *mudev,
+ struct vfio_iommu_type1_dma_map *map)
+{
+ struct vfio_dma_mapping *dma_map;
+ int ret;
+
+ /* TODO: support multiple DMA map operations in parallel */
+ mutex_lock(&mudev->dev_lock);
+ if (mudev->dma_map) {
+ mutex_unlock(&mudev->dev_lock);
+ muser_dbg("another DMA map operation is ongoing");
+ return -EBUSY;
+ }
+
+ dma_map = kmalloc(sizeof(struct vfio_dma_mapping), GFP_KERNEL);
+ if (!dma_map) {
+ mutex_unlock(&mudev->dev_lock);
+ return -ENOMEM;
+ }
+ mudev->dma_map = dma_map;
+ mutex_unlock(&mudev->dev_lock);
+
+ /* get vfio client pages to be used for DMA map */
+ ret = get_dma_map(mudev, dma_map, map);
+ if (ret)
+ goto out;
+
+ /* skip anonymous pages */
+ if (has_anonymous_pages(mudev->dma_map))
+ goto put_pages;
+
+ ret = muser_process_dma_map(mudev, map->flags);
+ if (ret)
+ goto put_pages;
+
+ /* add to the dma_list */
+ mutex_lock(&mudev->dev_lock);
+ list_add_tail(&dma_map->entry, &mudev->dma_list);
+ mudev->dma_map = NULL;
+ mutex_unlock(&mudev->dev_lock);
+ return 0;
+
+put_pages:
+ put_dma_map(mudev, dma_map, NR_PAGES(dma_map->length));
+
+out:
+ kfree(dma_map);
+ mutex_lock(&mudev->dev_lock);
+ mudev->dma_map = NULL;
+ mutex_unlock(&mudev->dev_lock);
+ return ret;
+}
+
+/* called with mudev.dev_lock held */
+static struct vfio_dma_mapping *__find_dma_map(struct muser_dev *mudev,
+ unsigned long iova)
+{
+ struct vfio_dma_mapping *dma_map;
+
+ list_for_each_entry(dma_map, &mudev->dma_list, entry) {
+ if (dma_map->iova == iova)
+ return dma_map;
+ }
+ return NULL;
+}
+
+static int muser_iommu_dma_unmap(struct muser_dev *const mudev,
+ struct vfio_iommu_type1_dma_unmap *const unmap)
+{
+ int err;
+ int len;
+ struct vfio_dma_mapping *dma_map;
+
+ mutex_lock(&mudev->dev_lock);
+ dma_map = __find_dma_map(mudev, unmap->iova);
+ if (!dma_map) {
+ mutex_unlock(&mudev->dev_lock);
+ muser_dbg("failed to find dma map for iova:%llu\n", unmap->iova);
+ return -EINVAL;
+ }
+ list_del(&dma_map->entry);
+ mutex_unlock(&mudev->dev_lock);
+
+ len = dma_map->length;
+ err = muser_process_dma_unmap(mudev, dma_map);
+ if (unlikely(err))
+ muser_dbg("failed to request PCI server to munmap: %d", err);
+
+ err = put_dma_map(mudev, dma_map, NR_PAGES(len));
+ if (unlikely(err)) {
+ muser_dbg("failed to tear down DMA map: %d", err);
+ goto out;
+ }
+
+ /* XXX: Do we need this? */
+ unmap->size = len;
+out:
+ return err;
+}
+
+/*
+ * FIXME There can be multiple DMA map calls per device. If each of these calls
+ * are serialised (this can be enforced by muser), then we tell PCI server to
+ * mmap the control device. Do we need to distinguish between the different
+ * DMA map calls at this stage if we can enforce only one outstanding DMA map
+ * call? What happens when the DMA map happens too early, before GET_DEVICE_FD
+ * is called?
+ */
+static int muser_iommu_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct muser_dev *mudev;
+ int err;
+
+ BUG_ON(!nb);
+ BUG_ON(!data);
+
+ mudev = container_of(nb, struct muser_dev, iommu_notifier);
+ switch (action) {
+ case VFIO_IOMMU_NOTIFY_DMA_MAP:
+ err = muser_iommu_dma_map(mudev,
+ (struct vfio_iommu_type1_dma_map *)
+ data);
+ break;
+ case VFIO_IOMMU_NOTIFY_DMA_UNMAP:
+ err = muser_iommu_dma_unmap(mudev,
+ (struct vfio_iommu_type1_dma_unmap
+ *)data);
+ break;
+ default:
+ muser_dbg("bad action=%lx", action);
+ err = -EINVAL;
+ }
+
+ if (unlikely(err))
+ return NOTIFY_BAD;
+ return NOTIFY_OK;
+}
+
+static int register_notifier(struct mdev_device *const mdev)
+{
+ unsigned long events =
+ VFIO_IOMMU_NOTIFY_DMA_MAP | VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+ struct muser_dev *const mudev = mdev_get_drvdata(mdev);
+
+ memset(&mudev->iommu_notifier, 0, sizeof(mudev->iommu_notifier));
+ mudev->iommu_notifier.notifier_call = muser_iommu_notifier;
+ return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+ &events, &mudev->iommu_notifier);
+}
+
+int muser_open(struct mdev_device *mdev)
+{
+ int err;
+ struct muser_dev *mudev = mdev_get_drvdata(mdev);
+
+ WARN_ON(mudev == NULL);
+
+ if (atomic_cmpxchg(&mudev->mdev_opened, 0, 1) != 0) {
+ muser_dbg("device already open");
+ return -EBUSY;
+ }
+
+ err = register_notifier(mdev);
+ if (unlikely(err)) {
+ int err2;
+ /*
+ * TODO we might have triggered some notifiers which will have
+ * caused PCI server to mmap. If open fails then PCI server dies
+ * therefore things get automatically cleaned up (e.g.
+ * vfio_unpin etc.)?
+ */
+ atomic_dec(&mudev->mdev_opened);
+ muser_dbg("failed to register notifier: %d", err);
+ err2 = dma_unmap_all(mudev, false);
+ if (unlikely(err2))
+ muser_dbg("failed to DMA unmap all regions: %d",
+ err2);
+ err2 =
+ vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+ &mudev->iommu_notifier);
+ if (unlikely(err2))
+ muser_info("failed to unregister notifier: %d", err);
+
+ }
+
+ return err;
+}
+
+static int dma_unmap_all(struct muser_dev *mudev, bool skip_user)
+{
+ struct vfio_dma_mapping *dma_map;
+ unsigned long length;
+ LIST_HEAD(head);
+
+ mutex_lock(&mudev->dev_lock);
+ while (!list_empty(&mudev->dma_list)) {
+ dma_map = list_first_entry(&mudev->dma_list,
+ struct vfio_dma_mapping, entry);
+ list_move(&dma_map->entry, &head);
+ }
+ mutex_unlock(&mudev->dev_lock);
+
+ while (!list_empty(&head)) {
+ int err;
+
+ dma_map = list_first_entry(&head, struct vfio_dma_mapping,
+ entry);
+ list_del(&dma_map->entry);
+ if (!skip_user) {
+ err = muser_process_dma_unmap(mudev, dma_map);
+ if (unlikely(err)) {
+ muser_alert("unmap request failed IOVA=%lx: %d",
+ dma_map->iova, err);
+ continue;
+ }
+ }
+
+ length = dma_map->length;
+ err = put_dma_map(mudev, dma_map, NR_PAGES(length));
+ if (unlikely(err))
+ muser_alert("failed to unmap DMA IOVA=%lx: %d",
+ dma_map->iova, err);
+ }
+ return 0;
+}
+
+void muser_close(struct mdev_device *mdev)
+{
+ struct muser_dev *mudev = mdev_get_drvdata(mdev);
+ int err;
+
+ err = dma_unmap_all(mudev, false);
+ if (unlikely(err))
+ muser_alert("failed to remove one or more DMA maps");
+
+ err = vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+ &mudev->iommu_notifier);
+ if (unlikely(err))
+ muser_info("failed to unregister notifier: %d", err);
+
+ WARN_ON(atomic_read(&mudev->mdev_opened) == 0);
+ atomic_dec(&mudev->mdev_opened);
+
+ /* TODO: Replace any pending mucmd back in cmd_list. */
+}
+
+static int
+pin_pages(struct mudev_cmd *mucmd, char __user *buf, size_t count,
+ int writeable)
+{
+ mucmd->pg_map.len = count;
+ return do_pin_pages(buf, count, writeable, &mucmd->pg_map);
+}
+
+void dump_buffer(unsigned char const *const buf, uint32_t count)
+{
+#if defined(DEBUG)
+ print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 4, 1, buf, count,
+ false);
+#endif
+}
+
+ssize_t muser_read(struct mdev_device *mdev, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct muser_dev *mudev = mdev_get_drvdata(mdev);
+ struct mudev_cmd mucmd = { 0 };
+ int err;
+
+ WARN_ON(mudev == NULL);
+
+ /* Setup mucmd and ping pages of the calling context. */
+ mucmd.type = MUSER_READ;
+ err = pin_pages(&mucmd, buf, count, 1);
+ if (err != 0)
+ return err;
+
+ /* Setup muser_cmd for server context. */
+ mucmd.muser_cmd.type = MUSER_READ;
+ mucmd.muser_cmd.rw.count = count;
+ mucmd.muser_cmd.rw.pos = *ppos;
+
+ muser_dbg("R %lx@%llx", mucmd.muser_cmd.rw.count,
+ mucmd.muser_cmd.rw.pos);
+
+ /* Process mudev_cmd in libmuser context. */
+ err = muser_process_cmd(mudev, &mucmd);
+ if (err != 0)
+ count = -1;
+ *ppos = mucmd.muser_cmd.rw.pos;
+
+ unpin_pages(&mucmd.pg_map);
+
+ dump_buffer(buf, count);
+ return count;
+}
+
+ssize_t muser_write(struct mdev_device *mdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct muser_dev *mudev = mdev_get_drvdata(mdev);
+ struct mudev_cmd mucmd = { 0 };
+ int err;
+ size_t _count = count;
+ loff_t _pos = *ppos;
+
+ muser_dbg("W %lx@%llx", count, *ppos);
+ dump_buffer(buf, count);
+
+ /* Setup mucmd and pin pages of the calling context. */
+ mucmd.type = MUSER_WRITE;
+ err = pin_pages(&mucmd, (char __user *)buf, count, 0);
+ if (err != 0)
+ return err;
+
+ /* Setup muser_cmd for libmuser context. */
+ mucmd.muser_cmd.type = MUSER_WRITE;
+ mucmd.muser_cmd.rw.count = count;
+ mucmd.muser_cmd.rw.pos = *ppos;
+
+ /* Process mudev_cmd in server context. */
+ err = muser_process_cmd(mudev, &mucmd);
+ if (err != 0)
+ count = -1;
+ *ppos = mucmd.muser_cmd.rw.pos;
+
+ unpin_pages(&mucmd.pg_map);
+
+ if (mucmd.muser_cmd.err)
+ muser_info("PCI config write %ld@0x%llx not handled: %d",
+ _count, _pos, mucmd.muser_cmd.err);
+
+ return count;
+}
+
+static int
+bounce_fds(struct mudev_cmd *mucmd, void __user *data, int user_data_size)
+{
+ int count = mucmd->muser_cmd.ioctl.data.irq_set.count;
+ int data_size = count * sizeof(int32_t);
+ int *user_fds;
+ int i;
+ int ret = 0;
+
+ if (user_data_size < data_size)
+ return -EINVAL;
+
+ mucmd->fds = kcalloc(count, sizeof(*mucmd->fds), GFP_KERNEL);
+ if (mucmd->fds == NULL)
+ return -ENOMEM;
+
+ user_fds = memdup_user(data, data_size);
+ if (IS_ERR(user_fds)) {
+ kfree(mucmd->fds);
+ mucmd->fds = NULL;
+ return PTR_ERR(user_fds);
+ }
+
+ for (i = 0; i < count; i++) {
+ if (user_fds[i] == -1)
+ continue;
+ mucmd->fds[i] = fget(user_fds[i]);
+ if (mucmd->fds[i] == NULL) {
+ ret = -EBADF;
+ goto err;
+ }
+ }
+
+ kfree(user_fds);
+
+ return 0;
+
+err:
+ for (i--; i >= 0; i--)
+ fput(mucmd->fds[i]);
+ kfree(user_fds);
+ kfree(mucmd->fds);
+ mucmd->fds = NULL;
+
+ return ret;
+}
+
+static unsigned int get_minsz(unsigned int cmd)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ return offsetofend(struct vfio_device_info, num_irqs);
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return offsetofend(struct vfio_region_info, offset);
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return offsetofend(struct vfio_irq_info, count);
+ case VFIO_DEVICE_SET_IRQS:
+ return offsetofend(struct vfio_irq_set, count);
+ }
+ return -1;
+}
+
+static unsigned int get_argsz(unsigned int cmd, struct mudev_cmd *mucmd)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ return mucmd->muser_cmd.ioctl.data.dev_info.argsz;
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return mucmd->muser_cmd.ioctl.data.reg_info.argsz;
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return mucmd->muser_cmd.ioctl.data.irq_info.argsz;
+ case VFIO_DEVICE_SET_IRQS:
+ return mucmd->muser_cmd.ioctl.data.irq_set.argsz;
+ }
+ return -1;
+}
+
+static int muser_ioctl_setup_cmd(struct mudev_cmd *mucmd, unsigned int cmd,
+ unsigned long arg)
+{
+ unsigned int minsz;
+ unsigned int argsz;
+ int err;
+
+ /* Determine smallest argsz we need for this command. */
+ minsz = get_minsz(cmd);
+ if (minsz == -1)
+ return -EOPNOTSUPP;
+
+ /* Copy caller-provided arg. */
+ err = muser_copyin(&mucmd->muser_cmd.ioctl.data, (void __user *)arg,
+ minsz);
+ if (unlikely(err))
+ return err;
+
+ /* Fetch argsz provided by caller. */
+ argsz = get_argsz(cmd, mucmd);
+ if (argsz == -1)
+ return -EINVAL;
+
+ /* Ensure provided size is at least the minimum required. */
+ if (argsz < minsz)
+ return -EINVAL;
+
+ /* Fetch potential data provided on SET_IRQS. */
+ if (cmd == VFIO_DEVICE_SET_IRQS) {
+ unsigned int flags = mucmd->muser_cmd.ioctl.data.irq_set.flags;
+
+ switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ /* Lookup eventfds and bounce references to mucmd. */
+ err = bounce_fds(mucmd, (void __user *) (arg + minsz),
+ argsz - minsz);
+ if (err) {
+ muser_dbg("failed to bounce fds: %d\n", err);
+ return err;
+ }
+ break;
+ }
+ }
+
+ /* Pin pages of the calling context. */
+ err = pin_pages(mucmd, (char __user *)arg, argsz, 1);
+ if (unlikely(err)) {
+ muser_dbg("failed to pin pages: %d\n", err);
+ return err;
+ }
+
+ return err;
+}
+
+static long muser_ioctl(struct mdev_device *mdev, unsigned int cmd,
+ unsigned long arg)
+{
+ struct muser_dev *mudev = mdev_get_drvdata(mdev);
+ struct mudev_cmd mucmd = { 0 };
+ int err;
+
+ muser_dbg("mdev=%p, cmd=%u, arg=0x%lX\n", mdev, cmd, arg);
+
+ if (cmd == VFIO_DEVICE_RESET) {
+ /*
+ * Qemu-vfio(check vfio_pci_reset()) takes care of
+ * enabling/disabling Interrupts.
+ *
+ * FIXME:
+ * No need to block pci config access as only one
+ * mdev_parent_ops is allowed to execute at a time.
+ *
+ * Returning -EAGAIN if client tries to send multiple resets.
+ */
+ if (!device_trylock(mudev->dev))
+ return -EAGAIN;
+ } else {
+ err = muser_ioctl_setup_cmd(&mucmd, cmd, arg);
+ if (err)
+ return err;
+ }
+
+ /* Setup common mucmd records. */
+ mucmd.type = MUSER_IOCTL;
+ mucmd.muser_cmd.type = MUSER_IOCTL;
+ mucmd.muser_cmd.ioctl.vfio_cmd = cmd;
+
+ /* Process mudev_cmd in server context. */
+ err = muser_process_cmd(mudev, &mucmd);
+ if (err != 0) {
+ muser_dbg("failed to process command: %d\n", err);
+ err = -1;
+ }
+
+ if (cmd == VFIO_DEVICE_RESET) {
+ device_unlock(mudev->dev);
+ } else {
+ /* Release resources. */
+ unpin_pages(&mucmd.pg_map);
+
+ /* maybe allocated for VFIO_IRQ_SET_DATA_EVENTFD */
+ kfree(mucmd.fds);
+ kfree(mucmd.data_fds);
+ }
+
+ return err;
+}
+
+static int muser_mmap(struct mdev_device *const mdev,
+ struct vm_area_struct *const vma)
+{
+ struct muser_dev *mudev = mdev_get_drvdata(mdev);
+ struct mudev_cmd mucmd = { 0 };
+ int err;
+
+ BUG_ON(!mudev);
+ BUG_ON(!vma);
+
+ /*
+ * Checking vm_flags cannot be easily done in user space as we can't
+ * access mm.h, so we have to do it here. Maybe implement the reverse
+ * of calc_vm_prot_bits/calc_vm_flag_bits?
+ */
+ if ((vma->vm_flags & ~(VM_READ | VM_WRITE | VM_SHARED | VM_MAYREAD |
+ VM_MAYWRITE | VM_MAYEXEC | VM_MAYSHARE))) {
+ muser_dbg("bag flags=0x%lx", vma->vm_flags);
+ return -EINVAL;
+ }
+
+ mucmd.type = MUSER_MMAP;
+ mucmd.muser_cmd.type = MUSER_MMAP;
+ mucmd.muser_cmd.mmap.request.start = vma->vm_start;
+ mucmd.muser_cmd.mmap.request.end = vma->vm_end;
+ mucmd.muser_cmd.mmap.request.pgoff = vma->vm_pgoff;
+ mucmd.mmap_len = vma->vm_end - vma->vm_start;
+
+ /* Process mudev_cmd in server context. */
+ err = muser_process_cmd(mudev, &mucmd);
+ if (unlikely(err)) {
+ muser_info("failed to mmap: %d", err);
+ return err;
+ }
+
+ return vm_insert_pages(vma, mucmd.pg_map.pages, mucmd.pg_map.nr_pages);
+}
+
+struct mdev_parent_ops muser_mdev_fops = {
+ .owner = THIS_MODULE,
+ .supported_type_groups = mdev_type_groups,
+ .create = muser_create,
+ .remove = muser_remove,
+ .open = muser_open,
+ .release = muser_close,
+ .read = muser_read,
+ .write = muser_write,
+ .ioctl = muser_ioctl,
+ .mmap = muser_mmap,
+};
+
+/* copy vfio-client pages(mucmd.pg_map) to server(arg) */
+static int bounce_out(void __user *arg, size_t argsz, struct mudev_cmd *mucmd)
+{
+ unsigned long to_copy, left;
+ void __user *to;
+ void *from;
+ unsigned int offset;
+ int i, ret = 0;
+
+ left = mucmd->pg_map.len;
+ if (argsz < left)
+ return -EINVAL;
+
+ offset = mucmd->pg_map.offset;
+
+ for (i = 0; i < mucmd->pg_map.nr_pages && ret == 0; i++) {
+ to_copy = min(left, PAGE_SIZE - offset);
+ to = arg + (mucmd->pg_map.len - left);
+ from = page_to_virt(mucmd->pg_map.pages[i]) + offset;
+
+ ret = muser_copyout(to, from, to_copy);
+ if (ret)
+ return ret;
+
+ left -= to_copy;
+
+ /* Must be zero after first iteration. */
+ offset = 0;
+ }
+ WARN_ON(left != 0);
+
+ return 0;
+}
+
+/* copy from server(uaddr) to vfio-client pages(mucmd.pg_map) */
+static int bounce_in(struct mudev_cmd *mucmd, void __user *uaddr)
+{
+ unsigned long to_copy, left;
+ void __user *from;
+ void *to;
+ unsigned int offset;
+ int i, ret;
+
+ left = mucmd->pg_map.len;
+ offset = mucmd->pg_map.offset;
+
+ for (i = 0; i < mucmd->pg_map.nr_pages; i++) {
+ to_copy = min(left, PAGE_SIZE - offset);
+ from = uaddr + (mucmd->pg_map.len - left);
+ to = page_to_virt(mucmd->pg_map.pages[i]) + offset;
+
+ ret = muser_copyin(to, from, to_copy);
+ if (ret)
+ return ret;
+
+ left -= to_copy;
+
+ /* Must be zero after first iteration. */
+ offset = 0;
+ }
+ WARN_ON(left != 0);
+
+ return 0;
+}
+
+static long install_fds(struct mudev_cmd *mucmd)
+{
+ int count = mucmd->muser_cmd.ioctl.data.irq_set.count;
+ int i;
+ long ret;
+
+ mucmd->data_fds = kcalloc(count, sizeof(int32_t), GFP_KERNEL);
+ if (mucmd->data_fds == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < count; i++) {
+ if (mucmd->fds[i] == NULL) {
+ mucmd->data_fds[i] = -1;
+ continue;
+ }
+ mucmd->data_fds[i] = get_unused_fd_flags(0);
+ if (mucmd->data_fds[i] < 0) {
+ ret = mucmd->data_fds[i];
+ muser_err("unable to get unused fd: %ld", ret);
+ goto err;
+ }
+ fd_install(mucmd->data_fds[i], mucmd->fds[i]);
+ }
+
+ return 0;
+
+err:
+ for (i--; i >= 0; i--)
+ put_unused_fd(mucmd->data_fds[i]);
+ kfree(mucmd->data_fds);
+
+ return ret;
+}
+
+static inline int maybe_install_fds(struct mudev_cmd *mucmd)
+{
+ unsigned int flags = mucmd->muser_cmd.ioctl.data.irq_set.flags;
+ long ret = 0;
+
+ if ((mucmd->muser_cmd.type == MUSER_IOCTL) &&
+ (mucmd->muser_cmd.ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS)) {
+ switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ ret = install_fds(mucmd);
+ if (unlikely(ret))
+ muser_dbg("failed to install fds: %ld", ret);
+ break;
+ /* TODO: SET_DATA_BOOL */
+ }
+ }
+
+ return ret;
+}
+
+static inline int mmap_done(struct mudev_cmd * const mucmd)
+{
+ struct muser_cmd *cmd = &mucmd->muser_cmd;
+ char __user *addr = (char __user *) cmd->mmap.response.addr;
+ int ret;
+
+ if (cmd->err < 0)
+ return -1;
+ ret = do_pin_pages(addr, mucmd->mmap_len, 1, &mucmd->pg_map);
+ if (ret) {
+ muser_alert("failed to pin pages: %d", ret);
+ mucmd->pg_map.pages = NULL;
+ mucmd->pg_map.nr_pages = 0;
+ }
+ return ret;
+}
+
+static long libmuser_unl_ioctl(struct file *filep,
+ unsigned int cmd, unsigned long arg)
+{
+ struct muser_dev *mudev = filep->private_data;
+ struct mudev_cmd *mucmd;
+ unsigned long offset;
+ long ret = -EINVAL;
+
+ WARN_ON(mudev == NULL);
+ switch (cmd) {
+ case MUSER_DEV_CMD_WAIT:
+ /* Block until a request come from vfio. */
+ ret = wait_event_interruptible(mudev->user_wait_q,
+ !list_empty(&mudev->cmd_list));
+ if (unlikely(ret)) {
+ muser_dbg("failed to wait for user space: %ld", ret);
+ goto out;
+ }
+
+ /* Pick and remove the mucmd from the cmd_list. */
+ mutex_lock(&mudev->dev_lock);
+ WARN_ON(list_empty(&mudev->cmd_list));
+ mucmd = list_first_entry(&mudev->cmd_list, struct mudev_cmd,
+ entry);
+ list_del(&mucmd->entry);
+ mutex_unlock(&mudev->dev_lock);
+
+ /* Keep a reference to mudev_cmd in mudev. */
+ WARN_ON(mudev->mucmd_pending != NULL);
+ mudev->mucmd_pending = mucmd;
+ /* TODO: These WARN_ON()s should really just detach mudev. */
+
+ /* Populate userspace with mucmd. */
+ ret = muser_copyout((void __user *)arg, &mucmd->muser_cmd,
+ sizeof(struct muser_cmd));
+ if (ret)
+ return -EFAULT;
+
+ /* Install FDs on VFIO_SET_IRQS */
+ ret = maybe_install_fds(mucmd);
+ if (ret)
+ return ret;
+
+ break;
+ case MUSER_DEV_CMD_DONE:
+ /* This is only called when a command is pending. */
+ if (mudev->mucmd_pending == NULL) {
+ muser_dbg("done but no command pending");
+ return -1;
+ }
+
+ /* Fetch (and clear) the pending command. */
+ mucmd = mudev->mucmd_pending;
+ mudev->mucmd_pending = NULL;
+
+ /* Fetch response from userspace. */
+ ret = muser_copyin(&mucmd->muser_cmd, (void __user *)arg,
+ sizeof(struct muser_cmd));
+ if (ret)
+ goto out;
+
+ switch (mucmd->type) {
+ case MUSER_IOCTL:
+ offset = offsetof(struct muser_cmd, ioctl);
+ offset += offsetof(struct muser_cmd_ioctl, data);
+ ret = bounce_in(mucmd, (void __user *)(arg + offset));
+ break;
+ case MUSER_MMAP:
+ ret = mmap_done(mucmd);
+ break;
+ case MUSER_WRITE:
+ case MUSER_READ:
+ case MUSER_DMA_MMAP:
+ case MUSER_DMA_MUNMAP:
+ break;
+ default:
+ muser_alert("bad command %d", mucmd->type);
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Wake up vfio client. */
+ up(&mudev->sem);
+ break;
+
+ default:
+ muser_info("bad ioctl 0x%x", cmd);
+ return -1;
+ }
+
+out:
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long libmuser_compat_ioctl(struct file *filep,
+ unsigned int cmd, unsigned long arg)
+{
+ arg = (unsigned long)compat_ptr(arg);
+ return libmuser_unl_ioctl(filep, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
+static struct muser_dev *muser_get_dev_from_minor(int minor)
+{
+ struct muser_dev *mudev;
+
+ /* Locate mudev using idr. */
+ mutex_lock(&muser.muser_lock);
+ mudev = idr_find(&muser.dev_idr, minor);
+ mutex_unlock(&muser.muser_lock);
+
+ return mudev;
+}
+
+static int libmuser_open(struct inode *inode, struct file *filep)
+{
+ struct muser_dev *mudev;
+ int opened;
+
+ /* Fetch corresponding mudev. */
+ mudev = muser_get_dev_from_minor(iminor(inode));
+ if (!mudev)
+ return -ENOENT;
+
+ /* Allow only one server for each mudev. */
+ opened = atomic_cmpxchg(&mudev->srv_opened, 0, 1);
+ if (opened)
+ return -EBUSY;
+
+ WARN_ON(filep->private_data != NULL);
+ filep->private_data = mudev;
+
+ return 0;
+}
+
+static int libmuser_release(struct inode *inode, struct file *filep)
+{
+ struct muser_dev *mudev = filep->private_data;
+ int err;
+
+ WARN_ON(mudev == NULL);
+ mutex_lock(&mudev->dev_lock);
+ /*
+ * FIXME must be per filep
+ */
+ if (mudev->mucmd_pending) {
+ muser_info("moving command back in list");
+ list_add_tail(&mudev->mucmd_pending->entry, &mudev->cmd_list);
+ mudev->mucmd_pending = NULL;
+ }
+ mutex_unlock(&mudev->dev_lock);
+
+ err = dma_unmap_all(mudev, true);
+ if (unlikely(err))
+ muser_alert("failed to remove DMA maps");
+
+ filep->private_data = NULL;
+ atomic_dec(&mudev->srv_opened);
+
+ return 0;
+}
+
+static inline int irq_set_data_eventfd(void __user * const buf,
+ struct mudev_cmd * const mucmd)
+{
+ return muser_copyout((void __user *)buf, mucmd->data_fds,
+ sizeof(__s32) * mucmd->muser_cmd.ioctl.data.irq_set.count);
+}
+
+static inline int irq_set_data_bool(void __user * const buf,
+ struct mudev_cmd * const mucmd)
+{
+ return muser_copyout((void __user *)buf, mucmd->data_fds,
+ sizeof(__u8) * mucmd->muser_cmd.ioctl.data.irq_set.count);
+}
+
+/*
+ * Called by libmuser for kernel->user transfers.
+ */
+static ssize_t libmuser_read(struct file *filp, char __user *buf,
+ size_t bufsz, loff_t *ppos)
+{
+ struct muser_dev *mudev = filp->private_data;
+ struct mudev_cmd *mucmd = mudev->mucmd_pending;
+ int ret = -EINVAL;
+ uint32_t irq_set_flags;
+
+ if (!mucmd || !mudev) {
+ muser_dbg("bad arguments");
+ return -EINVAL;
+ }
+
+ if (!access_ok(buf, bufsz)) {
+ muser_dbg("bad permissions");
+ return -EFAULT;
+ }
+
+ switch (mucmd->type) {
+ case MUSER_WRITE:
+ ret = bounce_out(buf, bufsz, mucmd);
+ if (ret) {
+ muser_dbg("failed to copy to user: %d", ret);
+ goto err;
+ }
+ break;
+ case MUSER_IOCTL:
+ /* FIXME move case into separate function */
+ if (mucmd->muser_cmd.ioctl.vfio_cmd != VFIO_DEVICE_SET_IRQS) {
+ muser_dbg("expected VFIO command %d, got %d instead",
+ VFIO_DEVICE_SET_IRQS,
+ mucmd->muser_cmd.ioctl.vfio_cmd);
+ goto err;
+ }
+ irq_set_flags = mucmd->muser_cmd.ioctl.data.irq_set.flags &
+ VFIO_IRQ_SET_DATA_TYPE_MASK;
+ switch (irq_set_flags) {
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ ret = irq_set_data_eventfd((void __user *)buf, mucmd);
+ if (unlikely(ret)) {
+ muser_dbg("failed to set data eventfd: %d",
+ ret);
+ goto err;
+ }
+ break;
+ case VFIO_IRQ_SET_DATA_BOOL:
+ ret = irq_set_data_bool((void __user *)buf, mucmd);
+ if (unlikely(ret))
+ goto err;
+ break;
+ default:
+ muser_dbg("bad VFIO set IRQ flags %d", irq_set_flags);
+ goto err;
+ }
+ break;
+ default:
+ muser_dbg("bad muser command %d", mucmd->type);
+ goto err;
+ }
+ return bufsz;
+
+err:
+ return ret;
+}
+
+/*
+ * Called by libmuser for user->kernel transfers.
+ */
+static ssize_t libmuser_write(struct file *filp, const char __user *buf,
+ size_t bufsz, loff_t *ppos)
+{
+ struct muser_dev *mudev = filp->private_data;
+ struct mudev_cmd *mucmd = mudev->mucmd_pending;
+ struct muser_cmd muser_cmd;
+ int ret;
+
+ if (!mucmd || !mudev) {
+ muser_dbg("bad arguments");
+ return -EINVAL;
+ }
+
+ if (!access_ok(buf, bufsz)) {
+ muser_dbg("bad permissions");
+ return -EFAULT;
+ }
+
+ ret = muser_copyin(&muser_cmd, (void __user *)buf,
+ sizeof(struct muser_cmd));
+ if (ret)
+ return ret;
+
+ if (mucmd->type != muser_cmd.type) {
+ muser_dbg("bad command %d", muser_cmd.type);
+ return -EINVAL;
+ }
+
+ WARN_ON(muser_cmd.type != MUSER_READ);
+ ret = bounce_in(mucmd, muser_cmd.rw.buf);
+ if (ret)
+ return ret;
+
+ return bufsz;
+}
+
+static const struct file_operations libmuser_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = libmuser_unl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = libmuser_compat_ioctl,
+#endif
+ .open = libmuser_open,
+ .release = libmuser_release,
+ .mmap = libmuser_mmap,
+ .read = libmuser_read,
+ .write = libmuser_write,
+};
+
+static void muser_device_release(struct device *dev)
+{
+ muser_info("muser dev released\n");
+}
+
+static char *muser_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, DRIVER_NAME "/%s", dev_name(dev));
+}
+
+static int __init muser_init(void)
+{
+ int ret;
+
+ /* Initialise idr. */
+ idr_init(&muser.dev_idr);
+ mutex_init(&muser.muser_lock);
+ INIT_LIST_HEAD(&muser.dev_list);
+
+ /* Initialise class. */
+ muser.class = class_create(THIS_MODULE, DRIVER_NAME);
+ if (IS_ERR(muser.class))
+ return PTR_ERR(muser.class);
+ muser.class->devnode = muser_devnode;
+
+ /* Allocate and register a chardev for muser devices. */
+ ret = alloc_chrdev_region(&muser.muser_devt, 0, MINORMASK + 1,
+ DRIVER_NAME);
+ if (ret)
+ goto err_alloc_chrdev;
+
+ cdev_init(&muser.muser_cdev, &libmuser_fops);
+ ret = cdev_add(&muser.muser_cdev, muser.muser_devt, MINORMASK + 1);
+ if (ret)
+ goto err_cdev_add;
+
+ muser.dev.class = muser.class;
+ muser.dev.release = muser_device_release;
+ dev_set_name(&muser.dev, "%s", DRIVER_NAME);
+
+ ret = device_register(&muser.dev);
+ if (ret)
+ goto err_device_register;
+
+ /* Register ourselves with mdev. */
+ ret = mdev_register_device(&muser.dev, &muser_mdev_fops);
+ if (ret)
+ goto err_mdev_register_device;
+
+ return 0;
+
+err_mdev_register_device:
+ device_unregister(&muser.dev);
+err_device_register:
+ cdev_del(&muser.muser_cdev);
+err_cdev_add:
+ unregister_chrdev_region(muser.muser_devt, MINORMASK + 1);
+err_alloc_chrdev:
+ class_destroy(muser.class);
+ muser.class = NULL;
+ return ret;
+}
+
+static void __exit muser_cleanup(void)
+{
+ struct muser_dev *mudev, *tmp;
+
+ /* Remove all devices. */
+ mutex_lock(&muser.muser_lock);
+ list_for_each_entry_safe(mudev, tmp, &muser.dev_list, dlist_entry) {
+ __muser_deinit_dev(mudev);
+ kfree(mudev);
+ }
+ mutex_unlock(&muser.muser_lock);
+
+ /* Unregister with mdev. */
+ muser.dev.bus = NULL;
+ mdev_unregister_device(&muser.dev);
+
+ /* Cleanup everything else. */
+ device_unregister(&muser.dev);
+ idr_destroy(&muser.dev_idr);
+ cdev_del(&muser.muser_cdev);
+ unregister_chrdev_region(muser.muser_devt, MINORMASK + 1);
+ class_destroy(muser.class);
+ muser.class = NULL;
+}
+
+module_init(muser_init);
+module_exit(muser_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kmod/muser.h b/kmod/muser.h
new file mode 100644
index 0000000..14fecd6
--- /dev/null
+++ b/kmod/muser.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2019, Nutanix Inc. All rights reserved.
+ *
+ * Author: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ */
+
+#ifndef _LINUX_MUSER_H
+#define _LINUX_MUSER_H
+
+#ifndef __KERNEL__
+#include <sys/types.h>
+#endif
+
+#include <linux/ioctl.h>
+#include <linux/vfio.h>
+
+#define MUSER_DEVNODE "muser"
+
+enum muser_cmd_type {
+ MUSER_IOCTL = 1,
+ MUSER_READ,
+ MUSER_WRITE,
+ MUSER_MMAP,
+ MUSER_DMA_MMAP,
+ MUSER_DMA_MUNMAP,
+};
+
+struct muser_cmd_rw {
+ size_t count;
+ loff_t pos;
+ char *buf; /* only used for write */
+};
+
+struct muser_cmd_ioctl {
+ int vfio_cmd;
+ union {
+ struct vfio_device_info dev_info;
+ struct vfio_region_info reg_info;
+ struct vfio_irq_info irq_info;
+ struct vfio_irq_set irq_set;
+ } data;
+};
+
+union muser_cmd_mmap {
+ struct {
+ unsigned long start;
+ unsigned long end;
+ unsigned long flags;
+ unsigned long pgoff;
+ } request;
+ struct {
+ unsigned long addr;
+ } response;
+};
+
+struct muser_cmd {
+ enum muser_cmd_type type;
+ union {
+ struct muser_cmd_rw rw;
+ struct muser_cmd_ioctl ioctl;
+ union muser_cmd_mmap mmap;
+ };
+ int err;
+};
+
+/* ioctl cmds valid for /dev/muser/<uuid> */
+#define MUSER_DEV_CMD_WAIT _IOW('M', 1, struct muser_cmd)
+#define MUSER_DEV_CMD_DONE _IOW('M', 2, struct muser_cmd)
+
+#endif /* _LINUX_MUSER_H */
diff --git a/lib/.indent.pro b/lib/.indent.pro
new file mode 100644
index 0000000..52ef8f2
--- /dev/null
+++ b/lib/.indent.pro
@@ -0,0 +1,4 @@
+-nbad -bap -nbc -bbo -hnl -br -brs -c33 -cd33 -ncdb -ce -ci4
+-cli0 -d0 -di1 -nfc1 -i4 -ip0 -l80 -lp -npcs -nprs -psl -sai
+-saf -saw -ncs -nsc -nsob -nfca -cp33 -ss -ts8 -il0
+-nut -blf
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
new file mode 100644
index 0000000..6d3d0ae
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2019 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos@nutanix.com>
+# Swapnil Ingle <swapnil.ingle@nutanix.com>
+# Felipe Franciosi <felipe@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+add_library(muser SHARED
+ ../kmod/muser.h
+ muser.h
+ pci.h
+ pmcap.h
+ msicap.h
+ pxcap.h
+ common.h
+ dma.h
+ dma.c
+ libmuser.c
+ libmuser_pci.c)
+set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;pmcap.h;msicap.h;pxcap.h")
+install(TARGETS muser
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/muser)
diff --git a/lib/common.h b/lib/common.h
new file mode 100644
index 0000000..4fbc048
--- /dev/null
+++ b/lib/common.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <stdint.h>
+
+#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
+#define PAGE_ALIGNED(x) (((x) & ((typeof(x))(PAGE_SIZE) - 1)) == 0)
+
+#define BIT(nr) (1UL << (nr))
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
+#define likely(e) __builtin_expect(!!(e), 1)
+#define unlikely(e) __builtin_expect(e, 0)
+
+#define ROUND_DOWN(x, a) ((x) & ~((a)-1))
+#define ROUND_UP(x,a) ROUND_DOWN((x)+(a)-1, a)
+
+void lm_log(lm_ctx_t const *const lm_ctx, const lm_log_lvl_t lvl,
+ char const *const fmt, ...);
+
+void dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix,
+ unsigned char const *const buf, uint32_t count);
+
+
+#endif /* __COMMON_H__ */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.c b/lib/dma.c
new file mode 100644
index 0000000..5c9455f
--- /dev/null
+++ b/lib/dma.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Mike Cui <cui@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/param.h>
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <errno.h>
+
+#include "dma.h"
+
+static inline ssize_t
+fd_get_blocksize(int fd)
+{
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ return -1;
+
+ return st.st_blksize;
+}
+
+/* Returns true if 2 fds refer to the same file.
+ If any fd is invalid, return false. */
+static inline bool
+fds_are_same_file(int fd1, int fd2)
+{
+ struct stat st1, st2;
+
+ return (fstat(fd1, &st1) == 0 && fstat(fd2, &st2) == 0 &&
+ st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+}
+
+dma_controller_t *
+dma_controller_create(int max_regions)
+{
+ dma_controller_t *dma;
+
+ dma = malloc(offsetof(dma_controller_t, regions) +
+ max_regions * sizeof(dma->regions[0]));
+
+ if (dma == NULL) {
+ return dma;
+ }
+
+ dma->max_regions = max_regions;
+ dma->nregions = 0;
+ memset(dma->regions, 0, max_regions * sizeof(dma->regions[0]));
+
+ return dma;
+}
+
+static void
+_dma_controller_do_remove_region(dma_memory_region_t * const region)
+{
+ assert(region);
+#if DMA_MAP_FAST_IMPL
+ dma_unmap_region(region, region->virt_addr, region->size);
+#endif
+ (void)close(region->fd);
+}
+
+/* FIXME not thread safe */
+int
+dma_controller_remove_region(dma_controller_t * dma, dma_addr_t dma_addr,
+ size_t size, int fd)
+{
+ int idx;
+ dma_memory_region_t *region;
+
+ assert(dma);
+
+ for (idx = 0; idx < dma->nregions; idx++) {
+ region = &dma->regions[idx];
+ if (region->dma_addr == dma_addr && region->size == size &&
+ fds_are_same_file(region->fd, fd)) {
+ _dma_controller_do_remove_region(region);
+ if (dma->nregions > 1)
+ memcpy(region, &dma->regions[dma->nregions - 1],
+ sizeof *region);
+ dma->nregions--;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static inline void
+dma_controller_remove_regions(lm_ctx_t * const ctx,
+ dma_controller_t * const dma)
+{
+ int i;
+
+ assert(dma);
+
+ for (i = 0; i < dma->nregions; i++) {
+ dma_memory_region_t *region = &dma->regions[i];
+
+ lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n",
+ region->virt_addr, region->dma_addr);
+
+ _dma_controller_do_remove_region(region);
+ }
+}
+
+void
+dma_controller_destroy(lm_ctx_t * const ctx, dma_controller_t * dma)
+{
+ dma_controller_remove_regions(ctx, dma);
+ free(dma);
+}
+
+int
+dma_controller_add_region(lm_ctx_t * const lm_ctx, dma_controller_t * dma,
+ dma_addr_t dma_addr, size_t size,
+ int fd, off_t offset)
+{
+ int idx;
+ dma_memory_region_t *region;
+ int page_size;
+
+ for (idx = 0; idx < dma->nregions; idx++) {
+ region = &dma->regions[idx];
+
+ /* First check if this is the same exact region. */
+ if (region->dma_addr == dma_addr && region->size == size) {
+ if (offset != region->offset) {
+ lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, "
+ "want=%d, existing=%d\n",
+ dma_addr, size, offset, region->offset);
+ goto err;
+ }
+ if (!fds_are_same_file(region->fd, fd)) {
+ /*
+ * Printing the file descriptors here doesn't really make
+ * sense as they can be different but actually pointing to
+ * the same file, however in the majority of cases we'll be
+ * using a single fd.
+ */
+ lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, "
+ "existing fd=%d\n", fd, region->fd);
+ goto err;
+ }
+ return idx;
+ }
+
+ /* Check for overlap, i.e. start of one region is within another. */
+ if ((dma_addr >= region->dma_addr &&
+ dma_addr < region->dma_addr + region->size) ||
+ (region->dma_addr >= dma_addr &&
+ region->dma_addr < dma_addr + size)) {
+ lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA "
+ "region %lx-%lx\n", dma_addr, size, region->dma_addr,
+ region->size);
+ goto err;
+ }
+ }
+
+ if (dma->nregions == dma->max_regions) {
+ idx = dma->max_regions;
+ lm_log(lm_ctx, LM_ERR, "reached maxed regions\n");
+ goto err;
+ }
+
+ idx = dma->nregions;
+ region = &dma->regions[idx];
+
+ page_size = fd_get_blocksize(fd);
+ if (page_size < 0) {
+ lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size);
+ goto err;
+ }
+ page_size = MAX(page_size, getpagesize());
+
+ region->dma_addr = dma_addr;
+ region->size = size;
+ region->page_size = page_size;
+ region->offset = offset;
+
+ region->fd = dup(fd); // dup the fd to get our own private copy
+ if (region->fd < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n",
+ strerror(errno));
+ goto err;
+ }
+#if DMA_MAP_FAST_IMPL
+ region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE,
+ 0, region->size);
+ if (region->virt_addr == MAP_FAILED) {
+ lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n",
+ dma_addr, dma_addr + size, strerror(errno));
+ close(region->fd);
+ goto err;
+ }
+#endif
+
+ dma->nregions++;
+
+ return idx;
+
+err:
+ return -idx - 1;
+}
+
+static inline void
+mmap_round(size_t * offset, size_t * size, size_t page_size)
+{
+ size_t offset_orig = *offset;
+ *offset = ROUND_DOWN(offset_orig, page_size);
+ *size = ROUND_UP(offset_orig + *size, page_size) - *offset;
+}
+
+void *
+dma_map_region(dma_memory_region_t * region, int prot,
+ size_t offset, size_t len)
+{
+ size_t mmap_offset, mmap_size = len;
+ char *mmap_base;
+
+ if (offset >= region->size || offset + len > region->size) {
+ return MAP_FAILED;
+ }
+
+ offset += region->offset;
+ mmap_offset = offset;
+ mmap_round(&mmap_offset, &mmap_size, region->page_size);
+
+ // Do the mmap.
+ mmap_base = mmap(NULL, mmap_size, prot, MAP_SHARED,
+ region->fd, mmap_offset);
+ if (mmap_base == MAP_FAILED) {
+ return mmap_base;
+ }
+ // Do not dump.
+ madvise(mmap_base, mmap_size, MADV_DONTDUMP);
+
+ return mmap_base + (offset - mmap_offset);
+}
+
+void
+dma_unmap_region(dma_memory_region_t * region, void *virt_addr, size_t len)
+{
+ mmap_round((size_t *) & virt_addr, &len, region->page_size);
+ munmap(virt_addr, len);
+}
+
+int
+_dma_addr_sg_split(lm_ctx_t * const ctx, const dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg)
+{
+ int idx;
+ int cnt = 0;
+ bool found = true; // Whether the current region is found.
+
+ while (found && len > 0) {
+ found = false;
+ for (idx = 0; idx < dma->nregions; idx++) {
+ const dma_memory_region_t *const region = &dma->regions[idx];
+ const dma_addr_t region_end = region->dma_addr + region->size;
+
+ while (dma_addr >= region->dma_addr && dma_addr < region_end) {
+ size_t region_len = MIN(region_end - dma_addr, len);
+
+ if (cnt < max_sg) {
+ sg[cnt].region = idx;
+ sg[cnt].offset = dma_addr - region->dma_addr;
+ sg[cnt].length = region_len;
+ }
+
+ cnt++;
+
+ // dma_addr found, may need to start from the top for the
+ // next dma_addr.
+ found = true;
+ dma_addr += region_len;
+ len -= region_len;
+
+ if (len == 0) {
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ if (!found) {
+ // There is still a region which was not found.
+ assert(len > 0);
+ cnt = -1;
+ } else if (cnt > max_sg) {
+ cnt = -cnt - 1;
+ }
+ return cnt;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.h b/lib/dma.h
new file mode 100644
index 0000000..80afaec
--- /dev/null
+++ b/lib/dma.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Mike Cui <cui@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef DMA_DMA_H
+#define DMA_DMA_H
+
+/*
+ * This library emulates a DMA controller for a device emulation application to
+ * perform DMA operations on a foreign memory space.
+ *
+ * Concepts:
+ * - A DMA controller has its own 64-bit DMA address space.
+ * - Foreign memory is made available to the DMA controller in linear chunks
+ * called memory regions.
+ * - Each memory region is backed by a file descriptor and
+ * is registered with the DMA controllers at a unique, non-overlapping
+ * linear span of the DMA address space.
+ * - To perform DMA, the application should first build a scatter-gather
+ * list (sglist) of dma_scattergather_t from DMA addresses. Then the sglist
+ * can be mapped using dma_map_sg() into the process's virtual address space
+ * as an iovec for direct access, and unmapped using dma_unmap_sg() when done.
+ * - dma_map_addr() and dma_unmap_addr() helper functions are provided
+ * for mapping DMA regions that can fit into one scatter-gather entry.
+ *
+ * This library can be compiled to function in two modes as defined by the
+ * following macros.
+ * - DMA_MAP_FAST (default): Every region is mapped into the application's
+ * virtual address space at registration time with R/W permissions.
+ * dma_map_sg() ignores all protection bits and only does lookups and
+ * returns pointers to the previously mapped regions. dma_unmap_sg() is
+ * effectively a no-op.
+ * - DMA_MAP_PROTECTED: Every call to dma_map_sg() does mmap()s and
+ * dma_unmap_sg() does munmap()s. All permission bits are honored. This mode
+ * is obviously much slower if used in the fast path. It may be useful to
+ * have the exta protection if the fast path does not need direct virtual
+ * memory access to foreign memory and data is accessed using a different
+ * method (e.g. RDMA, vfio-iommu). It can also be useful in debugging to
+ * make sure we are not writing to guest memory that's readonly for the
+ * device.
+ */
+
+#ifdef DMA_MAP_PROTECTED
+#undef DMA_MAP_FAST
+#define DMA_MAP_FAST_IMPL 0
+#else
+#define DMA_MAP_FAST_IMPL 1
+#endif
+
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "muser.h"
+#include "common.h"
+
+typedef struct {
+ dma_addr_t dma_addr; // DMA address of this region
+ size_t size; // Size of this region
+ int fd; // File descriptor to mmap
+ int page_size; // Page size of this fd
+ off_t offset; // File offset
+#if DMA_MAP_FAST_IMPL
+ void *virt_addr; // Virtual address of this region
+#endif
+} dma_memory_region_t;
+
+typedef struct {
+ int max_regions;
+ int nregions;
+ dma_memory_region_t regions[0];
+} dma_controller_t;
+
+dma_controller_t *dma_controller_create(int max_regions);
+void dma_controller_destroy(lm_ctx_t * const ctx, dma_controller_t * dma);
+
+/* Registers a new memory region.
+ * Returns:
+ * - On success, a non-negative region number
+ * - On failure, a negative integer (-x - 1) where x is the region number
+ * where this region would have been mapped to if the call could succeed
+ * (e.g. due to conflict with existing region).
+ */
+int dma_controller_add_region(lm_ctx_t * const ctx, dma_controller_t * dma,
+ dma_addr_t dma_addr, size_t size,
+ int fd, off_t offset);
+
+int dma_controller_remove_region(dma_controller_t * dma, dma_addr_t dma_addr,
+ size_t size, int fd);
+
+// Helper for dma_addr_to_sg() slow path.
+int _dma_addr_sg_split(lm_ctx_t * const ctx, const dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg);
+
+/* Takes a linear dma address span and returns a sg list suitable for DMA.
+ * A single linear dma address span may need to be split into multiple
+ * scatter gather regions due to limitations of how memory can be mapped.
+ *
+ * Returns:
+ * - On success, number of scatter gather entries created.
+ * - On failure:
+ * -1 if the dma address span is invalid
+ * (-x - 1) if @max_sg is too small, where x is the number of sg entries
+ * necessary to complete this request.
+ */
+static inline int
+dma_addr_to_sg(lm_ctx_t * const ctx, const dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg)
+{
+ static __thread int region_hint;
+ int cnt;
+
+ const dma_memory_region_t *const region = &dma->regions[region_hint];
+ const dma_addr_t region_end = region->dma_addr + region->size;
+
+ // Fast path: single region.
+ if (likely(max_sg > 0 && len > 0 &&
+ dma_addr >= region->dma_addr && dma_addr + len <= region_end)) {
+ sg->region = region_hint;
+ sg->offset = dma_addr - region->dma_addr;
+ sg->length = len;
+ return 1;
+ }
+ // Slow path: search through regions.
+ cnt = _dma_addr_sg_split(ctx, dma, dma_addr, len, sg, max_sg);
+ if (likely(cnt > 0)) {
+ region_hint = sg->region;
+ }
+ return cnt;
+}
+
+void *dma_map_region(dma_memory_region_t * region, int prot,
+ size_t offset, size_t len);
+
+void dma_unmap_region(dma_memory_region_t * region,
+ void *virt_addr, size_t len);
+
+static inline int
+dma_map_sg(dma_controller_t * dma, int prot,
+ const dma_scattergather_t * sg, struct iovec *iov, int cnt)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dma_memory_region_t *const region = &dma->regions[sg[i].region];
+
+#if DMA_MAP_FAST_IMPL
+ iov[i].iov_base = (char *)region->virt_addr + sg[i].offset;
+#else
+ iov[i].iov_base = dma_map_region(region, prot,
+ sg[i].offset, sg[i].length);
+ if (iov[i].iov_base == MAP_FAILED) {
+ return -1;
+ }
+#endif
+ iov[i].iov_len = sg[i].length;
+ }
+
+ return 0;
+}
+
+static inline void
+dma_unmap_sg(dma_controller_t * dma,
+ const dma_scattergather_t * sg, struct iovec *iov, int cnt)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dma_memory_region_t *const region = &dma->regions[sg[i].region];
+ if (!DMA_MAP_FAST_IMPL) {
+ dma_unmap_region(region, iov[i].iov_base, iov[i].iov_len);
+ }
+ }
+}
+
+static inline void *
+dma_map_addr(lm_ctx_t * const ctx, dma_controller_t * dma, int prot,
+ dma_addr_t dma_addr, uint32_t len)
+{
+ dma_scattergather_t sg;
+ struct iovec iov;
+
+ if (dma_addr_to_sg(ctx, dma, dma_addr, len, &sg, 1) == 1 &&
+ dma_map_sg(dma, prot, &sg, &iov, 1) == 0) {
+ return iov.iov_base;
+ }
+
+ return NULL;
+}
+
+static inline void
+dma_unmap_addr(lm_ctx_t * const ctx, dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len, void *addr)
+{
+ dma_scattergather_t sg;
+ struct iovec iov = {
+ .iov_base = addr,
+ .iov_len = len,
+ };
+ int r;
+
+ r = dma_addr_to_sg(ctx, dma, dma_addr, len, &sg, 1);
+ assert(r == 1);
+
+ dma_unmap_sg(dma, &sg, &iov, 1);
+}
+
+#endif /* DMA_DMA_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/libmuser.c b/lib/libmuser.c
new file mode 100644
index 0000000..ba016fe
--- /dev/null
+++ b/lib/libmuser.c
@@ -0,0 +1,1063 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <stdarg.h>
+
+#include "../kmod/muser.h"
+#include "muser.h"
+#include "dma.h"
+
+typedef enum {
+ IRQ_NONE = 0,
+ IRQ_INTX,
+ IRQ_MSI,
+ IRQ_MSIX,
+} irq_type_t;
+
+typedef struct {
+ irq_type_t type; /* irq type this device is using */
+ int err_efd; /* eventfd for irq err */
+ int req_efd; /* eventfd for irq req */
+ uint32_t max_ivs; /* maximum number of ivs supported */
+ int efds[0]; /* XXX must be last */
+} lm_irqs_t;
+
+/*
+ * Macro that ensures that a particular struct member is last. Doesn't work for
+ * flexible array members.
+ */
+#define MUST_BE_LAST(s, m, t) \
+ _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \
+ #t " " #m " must be last member in " #s)
+
+struct lm_ctx {
+ void *pvt;
+ dma_controller_t *dma;
+ int fd;
+ bool extended;
+ lm_fops_t fops;
+ lm_log_lvl_t log_lvl;
+ lm_log_fn_t *log;
+ lm_pci_info_t pci_info;
+ lm_pci_config_space_t *pci_config_space;
+ lm_irqs_t irqs; /* XXX must be last */
+};
+MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t);
+
+#define LM_CTX_SIZE(irqs) (sizeof(lm_ctx_t) + sizeof(int) * irqs)
+#define LM2VFIO_IRQT(type) (type - 1)
+
+void lm_log(const lm_ctx_t * const ctx, const lm_log_lvl_t lvl,
+ const char *const fmt, ...)
+{
+ va_list ap;
+ char buf[BUFSIZ];
+
+ assert(ctx);
+
+ if (!ctx->log || lvl > ctx->log_lvl || !fmt) {
+ return;
+ }
+
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof buf, fmt, ap);
+ va_end(ap);
+ ctx->log(ctx->pvt, buf);
+}
+
+static long irqs_disable(lm_ctx_t * lm_ctx, uint32_t index)
+{
+ int *irq_efd = NULL;
+ uint32_t i;
+
+ assert(lm_ctx != NULL);
+ assert(index < LM_DEV_NUM_IRQS);
+
+ switch (index) {
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ lm_ctx->irqs.type = IRQ_NONE;
+ for (i = 0; i < lm_ctx->irqs.max_ivs; i++) {
+ if (lm_ctx->irqs.efds[i] >= 0) {
+ (void) close(lm_ctx->irqs.efds[i]);
+ lm_ctx->irqs.efds[i] = -1;
+ }
+ }
+ return 0;
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ irq_efd = &lm_ctx->irqs.err_efd;
+ break;
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ irq_efd = &lm_ctx->irqs.req_efd;
+ break;
+ }
+
+ if (irq_efd != NULL) {
+ (void)close(*irq_efd);
+ *irq_efd = -1;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
+{
+ int efd, i;
+ long ret;
+ eventfd_t val;
+
+ for (i = irq_set->start; i < irq_set->start + irq_set->count; i++) {
+ efd = lm_ctx->irqs.efds[i];
+ if (efd >= 0) {
+ val = 1;
+ ret = eventfd_write(efd, val);
+ if (ret == -1) {
+ return -errno;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ uint8_t *d8;
+ int efd, i;
+ long ret;
+ eventfd_t val;
+
+ assert(data != NULL);
+ for (i = irq_set->start, d8 = data; i < irq_set->start + irq_set->count;
+ i++, d8++) {
+ efd = lm_ctx->irqs.efds[i];
+ if (efd >= 0 && *d8 == 1) {
+ val = 1;
+ ret = eventfd_write(efd, val);
+ if (ret == -1) {
+ return -errno;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ int32_t *d32;
+ int efd, i;
+
+ assert(data != NULL);
+ for (i = irq_set->start, d32 = data; i < irq_set->start + irq_set->count;
+ i++, d32++) {
+ efd = lm_ctx->irqs.efds[i];
+ if (efd >= 0) {
+ (void) close(efd);
+ lm_ctx->irqs.efds[i] = -1;
+ }
+ if (*d32 >= 0) {
+ lm_ctx->irqs.efds[i] = *d32;
+ }
+ }
+
+ return 0;
+}
+
+static long
+irqs_trigger(lm_ctx_t * lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ int err = 0;
+
+ assert(lm_ctx != NULL);
+ assert(irq_set != NULL);
+
+ if (irq_set->count == 0) {
+ return irqs_disable(lm_ctx, irq_set->index);
+ }
+
+ switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+ case VFIO_IRQ_SET_DATA_NONE:
+ err = irqs_set_data_none(lm_ctx, irq_set);
+ break;
+ case VFIO_IRQ_SET_DATA_BOOL:
+ err = irqs_set_data_bool(lm_ctx, irq_set, data);
+ break;
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ err = irqs_set_data_eventfd(lm_ctx, irq_set, data);
+ break;
+ }
+
+ return err;
+}
+
+static long
+dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
+{
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+ uint32_t a_type, d_type;
+
+ assert(lm_ctx != NULL);
+ assert(irq_set != NULL);
+
+ // Separate action and data types from flags.
+ a_type = (irq_set->flags & VFIO_IRQ_SET_ACTION_TYPE_MASK);
+ d_type = (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK);
+
+ // Ensure index is within bounds.
+ if (irq_set->index >= LM_DEV_NUM_IRQS) {
+ return -EINVAL;
+ }
+
+ /* TODO make each condition a function */
+
+ // Only one of MASK/UNMASK/TRIGGER is valid.
+ if ((a_type != VFIO_IRQ_SET_ACTION_MASK) &&
+ (a_type != VFIO_IRQ_SET_ACTION_UNMASK) &&
+ (a_type != VFIO_IRQ_SET_ACTION_TRIGGER)) {
+ return -EINVAL;
+ }
+ // Only one of NONE/BOOL/EVENTFD is valid.
+ if ((d_type != VFIO_IRQ_SET_DATA_NONE) &&
+ (d_type != VFIO_IRQ_SET_DATA_BOOL) &&
+ (d_type != VFIO_IRQ_SET_DATA_EVENTFD)) {
+ return -EINVAL;
+ }
+ // Ensure irq_set's start and count are within bounds.
+ if ((irq_set->start >= pci_info->irq_count[irq_set->index]) ||
+ (irq_set->start + irq_set->count > pci_info->irq_count[irq_set->index])) {
+ return -EINVAL;
+ }
+ // Only TRIGGER is valid for ERR/REQ.
+ if (((irq_set->index == VFIO_PCI_ERR_IRQ_INDEX) ||
+ (irq_set->index == VFIO_PCI_REQ_IRQ_INDEX)) &&
+ (a_type != VFIO_IRQ_SET_ACTION_TRIGGER)) {
+ return -EINVAL;
+ }
+ // count == 0 is only valid with ACTION_TRIGGER and DATA_NONE.
+ if ((irq_set->count == 0) && ((a_type != VFIO_IRQ_SET_ACTION_TRIGGER) ||
+ (d_type != VFIO_IRQ_SET_DATA_NONE))) {
+ return -EINVAL;
+ }
+ // If IRQs are set, ensure index matches what's enabled for the device.
+ if ((irq_set->count != 0) && (lm_ctx->irqs.type != IRQ_NONE) &&
+ (irq_set->index != LM2VFIO_IRQT(lm_ctx->irqs.type))) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static long
+dev_set_irqs(lm_ctx_t * lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ long ret;
+
+ assert(lm_ctx != NULL);
+ assert(irq_set != NULL);
+
+ // Ensure irq_set is valid.
+ ret = dev_set_irqs_validate(lm_ctx, irq_set);
+ if (ret != 0) {
+ return ret;
+ }
+
+ switch (irq_set->flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK: // fallthrough
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ // We're always edge-triggered without un/mask support.
+ return 0;
+ }
+
+ return irqs_trigger(lm_ctx, irq_set, data);
+}
+
+static long dev_get_irqinfo(lm_ctx_t * lm_ctx, struct vfio_irq_info *irq_info)
+{
+ assert(lm_ctx != NULL);
+ assert(irq_info != NULL);
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+
+ // Ensure provided argsz is sufficiently big and index is within bounds.
+ if ((irq_info->argsz < sizeof(struct vfio_irq_info)) ||
+ (irq_info->index >= LM_DEV_NUM_IRQS)) {
+ return -EINVAL;
+ }
+
+ irq_info->count = pci_info->irq_count[irq_info->index];
+ irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
+
+ return 0;
+}
+
+static long
+dev_get_reginfo(lm_ctx_t * lm_ctx, struct vfio_region_info *reg_info)
+{
+ assert(lm_ctx != NULL);
+ assert(reg_info != NULL);
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+
+ // Ensure provided argsz is sufficiently big and index is within bounds.
+ if ((reg_info->argsz < sizeof(struct vfio_region_info)) ||
+ (reg_info->index >= LM_DEV_NUM_REGS)) {
+ return -EINVAL;
+ }
+
+ reg_info->offset = pci_info->reg_info[reg_info->index].offset;
+ reg_info->flags = pci_info->reg_info[reg_info->index].flags;
+ reg_info->size = pci_info->reg_info[reg_info->index].size;
+
+ lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", reg_info->index);
+ dump_buffer(lm_ctx, "", (unsigned char *)reg_info, sizeof *reg_info);
+
+ return 0;
+}
+
+static long dev_get_info(struct vfio_device_info *dev_info)
+{
+ assert(dev_info != NULL);
+
+ // Ensure provided argsz is sufficiently big.
+ if (dev_info->argsz < sizeof(struct vfio_device_info)) {
+ return -EINVAL;
+ }
+
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI | VFIO_DEVICE_FLAGS_RESET;
+ dev_info->num_regions = LM_DEV_NUM_REGS;
+ dev_info->num_irqs = LM_DEV_NUM_IRQS;
+
+ return 0;
+}
+
+static long
+do_muser_ioctl(lm_ctx_t * lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
+{
+ int err = -ENOTSUP;
+
+ assert(lm_ctx != NULL);
+ switch (cmd_ioctl->vfio_cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ err = dev_get_info(&cmd_ioctl->data.dev_info);
+ break;
+ case VFIO_DEVICE_GET_REGION_INFO:
+ err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info);
+ break;
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info);
+ break;
+ case VFIO_DEVICE_SET_IRQS:
+ err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data);
+ break;
+ case VFIO_DEVICE_RESET:
+ if (lm_ctx->fops.reset) {
+ return lm_ctx->fops.reset(lm_ctx->pvt);
+ }
+ }
+
+ return err;
+}
+
+static int muser_dma_unmap(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ int err;
+
+ lm_log(lm_ctx, LM_INF, "removing DMA region %lx-%lx\n",
+ cmd->mmap.request.start, cmd->mmap.request.end);
+
+ if (lm_ctx->dma == NULL) {
+ lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+
+ err = dma_controller_remove_region(lm_ctx->dma,
+ cmd->mmap.request.start,
+ cmd->mmap.request.end -
+ cmd->mmap.request.start, lm_ctx->fd);
+ if (err != 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to remove DMA region %lx-%lx: %s\n",
+ cmd->mmap.request.start, cmd->mmap.request.end, strerror(err));
+ }
+
+ cmd->mmap.response.addr = err;
+
+ return err;
+}
+
+static int muser_dma_map(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ int err;
+
+ lm_log(lm_ctx, LM_INF, "adding DMA region %lx-%lx\n",
+ cmd->mmap.request.start, cmd->mmap.request.end);
+
+ if (lm_ctx->dma == NULL) {
+ lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+
+ if (cmd->mmap.request.start >= cmd->mmap.request.end) {
+ lm_log(lm_ctx, LM_ERR, "bad DMA region %lx-%lx\n",
+ cmd->mmap.request.start, cmd->mmap.request.end);
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+ err = dma_controller_add_region(lm_ctx, lm_ctx->dma,
+ cmd->mmap.request.start,
+ cmd->mmap.request.end -
+ cmd->mmap.request.start, lm_ctx->fd, 0);
+ if (err < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to add DMA region %lx-%lx: %d\n",
+ cmd->mmap.request.start, cmd->mmap.request.end, err);
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+
+ // TODO: Are we just abusing response.addr as a rc?
+ cmd->mmap.response.addr = 0;
+
+ return 0;
+}
+
+static int muser_mmap(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ unsigned long addr;
+ unsigned long start = cmd->mmap.request.start;
+ unsigned long end = cmd->mmap.request.end;
+ unsigned long pgoff = cmd->mmap.request.pgoff;
+
+ addr = lm_ctx->fops.mmap(lm_ctx->pvt, pgoff);
+ cmd->mmap.response.addr = addr;
+
+ if ((void *)addr == MAP_FAILED) {
+ cmd->err = -1;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+post_read(lm_ctx_t * const lm_ctx, struct muser_cmd *const cmd,
+ char *const data, const size_t offset, ssize_t ret)
+{
+ if (ret != cmd->rw.count) {
+ /* FIXME shouldn't we still reply to the kernel in case of error? */
+ lm_log(lm_ctx, LM_ERR, "%s: bad fops read: %d/%d, %s\n",
+ __func__, ret, cmd->rw.count, strerror(errno));
+ return ret;
+ }
+
+ /*
+ * TODO the kernel will first copy the command and then will use the .buf
+ * pointer to copy the data. Does it make sense to use writev in order to
+ * get rid of the .buf member? THe 1st element of the iovec will be the
+ * command and the 2nd the data.
+ */
+ cmd->rw.buf = data;
+ ret = write(lm_ctx->fd, cmd, sizeof(*cmd));
+ if ((int)ret != sizeof(*cmd)) {
+ lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %d/%d, %s\n",
+ __func__, ret, sizeof(*cmd), strerror(errno));
+ }
+ return ret;
+}
+
+int
+lm_get_region(lm_ctx_t * const lm_ctx, const loff_t pos, const size_t count,
+ loff_t * const off)
+{
+ assert(lm_ctx);
+ assert(off);
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+
+ int i;
+
+ for (i = 0; i < LM_DEV_NUM_REGS; i++) {
+ const lm_reg_info_t * const reg_info = &pci_info->reg_info[i];
+ if (pos >= reg_info->offset) {
+ if (pos - reg_info->offset + count <= reg_info->size) {
+ *off = pos - reg_info->offset;
+ return i;
+ }
+ }
+ }
+ return -ENOENT;
+}
+
+static ssize_t
+do_access(lm_ctx_t * const lm_ctx, char * const buf, size_t count, loff_t pos,
+ const bool is_write)
+{
+ int idx;
+ loff_t offset;
+ int ret = -EINVAL;
+ lm_pci_info_t *pci_info;
+
+ assert(lm_ctx != NULL);
+ assert(buf != NULL);
+ assert(count > 0);
+
+ pci_info = &lm_ctx->pci_info;
+ idx = lm_get_region(lm_ctx, pos, count, &offset);
+ if (idx < 0) {
+ lm_log(lm_ctx, LM_ERR, "invalid region %d\n", idx);
+ return idx;
+ }
+
+ /*
+ * TODO we should check at device registration time that all necessary
+ * callbacks are there in order to avoid having to check at runtime
+ */
+ switch (idx) {
+ case LM_DEV_BAR0_REG_IDX ... LM_DEV_BAR5_REG_IDX:
+ if (pci_info->bar_fn)
+ return pci_info->bar_fn(lm_ctx->pvt, idx, buf, count, offset, is_write);
+ break;
+ case LM_DEV_ROM_REG_IDX:
+ if (pci_info->rom_fn)
+ return pci_info->rom_fn(lm_ctx->pvt, buf, count, offset, is_write);
+ break;
+ case LM_DEV_CFG_REG_IDX:
+ if (pci_info->pci_config_fn)
+ return pci_info->pci_config_fn(lm_ctx->pvt, buf, count, offset,
+ is_write);
+ break;
+ case LM_DEV_VGA_REG_IDX:
+ if (pci_info->vga_fn)
+ return pci_info->vga_fn(lm_ctx->pvt, buf, count, offset, is_write);
+ break;
+ default:
+ lm_log(lm_ctx, LM_ERR, "bad region %d\n", idx);
+ return ret;
+ }
+
+ if (is_write && lm_ctx->fops.write) {
+ ret = lm_ctx->fops.write(lm_ctx->pvt, idx, buf, count, pos);
+ } else if (lm_ctx->fops.read) {
+ ret = lm_ctx->fops.read(lm_ctx->pvt, idx, buf, count, pos);
+ } else {
+ lm_log(lm_ctx, LM_ERR, "no R/W callback, region %d, %x@%lx\n",
+ idx, count, pos);
+ }
+
+ return ret;
+}
+
+/*
+ * TODO function name same lm_access_t, fix
+ */
+ssize_t
+lm_access(lm_ctx_t * const lm_ctx, char *buf, size_t count,
+ loff_t * const ppos, const bool is_write)
+{
+ unsigned int done = 0;
+ int ret;
+
+ assert(lm_ctx != NULL);
+ /* buf and ppos can be NULL if count is 0 */
+
+ while (count) {
+ size_t size;
+ if (count >= 8 && !(*ppos % 8)) {
+ size = 8;
+ } else if (count >= 4 && !(*ppos % 4)) {
+ size = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ size = 2;
+ } else {
+ size = 1;
+ }
+ ret = do_access(lm_ctx, buf, size, *ppos, is_write);
+ if (ret <= 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to %s %lx@%llx: %s\n",
+ is_write ? "write" : "read", *ppos, size, strerror(-ret));
+ return -EFAULT;
+ }
+ count -= size;
+ done += size;
+ *ppos += size;
+ buf += size;
+ }
+ return done;
+}
+
+
+static inline int
+muser_access(lm_ctx_t * const lm_ctx, struct muser_cmd *const cmd,
+ const bool is_write)
+{
+ char *data;
+ int err;
+ unsigned int i;
+ size_t count = 0;
+ ssize_t ret;
+
+ /* TODO how big do we expect count to be? Can we use alloca(3) instead? */
+ data = calloc(1, cmd->rw.count);
+ if (data == NULL) {
+ lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n");
+ return -1;
+ }
+
+ lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count,
+ cmd->rw.pos);
+
+ /* copy data to be written from kernel to user space */
+ if (is_write) {
+ err = read(lm_ctx->fd, data, cmd->rw.count);
+ /*
+ * FIXME this is wrong, we should be checking for
+ * err != cmd->rw.count
+ */
+ if (err < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ err = 0;
+ dump_buffer(lm_ctx, "buffer write", data, cmd->rw.count);
+ }
+
+ count = cmd->rw.count;
+ cmd->err = muser_pci_hdr_access(lm_ctx, &cmd->rw.count, &cmd->rw.pos,
+ is_write, data);
+ if (cmd->err) {
+ lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err);
+ }
+ count -= cmd->rw.count;
+ ret = lm_access(lm_ctx, data + count, cmd->rw.count, &cmd->rw.pos,
+ is_write);
+ if (!is_write) {
+ err = post_read(lm_ctx, cmd, data, count, ret);
+ dump_buffer(lm_ctx, "buffer read", data, cmd->rw.count);
+ }
+
+out:
+ free(data);
+
+ return err;
+}
+
+static int
+muser_ioctl(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ void *data = NULL;
+ size_t size = 0;
+ int ret;
+
+ /* TODO make this a function that returns the size */
+ if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) {
+ uint32_t flags = cmd->ioctl.data.irq_set.flags;
+ switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count;
+ break;
+ case VFIO_IRQ_SET_DATA_BOOL:
+ size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count;
+ break;
+ }
+ }
+
+ if (size != 0) {
+ data = calloc(1, size);
+ if (data == NULL) {
+#ifdef DEBUG
+ perror("calloc");
+#endif
+ return -1;
+ }
+
+ ret = read(lm_ctx->fd, data, size);
+ if (ret < 0) {
+#ifdef DEBUG
+ perror("read failed");
+#endif
+ goto out;
+ }
+ }
+
+ ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data);
+
+out:
+
+ free(data);
+ return ret;
+}
+
+static int drive_loop(lm_ctx_t *lm_ctx)
+{
+ struct muser_cmd cmd = { 0 };
+ int err;
+ size_t size;
+ unsigned int i;
+
+ do {
+ err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd);
+ if (err < 0) {
+ return err;
+ }
+
+ switch (cmd.type) {
+ case MUSER_IOCTL:
+ err = muser_ioctl(lm_ctx, &cmd);
+ break;
+ case MUSER_READ:
+ case MUSER_WRITE:
+ err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE);
+ break;
+ case MUSER_MMAP:
+ err = muser_mmap(lm_ctx, &cmd);
+ break;
+ case MUSER_DMA_MMAP:
+ err = muser_dma_map(lm_ctx, &cmd);
+ break;
+ case MUSER_DMA_MUNMAP:
+ err = muser_dma_unmap(lm_ctx, &cmd);
+ break;
+ default:
+ lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type);
+ continue;
+ }
+ cmd.err = err;
+ err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd);
+ if (err < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n",
+ strerror(errno));
+ }
+ // TODO: Figure out a clean way to get out of the loop.
+ } while (1);
+
+ return err;
+}
+
+int
+lm_ctx_drive(lm_ctx_t * lm_ctx)
+{
+
+ if (lm_ctx == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ return drive_loop(lm_ctx);
+}
+
+static int
+dev_detach(int dev_fd)
+{
+ return close(dev_fd);
+}
+
+static int
+dev_attach(const char *uuid)
+{
+ char *path;
+ int dev_fd;
+ int err;
+
+ err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid);
+ if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) {
+ return -1;
+ }
+
+ dev_fd = open(path, O_RDWR);
+
+ free(path);
+
+ return dev_fd;
+}
+
+void *
+lm_mmap(lm_ctx_t * lm_ctx, size_t length, off_t offset)
+{
+ off_t lm_off;
+
+ if ((lm_ctx == NULL) || (length == 0) || !PAGE_ALIGNED(offset)) {
+ errno = EINVAL;
+ return MAP_FAILED;
+ }
+
+ lm_off = offset | BIT(63);
+ return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED,
+ lm_ctx->fd, lm_off);
+}
+
+int
+lm_irq_trigger(lm_ctx_t * lm_ctx, uint32_t vector)
+{
+ eventfd_t val = 1;
+
+ if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (lm_ctx->irqs.efds[vector] == -1) {
+ errno = ENOENT;
+ return -1;
+ }
+
+ return eventfd_write(lm_ctx->irqs.efds[vector], val);
+}
+
+void
+lm_ctx_destroy(lm_ctx_t * lm_ctx)
+{
+ if (lm_ctx == NULL) {
+ return;
+ }
+
+ free(lm_ctx->pci_config_space);
+ dev_detach(lm_ctx->fd);
+ if (lm_ctx->dma != NULL) {
+ dma_controller_destroy(lm_ctx, lm_ctx->dma);
+ }
+ free(lm_ctx);
+ // FIXME: Maybe close any open irq efds? Unmap stuff?
+}
+
+static void
+init_pci_hdr(lm_pci_hdr_t * const hdr, const lm_pci_hdr_id_t * const id,
+ const lm_pci_hdr_cc_t * const cc)
+{
+ assert(hdr);
+ assert(id);
+ assert(cc);
+
+ hdr->id = *id;
+ hdr->cc = *cc;
+
+ hdr->ss.vid = hdr->id.vid;
+ hdr->ss.sid = hdr->id.did;
+}
+
+lm_ctx_t *
+lm_ctx_create(lm_dev_info_t * const dev_info)
+{
+ lm_ctx_t *lm_ctx;
+ uint32_t max_ivs = 0;
+ uint32_t i;
+ int err = 0;
+ size_t size;
+
+ if (dev_info == NULL) {
+ err = EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < LM_DEV_NUM_IRQS; i++) {
+ if (max_ivs < dev_info->pci_info.irq_count[i]) {
+ max_ivs = dev_info->pci_info.irq_count[i];
+ }
+ }
+
+ lm_ctx = calloc(1, LM_CTX_SIZE(max_ivs));
+ if (lm_ctx == NULL) {
+ err = errno;
+ goto out;
+ }
+
+ memcpy(&lm_ctx->pci_info, &dev_info->pci_info, sizeof(lm_pci_info_t));
+
+ lm_ctx->fd = dev_attach(dev_info->uuid);
+ if (lm_ctx->fd == -1) {
+ err = errno;
+ goto out;
+ }
+
+ if (dev_info->nr_dma_regions > 0) {
+ lm_ctx->dma = dma_controller_create(dev_info->nr_dma_regions);
+ if (lm_ctx->dma == NULL) {
+ err = errno;
+ goto out;
+ }
+ }
+
+ lm_ctx->pci_info.irq_count[LM_DEV_ERR_IRQ_IDX] = 1;
+ lm_ctx->pci_info.irq_count[LM_DEV_REQ_IRQ_IDX] = 1;
+
+ lm_ctx->extended = dev_info->extended;
+ if (lm_ctx->extended) {
+ size = PCI_EXTENDED_CONFIG_SPACE_SIZEOF;
+ } else {
+ size = PCI_CONFIG_SPACE_SIZEOF;
+ }
+ lm_ctx->pci_config_space = calloc(PCI_EXTENDED_CONFIG_SPACE_SIZEOF, 1);
+ if (!lm_ctx->pci_config_space) {
+ err = errno;
+ goto out;
+ }
+
+ init_pci_hdr(&lm_ctx->pci_config_space->hdr, &dev_info->id, &dev_info->cc);
+ for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_config_space->hdr.bars); i++) {
+ if ((dev_info->pci_info.reg_info[i].flags & LM_REG_FLAG_MEM) == 0) {
+ lm_ctx->pci_config_space->hdr.bars[i].io.region_type |= 0x1;
+ }
+ }
+
+ lm_ctx->fops = dev_info->fops;
+ lm_ctx->pvt = dev_info->pvt;
+
+ for (i = 0; i < max_ivs; i++) {
+ lm_ctx->irqs.efds[i] = -1;
+ }
+ lm_ctx->irqs.err_efd = -1;
+ lm_ctx->irqs.req_efd = -1;
+ lm_ctx->irqs.type = IRQ_NONE;
+ lm_ctx->irqs.max_ivs = max_ivs;
+
+ lm_ctx->log = dev_info->log;
+ lm_ctx->log_lvl = dev_info->log_lvl;
+
+ lm_ctx->pci_info.bar_fn = dev_info->pci_info.bar_fn;
+ lm_ctx->pci_info.rom_fn = dev_info->pci_info.rom_fn;
+ lm_ctx->pci_info.pci_config_fn = dev_info->pci_info.pci_config_fn;
+ lm_ctx->pci_info.vga_fn = dev_info->pci_info.vga_fn;
+
+out:
+ if (err) {
+ if (lm_ctx) {
+ dev_detach(lm_ctx->fd);
+ free(lm_ctx->pci_config_space);
+ free(lm_ctx);
+ lm_ctx = NULL;
+ }
+ errno = err;
+ }
+ return lm_ctx;
+}
+
+void
+dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix,
+ unsigned char const *const buf, const uint32_t count)
+{
+#ifdef DEBUG
+ int i;
+ const size_t bytes_per_line = 0x8;
+
+ if (strcmp(prefix, "")) {
+ lm_log(lm_ctx, LM_DBG, "%s\n", prefix);
+ }
+ for (i = 0; i < (int)count; i++) {
+ if (i % bytes_per_line != 0) {
+ lm_log(lm_ctx, LM_DBG, " ");
+ }
+ /* TODO valgrind emits a warning if count is 1 */
+ lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i));
+ if ((i + 1) % bytes_per_line == 0) {
+ lm_log(lm_ctx, LM_DBG, "\n");
+ }
+ }
+ if (i % bytes_per_line != 0) {
+ lm_log(lm_ctx, LM_DBG, "\n");
+ }
+#endif
+}
+
+/*
+ * Returns a pointer to the standard part of the PCI configuration space.
+ */
+inline lm_pci_config_space_t *
+lm_get_pci_config_space(lm_ctx_t * const lm_ctx)
+{
+ assert(lm_ctx != NULL);
+ return lm_ctx->pci_config_space;
+}
+
+/*
+ * Returns a pointer to the non-standard part of the PCI configuration space.
+ */
+inline uint8_t *
+lm_get_pci_non_std_config_space(lm_ctx_t * const lm_ctx)
+{
+ assert(lm_ctx != NULL);
+ return (uint8_t *) & lm_ctx->pci_config_space->non_std;
+}
+
+inline lm_reg_info_t *
+lm_get_region_info(lm_ctx_t * const lm_ctx)
+{
+ assert(lm_ctx != NULL);
+ return lm_ctx->pci_info.reg_info;
+}
+
+inline int
+lm_addr_to_sg(lm_ctx_t * const lm_ctx, dma_addr_t dma_addr,
+ uint32_t len, dma_scattergather_t * sg, int max_sg)
+{
+ return dma_addr_to_sg(lm_ctx, lm_ctx->dma, dma_addr, len, sg, max_sg);
+}
+
+inline int
+lm_map_sg(lm_ctx_t * const lm_ctx, int prot,
+ const dma_scattergather_t * sg, struct iovec *iov, int cnt)
+{
+ return dma_map_sg(lm_ctx->dma, prot, sg, iov, cnt);
+}
+
+inline void
+lm_unmap_sg(lm_ctx_t * const lm_ctx, const dma_scattergather_t * sg,
+ struct iovec *iov, int cnt)
+{
+ return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt);
+}
+
+int
+lm_ctx_run(lm_ctx_t * const lm_ctx)
+{
+ int ret = lm_ctx_drive(lm_ctx);
+
+ lm_ctx_destroy(lm_ctx);
+ return ret;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/libmuser_pci.c b/lib/libmuser_pci.c
new file mode 100644
index 0000000..df45336
--- /dev/null
+++ b/lib/libmuser_pci.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/param.h>
+#include <errno.h>
+
+#include <linux/pci_regs.h>
+#include <linux/vfio.h>
+
+#include "muser.h"
+#include "pci.h"
+#include "common.h"
+
+static inline void
+muser_pci_hdr_write_bar(lm_ctx_t * const lm_ctx, const uint16_t bar_index,
+ const char *const buf)
+{
+ uint32_t cfg_addr;
+ uint32_t *bar;
+ unsigned long mask;
+ lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx);
+
+ assert(lm_ctx);
+
+ if (reg_info[bar_index].size == 0) {
+ return;
+ }
+
+ bar = (uint32_t *) & lm_get_pci_config_space(lm_ctx)->hdr.bars[bar_index];
+ cfg_addr = *(uint32_t *) buf;
+
+ lm_log(lm_ctx, LM_DBG, "BAR%d addr 0x%x\n", bar_index, cfg_addr);
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = ~(reg_info[bar_index].size) + 1;
+ }
+
+ if ((reg_info[bar_index].flags & LM_REG_FLAG_MEM)) {
+ mask = PCI_BASE_ADDRESS_MEM_MASK;
+ } else {
+ mask = PCI_BASE_ADDRESS_IO_MASK;
+ }
+ cfg_addr |= (*bar & ~mask);
+
+ *bar = htole32(cfg_addr);
+}
+
+#define BAR_INDEX(offset) ((offset - PCI_BASE_ADDRESS_0) >> 2)
+
+static int
+handle_command_write(lm_ctx_t * const ctx, lm_pci_config_space_t * const pci,
+ const char * const buf, const size_t count)
+{
+ uint16_t v;
+
+ assert(ctx);
+
+ if (count != 2) {
+ lm_log(ctx, LM_ERR, "bad write command size %d\n", count);
+ return -EINVAL;
+ }
+
+ assert(pci);
+ assert(buf);
+
+ v = *(uint16_t*)buf;
+
+ if ((v & PCI_COMMAND_IO) == PCI_COMMAND_IO) {
+ if (!pci->hdr.cmd.iose) {
+ pci->hdr.cmd.iose = 0x1;
+ lm_log(ctx, LM_INF, "I/O space enabled\n");
+ }
+ v &= ~PCI_COMMAND_IO;
+ } else {
+ if (pci->hdr.cmd.iose) {
+ pci->hdr.cmd.iose = 0x0;
+ lm_log(ctx, LM_INF, "I/O space disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_MEMORY) == PCI_COMMAND_MEMORY) {
+ if (!pci->hdr.cmd.mse) {
+ pci->hdr.cmd.mse = 0x1;
+ lm_log(ctx, LM_INF, "memory space enabled\n");
+ }
+ v &= ~PCI_COMMAND_MEMORY;
+ } else {
+ if (pci->hdr.cmd.mse) {
+ pci->hdr.cmd.mse = 0x0;
+ lm_log(ctx, LM_INF, "memory space disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_MASTER) == PCI_COMMAND_MASTER) {
+ if (!pci->hdr.cmd.bme) {
+ pci->hdr.cmd.bme = 0x1;
+ lm_log(ctx, LM_INF, "bus master enabled\n");
+ }
+ v &= ~PCI_COMMAND_MASTER;
+ } else {
+ if (pci->hdr.cmd.bme) {
+ pci->hdr.cmd.bme = 0x0;
+ lm_log(ctx, LM_INF, "bus master disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_SERR) == PCI_COMMAND_SERR) {
+ if (!pci->hdr.cmd.see) {
+ pci->hdr.cmd.see = 0x1;
+ lm_log(ctx, LM_INF, "SERR# enabled\n");
+ }
+ v &= ~PCI_COMMAND_SERR;
+ } else {
+ if (pci->hdr.cmd.see) {
+ pci->hdr.cmd.see = 0x0;
+ lm_log(ctx, LM_INF, "SERR# disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) {
+ if (!pci->hdr.cmd.id) {
+ pci->hdr.cmd.id = 0x1;
+ lm_log(ctx, LM_INF, "INTx emulation enabled\n");
+ }
+ v &= ~PCI_COMMAND_INTX_DISABLE;
+ } else {
+ if (pci->hdr.cmd.id) {
+ pci->hdr.cmd.id = 0x0;
+ lm_log(ctx, LM_INF, "INTx emulation disabled\n");
+ }
+ }
+
+ if (v) {
+ lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+handle_erom_write(lm_ctx_t * const ctx, lm_pci_config_space_t * const pci,
+ const char *const buf, const size_t count)
+{
+ uint32_t v;
+
+ assert(ctx);
+ assert(pci);
+
+ if (count != 0x4) {
+ lm_log(ctx, LM_ERR, "bad EROM count %d\n", count);
+ return -EINVAL;
+ }
+ v = *(uint32_t*)buf;
+
+ if (v == (uint32_t)PCI_ROM_ADDRESS_MASK) {
+ lm_log(ctx, LM_INF, "write mask to EROM ignored\n");
+ } else if (v == 0) {
+ lm_log(ctx, LM_INF, "cleared EROM\n");
+ pci->hdr.erom = 0;
+ } else if (v == ~PCI_ROM_ADDRESS_ENABLE) {
+ lm_log(ctx, LM_INF, "EROM disable ignored\n");
+ } else {
+ lm_log(ctx, LM_ERR, "bad write to EROM 0x%x bytes\n", v);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static inline int
+muser_pci_hdr_write(lm_ctx_t * const lm_ctx, const uint16_t offset,
+ const char *const buf, const size_t count)
+{
+ uint32_t *bar;
+ lm_pci_config_space_t *pci;
+ int ret = 0;
+
+ assert(lm_ctx);
+ assert(buf);
+
+ pci = lm_get_pci_config_space(lm_ctx);
+
+ switch (offset) {
+ case PCI_COMMAND:
+ ret = handle_command_write(lm_ctx, pci, buf, count);
+ break;
+ case PCI_STATUS:
+ lm_log(lm_ctx, LM_INF, "write to status ignored\n");
+ break;
+ case PCI_INTERRUPT_PIN:
+ lm_log(lm_ctx, LM_ERR, "attempt to write read-only field IPIN\n");
+ ret = -EINVAL;
+ break;
+ case PCI_INTERRUPT_LINE:
+ pci->hdr.intr.iline = buf[0];
+ break;
+ case PCI_LATENCY_TIMER:
+ pci->hdr.mlt = (uint8_t)buf[0];
+ lm_log(lm_ctx, LM_INF, "set to latency timer to %hhx\n", pci->hdr.mlt);
+ break;
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_1:
+ case PCI_BASE_ADDRESS_2:
+ case PCI_BASE_ADDRESS_3:
+ case PCI_BASE_ADDRESS_4:
+ case PCI_BASE_ADDRESS_5:
+ muser_pci_hdr_write_bar(lm_ctx, BAR_INDEX(offset), buf);
+ break;
+ case PCI_ROM_ADDRESS:
+ ret = handle_erom_write(lm_ctx, pci, buf, count);
+ break;
+ default:
+ lm_log(lm_ctx, LM_INF, "PCI config write %x@%x not handled\n",
+ count, offset);
+ ret = -EINVAL;
+ }
+
+ dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff);
+
+ return ret;
+}
+
+/*
+ * @pci_hdr: the PCI header
+ * @reg_info: region info
+ * @rw: the command
+ * @write: whether this is a PCI header write
+ * @count: output parameter that receives the number of bytes read/written
+ */
+static inline int
+muser_do_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count,
+ loff_t * const pos, const bool is_write,
+ unsigned char *const buf)
+{
+ size_t _count;
+ loff_t _pos;
+ int err = 0;
+
+ assert(lm_ctx);
+ assert(count);
+ assert(pos);
+ assert(buf);
+
+ _pos = *pos - lm_get_region_info(lm_ctx)[LM_DEV_CFG_REG_IDX].offset;
+ _count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos);
+
+ if (is_write) {
+ err = muser_pci_hdr_write(lm_ctx, _pos, buf, _count);
+ } else {
+ memcpy(buf, lm_get_pci_config_space(lm_ctx)->hdr.raw + _pos, _count);
+ }
+ *pos += _count;
+ *count -= _count;
+ return err;
+}
+
+static inline bool
+muser_is_pci_hdr_access(const lm_reg_info_t * const reg_info, const loff_t pos)
+{
+ const off_t off = (loff_t) reg_info[LM_DEV_CFG_REG_IDX].offset;
+ return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF;
+}
+
+int
+muser_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count,
+ loff_t * const pos, const bool is_write,
+ unsigned char *const buf)
+{
+ assert(lm_ctx);
+ assert(count);
+ assert(pos);
+
+ if (!muser_is_pci_hdr_access(lm_get_region_info(lm_ctx), *pos)) {
+ return 0;
+ }
+ return muser_do_pci_hdr_access(lm_ctx, count, pos, is_write, buf);
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/msicap.h b/lib/msicap.h
new file mode 100644
index 0000000..bfcf1cd
--- /dev/null
+++ b/lib/msicap.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+struct mid {
+ unsigned int next:8;
+ unsigned int cid:8;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct mid) == 0x2, "bad MID size");
+
+struct mc {
+ unsigned int msie:1;
+ unsigned int mmc:3;
+ unsigned int mme:3;
+ unsigned int c64:1;
+ unsigned int pvm:1;
+ unsigned int res1:7;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct mc) == 0x2, "bad MC size");
+
+struct ma {
+ unsigned int res1:2;
+ unsigned int addr:30;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct ma) == 0x4, "bad MA size");
+
+struct msicap {
+ struct mid mid;
+ struct mc mc;
+ struct ma ma;
+ uint32_t mua;
+ uint16_t md;
+ uint16_t padding;
+ uint32_t mmask;
+ uint32_t mpend;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size");
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser.h b/lib/muser.h
new file mode 100644
index 0000000..a844f5c
--- /dev/null
+++ b/lib/muser.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef LIB_MUSER_H
+#define LIB_MUSER_H
+
+#include <stdint.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "pci.h"
+
+/**
+ * lm_fops_t - driver callbacks
+ *
+ * @read: read device configuration space
+ * @write: write device configuration space
+ * @mmap: mmap device configuration space
+ * @reset: reset the device
+ */
+typedef struct {
+ ssize_t (*read) (void *pvt, const int index, char *buf, size_t count,
+ loff_t pos);
+ ssize_t (*write) (void *pvt, const int index, char *buf, size_t count,
+ loff_t pos);
+ unsigned long (*mmap) (void *pvt, unsigned long pgoff);
+ int (*reset) (void *pvt);
+} lm_fops_t;
+
+
+/**
+ * Callback function signatures for each regions.
+ *
+ * @lm_bar_access_t: typedef for BAR access function.
+ * @lm_non_bar_access_t: typedef for non-BAR(rom, pci config,
+ * vga) access functions.
+ */
+typedef ssize_t (lm_bar_access_t) (void *pvt, const int region_index,
+ char * const buf, size_t count,
+ loff_t offset, const bool is_write);
+typedef ssize_t (lm_non_bar_access_t) (void *pvt, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write);
+typedef struct {
+ uint32_t irq_count[LM_DEV_NUM_IRQS];
+ lm_reg_info_t reg_info[LM_DEV_NUM_REGS];
+
+ /* Optional PCI region access callbacks. */
+ lm_bar_access_t *bar_fn;
+ lm_non_bar_access_t *rom_fn;
+ lm_non_bar_access_t *pci_config_fn;
+ lm_non_bar_access_t *vga_fn;
+} lm_pci_info_t;
+
+/**
+ * Callback function signature for log function
+ *
+ * @lm_log_fn_t: typedef for log function.
+ */
+typedef void (lm_log_fn_t) (void *pvt, const char *const msg);
+
+/**
+ * Device information structure, used to create the lm_ctx.
+ * To be filled and passed to lm_ctx_run()
+ */
+typedef struct {
+ char *uuid;
+ void *pvt;
+ /*
+ * whether an extended PCI configuration space should be created
+ */
+ bool extended;
+ int nr_dma_regions;
+ lm_log_fn_t *log;
+ lm_log_lvl_t log_lvl;
+ lm_fops_t fops;
+ lm_pci_hdr_id_t id;
+ lm_pci_hdr_cc_t cc;
+ lm_pci_info_t pci_info;
+} lm_dev_info_t;
+
+/**
+ * Creates libmuser context.
+ *
+ * Arguments:
+ * @dev_info: device information used to create the context.
+ */
+lm_ctx_t *lm_ctx_create(lm_dev_info_t * dev_info);
+
+/**
+ * Destroys libmuser context.
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to destroy.
+ */
+void lm_ctx_destroy(lm_ctx_t * lm_ctx);
+
+/**
+ * Once the lm_ctx is configured lm_ctx_drive() drives it. This function waits
+ * for commands comming from muser.ko and then processes it..
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to drive.
+ */
+
+int lm_ctx_drive(lm_ctx_t * lm_ctx);
+
+
+/**
+ * Creates mapping of BAR's into the callers vmem. It should be called from
+ * lm_fops_t->mmap.
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to create mapping from.
+ */
+void *lm_mmap(lm_ctx_t * lm_ctx, size_t length, off_t offset);
+
+/**
+ * Trigger interrupt.
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to trigger interrupt.
+ * @vector: vector to tirgger interrupt on.
+ */
+int lm_irq_trigger(lm_ctx_t * lm_ctx, uint32_t vector);
+
+/* Helper functions */
+
+int lm_ctx_run(lm_ctx_t * const ctx);
+
+uint8_t *lm_get_pci_non_std_config_space(lm_ctx_t * const lm_ctx);
+
+int lm_addr_to_sg(lm_ctx_t * const ctx, dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg);
+
+int
+lm_map_sg(lm_ctx_t * const ctx, int prot, const dma_scattergather_t * sg,
+ struct iovec *iov, int cnt);
+
+void
+lm_unmap_sg(lm_ctx_t * const ctx, const dma_scattergather_t * sg,
+ struct iovec *iov, int cnt);
+
+int
+lm_get_region(lm_ctx_t * const ctx, const loff_t pos,
+ const size_t count, loff_t * const off);
+
+#ifdef DEBUG
+void
+dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix,
+ unsigned char const *const buf, const uint32_t count);
+#endif
+
+#endif /* LIB_MUSER_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/pci.h b/lib/pci.h
new file mode 100644
index 0000000..4b7132a
--- /dev/null
+++ b/lib/pci.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef LIBMUSER_PCI_H
+#define LIBMUSER_PCI_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <linux/pci_regs.h>
+
+struct lm_ctx;
+typedef struct lm_ctx lm_ctx_t;
+
+typedef uint64_t dma_addr_t;
+
+typedef struct {
+ int region;
+ int length;
+ uint64_t offset;
+} dma_scattergather_t;
+
+typedef struct lm_ctx lm_ctx_t;
+typedef struct lm_reg_info lm_reg_info_t;
+typedef struct lm_pci_config_space lm_pci_config_space_t;
+
+typedef enum {
+ LM_ERR,
+ LM_INF,
+ LM_DBG
+} lm_log_lvl_t;
+
+#define PCI_CONFIG_SPACE_SIZEOF 0x100
+#define PCI_EXTENDED_CONFIG_SPACE_SIZEOF 0x1000
+
+enum {
+ LM_DEV_BAR0_REG_IDX,
+ LM_DEV_BAR1_REG_IDX,
+ LM_DEV_BAR2_REG_IDX,
+ LM_DEV_BAR3_REG_IDX,
+ LM_DEV_BAR4_REG_IDX,
+ LM_DEV_BAR5_REG_IDX,
+ LM_DEV_ROM_REG_IDX,
+ LM_DEV_CFG_REG_IDX,
+ LM_DEV_VGA_REG_IDX,
+ LM_DEV_NUM_REGS = 9
+};
+
+/*
+ * TODO lots of the sizes of each member are defined in pci_regs.h, use those
+ * instead?
+ */
+
+typedef union {
+ uint32_t raw;
+ struct {
+ uint16_t vid;
+ uint16_t sid;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_ss_t;
+_Static_assert(sizeof(lm_pci_hdr_ss_t) == 0x4, "bad SS size");
+
+typedef union {
+ uint8_t raw;
+} __attribute__ ((packed)) lm_pci_hdr_bist_t;
+_Static_assert(sizeof(lm_pci_hdr_bist_t) == 0x1, "bad BIST size");
+
+typedef union {
+ uint32_t raw;
+ union {
+ struct {
+ unsigned int region_type:1;
+ unsigned int locatable:2;
+ unsigned int prefetchable:1;
+ unsigned int base_address:28;
+ } __attribute__ ((packed)) mem;
+ struct {
+ unsigned int region_type:1;
+ unsigned int reserved:1;
+ unsigned int base_address:30;
+ } __attribute__ ((packed)) io;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_bar_t;
+_Static_assert(sizeof(lm_bar_t) == 0x4, "bad BAR size");
+
+typedef union {
+ uint8_t raw;
+} __attribute__ ((packed)) lm_pci_hdr_htype_t;
+_Static_assert(sizeof(lm_pci_hdr_htype_t) == 0x1, "bad HTYPE size");
+
+typedef union {
+ uint8_t raw[3];
+ struct {
+ uint8_t pi;
+ uint8_t scc;
+ uint8_t bcc;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_cc_t;
+_Static_assert(sizeof(lm_pci_hdr_cc_t) == 0x3, "bad CC size");
+
+/* device status */
+typedef union {
+ uint16_t raw;
+ struct {
+ unsigned int res1:3;
+ unsigned int is:1;
+ unsigned int cl:1;
+ unsigned int c66:1;
+ unsigned int res2:1;
+ unsigned int fbc:1;
+ unsigned int dpd:1;
+ unsigned int devt:2;
+ unsigned int sta:1;
+ unsigned int rta:1;
+ unsigned int rma:1;
+ unsigned int sse:1;
+ unsigned int dpe:1;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_sts_t;
+_Static_assert(sizeof(lm_pci_hdr_sts_t) == 0x2, "bad STS size");
+
+typedef union {
+ uint16_t raw;
+ struct {
+ uint8_t iose:1;
+ uint8_t mse:1;
+ uint8_t bme:1;
+ uint8_t sce:1;
+ uint8_t mwie:1;
+ uint8_t vga:1;
+ uint8_t pee:1;
+ uint8_t zero:1;
+ uint8_t see:1;
+ uint8_t fbe:1;
+ uint8_t id:1;
+ uint8_t res1:5;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_cmd_t;
+_Static_assert(sizeof(lm_pci_hdr_cmd_t) == 0x2, "bad CMD size");
+
+typedef union {
+ uint32_t raw;
+ struct {
+ uint16_t vid;
+ uint16_t did;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_id_t;
+_Static_assert(sizeof(lm_pci_hdr_id_t) == 0x4, "bad ID size");
+
+typedef union {
+ uint16_t raw;
+ struct {
+ uint8_t iline;
+ uint8_t ipin;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_intr_t;
+_Static_assert(sizeof(lm_pci_hdr_intr_t) == 0x2, "bad INTR size");
+
+typedef union {
+ uint8_t raw[PCI_STD_HEADER_SIZEOF];
+ struct {
+ lm_pci_hdr_id_t id;
+ lm_pci_hdr_cmd_t cmd;
+ lm_pci_hdr_sts_t sts;
+ uint8_t rid;
+ lm_pci_hdr_cc_t cc;
+ uint8_t cls;
+ uint8_t mlt;
+ lm_pci_hdr_htype_t htype;
+ lm_pci_hdr_bist_t bist;
+#define PCI_BARS_NR 6
+ lm_bar_t bars[PCI_BARS_NR];
+ uint32_t ccptr;
+ lm_pci_hdr_ss_t ss;
+ uint32_t erom;
+ uint8_t cap;
+ uint8_t res1[7];
+ lm_pci_hdr_intr_t intr;
+ uint8_t mgnt;
+ uint8_t mlat;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_t;
+_Static_assert(sizeof(lm_pci_hdr_t) == 0x40, "bad PCI header size");
+
+typedef struct {
+ uint8_t raw[PCI_CONFIG_SPACE_SIZEOF - PCI_STD_HEADER_SIZEOF];
+} __attribute__ ((packed)) lm_pci_non_std_config_space_t;
+_Static_assert(sizeof(lm_pci_non_std_config_space_t) == 0xc0,
+ "bad non-standard PCI configuration space size");
+
+struct lm_pci_config_space {
+ union {
+ uint8_t raw[PCI_CONFIG_SPACE_SIZEOF];
+ struct {
+ lm_pci_hdr_t hdr;
+ lm_pci_non_std_config_space_t non_std;
+ } __attribute__ ((packed));
+ } __attribute__ ((packed));
+ uint8_t extended[];
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct lm_pci_config_space) == 0x100,
+ "bad PCI configuration space size");
+
+// Region flags.
+#define LM_REG_FLAG_READ (1 << 0)
+#define LM_REG_FLAG_WRITE (1 << 1)
+#define LM_REG_FLAG_MMAP (1 << 2) // TODO: how this relates to IO bar?
+#define LM_REG_FLAG_RW (LM_REG_FLAG_READ | LM_REG_FLAG_WRITE)
+#define LM_REG_FLAG_MEM (1 << 3) // if unset, bar is IO
+
+struct lm_reg_info {
+ uint32_t flags;
+ uint32_t size;
+ uint64_t offset;
+};
+
+enum {
+ LM_DEV_INTX_IRQ_IDX,
+ LM_DEV_MSI_IRQ_IDX,
+ LM_DEV_MSIX_IRQ_IDX,
+ LM_DEV_ERR_IRQ_IDX,
+ LM_DEV_REQ_IRQ_IDX,
+ LM_DEV_NUM_IRQS = 5
+};
+
+/*
+ * Returns a pointer to the non-standard part of the PCI configuration space.
+ */
+lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t * const lm_ctx);
+
+lm_reg_info_t *lm_get_region_info(lm_ctx_t * const lm_ctx);
+
+/*
+ * TODO the rest of these functions don't need to be public, put them in a
+ * private header file so libmuser.c can use them.
+ * TODO replace the "muser" prefix
+ */
+int
+muser_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count,
+ loff_t * const pos, const bool write,
+ unsigned char *const buf);
+
+
+
+#endif /* LIBMUSER_PCI_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/pmcap.h b/lib/pmcap.h
new file mode 100644
index 0000000..2757a3e
--- /dev/null
+++ b/lib/pmcap.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+struct pid {
+ unsigned int cid:8;
+ unsigned int next:8;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pid) == 0x2, "bad PID size");
+
+struct pc {
+ unsigned int vs:3;
+ unsigned int pmec:1;
+ unsigned int res:1;
+ unsigned int dsi:1;
+ unsigned int auxc:3;
+ unsigned int d1s:1;
+ unsigned int d2s:1;
+ unsigned int psup:5;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+
+struct pmcs {
+ unsigned int ps:2;
+ unsigned int res1:1;
+ unsigned int nsfrst:1;
+ unsigned int res2:4;
+ unsigned int pmee:1;
+ unsigned int dse:4;
+ unsigned int dsc:2;
+ unsigned int pmes:1;
+};
+_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+
+struct pmcap {
+ struct pid pid;
+ struct pc pc;
+ struct pmcs pmcs;
+} __attribute__((packed)) __attribute__ ((aligned(8)));
+_Static_assert(sizeof(struct pmcap) == 0x8, "bad PC size");
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/pxcap.h b/lib/pxcap.h
new file mode 100644
index 0000000..fbea685
--- /dev/null
+++ b/lib/pxcap.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+struct pxid {
+ unsigned int cid:8;
+ unsigned int next:8;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size");
+
+struct pxcap {
+ unsigned int ver:4;
+ unsigned int dpt:4;
+ unsigned int si:1;
+ unsigned int imn:5;
+ unsigned int res1:2;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxcap) == 0x2, "bad PXCAP size");
+
+struct pxdcap {
+ unsigned int mps:3;
+ unsigned int pfs:2;
+ unsigned int etfs:1;
+ unsigned int l0sl:3;
+ unsigned int l1l:3;
+ unsigned int per:1;
+ unsigned int res1:2;
+ unsigned int csplv:8;
+ unsigned int cspls:2;
+ unsigned int flrc:1;
+ unsigned int res2:3;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxdcap) == 0x4, "bad PXDCAP size");
+
+union pxdc {
+ uint16_t raw;
+ struct {
+ unsigned int cere:1;
+ unsigned int nfere:1;
+ unsigned int fere:1;
+ unsigned int urre:1;
+ unsigned int ero:1;
+ unsigned int mps:3;
+ unsigned int ete:1;
+ unsigned int pfe:1;
+ unsigned int appme:1;
+ unsigned int ens:1;
+ unsigned int mrrs:3;
+ unsigned int iflr:1;
+ } __attribute__((packed));
+} __attribute__((packed));
+_Static_assert(sizeof(union pxdc) == 0x2, "bad PXDC size");
+
+/* TODO not defining for now since all values are 0 for reset */
+struct pxds {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxds) == 0x2, "bad PXDS size");
+
+struct pxlcap {
+ unsigned int stuff:32;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxlcap) == 0x4, "bad PXLCAP size");
+
+struct pxlc {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxlc) == 0x2, "bad PXLC size");
+
+struct pxls {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxls) == 0x2, "bad PXLS size");
+
+struct pxdcap2 {
+ unsigned int ctrs:4;
+ unsigned int ctds:1;
+ unsigned int arifs:1;
+ unsigned int aors:1;
+ unsigned int aocs32:1;
+ unsigned int aocs64:1;
+ unsigned int ccs128:1;
+ unsigned int nprpr:1;
+ unsigned int ltrs:1;
+ unsigned int tphcs:2;
+ unsigned int obffs:2;
+ unsigned int effs:1;
+ unsigned int eetps:1;
+ unsigned int meetp:2;
+ unsigned int res1:8;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxdcap2) == 0x4, "bad PXDCAP2 size");
+
+struct pxdc2 {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size");
+
+/* TODO name conflicts with PXCAP */
+struct PCI_Express_Capability {
+ struct pxid pxid;
+ struct pxcap pxcap;
+ struct pxdcap pxdcap;
+ union pxdc pxdc;
+ struct pxds pxds;
+ struct pxlcap pxlcap;
+ struct pxlc pxlc;
+ struct pxls pxls;
+ uint8_t pad[0x10];
+ struct pxdcap2 pxdcap2;
+ struct pxdc2 pxdc2;
+} __attribute__((packed));
+_Static_assert(sizeof(struct PCI_Express_Capability) == 0x2a,
+ "bad PCI Express Capability size");
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/patches/vfio.diff b/patches/vfio.diff
new file mode 100644
index 0000000..d19da2e
--- /dev/null
+++ b/patches/vfio.diff
@@ -0,0 +1,192 @@
+diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
+index a3030cd..ab1b82c 100644
+--- a/drivers/vfio/vfio.c
++++ b/drivers/vfio/vfio.c
+@@ -2019,15 +2019,24 @@ static int vfio_register_iommu_notifier(struct vfio_group *group,
+ int ret;
+
+ ret = vfio_group_add_container_user(group);
+- if (ret)
++ if (ret) {
++ pr_info("vfio_group_add_container_user failed with %d\n", ret);
+ return -EINVAL;
++ }
+
+ container = group->container;
+ driver = container->iommu_driver;
+- if (likely(driver && driver->ops->register_notifier))
++ if (likely(driver && driver->ops->register_notifier)) {
+ ret = driver->ops->register_notifier(container->iommu_data,
+- events, nb);
+- else
++ events, nb);
++ if (unlikely(!ret) && driver->ops->retro_notify) {
++ ret = driver->ops->retro_notify(container->iommu_data);
++ if (unlikely((ret & NOTIFY_BAD) == NOTIFY_BAD))
++ ret = -ENOTTY;
++ else
++ ret = 0;
++ }
++ } else
+ ret = -ENOTTY;
+
+ vfio_group_try_dissolve_container(group);
+@@ -2140,6 +2149,7 @@ int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
+ ret = vfio_register_group_notifier(group, events, nb);
+ break;
+ default:
++ pr_info("bad notification type %d\n", type);
+ ret = -EINVAL;
+ }
+
+diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
+index d0f731c..b47b8f96 100644
+--- a/drivers/vfio/vfio_iommu_type1.c
++++ b/drivers/vfio/vfio_iommu_type1.c
+@@ -558,8 +558,10 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ return -EINVAL;
+
+ /* Supported for v2 version only */
+- if (!iommu->v2)
++ if (!iommu->v2) {
++ pr_debug("non v2 IOMMU\n");
+ return -EACCES;
++ }
+
+ mutex_lock(&iommu->lock);
+
+@@ -1050,6 +1052,30 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+ return ret;
+ }
+
++static int vfio_dma_map_trigger_notifiers(struct vfio_iommu * const iommu,
++ struct vfio_dma const * const dma)
++
++{
++ struct vfio_iommu_type1_dma_map nb_map = {0};
++
++ BUG_ON(!iommu);
++ BUG_ON(!dma);
++
++ nb_map.flags = dma->prot;
++
++ if ((dma->prot & IOMMU_READ) == IOMMU_READ)
++ nb_map.flags |= VFIO_DMA_MAP_FLAG_READ;
++ if ((dma->prot & IOMMU_WRITE) == IOMMU_WRITE)
++ nb_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
++ nb_map.vaddr = dma->vaddr;
++ nb_map.iova = dma->iova;
++ nb_map.size = dma->size;
++
++ return blocking_notifier_call_chain(&iommu->notifier,
++ VFIO_IOMMU_NOTIFY_DMA_MAP,
++ &nb_map);
++}
++
+ static int vfio_dma_do_map(struct vfio_iommu *iommu,
+ struct vfio_iommu_type1_dma_map *map)
+ {
+@@ -1139,13 +1165,25 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
+ vfio_link_dma(iommu, dma);
+
+ /* Don't pin and map if container doesn't contain IOMMU capable domain*/
+- if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
++ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+ dma->size = size;
+- else
++ ret = 0;
++ } else
+ ret = vfio_pin_map_dma(iommu, dma, size);
+
+ out_unlock:
+ mutex_unlock(&iommu->lock);
++ /* FIXME is the following safe without having acquired the mutex? */
++ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) && !ret) {
++ ret = vfio_dma_map_trigger_notifiers(iommu, dma);
++ /* FIXME proceed or clean up and fail? */
++ if ((ret & NOTIFY_BAD) == NOTIFY_BAD) {
++ pr_debug("failed to trigger notifier(s): %d\n", ret);
++ ret = -EINVAL;
++ } else
++ ret = 0;
++ }
++
+ return ret;
+ }
+
+@@ -1504,8 +1542,11 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
+
+ dma = rb_entry(n, struct vfio_dma, node);
+
+- if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
++ if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list))) {
++ pr_debug("DMA region %llx-%llx still pinned\n",
++ dma->iova, dma->iova + dma->size);
+ break;
++ }
+ }
+ /* mdev vendor driver must unregister notifier */
+ WARN_ON(iommu->notifier.head);
+@@ -1740,7 +1781,7 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data,
+ struct vfio_iommu *iommu = iommu_data;
+
+ /* clear known events */
+- *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
++ *events &= ~(VFIO_IOMMU_NOTIFY_DMA_MAP | VFIO_IOMMU_NOTIFY_DMA_UNMAP);
+
+ /* refuse to register if still events remaining */
+ if (*events)
+@@ -1749,6 +1790,25 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data,
+ return blocking_notifier_chain_register(&iommu->notifier, nb);
+ }
+
++static int vfio_iommu_type1_retro_notify(void *iommu_data)
++{
++ int err = NOTIFY_OK;
++ struct vfio_iommu *iommu;
++ struct vfio_dma *pos, *n;
++
++ BUG_ON(!iommu_data);
++
++ iommu = (struct vfio_iommu*)iommu_data;
++
++ rbtree_postorder_for_each_entry_safe(pos, n, &iommu->dma_list, node) {
++ err = vfio_dma_map_trigger_notifiers(iommu, pos);
++ if ((err & NOTIFY_BAD) == NOTIFY_BAD)
++ break;
++ }
++
++ return err;
++}
++
+ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
+ struct notifier_block *nb)
+ {
+@@ -1769,6 +1829,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
+ .unpin_pages = vfio_iommu_type1_unpin_pages,
+ .register_notifier = vfio_iommu_type1_register_notifier,
+ .unregister_notifier = vfio_iommu_type1_unregister_notifier,
++ .retro_notify = vfio_iommu_type1_retro_notify,
+ };
+
+ static int __init vfio_iommu_type1_init(void)
+diff --git a/include/linux/vfio.h b/include/linux/vfio.h
+index 66741ab0..10ee80b 100644
+--- a/include/linux/vfio.h
++++ b/include/linux/vfio.h
+@@ -85,6 +85,7 @@ struct vfio_iommu_driver_ops {
+ struct notifier_block *nb);
+ int (*unregister_notifier)(void *iommu_data,
+ struct notifier_block *nb);
++ int (*retro_notify)(void *iommu_data);
+ };
+
+ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
+@@ -118,6 +119,7 @@ enum vfio_notify_type {
+
+ /* events for VFIO_IOMMU_NOTIFY */
+ #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
++#define VFIO_IOMMU_NOTIFY_DMA_MAP BIT(1)
+
+ /* events for VFIO_GROUP_NOTIFY */
+ #define VFIO_GROUP_NOTIFY_SET_KVM BIT(0)
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
new file mode 100644
index 0000000..d12a813
--- /dev/null
+++ b/samples/CMakeLists.txt
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2019 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos@nutanix.com>
+# Swapnil Ingle <swapnil.ingle@nutanix.com>
+# Felipe Franciosi <felipe@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+add_executable(test_read test_read.c)
+add_executable(test_mmap test_mmap.c)
diff --git a/samples/test_mmap.c b/samples/test_mmap.c
new file mode 100644
index 0000000..02c32f1
--- /dev/null
+++ b/samples/test_mmap.c
@@ -0,0 +1,199 @@
+/*
+ * Userspace mediated device sample application
+ *
+ * Copyright (c) 2019, Nutanix Inc. All rights reserved.
+ * Author: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/vfio.h>
+#include <limits.h>
+#include <assert.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#define VFIO_PATH "/dev/vfio/"
+#define VFIO_CTR_PATH VFIO_PATH "vfio"
+#define SYSFS_PCI_DEV_PATH "/sys/bus/pci/devices/"
+#define SYSFS_IOMMU_GROUP "/iommu_group"
+
+static int
+pci_group_id(const char *bdf)
+{
+ char *dev_path;
+ char group_path[PATH_MAX];
+ int group_id;
+
+ assert(bdf);
+
+ asprintf(&dev_path, SYSFS_PCI_DEV_PATH "%s" SYSFS_IOMMU_GROUP, bdf);
+ memset(group_path, 0, sizeof(group_path));
+ readlink(dev_path, group_path, sizeof(group_path));
+ free(dev_path);
+ sscanf(basename(group_path), "%d", &group_id);
+ return group_id;
+}
+
+static inline void*
+test_map_dma(const int fd, const unsigned long size, const unsigned long iova)
+{
+ int err;
+ struct vfio_iommu_type1_dma_map dma_map = {
+ .argsz = sizeof(dma_map),
+ .size = size,
+ .iova = iova,
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ };
+
+ /* Allocate some space and setup a DMA mapping */
+ /* FIXME it *must* be MAP_SHARED */
+ dma_map.vaddr = (unsigned long long)mmap(0, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0, 0);
+ if (dma_map.vaddr == (unsigned long)MAP_FAILED) {
+ perror("failed to map DMA");
+ return NULL;
+ }
+ printf("%llx\n", dma_map.vaddr);
+ strcpy((char*)dma_map.vaddr, "foo");
+
+ fprintf(stderr, "attempting to MAP_DMA IOVA=%llx\n", dma_map.iova);
+
+ err = ioctl(fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (err) {
+ fprintf(stderr, "failed to MAP_DMA: %d (errno=%d)", err, errno);
+ return NULL;
+ }
+ printf("[%s]\n", (char*)dma_map.vaddr);
+
+ return (void*)dma_map.vaddr;
+}
+
+static inline void
+test_unmap_dma(const int fd, const unsigned long size, const unsigned long iova)
+{
+ int err;
+ struct vfio_iommu_type1_dma_unmap dma_unmap = {
+ .argsz = sizeof dma_unmap,
+ .size = size,
+ .iova = iova,
+ .flags = 0
+ };
+
+ err = ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+ if (err) {
+ perror("UNMAP_DMA\n");
+ return;
+ }
+ printf("unmapped IOVA=%llx\n", dma_unmap.iova);
+}
+
+int main(int argc, char * argv[])
+{
+ int err, vfio_ctr_fd, vfio_grp_fd, vfio_dev_fd;
+ char *grp_path;
+#ifdef DEBUG
+ struct vfio_group_status grp_status;
+#endif
+ struct vfio_iommu_type1_info iommu_info;
+ void *dma_map_addr = NULL;
+
+ if (argc != 2) {
+ printf("Usage: %s <device bdf in full>\n", argv[0]);
+ printf(" ex: %s 0000:82:00.0\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ vfio_ctr_fd = open(VFIO_CTR_PATH, O_RDWR);
+ assert(vfio_ctr_fd >= 0);
+
+#ifdef DEBUG
+ err = ioctl(vfio_ctr_fd, VFIO_GET_API_VERSION);
+ assert(err == VFIO_API_VERSION);
+ err = ioctl(vfio_ctr_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+ assert(err == 1);
+#endif
+
+ // Open the VFIO entry for this device's IOMMU GROUP.
+ err = asprintf(&grp_path, VFIO_PATH "%d", pci_group_id(argv[1]));
+ assert(err > 0);
+ vfio_grp_fd = open(grp_path, O_RDWR);
+ assert(vfio_grp_fd >= 0);
+ free(grp_path);
+
+#ifdef DEBUG
+ // Ensure group is viable.
+ memset(&grp_status, 0, sizeof(grp_status));
+ grp_status.argsz = sizeof(grp_status);
+ err = ioctl(vfio_grp_fd, VFIO_GROUP_GET_STATUS, &grp_status);
+ assert(!err);
+ assert((grp_status.flags & VFIO_GROUP_FLAGS_VIABLE) == 1);
+#endif
+
+ // Add the group to the container.
+ err = ioctl(vfio_grp_fd, VFIO_GROUP_SET_CONTAINER, &vfio_ctr_fd);
+ assert(!err);
+
+ // Enable IOMMU type 1 on container.
+ err = ioctl(vfio_ctr_fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU);
+ assert(!err);
+
+ // Fetch IOMMU information from VFIO.
+ memset(&iommu_info, 0, sizeof(iommu_info));
+ iommu_info.argsz = sizeof(iommu_info);
+ err = ioctl(vfio_ctr_fd, VFIO_IOMMU_GET_INFO, &iommu_info);
+ assert(!err);
+
+ // Get a device fd from VFIO.
+ vfio_dev_fd = ioctl(vfio_grp_fd, VFIO_GROUP_GET_DEVICE_FD, argv[1]);
+ assert(vfio_dev_fd >= 0);
+
+ void *p;
+ p = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ | PROT_WRITE,
+ MAP_SHARED, vfio_dev_fd, 0);
+ assert(p != MAP_FAILED);
+ printf("%p\n", p);
+ printf("%s\n", (char*)p);
+
+ dma_map_addr = test_map_dma(vfio_ctr_fd, 4096, 0xdeadbeef000);
+ if (!dma_map_addr)
+ exit(EXIT_FAILURE);
+ test_unmap_dma(vfio_ctr_fd, 4096, 0xdeadbeef000);
+
+ return 0;
+}
diff --git a/samples/test_read.c b/samples/test_read.c
new file mode 100644
index 0000000..24af454
--- /dev/null
+++ b/samples/test_read.c
@@ -0,0 +1,233 @@
+/*
+ * Userspace mediated device sample application
+ *
+ * Copyright (c) 2019, Nutanix Inc. All rights reserved.
+ * Author: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/vfio.h>
+#include <limits.h>
+#include <assert.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#define VFIO_PATH "/dev/vfio/"
+#define VFIO_CTR_PATH VFIO_PATH "vfio"
+#define SYSFS_MUSER_DEV_PATH "/sys/class/muser/muser/"
+#define SYSFS_IOMMU_GROUP "/iommu_group"
+
+static int
+test_read(int vfio_dev_fd, off_t offset)
+{
+ size_t bytes;
+ char buf[256];
+ int i;
+
+ memset(buf, 0, sizeof(buf));
+ printf("* Reading %zd bytes\n", sizeof(buf));
+ bytes = pread(vfio_dev_fd, buf, sizeof(buf), offset);
+ assert(bytes == sizeof(buf));
+ printf("** Read %zd bytes\n", bytes);
+
+ for (i = 0; i < sizeof(buf); i++) {
+ if (i % 16 == 0) {
+ printf("%04X:", i);
+ }
+ printf(" %02hhX", buf[i]);
+ if (i % 16 == 15) {
+ printf("\n");
+ }
+ }
+ if (i % 16 != 0) {
+ printf("\n");
+ }
+
+ return 0;
+}
+
+static int
+pci_group_id(const char *uuid)
+{
+ char *dev_path;
+ char group_path[PATH_MAX];
+ int group_id;
+
+ assert(uuid != NULL);
+
+ asprintf(&dev_path, SYSFS_MUSER_DEV_PATH "%s" SYSFS_IOMMU_GROUP, uuid);
+ memset(group_path, 0, sizeof(group_path));
+ readlink(dev_path, group_path, sizeof(group_path));
+ free(dev_path);
+ sscanf(basename(group_path), "%d", &group_id);
+ return group_id;
+}
+
+int
+main(int argc, char * argv[])
+{
+ int vfio_ctr_fd, vfio_grp_fd, vfio_dev_fd;
+ char *grp_path;
+ int i;
+ int err;
+
+ if (argc != 2) {
+ printf("Usage: %s <muser_dev_uuid>\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ // Create a new VFIO container.
+ printf("* Creating new VFIO container...\n");
+ vfio_ctr_fd = open(VFIO_CTR_PATH, O_RDWR);
+ assert(vfio_ctr_fd >= 0);
+ printf("** vfio_ctr_fd = %d\n", vfio_ctr_fd);
+
+ // Ensure kernel VFIO is compatible.
+ printf("* Fetching VFIO API version...\n");
+ err = ioctl(vfio_ctr_fd, VFIO_GET_API_VERSION);
+ assert(err == VFIO_API_VERSION);
+
+ // Ensure VFIO supports TYPE1 IOMMU.
+ printf("* Checking for IOMMU TYPE1 extension in VFIO...\n");
+ err = ioctl(vfio_ctr_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+ assert(err == 1);
+
+ // Open the VFIO entry for this device's IOMMU GROUP.
+ err = asprintf(&grp_path, VFIO_PATH "%d", pci_group_id(argv[1]));
+ assert(err > 0);
+ printf("* Opening the VFIO group (%s)...\n", grp_path);
+ vfio_grp_fd = open(grp_path, O_RDWR);
+ assert(vfio_grp_fd >= 0);
+ printf("** vfio_grp_fd = %d\n", vfio_grp_fd);
+ free(grp_path);
+
+ // Ensure group is viable.
+ struct vfio_group_status grp_status;
+ printf("* Ensuring all devices in this group are bound to VFIO...\n");
+ memset(&grp_status, 0, sizeof(grp_status));
+ grp_status.argsz = sizeof(grp_status);
+ err = ioctl(vfio_grp_fd, VFIO_GROUP_GET_STATUS, &grp_status);
+ assert(!err);
+ assert((grp_status.flags & VFIO_GROUP_FLAGS_VIABLE) == 1);
+
+ // Add the group to the container.
+ printf("* Adding group to container...\n");
+ err = ioctl(vfio_grp_fd, VFIO_GROUP_SET_CONTAINER, &vfio_ctr_fd);
+ assert(!err);
+
+ // Enable IOMMU type 1 on container.
+ printf("* Setting IOMMU Type 1 on container...\n");
+ err = ioctl(vfio_ctr_fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU);
+ assert(!err);
+
+ // Fetch IOMMU information from VFIO.
+ struct vfio_iommu_type1_info iommu_info;
+ printf("* Fetching IOMMU information...\n");
+ memset(&iommu_info, 0, sizeof(iommu_info));
+ iommu_info.argsz = sizeof(iommu_info);
+ err = ioctl(vfio_ctr_fd, VFIO_IOMMU_GET_INFO, &iommu_info);
+ assert(!err);
+
+ // Get a device fd from VFIO.
+ printf("* Getting a device (%s) fd from group...\n", argv[1]);
+ vfio_dev_fd = ioctl(vfio_grp_fd, VFIO_GROUP_GET_DEVICE_FD, argv[1]);
+ assert(vfio_dev_fd >= 0);
+ printf("** vfio_dev_fd = %d\n", vfio_dev_fd);
+
+ // Fetch device information.
+ printf("* Fetching device information...\n");
+ struct vfio_device_info dev_info;
+ memset(&dev_info, 0, sizeof(dev_info));
+ dev_info.argsz = sizeof(dev_info);
+ err = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &dev_info);
+ assert(err == 0);
+ assert(dev_info.num_regions <= VFIO_PCI_NUM_REGIONS);
+
+ // Fetch region information for this device.
+ struct vfio_region_info reg_info[VFIO_PCI_NUM_REGIONS];
+ printf("* Fetching information for %u regions\n", dev_info.num_regions);
+ for (i = 0; i < (int)dev_info.num_regions; i++) {
+ memset(&reg_info[i], 0, sizeof(reg_info[i]));
+ reg_info[i].argsz = sizeof(reg_info[i]);
+ reg_info[i].index = i;
+ err = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info[i]);
+ if (err != 0) {
+ // This region doesn't exist or isn't accessible.
+ printf("** %d: Region info unavailable\n", i);
+ memset(&reg_info[i], 0, sizeof(reg_info[i]));
+ } else {
+ printf("** %d: argsz=0x%X, flags=0x%X, index=0x%X, "
+ "size=0x%llX, offset=0x%llX\n",
+ i,
+ reg_info[i].argsz,
+ reg_info[i].flags,
+ reg_info[i].index,
+ reg_info[i].size,
+ reg_info[i].offset);
+ }
+ }
+
+ // Fetch irq information for this device.
+ struct vfio_irq_info irq_info[VFIO_PCI_NUM_IRQS];
+ printf("* Fetching information for %u irqs\n", dev_info.num_irqs);
+ for (i = 0; i < (int)dev_info.num_irqs; i++) {
+ memset(&irq_info[i], 0, sizeof(irq_info[i]));
+ irq_info[i].argsz = sizeof(irq_info[i]);
+ irq_info[i].index = i;
+ err = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info[i]);
+ if (err != 0) {
+ // This irq doesn't exist or isn't accessible.
+ printf("** %d: Irq info unavailable\n", i);
+ memset(&irq_info[i], 0, sizeof(irq_info[i]));
+ } else {
+ printf("** %d: argsz=0x%X, flags=0x%X, index=0x%X, count=%u\n",
+ i,
+ irq_info[i].argsz,
+ irq_info[i].flags,
+ irq_info[i].index,
+ irq_info[i].count);
+ }
+ }
+
+ // Test.
+ err = test_read(vfio_dev_fd, reg_info[VFIO_PCI_CONFIG_REGION_INDEX].offset);
+ assert(!err);
+
+ return 0;
+}